diff options
Diffstat (limited to 'arch/x86/kernel')
225 files changed, 97774 insertions, 0 deletions
diff --git a/arch/x86/kernel/.gitignore b/arch/x86/kernel/.gitignore new file mode 100644 index 00000000000..08f4fd73146 --- /dev/null +++ b/arch/x86/kernel/.gitignore @@ -0,0 +1,3 @@ +vsyscall.lds +vsyscall_32.lds +vmlinux.lds diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile new file mode 100644 index 00000000000..047f9ff2e36 --- /dev/null +++ b/arch/x86/kernel/Makefile @@ -0,0 +1,120 @@ +# +# Makefile for the linux kernel. +# + +extra-y := head_$(BITS).o head$(BITS).o head.o vmlinux.lds + +CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) + +ifdef CONFIG_FUNCTION_TRACER +# Do not profile debug and lowlevel utilities +CFLAGS_REMOVE_tsc.o = -pg +CFLAGS_REMOVE_paravirt-spinlocks.o = -pg +CFLAGS_REMOVE_pvclock.o = -pg +CFLAGS_REMOVE_kvmclock.o = -pg +CFLAGS_REMOVE_ftrace.o = -pg +CFLAGS_REMOVE_early_printk.o = -pg +endif + +CFLAGS_irq.o := -I$(src)/../include/asm/trace + +obj-y := process_$(BITS).o signal.o entry_$(BITS).o +obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o +obj-y += time.o ioport.o ldt.o dumpstack.o nmi.o +obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o +obj-$(CONFIG_IRQ_WORK) += irq_work.o +obj-y += probe_roms.o +obj-$(CONFIG_X86_32) += i386_ksyms_32.o +obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o +obj-$(CONFIG_X86_64) += mcount_64.o +obj-y += syscall_$(BITS).o vsyscall_gtod.o +obj-$(CONFIG_X86_64) += vsyscall_64.o +obj-$(CONFIG_X86_64) += vsyscall_emu_64.o +obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o +obj-$(CONFIG_SYSFS) += ksysfs.o +obj-y += bootflag.o e820.o +obj-y += pci-dma.o quirks.o topology.o kdebugfs.o +obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o +obj-y += tsc.o tsc_msr.o io_delay.o rtc.o +obj-y += pci-iommu_table.o +obj-y += resource.o + +obj-$(CONFIG_PREEMPT) += preempt.o + +obj-y += process.o +obj-y += i387.o xsave.o +obj-y += ptrace.o +obj-$(CONFIG_X86_32) += tls.o +obj-$(CONFIG_IA32_EMULATION) += tls.o +obj-y += step.o +obj-$(CONFIG_INTEL_TXT) += tboot.o +obj-$(CONFIG_ISA_DMA_API) += i8237.o +obj-$(CONFIG_STACKTRACE) += stacktrace.o +obj-y += cpu/ +obj-y += acpi/ +obj-y += reboot.o +obj-$(CONFIG_X86_MSR) += msr.o +obj-$(CONFIG_X86_CPUID) += cpuid.o +obj-$(CONFIG_PCI) += early-quirks.o +apm-y := apm_32.o +obj-$(CONFIG_APM) += apm.o +obj-$(CONFIG_SMP) += smp.o +obj-$(CONFIG_SMP) += smpboot.o +obj-$(CONFIG_SMP) += tsc_sync.o +obj-$(CONFIG_SMP) += setup_percpu.o +obj-$(CONFIG_X86_MPPARSE) += mpparse.o +obj-y += apic/ +obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o +obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o +obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o +obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o +obj-$(CONFIG_X86_TSC) += trace_clock.o +obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o +obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o +obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o +obj-y += kprobes/ +obj-$(CONFIG_MODULES) += module.o +obj-$(CONFIG_DOUBLEFAULT) += doublefault.o +obj-$(CONFIG_KGDB) += kgdb.o +obj-$(CONFIG_VM86) += vm86_32.o +obj-$(CONFIG_EARLY_PRINTK) += early_printk.o + +obj-$(CONFIG_HPET_TIMER) += hpet.o +obj-$(CONFIG_APB_TIMER) += apb_timer.o + +obj-$(CONFIG_AMD_NB) += amd_nb.o +obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o +obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o +obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o + +obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o +obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o +obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o +obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o + +obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o + +obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o + +obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o +obj-$(CONFIG_OF) += devicetree.o +obj-$(CONFIG_UPROBES) += uprobes.o +obj-y += sysfb.o +obj-$(CONFIG_X86_SYSFB) += sysfb_simplefb.o +obj-$(CONFIG_EFI) += sysfb_efi.o + +obj-$(CONFIG_PERF_EVENTS) += perf_regs.o +obj-$(CONFIG_TRACING) += tracepoint.o +obj-$(CONFIG_IOSF_MBI) += iosf_mbi.o + +### +# 64 bit specific files +ifeq ($(CONFIG_X86_64),y) + obj-$(CONFIG_AUDIT) += audit_64.o + + obj-$(CONFIG_GART_IOMMU) += amd_gart_64.o aperture_64.o + obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o + + obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o + obj-y += vsmp_64.o +endif diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile new file mode 100644 index 00000000000..163b2258147 --- /dev/null +++ b/arch/x86/kernel/acpi/Makefile @@ -0,0 +1,7 @@ +obj-$(CONFIG_ACPI) += boot.o +obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o + +ifneq ($(CONFIG_ACPI_PROCESSOR),) +obj-y += cstate.o +endif + diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c new file mode 100644 index 00000000000..86281ffb96d --- /dev/null +++ b/arch/x86/kernel/acpi/boot.c @@ -0,0 +1,1656 @@ +/* + * boot.c - Architecture-Specific Low-Level ACPI Boot Support + * + * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> + * Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com> + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ + +#include <linux/init.h> +#include <linux/acpi.h> +#include <linux/acpi_pmtmr.h> +#include <linux/efi.h> +#include <linux/cpumask.h> +#include <linux/module.h> +#include <linux/dmi.h> +#include <linux/irq.h> +#include <linux/slab.h> +#include <linux/bootmem.h> +#include <linux/ioport.h> +#include <linux/pci.h> + +#include <asm/pci_x86.h> +#include <asm/pgtable.h> +#include <asm/io_apic.h> +#include <asm/apic.h> +#include <asm/io.h> +#include <asm/mpspec.h> +#include <asm/smp.h> + +#include "sleep.h" /* To include x86_acpi_suspend_lowlevel */ +static int __initdata acpi_force = 0; +int acpi_disabled; +EXPORT_SYMBOL(acpi_disabled); + +#ifdef CONFIG_X86_64 +# include <asm/proto.h> +#endif /* X86 */ + +#define PREFIX "ACPI: " + +int acpi_noirq; /* skip ACPI IRQ initialization */ +int acpi_pci_disabled; /* skip ACPI PCI scan and IRQ initialization */ +EXPORT_SYMBOL(acpi_pci_disabled); + +int acpi_lapic; +int acpi_ioapic; +int acpi_strict; +int acpi_disable_cmcff; + +u8 acpi_sci_flags __initdata; +int acpi_sci_override_gsi __initdata; +int acpi_skip_timer_override __initdata; +int acpi_use_timer_override __initdata; +int acpi_fix_pin2_polarity __initdata; + +#ifdef CONFIG_X86_LOCAL_APIC +static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; +#endif + +#ifndef __HAVE_ARCH_CMPXCHG +#warning ACPI uses CMPXCHG, i486 and later hardware +#endif + +/* -------------------------------------------------------------------------- + Boot-time Configuration + -------------------------------------------------------------------------- */ + +/* + * The default interrupt routing model is PIC (8259). This gets + * overridden if IOAPICs are enumerated (below). + */ +enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC; + + +/* + * ISA irqs by default are the first 16 gsis but can be + * any gsi as specified by an interrupt source override. + */ +static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +}; + +static unsigned int gsi_to_irq(unsigned int gsi) +{ + unsigned int irq = gsi + NR_IRQS_LEGACY; + unsigned int i; + + for (i = 0; i < NR_IRQS_LEGACY; i++) { + if (isa_irq_to_gsi[i] == gsi) { + return i; + } + } + + /* Provide an identity mapping of gsi == irq + * except on truly weird platforms that have + * non isa irqs in the first 16 gsis. + */ + if (gsi >= NR_IRQS_LEGACY) + irq = gsi; + else + irq = gsi_top + gsi; + + return irq; +} + +static u32 irq_to_gsi(int irq) +{ + unsigned int gsi; + + if (irq < NR_IRQS_LEGACY) + gsi = isa_irq_to_gsi[irq]; + else if (irq < gsi_top) + gsi = irq; + else if (irq < (gsi_top + NR_IRQS_LEGACY)) + gsi = irq - gsi_top; + else + gsi = 0xffffffff; + + return gsi; +} + +/* + * This is just a simple wrapper around early_ioremap(), + * with sanity checks for phys == 0 and size == 0. + */ +char *__init __acpi_map_table(unsigned long phys, unsigned long size) +{ + + if (!phys || !size) + return NULL; + + return early_ioremap(phys, size); +} + +void __init __acpi_unmap_table(char *map, unsigned long size) +{ + if (!map || !size) + return; + + early_iounmap(map, size); +} + +#ifdef CONFIG_X86_LOCAL_APIC +static int __init acpi_parse_madt(struct acpi_table_header *table) +{ + struct acpi_table_madt *madt = NULL; + + if (!cpu_has_apic) + return -EINVAL; + + madt = (struct acpi_table_madt *)table; + if (!madt) { + printk(KERN_WARNING PREFIX "Unable to map MADT\n"); + return -ENODEV; + } + + if (madt->address) { + acpi_lapic_addr = (u64) madt->address; + + printk(KERN_DEBUG PREFIX "Local APIC address 0x%08x\n", + madt->address); + } + + default_acpi_madt_oem_check(madt->header.oem_id, + madt->header.oem_table_id); + + return 0; +} + +/** + * acpi_register_lapic - register a local apic and generates a logic cpu number + * @id: local apic id to register + * @enabled: this cpu is enabled or not + * + * Returns the logic cpu number which maps to the local apic + */ +static int acpi_register_lapic(int id, u8 enabled) +{ + unsigned int ver = 0; + + if (id >= MAX_LOCAL_APIC) { + printk(KERN_INFO PREFIX "skipped apicid that is too big\n"); + return -EINVAL; + } + + if (!enabled) { + ++disabled_cpus; + return -EINVAL; + } + + if (boot_cpu_physical_apicid != -1U) + ver = apic_version[boot_cpu_physical_apicid]; + + return generic_processor_info(id, ver); +} + +static int __init +acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) +{ + struct acpi_madt_local_x2apic *processor = NULL; + int apic_id; + u8 enabled; + + processor = (struct acpi_madt_local_x2apic *)header; + + if (BAD_MADT_ENTRY(processor, end)) + return -EINVAL; + + acpi_table_print_madt_entry(header); + + apic_id = processor->local_apic_id; + enabled = processor->lapic_flags & ACPI_MADT_ENABLED; +#ifdef CONFIG_X86_X2APIC + /* + * We need to register disabled CPU as well to permit + * counting disabled CPUs. This allows us to size + * cpus_possible_map more accurately, to permit + * to not preallocating memory for all NR_CPUS + * when we use CPU hotplug. + */ + if (!apic->apic_id_valid(apic_id) && enabled) + printk(KERN_WARNING PREFIX "x2apic entry ignored\n"); + else + acpi_register_lapic(apic_id, enabled); +#else + printk(KERN_WARNING PREFIX "x2apic entry ignored\n"); +#endif + + return 0; +} + +static int __init +acpi_parse_lapic(struct acpi_subtable_header * header, const unsigned long end) +{ + struct acpi_madt_local_apic *processor = NULL; + + processor = (struct acpi_madt_local_apic *)header; + + if (BAD_MADT_ENTRY(processor, end)) + return -EINVAL; + + acpi_table_print_madt_entry(header); + + /* + * We need to register disabled CPU as well to permit + * counting disabled CPUs. This allows us to size + * cpus_possible_map more accurately, to permit + * to not preallocating memory for all NR_CPUS + * when we use CPU hotplug. + */ + acpi_register_lapic(processor->id, /* APIC ID */ + processor->lapic_flags & ACPI_MADT_ENABLED); + + return 0; +} + +static int __init +acpi_parse_sapic(struct acpi_subtable_header *header, const unsigned long end) +{ + struct acpi_madt_local_sapic *processor = NULL; + + processor = (struct acpi_madt_local_sapic *)header; + + if (BAD_MADT_ENTRY(processor, end)) + return -EINVAL; + + acpi_table_print_madt_entry(header); + + acpi_register_lapic((processor->id << 8) | processor->eid,/* APIC ID */ + processor->lapic_flags & ACPI_MADT_ENABLED); + + return 0; +} + +static int __init +acpi_parse_lapic_addr_ovr(struct acpi_subtable_header * header, + const unsigned long end) +{ + struct acpi_madt_local_apic_override *lapic_addr_ovr = NULL; + + lapic_addr_ovr = (struct acpi_madt_local_apic_override *)header; + + if (BAD_MADT_ENTRY(lapic_addr_ovr, end)) + return -EINVAL; + + acpi_lapic_addr = lapic_addr_ovr->address; + + return 0; +} + +static int __init +acpi_parse_x2apic_nmi(struct acpi_subtable_header *header, + const unsigned long end) +{ + struct acpi_madt_local_x2apic_nmi *x2apic_nmi = NULL; + + x2apic_nmi = (struct acpi_madt_local_x2apic_nmi *)header; + + if (BAD_MADT_ENTRY(x2apic_nmi, end)) + return -EINVAL; + + acpi_table_print_madt_entry(header); + + if (x2apic_nmi->lint != 1) + printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n"); + + return 0; +} + +static int __init +acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long end) +{ + struct acpi_madt_local_apic_nmi *lapic_nmi = NULL; + + lapic_nmi = (struct acpi_madt_local_apic_nmi *)header; + + if (BAD_MADT_ENTRY(lapic_nmi, end)) + return -EINVAL; + + acpi_table_print_madt_entry(header); + + if (lapic_nmi->lint != 1) + printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n"); + + return 0; +} + +#endif /*CONFIG_X86_LOCAL_APIC */ + +#ifdef CONFIG_X86_IO_APIC + +static int __init +acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end) +{ + struct acpi_madt_io_apic *ioapic = NULL; + + ioapic = (struct acpi_madt_io_apic *)header; + + if (BAD_MADT_ENTRY(ioapic, end)) + return -EINVAL; + + acpi_table_print_madt_entry(header); + + mp_register_ioapic(ioapic->id, + ioapic->address, ioapic->global_irq_base); + + return 0; +} + +/* + * Parse Interrupt Source Override for the ACPI SCI + */ +static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger, u32 gsi) +{ + if (trigger == 0) /* compatible SCI trigger is level */ + trigger = 3; + + if (polarity == 0) /* compatible SCI polarity is low */ + polarity = 3; + + /* Command-line over-ride via acpi_sci= */ + if (acpi_sci_flags & ACPI_MADT_TRIGGER_MASK) + trigger = (acpi_sci_flags & ACPI_MADT_TRIGGER_MASK) >> 2; + + if (acpi_sci_flags & ACPI_MADT_POLARITY_MASK) + polarity = acpi_sci_flags & ACPI_MADT_POLARITY_MASK; + + /* + * mp_config_acpi_legacy_irqs() already setup IRQs < 16 + * If GSI is < 16, this will update its flags, + * else it will create a new mp_irqs[] entry. + */ + mp_override_legacy_irq(bus_irq, polarity, trigger, gsi); + + /* + * stash over-ride to indicate we've been here + * and for later update of acpi_gbl_FADT + */ + acpi_sci_override_gsi = gsi; + return; +} + +static int __init +acpi_parse_int_src_ovr(struct acpi_subtable_header * header, + const unsigned long end) +{ + struct acpi_madt_interrupt_override *intsrc = NULL; + + intsrc = (struct acpi_madt_interrupt_override *)header; + + if (BAD_MADT_ENTRY(intsrc, end)) + return -EINVAL; + + acpi_table_print_madt_entry(header); + + if (intsrc->source_irq == acpi_gbl_FADT.sci_interrupt) { + acpi_sci_ioapic_setup(intsrc->source_irq, + intsrc->inti_flags & ACPI_MADT_POLARITY_MASK, + (intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2, + intsrc->global_irq); + return 0; + } + + if (intsrc->source_irq == 0) { + if (acpi_skip_timer_override) { + printk(PREFIX "BIOS IRQ0 override ignored.\n"); + return 0; + } + + if ((intsrc->global_irq == 2) && acpi_fix_pin2_polarity + && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) { + intsrc->inti_flags &= ~ACPI_MADT_POLARITY_MASK; + printk(PREFIX "BIOS IRQ0 pin2 override: forcing polarity to high active.\n"); + } + } + + mp_override_legacy_irq(intsrc->source_irq, + intsrc->inti_flags & ACPI_MADT_POLARITY_MASK, + (intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2, + intsrc->global_irq); + + return 0; +} + +static int __init +acpi_parse_nmi_src(struct acpi_subtable_header * header, const unsigned long end) +{ + struct acpi_madt_nmi_source *nmi_src = NULL; + + nmi_src = (struct acpi_madt_nmi_source *)header; + + if (BAD_MADT_ENTRY(nmi_src, end)) + return -EINVAL; + + acpi_table_print_madt_entry(header); + + /* TBD: Support nimsrc entries? */ + + return 0; +} + +#endif /* CONFIG_X86_IO_APIC */ + +/* + * acpi_pic_sci_set_trigger() + * + * use ELCR to set PIC-mode trigger type for SCI + * + * If a PIC-mode SCI is not recognized or gives spurious IRQ7's + * it may require Edge Trigger -- use "acpi_sci=edge" + * + * Port 0x4d0-4d1 are ECLR1 and ECLR2, the Edge/Level Control Registers + * for the 8259 PIC. bit[n] = 1 means irq[n] is Level, otherwise Edge. + * ECLR1 is IRQs 0-7 (IRQ 0, 1, 2 must be 0) + * ECLR2 is IRQs 8-15 (IRQ 8, 13 must be 0) + */ + +void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger) +{ + unsigned int mask = 1 << irq; + unsigned int old, new; + + /* Real old ELCR mask */ + old = inb(0x4d0) | (inb(0x4d1) << 8); + + /* + * If we use ACPI to set PCI IRQs, then we should clear ELCR + * since we will set it correctly as we enable the PCI irq + * routing. + */ + new = acpi_noirq ? old : 0; + + /* + * Update SCI information in the ELCR, it isn't in the PCI + * routing tables.. + */ + switch (trigger) { + case 1: /* Edge - clear */ + new &= ~mask; + break; + case 3: /* Level - set */ + new |= mask; + break; + } + + if (old == new) + return; + + printk(PREFIX "setting ELCR to %04x (from %04x)\n", new, old); + outb(new, 0x4d0); + outb(new >> 8, 0x4d1); +} + +int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) +{ + *irq = gsi_to_irq(gsi); + +#ifdef CONFIG_X86_IO_APIC + if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) + setup_IO_APIC_irq_extra(gsi); +#endif + + return 0; +} +EXPORT_SYMBOL_GPL(acpi_gsi_to_irq); + +int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi) +{ + if (isa_irq >= 16) + return -1; + *gsi = irq_to_gsi(isa_irq); + return 0; +} + +static int acpi_register_gsi_pic(struct device *dev, u32 gsi, + int trigger, int polarity) +{ +#ifdef CONFIG_PCI + /* + * Make sure all (legacy) PCI IRQs are set as level-triggered. + */ + if (trigger == ACPI_LEVEL_SENSITIVE) + eisa_set_level_irq(gsi); +#endif + + return gsi; +} + +static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi, + int trigger, int polarity) +{ +#ifdef CONFIG_X86_IO_APIC + gsi = mp_register_gsi(dev, gsi, trigger, polarity); +#endif + + return gsi; +} + +int (*__acpi_register_gsi)(struct device *dev, u32 gsi, + int trigger, int polarity) = acpi_register_gsi_pic; + +#ifdef CONFIG_ACPI_SLEEP +int (*acpi_suspend_lowlevel)(void) = x86_acpi_suspend_lowlevel; +#else +int (*acpi_suspend_lowlevel)(void); +#endif + +/* + * success: return IRQ number (>=0) + * failure: return < 0 + */ +int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) +{ + unsigned int irq; + unsigned int plat_gsi = gsi; + + plat_gsi = (*__acpi_register_gsi)(dev, gsi, trigger, polarity); + irq = gsi_to_irq(plat_gsi); + + return irq; +} +EXPORT_SYMBOL_GPL(acpi_register_gsi); + +void acpi_unregister_gsi(u32 gsi) +{ +} +EXPORT_SYMBOL_GPL(acpi_unregister_gsi); + +void __init acpi_set_irq_model_pic(void) +{ + acpi_irq_model = ACPI_IRQ_MODEL_PIC; + __acpi_register_gsi = acpi_register_gsi_pic; + acpi_ioapic = 0; +} + +void __init acpi_set_irq_model_ioapic(void) +{ + acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC; + __acpi_register_gsi = acpi_register_gsi_ioapic; + acpi_ioapic = 1; +} + +/* + * ACPI based hotplug support for CPU + */ +#ifdef CONFIG_ACPI_HOTPLUG_CPU +#include <acpi/processor.h> + +static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) +{ +#ifdef CONFIG_ACPI_NUMA + int nid; + + nid = acpi_get_node(handle); + if (nid != -1) { + set_apicid_to_node(physid, nid); + numa_set_node(cpu, nid); + } +#endif +} + +static int _acpi_map_lsapic(acpi_handle handle, int physid, int *pcpu) +{ + int cpu; + + cpu = acpi_register_lapic(physid, ACPI_MADT_ENABLED); + if (cpu < 0) { + pr_info(PREFIX "Unable to map lapic to logical cpu number\n"); + return cpu; + } + + acpi_processor_set_pdc(handle); + acpi_map_cpu2node(handle, cpu, physid); + + *pcpu = cpu; + return 0; +} + +/* wrapper to silence section mismatch warning */ +int __ref acpi_map_lsapic(acpi_handle handle, int physid, int *pcpu) +{ + return _acpi_map_lsapic(handle, physid, pcpu); +} +EXPORT_SYMBOL(acpi_map_lsapic); + +int acpi_unmap_lsapic(int cpu) +{ +#ifdef CONFIG_ACPI_NUMA + set_apicid_to_node(per_cpu(x86_cpu_to_apicid, cpu), NUMA_NO_NODE); +#endif + + per_cpu(x86_cpu_to_apicid, cpu) = -1; + set_cpu_present(cpu, false); + num_processors--; + + return (0); +} + +EXPORT_SYMBOL(acpi_unmap_lsapic); +#endif /* CONFIG_ACPI_HOTPLUG_CPU */ + +int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base) +{ + /* TBD */ + return -EINVAL; +} + +EXPORT_SYMBOL(acpi_register_ioapic); + +int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base) +{ + /* TBD */ + return -EINVAL; +} + +EXPORT_SYMBOL(acpi_unregister_ioapic); + +static int __init acpi_parse_sbf(struct acpi_table_header *table) +{ + struct acpi_table_boot *sb; + + sb = (struct acpi_table_boot *)table; + if (!sb) { + printk(KERN_WARNING PREFIX "Unable to map SBF\n"); + return -ENODEV; + } + + sbf_port = sb->cmos_index; /* Save CMOS port */ + + return 0; +} + +#ifdef CONFIG_HPET_TIMER +#include <asm/hpet.h> + +static struct resource *hpet_res __initdata; + +static int __init acpi_parse_hpet(struct acpi_table_header *table) +{ + struct acpi_table_hpet *hpet_tbl; + + hpet_tbl = (struct acpi_table_hpet *)table; + if (!hpet_tbl) { + printk(KERN_WARNING PREFIX "Unable to map HPET\n"); + return -ENODEV; + } + + if (hpet_tbl->address.space_id != ACPI_SPACE_MEM) { + printk(KERN_WARNING PREFIX "HPET timers must be located in " + "memory.\n"); + return -1; + } + + hpet_address = hpet_tbl->address.address; + hpet_blockid = hpet_tbl->sequence; + + /* + * Some broken BIOSes advertise HPET at 0x0. We really do not + * want to allocate a resource there. + */ + if (!hpet_address) { + printk(KERN_WARNING PREFIX + "HPET id: %#x base: %#lx is invalid\n", + hpet_tbl->id, hpet_address); + return 0; + } +#ifdef CONFIG_X86_64 + /* + * Some even more broken BIOSes advertise HPET at + * 0xfed0000000000000 instead of 0xfed00000. Fix it up and add + * some noise: + */ + if (hpet_address == 0xfed0000000000000UL) { + if (!hpet_force_user) { + printk(KERN_WARNING PREFIX "HPET id: %#x " + "base: 0xfed0000000000000 is bogus\n " + "try hpet=force on the kernel command line to " + "fix it up to 0xfed00000.\n", hpet_tbl->id); + hpet_address = 0; + return 0; + } + printk(KERN_WARNING PREFIX + "HPET id: %#x base: 0xfed0000000000000 fixed up " + "to 0xfed00000.\n", hpet_tbl->id); + hpet_address >>= 32; + } +#endif + printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n", + hpet_tbl->id, hpet_address); + + /* + * Allocate and initialize the HPET firmware resource for adding into + * the resource tree during the lateinit timeframe. + */ +#define HPET_RESOURCE_NAME_SIZE 9 + hpet_res = alloc_bootmem(sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE); + + hpet_res->name = (void *)&hpet_res[1]; + hpet_res->flags = IORESOURCE_MEM; + snprintf((char *)hpet_res->name, HPET_RESOURCE_NAME_SIZE, "HPET %u", + hpet_tbl->sequence); + + hpet_res->start = hpet_address; + hpet_res->end = hpet_address + (1 * 1024) - 1; + + return 0; +} + +/* + * hpet_insert_resource inserts the HPET resources used into the resource + * tree. + */ +static __init int hpet_insert_resource(void) +{ + if (!hpet_res) + return 1; + + return insert_resource(&iomem_resource, hpet_res); +} + +late_initcall(hpet_insert_resource); + +#else +#define acpi_parse_hpet NULL +#endif + +static int __init acpi_parse_fadt(struct acpi_table_header *table) +{ + +#ifdef CONFIG_X86_PM_TIMER + /* detect the location of the ACPI PM Timer */ + if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) { + /* FADT rev. 2 */ + if (acpi_gbl_FADT.xpm_timer_block.space_id != + ACPI_ADR_SPACE_SYSTEM_IO) + return 0; + + pmtmr_ioport = acpi_gbl_FADT.xpm_timer_block.address; + /* + * "X" fields are optional extensions to the original V1.0 + * fields, so we must selectively expand V1.0 fields if the + * corresponding X field is zero. + */ + if (!pmtmr_ioport) + pmtmr_ioport = acpi_gbl_FADT.pm_timer_block; + } else { + /* FADT rev. 1 */ + pmtmr_ioport = acpi_gbl_FADT.pm_timer_block; + } + if (pmtmr_ioport) + printk(KERN_INFO PREFIX "PM-Timer IO Port: %#x\n", + pmtmr_ioport); +#endif + return 0; +} + +#ifdef CONFIG_X86_LOCAL_APIC +/* + * Parse LAPIC entries in MADT + * returns 0 on success, < 0 on error + */ + +static int __init early_acpi_parse_madt_lapic_addr_ovr(void) +{ + int count; + + if (!cpu_has_apic) + return -ENODEV; + + /* + * Note that the LAPIC address is obtained from the MADT (32-bit value) + * and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value). + */ + + count = + acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE, + acpi_parse_lapic_addr_ovr, 0); + if (count < 0) { + printk(KERN_ERR PREFIX + "Error parsing LAPIC address override entry\n"); + return count; + } + + register_lapic_address(acpi_lapic_addr); + + return count; +} + +static int __init acpi_parse_madt_lapic_entries(void) +{ + int count; + int x2count = 0; + + if (!cpu_has_apic) + return -ENODEV; + + /* + * Note that the LAPIC address is obtained from the MADT (32-bit value) + * and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value). + */ + + count = + acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE, + acpi_parse_lapic_addr_ovr, 0); + if (count < 0) { + printk(KERN_ERR PREFIX + "Error parsing LAPIC address override entry\n"); + return count; + } + + register_lapic_address(acpi_lapic_addr); + + count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC, + acpi_parse_sapic, MAX_LOCAL_APIC); + + if (!count) { + x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC, + acpi_parse_x2apic, MAX_LOCAL_APIC); + count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC, + acpi_parse_lapic, MAX_LOCAL_APIC); + } + if (!count && !x2count) { + printk(KERN_ERR PREFIX "No LAPIC entries present\n"); + /* TBD: Cleanup to allow fallback to MPS */ + return -ENODEV; + } else if (count < 0 || x2count < 0) { + printk(KERN_ERR PREFIX "Error parsing LAPIC entry\n"); + /* TBD: Cleanup to allow fallback to MPS */ + return count; + } + + x2count = + acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC_NMI, + acpi_parse_x2apic_nmi, 0); + count = + acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_NMI, acpi_parse_lapic_nmi, 0); + if (count < 0 || x2count < 0) { + printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n"); + /* TBD: Cleanup to allow fallback to MPS */ + return count; + } + return 0; +} +#endif /* CONFIG_X86_LOCAL_APIC */ + +#ifdef CONFIG_X86_IO_APIC +#define MP_ISA_BUS 0 + +void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) +{ + int ioapic; + int pin; + struct mpc_intsrc mp_irq; + + /* + * Convert 'gsi' to 'ioapic.pin'. + */ + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) + return; + pin = mp_find_ioapic_pin(ioapic, gsi); + + /* + * TBD: This check is for faulty timer entries, where the override + * erroneously sets the trigger to level, resulting in a HUGE + * increase of timer interrupts! + */ + if ((bus_irq == 0) && (trigger == 3)) + trigger = 1; + + mp_irq.type = MP_INTSRC; + mp_irq.irqtype = mp_INT; + mp_irq.irqflag = (trigger << 2) | polarity; + mp_irq.srcbus = MP_ISA_BUS; + mp_irq.srcbusirq = bus_irq; /* IRQ */ + mp_irq.dstapic = mpc_ioapic_id(ioapic); /* APIC ID */ + mp_irq.dstirq = pin; /* INTIN# */ + + mp_save_irq(&mp_irq); + + isa_irq_to_gsi[bus_irq] = gsi; +} + +void __init mp_config_acpi_legacy_irqs(void) +{ + int i; + struct mpc_intsrc mp_irq; + +#ifdef CONFIG_EISA + /* + * Fabricate the legacy ISA bus (bus #31). + */ + mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; +#endif + set_bit(MP_ISA_BUS, mp_bus_not_pci); + pr_debug("Bus #%d is ISA\n", MP_ISA_BUS); + + /* + * Use the default configuration for the IRQs 0-15. Unless + * overridden by (MADT) interrupt source override entries. + */ + for (i = 0; i < 16; i++) { + int ioapic, pin; + unsigned int dstapic; + int idx; + u32 gsi; + + /* Locate the gsi that irq i maps to. */ + if (acpi_isa_irq_to_gsi(i, &gsi)) + continue; + + /* + * Locate the IOAPIC that manages the ISA IRQ. + */ + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) + continue; + pin = mp_find_ioapic_pin(ioapic, gsi); + dstapic = mpc_ioapic_id(ioapic); + + for (idx = 0; idx < mp_irq_entries; idx++) { + struct mpc_intsrc *irq = mp_irqs + idx; + + /* Do we already have a mapping for this ISA IRQ? */ + if (irq->srcbus == MP_ISA_BUS && irq->srcbusirq == i) + break; + + /* Do we already have a mapping for this IOAPIC pin */ + if (irq->dstapic == dstapic && irq->dstirq == pin) + break; + } + + if (idx != mp_irq_entries) { + printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i); + continue; /* IRQ already used */ + } + + mp_irq.type = MP_INTSRC; + mp_irq.irqflag = 0; /* Conforming */ + mp_irq.srcbus = MP_ISA_BUS; + mp_irq.dstapic = dstapic; + mp_irq.irqtype = mp_INT; + mp_irq.srcbusirq = i; /* Identity mapped */ + mp_irq.dstirq = pin; + + mp_save_irq(&mp_irq); + } +} + +static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, + int polarity) +{ +#ifdef CONFIG_X86_MPPARSE + struct mpc_intsrc mp_irq; + struct pci_dev *pdev; + unsigned char number; + unsigned int devfn; + int ioapic; + u8 pin; + + if (!acpi_ioapic) + return 0; + if (!dev || !dev_is_pci(dev)) + return 0; + + pdev = to_pci_dev(dev); + number = pdev->bus->number; + devfn = pdev->devfn; + pin = pdev->pin; + /* print the entry should happen on mptable identically */ + mp_irq.type = MP_INTSRC; + mp_irq.irqtype = mp_INT; + mp_irq.irqflag = (trigger == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) | + (polarity == ACPI_ACTIVE_HIGH ? 1 : 3); + mp_irq.srcbus = number; + mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); + ioapic = mp_find_ioapic(gsi); + mp_irq.dstapic = mpc_ioapic_id(ioapic); + mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi); + + mp_save_irq(&mp_irq); +#endif + return 0; +} + +int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) +{ + int ioapic; + int ioapic_pin; + struct io_apic_irq_attr irq_attr; + int ret; + + if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) + return gsi; + + /* Don't set up the ACPI SCI because it's already set up */ + if (acpi_gbl_FADT.sci_interrupt == gsi) + return gsi; + + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) { + printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi); + return gsi; + } + + ioapic_pin = mp_find_ioapic_pin(ioapic, gsi); + + if (ioapic_pin > MP_MAX_IOAPIC_PIN) { + printk(KERN_ERR "Invalid reference to IOAPIC pin " + "%d-%d\n", mpc_ioapic_id(ioapic), + ioapic_pin); + return gsi; + } + + if (enable_update_mptable) + mp_config_acpi_gsi(dev, gsi, trigger, polarity); + + set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin, + trigger == ACPI_EDGE_SENSITIVE ? 0 : 1, + polarity == ACPI_ACTIVE_HIGH ? 0 : 1); + ret = io_apic_set_pci_routing(dev, gsi_to_irq(gsi), &irq_attr); + if (ret < 0) + gsi = INT_MIN; + + return gsi; +} + +/* + * Parse IOAPIC related entries in MADT + * returns 0 on success, < 0 on error + */ +static int __init acpi_parse_madt_ioapic_entries(void) +{ + int count; + + /* + * ACPI interpreter is required to complete interrupt setup, + * so if it is off, don't enumerate the io-apics with ACPI. + * If MPS is present, it will handle them, + * otherwise the system will stay in PIC mode + */ + if (acpi_disabled || acpi_noirq) + return -ENODEV; + + if (!cpu_has_apic) + return -ENODEV; + + /* + * if "noapic" boot option, don't look for IO-APICs + */ + if (skip_ioapic_setup) { + printk(KERN_INFO PREFIX "Skipping IOAPIC probe " + "due to 'noapic' option.\n"); + return -ENODEV; + } + + count = + acpi_table_parse_madt(ACPI_MADT_TYPE_IO_APIC, acpi_parse_ioapic, + MAX_IO_APICS); + if (!count) { + printk(KERN_ERR PREFIX "No IOAPIC entries present\n"); + return -ENODEV; + } else if (count < 0) { + printk(KERN_ERR PREFIX "Error parsing IOAPIC entry\n"); + return count; + } + + count = + acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_parse_int_src_ovr, + nr_irqs); + if (count < 0) { + printk(KERN_ERR PREFIX + "Error parsing interrupt source overrides entry\n"); + /* TBD: Cleanup to allow fallback to MPS */ + return count; + } + + /* + * If BIOS did not supply an INT_SRC_OVR for the SCI + * pretend we got one so we can set the SCI flags. + */ + if (!acpi_sci_override_gsi) + acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0, + acpi_gbl_FADT.sci_interrupt); + + /* Fill in identity legacy mappings where no override */ + mp_config_acpi_legacy_irqs(); + + count = + acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_parse_nmi_src, + nr_irqs); + if (count < 0) { + printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n"); + /* TBD: Cleanup to allow fallback to MPS */ + return count; + } + + return 0; +} +#else +static inline int acpi_parse_madt_ioapic_entries(void) +{ + return -1; +} +#endif /* !CONFIG_X86_IO_APIC */ + +static void __init early_acpi_process_madt(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + int error; + + if (!acpi_table_parse(ACPI_SIG_MADT, acpi_parse_madt)) { + + /* + * Parse MADT LAPIC entries + */ + error = early_acpi_parse_madt_lapic_addr_ovr(); + if (!error) { + acpi_lapic = 1; + smp_found_config = 1; + } + if (error == -EINVAL) { + /* + * Dell Precision Workstation 410, 610 come here. + */ + printk(KERN_ERR PREFIX + "Invalid BIOS MADT, disabling ACPI\n"); + disable_acpi(); + } + } +#endif +} + +static void __init acpi_process_madt(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + int error; + + if (!acpi_table_parse(ACPI_SIG_MADT, acpi_parse_madt)) { + + /* + * Parse MADT LAPIC entries + */ + error = acpi_parse_madt_lapic_entries(); + if (!error) { + acpi_lapic = 1; + + /* + * Parse MADT IO-APIC entries + */ + error = acpi_parse_madt_ioapic_entries(); + if (!error) { + acpi_set_irq_model_ioapic(); + + smp_found_config = 1; + } + } + if (error == -EINVAL) { + /* + * Dell Precision Workstation 410, 610 come here. + */ + printk(KERN_ERR PREFIX + "Invalid BIOS MADT, disabling ACPI\n"); + disable_acpi(); + } + } else { + /* + * ACPI found no MADT, and so ACPI wants UP PIC mode. + * In the event an MPS table was found, forget it. + * Boot with "acpi=off" to use MPS on such a system. + */ + if (smp_found_config) { + printk(KERN_WARNING PREFIX + "No APIC-table, disabling MPS\n"); + smp_found_config = 0; + } + } + + /* + * ACPI supports both logical (e.g. Hyper-Threading) and physical + * processors, where MPS only supports physical. + */ + if (acpi_lapic && acpi_ioapic) + printk(KERN_INFO "Using ACPI (MADT) for SMP configuration " + "information\n"); + else if (acpi_lapic) + printk(KERN_INFO "Using ACPI for processor (LAPIC) " + "configuration information\n"); +#endif + return; +} + +static int __init disable_acpi_irq(const struct dmi_system_id *d) +{ + if (!acpi_force) { + printk(KERN_NOTICE "%s detected: force use of acpi=noirq\n", + d->ident); + acpi_noirq_set(); + } + return 0; +} + +static int __init disable_acpi_pci(const struct dmi_system_id *d) +{ + if (!acpi_force) { + printk(KERN_NOTICE "%s detected: force use of pci=noacpi\n", + d->ident); + acpi_disable_pci(); + } + return 0; +} + +static int __init dmi_disable_acpi(const struct dmi_system_id *d) +{ + if (!acpi_force) { + printk(KERN_NOTICE "%s detected: acpi off\n", d->ident); + disable_acpi(); + } else { + printk(KERN_NOTICE + "Warning: DMI blacklist says broken, but acpi forced\n"); + } + return 0; +} + +/* + * Force ignoring BIOS IRQ0 override + */ +static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) +{ + if (!acpi_skip_timer_override) { + pr_notice("%s detected: Ignoring BIOS IRQ0 override\n", + d->ident); + acpi_skip_timer_override = 1; + } + return 0; +} + +/* + * If your system is blacklisted here, but you find that acpi=force + * works for you, please contact linux-acpi@vger.kernel.org + */ +static struct dmi_system_id __initdata acpi_dmi_table[] = { + /* + * Boxes that need ACPI disabled + */ + { + .callback = dmi_disable_acpi, + .ident = "IBM Thinkpad", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), + DMI_MATCH(DMI_BOARD_NAME, "2629H1G"), + }, + }, + + /* + * Boxes that need ACPI PCI IRQ routing disabled + */ + { + .callback = disable_acpi_irq, + .ident = "ASUS A7V", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC"), + DMI_MATCH(DMI_BOARD_NAME, "<A7V>"), + /* newer BIOS, Revision 1011, does work */ + DMI_MATCH(DMI_BIOS_VERSION, + "ASUS A7V ACPI BIOS Revision 1007"), + }, + }, + { + /* + * Latest BIOS for IBM 600E (1.16) has bad pcinum + * for LPC bridge, which is needed for the PCI + * interrupt links to work. DSDT fix is in bug 5966. + * 2645, 2646 model numbers are shared with 600/600E/600X + */ + .callback = disable_acpi_irq, + .ident = "IBM Thinkpad 600 Series 2645", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), + DMI_MATCH(DMI_BOARD_NAME, "2645"), + }, + }, + { + .callback = disable_acpi_irq, + .ident = "IBM Thinkpad 600 Series 2646", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), + DMI_MATCH(DMI_BOARD_NAME, "2646"), + }, + }, + /* + * Boxes that need ACPI PCI IRQ routing and PCI scan disabled + */ + { /* _BBN 0 bug */ + .callback = disable_acpi_pci, + .ident = "ASUS PR-DLS", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), + DMI_MATCH(DMI_BOARD_NAME, "PR-DLS"), + DMI_MATCH(DMI_BIOS_VERSION, + "ASUS PR-DLS ACPI BIOS Revision 1010"), + DMI_MATCH(DMI_BIOS_DATE, "03/21/2003") + }, + }, + { + .callback = disable_acpi_pci, + .ident = "Acer TravelMate 36x Laptop", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Acer"), + DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"), + }, + }, + {} +}; + +/* second table for DMI checks that should run after early-quirks */ +static struct dmi_system_id __initdata acpi_dmi_table_late[] = { + /* + * HP laptops which use a DSDT reporting as HP/SB400/10000, + * which includes some code which overrides all temperature + * trip points to 16C if the INTIN2 input of the I/O APIC + * is enabled. This input is incorrectly designated the + * ISA IRQ 0 via an interrupt source override even though + * it is wired to the output of the master 8259A and INTIN0 + * is not connected at all. Force ignoring BIOS IRQ0 + * override in that cases. + */ + { + .callback = dmi_ignore_irq0_timer_override, + .ident = "HP nx6115 laptop", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6115"), + }, + }, + { + .callback = dmi_ignore_irq0_timer_override, + .ident = "HP NX6125 laptop", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6125"), + }, + }, + { + .callback = dmi_ignore_irq0_timer_override, + .ident = "HP NX6325 laptop", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6325"), + }, + }, + { + .callback = dmi_ignore_irq0_timer_override, + .ident = "HP 6715b laptop", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq 6715b"), + }, + }, + { + .callback = dmi_ignore_irq0_timer_override, + .ident = "FUJITSU SIEMENS", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"), + DMI_MATCH(DMI_PRODUCT_NAME, "AMILO PRO V2030"), + }, + }, + {} +}; + +/* + * acpi_boot_table_init() and acpi_boot_init() + * called from setup_arch(), always. + * 1. checksums all tables + * 2. enumerates lapics + * 3. enumerates io-apics + * + * acpi_table_init() is separate to allow reading SRAT without + * other side effects. + * + * side effects of acpi_boot_init: + * acpi_lapic = 1 if LAPIC found + * acpi_ioapic = 1 if IOAPIC found + * if (acpi_lapic && acpi_ioapic) smp_found_config = 1; + * if acpi_blacklisted() acpi_disabled = 1; + * acpi_irq_model=... + * ... + */ + +void __init acpi_boot_table_init(void) +{ + dmi_check_system(acpi_dmi_table); + + /* + * If acpi_disabled, bail out + */ + if (acpi_disabled) + return; + + /* + * Initialize the ACPI boot-time table parser. + */ + if (acpi_table_init()) { + disable_acpi(); + return; + } + + acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf); + + /* + * blacklist may disable ACPI entirely + */ + if (acpi_blacklisted()) { + if (acpi_force) { + printk(KERN_WARNING PREFIX "acpi=force override\n"); + } else { + printk(KERN_WARNING PREFIX "Disabling ACPI support\n"); + disable_acpi(); + return; + } + } +} + +int __init early_acpi_boot_init(void) +{ + /* + * If acpi_disabled, bail out + */ + if (acpi_disabled) + return 1; + + /* + * Process the Multiple APIC Description Table (MADT), if present + */ + early_acpi_process_madt(); + + return 0; +} + +int __init acpi_boot_init(void) +{ + /* those are executed after early-quirks are executed */ + dmi_check_system(acpi_dmi_table_late); + + /* + * If acpi_disabled, bail out + */ + if (acpi_disabled) + return 1; + + acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf); + + /* + * set sci_int and PM timer address + */ + acpi_table_parse(ACPI_SIG_FADT, acpi_parse_fadt); + + /* + * Process the Multiple APIC Description Table (MADT), if present + */ + acpi_process_madt(); + + acpi_table_parse(ACPI_SIG_HPET, acpi_parse_hpet); + + if (!acpi_noirq) + x86_init.pci.init = pci_acpi_init; + + return 0; +} + +static int __init parse_acpi(char *arg) +{ + if (!arg) + return -EINVAL; + + /* "acpi=off" disables both ACPI table parsing and interpreter */ + if (strcmp(arg, "off") == 0) { + disable_acpi(); + } + /* acpi=force to over-ride black-list */ + else if (strcmp(arg, "force") == 0) { + acpi_force = 1; + acpi_disabled = 0; + } + /* acpi=strict disables out-of-spec workarounds */ + else if (strcmp(arg, "strict") == 0) { + acpi_strict = 1; + } + /* acpi=rsdt use RSDT instead of XSDT */ + else if (strcmp(arg, "rsdt") == 0) { + acpi_gbl_do_not_use_xsdt = TRUE; + } + /* "acpi=noirq" disables ACPI interrupt routing */ + else if (strcmp(arg, "noirq") == 0) { + acpi_noirq_set(); + } + /* "acpi=copy_dsdt" copys DSDT */ + else if (strcmp(arg, "copy_dsdt") == 0) { + acpi_gbl_copy_dsdt_locally = 1; + } + /* "acpi=nocmcff" disables FF mode for corrected errors */ + else if (strcmp(arg, "nocmcff") == 0) { + acpi_disable_cmcff = 1; + } else { + /* Core will printk when we return error. */ + return -EINVAL; + } + return 0; +} +early_param("acpi", parse_acpi); + +/* FIXME: Using pci= for an ACPI parameter is a travesty. */ +static int __init parse_pci(char *arg) +{ + if (arg && strcmp(arg, "noacpi") == 0) + acpi_disable_pci(); + return 0; +} +early_param("pci", parse_pci); + +int __init acpi_mps_check(void) +{ +#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_X86_MPPARSE) +/* mptable code is not built-in*/ + if (acpi_disabled || acpi_noirq) { + printk(KERN_WARNING "MPS support code is not built-in.\n" + "Using acpi=off or acpi=noirq or pci=noacpi " + "may have problem\n"); + return 1; + } +#endif + return 0; +} + +#ifdef CONFIG_X86_IO_APIC +static int __init parse_acpi_skip_timer_override(char *arg) +{ + acpi_skip_timer_override = 1; + return 0; +} +early_param("acpi_skip_timer_override", parse_acpi_skip_timer_override); + +static int __init parse_acpi_use_timer_override(char *arg) +{ + acpi_use_timer_override = 1; + return 0; +} +early_param("acpi_use_timer_override", parse_acpi_use_timer_override); +#endif /* CONFIG_X86_IO_APIC */ + +static int __init setup_acpi_sci(char *s) +{ + if (!s) + return -EINVAL; + if (!strcmp(s, "edge")) + acpi_sci_flags = ACPI_MADT_TRIGGER_EDGE | + (acpi_sci_flags & ~ACPI_MADT_TRIGGER_MASK); + else if (!strcmp(s, "level")) + acpi_sci_flags = ACPI_MADT_TRIGGER_LEVEL | + (acpi_sci_flags & ~ACPI_MADT_TRIGGER_MASK); + else if (!strcmp(s, "high")) + acpi_sci_flags = ACPI_MADT_POLARITY_ACTIVE_HIGH | + (acpi_sci_flags & ~ACPI_MADT_POLARITY_MASK); + else if (!strcmp(s, "low")) + acpi_sci_flags = ACPI_MADT_POLARITY_ACTIVE_LOW | + (acpi_sci_flags & ~ACPI_MADT_POLARITY_MASK); + else + return -EINVAL; + return 0; +} +early_param("acpi_sci", setup_acpi_sci); + +int __acpi_acquire_global_lock(unsigned int *lock) +{ + unsigned int old, new, val; + do { + old = *lock; + new = (((old & ~0x3) + 2) + ((old >> 1) & 0x1)); + val = cmpxchg(lock, old, new); + } while (unlikely (val != old)); + return (new < 3) ? -1 : 0; +} + +int __acpi_release_global_lock(unsigned int *lock) +{ + unsigned int old, new, val; + do { + old = *lock; + new = old & ~0x3; + val = cmpxchg(lock, old, new); + } while (unlikely (val != old)); + return old & 0x1; +} + +void __init arch_reserve_mem_area(acpi_physical_address addr, size_t size) +{ + e820_add_region(addr, size, E820_ACPI); + update_e820(); +} diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c new file mode 100644 index 00000000000..4b28159e042 --- /dev/null +++ b/arch/x86/kernel/acpi/cstate.c @@ -0,0 +1,183 @@ +/* + * Copyright (C) 2005 Intel Corporation + * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> + * - Added _PDC for SMP C-states on Intel CPUs + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/acpi.h> +#include <linux/cpu.h> +#include <linux/sched.h> + +#include <acpi/processor.h> +#include <asm/acpi.h> +#include <asm/mwait.h> +#include <asm/special_insns.h> + +/* + * Initialize bm_flags based on the CPU cache properties + * On SMP it depends on cache configuration + * - When cache is not shared among all CPUs, we flush cache + * before entering C3. + * - When cache is shared among all CPUs, we use bm_check + * mechanism as in UP case + * + * This routine is called only after all the CPUs are online + */ +void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags, + unsigned int cpu) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu); + + flags->bm_check = 0; + if (num_online_cpus() == 1) + flags->bm_check = 1; + else if (c->x86_vendor == X86_VENDOR_INTEL) { + /* + * Today all MP CPUs that support C3 share cache. + * And caches should not be flushed by software while + * entering C3 type state. + */ + flags->bm_check = 1; + } + + /* + * On all recent Intel platforms, ARB_DISABLE is a nop. + * So, set bm_control to zero to indicate that ARB_DISABLE + * is not required while entering C3 type state on + * P4, Core and beyond CPUs + */ + if (c->x86_vendor == X86_VENDOR_INTEL && + (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 0x0f))) + flags->bm_control = 0; +} +EXPORT_SYMBOL(acpi_processor_power_init_bm_check); + +/* The code below handles cstate entry with monitor-mwait pair on Intel*/ + +struct cstate_entry { + struct { + unsigned int eax; + unsigned int ecx; + } states[ACPI_PROCESSOR_MAX_POWER]; +}; +static struct cstate_entry __percpu *cpu_cstate_entry; /* per CPU ptr */ + +static short mwait_supported[ACPI_PROCESSOR_MAX_POWER]; + +#define NATIVE_CSTATE_BEYOND_HALT (2) + +static long acpi_processor_ffh_cstate_probe_cpu(void *_cx) +{ + struct acpi_processor_cx *cx = _cx; + long retval; + unsigned int eax, ebx, ecx, edx; + unsigned int edx_part; + unsigned int cstate_type; /* C-state type and not ACPI C-state type */ + unsigned int num_cstate_subtype; + + cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx); + + /* Check whether this particular cx_type (in CST) is supported or not */ + cstate_type = ((cx->address >> MWAIT_SUBSTATE_SIZE) & + MWAIT_CSTATE_MASK) + 1; + edx_part = edx >> (cstate_type * MWAIT_SUBSTATE_SIZE); + num_cstate_subtype = edx_part & MWAIT_SUBSTATE_MASK; + + retval = 0; + /* If the HW does not support any sub-states in this C-state */ + if (num_cstate_subtype == 0) { + pr_warn(FW_BUG "ACPI MWAIT C-state 0x%x not supported by HW (0x%x)\n", cx->address, edx_part); + retval = -1; + goto out; + } + + /* mwait ecx extensions INTERRUPT_BREAK should be supported for C2/C3 */ + if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) || + !(ecx & CPUID5_ECX_INTERRUPT_BREAK)) { + retval = -1; + goto out; + } + + if (!mwait_supported[cstate_type]) { + mwait_supported[cstate_type] = 1; + printk(KERN_DEBUG + "Monitor-Mwait will be used to enter C-%d " + "state\n", cx->type); + } + snprintf(cx->desc, + ACPI_CX_DESC_LEN, "ACPI FFH INTEL MWAIT 0x%x", + cx->address); +out: + return retval; +} + +int acpi_processor_ffh_cstate_probe(unsigned int cpu, + struct acpi_processor_cx *cx, struct acpi_power_register *reg) +{ + struct cstate_entry *percpu_entry; + struct cpuinfo_x86 *c = &cpu_data(cpu); + long retval; + + if (!cpu_cstate_entry || c->cpuid_level < CPUID_MWAIT_LEAF) + return -1; + + if (reg->bit_offset != NATIVE_CSTATE_BEYOND_HALT) + return -1; + + percpu_entry = per_cpu_ptr(cpu_cstate_entry, cpu); + percpu_entry->states[cx->index].eax = 0; + percpu_entry->states[cx->index].ecx = 0; + + /* Make sure we are running on right CPU */ + + retval = work_on_cpu(cpu, acpi_processor_ffh_cstate_probe_cpu, cx); + if (retval == 0) { + /* Use the hint in CST */ + percpu_entry->states[cx->index].eax = cx->address; + percpu_entry->states[cx->index].ecx = MWAIT_ECX_INTERRUPT_BREAK; + } + + /* + * For _CST FFH on Intel, if GAS.access_size bit 1 is cleared, + * then we should skip checking BM_STS for this C-state. + * ref: "Intel Processor Vendor-Specific ACPI Interface Specification" + */ + if ((c->x86_vendor == X86_VENDOR_INTEL) && !(reg->access_size & 0x2)) + cx->bm_sts_skip = 1; + + return retval; +} +EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); + +void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx) +{ + unsigned int cpu = smp_processor_id(); + struct cstate_entry *percpu_entry; + + percpu_entry = per_cpu_ptr(cpu_cstate_entry, cpu); + mwait_idle_with_hints(percpu_entry->states[cx->index].eax, + percpu_entry->states[cx->index].ecx); +} +EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_enter); + +static int __init ffh_cstate_init(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + if (c->x86_vendor != X86_VENDOR_INTEL) + return -1; + + cpu_cstate_entry = alloc_percpu(struct cstate_entry); + return 0; +} + +static void __exit ffh_cstate_exit(void) +{ + free_percpu(cpu_cstate_entry); + cpu_cstate_entry = NULL; +} + +arch_initcall(ffh_cstate_init); +__exitcall(ffh_cstate_exit); diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c new file mode 100644 index 00000000000..31368207837 --- /dev/null +++ b/arch/x86/kernel/acpi/sleep.c @@ -0,0 +1,140 @@ +/* + * sleep.c - x86-specific ACPI sleep support. + * + * Copyright (C) 2001-2003 Patrick Mochel + * Copyright (C) 2001-2003 Pavel Machek <pavel@ucw.cz> + */ + +#include <linux/acpi.h> +#include <linux/bootmem.h> +#include <linux/memblock.h> +#include <linux/dmi.h> +#include <linux/cpumask.h> +#include <asm/segment.h> +#include <asm/desc.h> +#include <asm/pgtable.h> +#include <asm/cacheflush.h> +#include <asm/realmode.h> + +#include "../../realmode/rm/wakeup.h" +#include "sleep.h" + +unsigned long acpi_realmode_flags; + +#if defined(CONFIG_SMP) && defined(CONFIG_64BIT) +static char temp_stack[4096]; +#endif + +/** + * x86_acpi_enter_sleep_state - enter sleep state + * @state: Sleep state to enter. + * + * Wrapper around acpi_enter_sleep_state() to be called by assmebly. + */ +acpi_status asmlinkage __visible x86_acpi_enter_sleep_state(u8 state) +{ + return acpi_enter_sleep_state(state); +} + +/** + * x86_acpi_suspend_lowlevel - save kernel state + * + * Create an identity mapped page table and copy the wakeup routine to + * low memory. + */ +int x86_acpi_suspend_lowlevel(void) +{ + struct wakeup_header *header = + (struct wakeup_header *) __va(real_mode_header->wakeup_header); + + if (header->signature != WAKEUP_HEADER_SIGNATURE) { + printk(KERN_ERR "wakeup header does not match\n"); + return -EINVAL; + } + + header->video_mode = saved_video_mode; + + header->pmode_behavior = 0; + +#ifndef CONFIG_64BIT + native_store_gdt((struct desc_ptr *)&header->pmode_gdt); + + /* + * We have to check that we can write back the value, and not + * just read it. At least on 90 nm Pentium M (Family 6, Model + * 13), reading an invalid MSR is not guaranteed to trap, see + * Erratum X4 in "Intel Pentium M Processor on 90 nm Process + * with 2-MB L2 Cache and Intel® Processor A100 and A110 on 90 + * nm process with 512-KB L2 Cache Specification Update". + */ + if (!rdmsr_safe(MSR_EFER, + &header->pmode_efer_low, + &header->pmode_efer_high) && + !wrmsr_safe(MSR_EFER, + header->pmode_efer_low, + header->pmode_efer_high)) + header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_EFER); +#endif /* !CONFIG_64BIT */ + + header->pmode_cr0 = read_cr0(); + if (__this_cpu_read(cpu_info.cpuid_level) >= 0) { + header->pmode_cr4 = read_cr4(); + header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_CR4); + } + if (!rdmsr_safe(MSR_IA32_MISC_ENABLE, + &header->pmode_misc_en_low, + &header->pmode_misc_en_high) && + !wrmsr_safe(MSR_IA32_MISC_ENABLE, + header->pmode_misc_en_low, + header->pmode_misc_en_high)) + header->pmode_behavior |= + (1 << WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE); + header->realmode_flags = acpi_realmode_flags; + header->real_magic = 0x12345678; + +#ifndef CONFIG_64BIT + header->pmode_entry = (u32)&wakeup_pmode_return; + header->pmode_cr3 = (u32)__pa_symbol(initial_page_table); + saved_magic = 0x12345678; +#else /* CONFIG_64BIT */ +#ifdef CONFIG_SMP + stack_start = (unsigned long)temp_stack + sizeof(temp_stack); + early_gdt_descr.address = + (unsigned long)get_cpu_gdt_table(smp_processor_id()); + initial_gs = per_cpu_offset(smp_processor_id()); +#endif + initial_code = (unsigned long)wakeup_long64; + saved_magic = 0x123456789abcdef0L; +#endif /* CONFIG_64BIT */ + + do_suspend_lowlevel(); + return 0; +} + +static int __init acpi_sleep_setup(char *str) +{ + while ((str != NULL) && (*str != '\0')) { + if (strncmp(str, "s3_bios", 7) == 0) + acpi_realmode_flags |= 1; + if (strncmp(str, "s3_mode", 7) == 0) + acpi_realmode_flags |= 2; + if (strncmp(str, "s3_beep", 7) == 0) + acpi_realmode_flags |= 4; +#ifdef CONFIG_HIBERNATION + if (strncmp(str, "s4_nohwsig", 10) == 0) + acpi_no_s4_hw_signature(); +#endif + if (strncmp(str, "nonvs", 5) == 0) + acpi_nvs_nosave(); + if (strncmp(str, "nonvs_s3", 8) == 0) + acpi_nvs_nosave_s3(); + if (strncmp(str, "old_ordering", 12) == 0) + acpi_old_suspend_ordering(); + str = strchr(str, ','); + if (str != NULL) + str += strspn(str, ", \t"); + } + return 1; +} + +__setup("acpi_sleep=", acpi_sleep_setup); diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h new file mode 100644 index 00000000000..65c7b606b60 --- /dev/null +++ b/arch/x86/kernel/acpi/sleep.h @@ -0,0 +1,21 @@ +/* + * Variables and functions used by the code in sleep.c + */ + +#include <asm/realmode.h> + +extern unsigned long saved_video_mode; +extern long saved_magic; + +extern int wakeup_pmode_return; + +extern u8 wake_sleep_flags; + +extern unsigned long acpi_copy_wakeup_routine(unsigned long); +extern void wakeup_long64(void); + +extern void do_suspend_lowlevel(void); + +extern int x86_acpi_suspend_lowlevel(void); + +acpi_status asmlinkage x86_acpi_enter_sleep_state(u8 state); diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S new file mode 100644 index 00000000000..665c6b7d2ea --- /dev/null +++ b/arch/x86/kernel/acpi/wakeup_32.S @@ -0,0 +1,97 @@ + .text +#include <linux/linkage.h> +#include <asm/segment.h> +#include <asm/page_types.h> + +# Copyright 2003, 2008 Pavel Machek <pavel@suse.cz>, distribute under GPLv2 + + .code32 + ALIGN + +ENTRY(wakeup_pmode_return) +wakeup_pmode_return: + movw $__KERNEL_DS, %ax + movw %ax, %ss + movw %ax, %ds + movw %ax, %es + movw %ax, %fs + movw %ax, %gs + + # reload the gdt, as we need the full 32 bit address + lidt saved_idt + lldt saved_ldt + ljmp $(__KERNEL_CS), $1f +1: + movl %cr3, %eax + movl %eax, %cr3 + wbinvd + + # and restore the stack ... but you need gdt for this to work + movl saved_context_esp, %esp + + movl %cs:saved_magic, %eax + cmpl $0x12345678, %eax + jne bogus_magic + + # jump to place where we left off + movl saved_eip, %eax + jmp *%eax + +bogus_magic: + jmp bogus_magic + + + +save_registers: + sidt saved_idt + sldt saved_ldt + str saved_tss + + leal 4(%esp), %eax + movl %eax, saved_context_esp + movl %ebx, saved_context_ebx + movl %ebp, saved_context_ebp + movl %esi, saved_context_esi + movl %edi, saved_context_edi + pushfl + popl saved_context_eflags + + movl $ret_point, saved_eip + ret + + +restore_registers: + movl saved_context_ebp, %ebp + movl saved_context_ebx, %ebx + movl saved_context_esi, %esi + movl saved_context_edi, %edi + pushl saved_context_eflags + popfl + ret + +ENTRY(do_suspend_lowlevel) + call save_processor_state + call save_registers + pushl $3 + call x86_acpi_enter_sleep_state + addl $4, %esp + +# In case of S3 failure, we'll emerge here. Jump +# to ret_point to recover + jmp ret_point + .p2align 4,,7 +ret_point: + call restore_registers + call restore_processor_state + ret + +.data +ALIGN +ENTRY(saved_magic) .long 0 +ENTRY(saved_eip) .long 0 + +# saved registers +saved_idt: .long 0,0 +saved_ldt: .long 0 +saved_tss: .long 0 + diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S new file mode 100644 index 00000000000..ae693b51ed8 --- /dev/null +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -0,0 +1,124 @@ +.text +#include <linux/linkage.h> +#include <asm/segment.h> +#include <asm/pgtable_types.h> +#include <asm/page_types.h> +#include <asm/msr.h> +#include <asm/asm-offsets.h> + +# Copyright 2003 Pavel Machek <pavel@suse.cz>, distribute under GPLv2 + +.code64 + /* + * Hooray, we are in Long 64-bit mode (but still running in low memory) + */ +ENTRY(wakeup_long64) + movq saved_magic, %rax + movq $0x123456789abcdef0, %rdx + cmpq %rdx, %rax + jne bogus_64_magic + + movw $__KERNEL_DS, %ax + movw %ax, %ss + movw %ax, %ds + movw %ax, %es + movw %ax, %fs + movw %ax, %gs + movq saved_rsp, %rsp + + movq saved_rbx, %rbx + movq saved_rdi, %rdi + movq saved_rsi, %rsi + movq saved_rbp, %rbp + + movq saved_rip, %rax + jmp *%rax +ENDPROC(wakeup_long64) + +bogus_64_magic: + jmp bogus_64_magic + +ENTRY(do_suspend_lowlevel) + subq $8, %rsp + xorl %eax, %eax + call save_processor_state + + movq $saved_context, %rax + movq %rsp, pt_regs_sp(%rax) + movq %rbp, pt_regs_bp(%rax) + movq %rsi, pt_regs_si(%rax) + movq %rdi, pt_regs_di(%rax) + movq %rbx, pt_regs_bx(%rax) + movq %rcx, pt_regs_cx(%rax) + movq %rdx, pt_regs_dx(%rax) + movq %r8, pt_regs_r8(%rax) + movq %r9, pt_regs_r9(%rax) + movq %r10, pt_regs_r10(%rax) + movq %r11, pt_regs_r11(%rax) + movq %r12, pt_regs_r12(%rax) + movq %r13, pt_regs_r13(%rax) + movq %r14, pt_regs_r14(%rax) + movq %r15, pt_regs_r15(%rax) + pushfq + popq pt_regs_flags(%rax) + + movq $resume_point, saved_rip(%rip) + + movq %rsp, saved_rsp + movq %rbp, saved_rbp + movq %rbx, saved_rbx + movq %rdi, saved_rdi + movq %rsi, saved_rsi + + addq $8, %rsp + movl $3, %edi + xorl %eax, %eax + call x86_acpi_enter_sleep_state + /* in case something went wrong, restore the machine status and go on */ + jmp resume_point + + .align 4 +resume_point: + /* We don't restore %rax, it must be 0 anyway */ + movq $saved_context, %rax + movq saved_context_cr4(%rax), %rbx + movq %rbx, %cr4 + movq saved_context_cr3(%rax), %rbx + movq %rbx, %cr3 + movq saved_context_cr2(%rax), %rbx + movq %rbx, %cr2 + movq saved_context_cr0(%rax), %rbx + movq %rbx, %cr0 + pushq pt_regs_flags(%rax) + popfq + movq pt_regs_sp(%rax), %rsp + movq pt_regs_bp(%rax), %rbp + movq pt_regs_si(%rax), %rsi + movq pt_regs_di(%rax), %rdi + movq pt_regs_bx(%rax), %rbx + movq pt_regs_cx(%rax), %rcx + movq pt_regs_dx(%rax), %rdx + movq pt_regs_r8(%rax), %r8 + movq pt_regs_r9(%rax), %r9 + movq pt_regs_r10(%rax), %r10 + movq pt_regs_r11(%rax), %r11 + movq pt_regs_r12(%rax), %r12 + movq pt_regs_r13(%rax), %r13 + movq pt_regs_r14(%rax), %r14 + movq pt_regs_r15(%rax), %r15 + + xorl %eax, %eax + addq $8, %rsp + jmp restore_processor_state +ENDPROC(do_suspend_lowlevel) + +.data +ENTRY(saved_rbp) .quad 0 +ENTRY(saved_rsi) .quad 0 +ENTRY(saved_rdi) .quad 0 +ENTRY(saved_rbx) .quad 0 + +ENTRY(saved_rip) .quad 0 +ENTRY(saved_rsp) .quad 0 + +ENTRY(saved_magic) .quad 0 diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c new file mode 100644 index 00000000000..703130f469e --- /dev/null +++ b/arch/x86/kernel/alternative.c @@ -0,0 +1,677 @@ +#define pr_fmt(fmt) "SMP alternatives: " fmt + +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/mutex.h> +#include <linux/list.h> +#include <linux/stringify.h> +#include <linux/mm.h> +#include <linux/vmalloc.h> +#include <linux/memory.h> +#include <linux/stop_machine.h> +#include <linux/slab.h> +#include <linux/kdebug.h> +#include <asm/alternative.h> +#include <asm/sections.h> +#include <asm/pgtable.h> +#include <asm/mce.h> +#include <asm/nmi.h> +#include <asm/cacheflush.h> +#include <asm/tlbflush.h> +#include <asm/io.h> +#include <asm/fixmap.h> + +#define MAX_PATCH_LEN (255-1) + +static int __initdata_or_module debug_alternative; + +static int __init debug_alt(char *str) +{ + debug_alternative = 1; + return 1; +} +__setup("debug-alternative", debug_alt); + +static int noreplace_smp; + +static int __init setup_noreplace_smp(char *str) +{ + noreplace_smp = 1; + return 1; +} +__setup("noreplace-smp", setup_noreplace_smp); + +#ifdef CONFIG_PARAVIRT +static int __initdata_or_module noreplace_paravirt = 0; + +static int __init setup_noreplace_paravirt(char *str) +{ + noreplace_paravirt = 1; + return 1; +} +__setup("noreplace-paravirt", setup_noreplace_paravirt); +#endif + +#define DPRINTK(fmt, ...) \ +do { \ + if (debug_alternative) \ + printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ +} while (0) + +/* + * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes + * that correspond to that nop. Getting from one nop to the next, we + * add to the array the offset that is equal to the sum of all sizes of + * nops preceding the one we are after. + * + * Note: The GENERIC_NOP5_ATOMIC is at the end, as it breaks the + * nice symmetry of sizes of the previous nops. + */ +#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64) +static const unsigned char intelnops[] = +{ + GENERIC_NOP1, + GENERIC_NOP2, + GENERIC_NOP3, + GENERIC_NOP4, + GENERIC_NOP5, + GENERIC_NOP6, + GENERIC_NOP7, + GENERIC_NOP8, + GENERIC_NOP5_ATOMIC +}; +static const unsigned char * const intel_nops[ASM_NOP_MAX+2] = +{ + NULL, + intelnops, + intelnops + 1, + intelnops + 1 + 2, + intelnops + 1 + 2 + 3, + intelnops + 1 + 2 + 3 + 4, + intelnops + 1 + 2 + 3 + 4 + 5, + intelnops + 1 + 2 + 3 + 4 + 5 + 6, + intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7, + intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, +}; +#endif + +#ifdef K8_NOP1 +static const unsigned char k8nops[] = +{ + K8_NOP1, + K8_NOP2, + K8_NOP3, + K8_NOP4, + K8_NOP5, + K8_NOP6, + K8_NOP7, + K8_NOP8, + K8_NOP5_ATOMIC +}; +static const unsigned char * const k8_nops[ASM_NOP_MAX+2] = +{ + NULL, + k8nops, + k8nops + 1, + k8nops + 1 + 2, + k8nops + 1 + 2 + 3, + k8nops + 1 + 2 + 3 + 4, + k8nops + 1 + 2 + 3 + 4 + 5, + k8nops + 1 + 2 + 3 + 4 + 5 + 6, + k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, + k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, +}; +#endif + +#if defined(K7_NOP1) && !defined(CONFIG_X86_64) +static const unsigned char k7nops[] = +{ + K7_NOP1, + K7_NOP2, + K7_NOP3, + K7_NOP4, + K7_NOP5, + K7_NOP6, + K7_NOP7, + K7_NOP8, + K7_NOP5_ATOMIC +}; +static const unsigned char * const k7_nops[ASM_NOP_MAX+2] = +{ + NULL, + k7nops, + k7nops + 1, + k7nops + 1 + 2, + k7nops + 1 + 2 + 3, + k7nops + 1 + 2 + 3 + 4, + k7nops + 1 + 2 + 3 + 4 + 5, + k7nops + 1 + 2 + 3 + 4 + 5 + 6, + k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, + k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, +}; +#endif + +#ifdef P6_NOP1 +static const unsigned char p6nops[] = +{ + P6_NOP1, + P6_NOP2, + P6_NOP3, + P6_NOP4, + P6_NOP5, + P6_NOP6, + P6_NOP7, + P6_NOP8, + P6_NOP5_ATOMIC +}; +static const unsigned char * const p6_nops[ASM_NOP_MAX+2] = +{ + NULL, + p6nops, + p6nops + 1, + p6nops + 1 + 2, + p6nops + 1 + 2 + 3, + p6nops + 1 + 2 + 3 + 4, + p6nops + 1 + 2 + 3 + 4 + 5, + p6nops + 1 + 2 + 3 + 4 + 5 + 6, + p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, + p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, +}; +#endif + +/* Initialize these to a safe default */ +#ifdef CONFIG_X86_64 +const unsigned char * const *ideal_nops = p6_nops; +#else +const unsigned char * const *ideal_nops = intel_nops; +#endif + +void __init arch_init_ideal_nops(void) +{ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + /* + * Due to a decoder implementation quirk, some + * specific Intel CPUs actually perform better with + * the "k8_nops" than with the SDM-recommended NOPs. + */ + if (boot_cpu_data.x86 == 6 && + boot_cpu_data.x86_model >= 0x0f && + boot_cpu_data.x86_model != 0x1c && + boot_cpu_data.x86_model != 0x26 && + boot_cpu_data.x86_model != 0x27 && + boot_cpu_data.x86_model < 0x30) { + ideal_nops = k8_nops; + } else if (boot_cpu_has(X86_FEATURE_NOPL)) { + ideal_nops = p6_nops; + } else { +#ifdef CONFIG_X86_64 + ideal_nops = k8_nops; +#else + ideal_nops = intel_nops; +#endif + } + break; + default: +#ifdef CONFIG_X86_64 + ideal_nops = k8_nops; +#else + if (boot_cpu_has(X86_FEATURE_K8)) + ideal_nops = k8_nops; + else if (boot_cpu_has(X86_FEATURE_K7)) + ideal_nops = k7_nops; + else + ideal_nops = intel_nops; +#endif + } +} + +/* Use this to add nops to a buffer, then text_poke the whole buffer. */ +static void __init_or_module add_nops(void *insns, unsigned int len) +{ + while (len > 0) { + unsigned int noplen = len; + if (noplen > ASM_NOP_MAX) + noplen = ASM_NOP_MAX; + memcpy(insns, ideal_nops[noplen], noplen); + insns += noplen; + len -= noplen; + } +} + +extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; +extern s32 __smp_locks[], __smp_locks_end[]; +void *text_poke_early(void *addr, const void *opcode, size_t len); + +/* Replace instructions with better alternatives for this CPU type. + This runs before SMP is initialized to avoid SMP problems with + self modifying code. This implies that asymmetric systems where + APs have less capabilities than the boot processor are not handled. + Tough. Make sure you disable such features by hand. */ + +void __init_or_module apply_alternatives(struct alt_instr *start, + struct alt_instr *end) +{ + struct alt_instr *a; + u8 *instr, *replacement; + u8 insnbuf[MAX_PATCH_LEN]; + + DPRINTK("%s: alt table %p -> %p\n", __func__, start, end); + /* + * The scan order should be from start to end. A later scanned + * alternative code can overwrite a previous scanned alternative code. + * Some kernel functions (e.g. memcpy, memset, etc) use this order to + * patch code. + * + * So be careful if you want to change the scan order to any other + * order. + */ + for (a = start; a < end; a++) { + instr = (u8 *)&a->instr_offset + a->instr_offset; + replacement = (u8 *)&a->repl_offset + a->repl_offset; + BUG_ON(a->replacementlen > a->instrlen); + BUG_ON(a->instrlen > sizeof(insnbuf)); + BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); + if (!boot_cpu_has(a->cpuid)) + continue; + + memcpy(insnbuf, replacement, a->replacementlen); + + /* 0xe8 is a relative jump; fix the offset. */ + if (*insnbuf == 0xe8 && a->replacementlen == 5) + *(s32 *)(insnbuf + 1) += replacement - instr; + + add_nops(insnbuf + a->replacementlen, + a->instrlen - a->replacementlen); + + text_poke_early(instr, insnbuf, a->instrlen); + } +} + +#ifdef CONFIG_SMP + +static void alternatives_smp_lock(const s32 *start, const s32 *end, + u8 *text, u8 *text_end) +{ + const s32 *poff; + + mutex_lock(&text_mutex); + for (poff = start; poff < end; poff++) { + u8 *ptr = (u8 *)poff + *poff; + + if (!*poff || ptr < text || ptr >= text_end) + continue; + /* turn DS segment override prefix into lock prefix */ + if (*ptr == 0x3e) + text_poke(ptr, ((unsigned char []){0xf0}), 1); + } + mutex_unlock(&text_mutex); +} + +static void alternatives_smp_unlock(const s32 *start, const s32 *end, + u8 *text, u8 *text_end) +{ + const s32 *poff; + + mutex_lock(&text_mutex); + for (poff = start; poff < end; poff++) { + u8 *ptr = (u8 *)poff + *poff; + + if (!*poff || ptr < text || ptr >= text_end) + continue; + /* turn lock prefix into DS segment override prefix */ + if (*ptr == 0xf0) + text_poke(ptr, ((unsigned char []){0x3E}), 1); + } + mutex_unlock(&text_mutex); +} + +struct smp_alt_module { + /* what is this ??? */ + struct module *mod; + char *name; + + /* ptrs to lock prefixes */ + const s32 *locks; + const s32 *locks_end; + + /* .text segment, needed to avoid patching init code ;) */ + u8 *text; + u8 *text_end; + + struct list_head next; +}; +static LIST_HEAD(smp_alt_modules); +static DEFINE_MUTEX(smp_alt); +static bool uniproc_patched = false; /* protected by smp_alt */ + +void __init_or_module alternatives_smp_module_add(struct module *mod, + char *name, + void *locks, void *locks_end, + void *text, void *text_end) +{ + struct smp_alt_module *smp; + + mutex_lock(&smp_alt); + if (!uniproc_patched) + goto unlock; + + if (num_possible_cpus() == 1) + /* Don't bother remembering, we'll never have to undo it. */ + goto smp_unlock; + + smp = kzalloc(sizeof(*smp), GFP_KERNEL); + if (NULL == smp) + /* we'll run the (safe but slow) SMP code then ... */ + goto unlock; + + smp->mod = mod; + smp->name = name; + smp->locks = locks; + smp->locks_end = locks_end; + smp->text = text; + smp->text_end = text_end; + DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n", + __func__, smp->locks, smp->locks_end, + smp->text, smp->text_end, smp->name); + + list_add_tail(&smp->next, &smp_alt_modules); +smp_unlock: + alternatives_smp_unlock(locks, locks_end, text, text_end); +unlock: + mutex_unlock(&smp_alt); +} + +void __init_or_module alternatives_smp_module_del(struct module *mod) +{ + struct smp_alt_module *item; + + mutex_lock(&smp_alt); + list_for_each_entry(item, &smp_alt_modules, next) { + if (mod != item->mod) + continue; + list_del(&item->next); + kfree(item); + break; + } + mutex_unlock(&smp_alt); +} + +void alternatives_enable_smp(void) +{ + struct smp_alt_module *mod; + + /* Why bother if there are no other CPUs? */ + BUG_ON(num_possible_cpus() == 1); + + mutex_lock(&smp_alt); + + if (uniproc_patched) { + pr_info("switching to SMP code\n"); + BUG_ON(num_online_cpus() != 1); + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); + clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP); + list_for_each_entry(mod, &smp_alt_modules, next) + alternatives_smp_lock(mod->locks, mod->locks_end, + mod->text, mod->text_end); + uniproc_patched = false; + } + mutex_unlock(&smp_alt); +} + +/* Return 1 if the address range is reserved for smp-alternatives */ +int alternatives_text_reserved(void *start, void *end) +{ + struct smp_alt_module *mod; + const s32 *poff; + u8 *text_start = start; + u8 *text_end = end; + + list_for_each_entry(mod, &smp_alt_modules, next) { + if (mod->text > text_end || mod->text_end < text_start) + continue; + for (poff = mod->locks; poff < mod->locks_end; poff++) { + const u8 *ptr = (const u8 *)poff + *poff; + + if (text_start <= ptr && text_end > ptr) + return 1; + } + } + + return 0; +} +#endif + +#ifdef CONFIG_PARAVIRT +void __init_or_module apply_paravirt(struct paravirt_patch_site *start, + struct paravirt_patch_site *end) +{ + struct paravirt_patch_site *p; + char insnbuf[MAX_PATCH_LEN]; + + if (noreplace_paravirt) + return; + + for (p = start; p < end; p++) { + unsigned int used; + + BUG_ON(p->len > MAX_PATCH_LEN); + /* prep the buffer with the original instructions */ + memcpy(insnbuf, p->instr, p->len); + used = pv_init_ops.patch(p->instrtype, p->clobbers, insnbuf, + (unsigned long)p->instr, p->len); + + BUG_ON(used > p->len); + + /* Pad the rest with nops */ + add_nops(insnbuf + used, p->len - used); + text_poke_early(p->instr, insnbuf, p->len); + } +} +extern struct paravirt_patch_site __start_parainstructions[], + __stop_parainstructions[]; +#endif /* CONFIG_PARAVIRT */ + +void __init alternative_instructions(void) +{ + /* The patching is not fully atomic, so try to avoid local interruptions + that might execute the to be patched code. + Other CPUs are not running. */ + stop_nmi(); + + /* + * Don't stop machine check exceptions while patching. + * MCEs only happen when something got corrupted and in this + * case we must do something about the corruption. + * Ignoring it is worse than a unlikely patching race. + * Also machine checks tend to be broadcast and if one CPU + * goes into machine check the others follow quickly, so we don't + * expect a machine check to cause undue problems during to code + * patching. + */ + + apply_alternatives(__alt_instructions, __alt_instructions_end); + +#ifdef CONFIG_SMP + /* Patch to UP if other cpus not imminent. */ + if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) { + uniproc_patched = true; + alternatives_smp_module_add(NULL, "core kernel", + __smp_locks, __smp_locks_end, + _text, _etext); + } + + if (!uniproc_patched || num_possible_cpus() == 1) + free_init_pages("SMP alternatives", + (unsigned long)__smp_locks, + (unsigned long)__smp_locks_end); +#endif + + apply_paravirt(__parainstructions, __parainstructions_end); + + restart_nmi(); +} + +/** + * text_poke_early - Update instructions on a live kernel at boot time + * @addr: address to modify + * @opcode: source of the copy + * @len: length to copy + * + * When you use this code to patch more than one byte of an instruction + * you need to make sure that other CPUs cannot execute this code in parallel. + * Also no thread must be currently preempted in the middle of these + * instructions. And on the local CPU you need to be protected again NMI or MCE + * handlers seeing an inconsistent instruction while you patch. + */ +void *__init_or_module text_poke_early(void *addr, const void *opcode, + size_t len) +{ + unsigned long flags; + local_irq_save(flags); + memcpy(addr, opcode, len); + sync_core(); + local_irq_restore(flags); + /* Could also do a CLFLUSH here to speed up CPU recovery; but + that causes hangs on some VIA CPUs. */ + return addr; +} + +/** + * text_poke - Update instructions on a live kernel + * @addr: address to modify + * @opcode: source of the copy + * @len: length to copy + * + * Only atomic text poke/set should be allowed when not doing early patching. + * It means the size must be writable atomically and the address must be aligned + * in a way that permits an atomic write. It also makes sure we fit on a single + * page. + * + * Note: Must be called under text_mutex. + */ +void *text_poke(void *addr, const void *opcode, size_t len) +{ + unsigned long flags; + char *vaddr; + struct page *pages[2]; + int i; + + if (!core_kernel_text((unsigned long)addr)) { + pages[0] = vmalloc_to_page(addr); + pages[1] = vmalloc_to_page(addr + PAGE_SIZE); + } else { + pages[0] = virt_to_page(addr); + WARN_ON(!PageReserved(pages[0])); + pages[1] = virt_to_page(addr + PAGE_SIZE); + } + BUG_ON(!pages[0]); + local_irq_save(flags); + set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0])); + if (pages[1]) + set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1])); + vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0); + memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len); + clear_fixmap(FIX_TEXT_POKE0); + if (pages[1]) + clear_fixmap(FIX_TEXT_POKE1); + local_flush_tlb(); + sync_core(); + /* Could also do a CLFLUSH here to speed up CPU recovery; but + that causes hangs on some VIA CPUs. */ + for (i = 0; i < len; i++) + BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]); + local_irq_restore(flags); + return addr; +} + +static void do_sync_core(void *info) +{ + sync_core(); +} + +static bool bp_patching_in_progress; +static void *bp_int3_handler, *bp_int3_addr; + +int poke_int3_handler(struct pt_regs *regs) +{ + /* bp_patching_in_progress */ + smp_rmb(); + + if (likely(!bp_patching_in_progress)) + return 0; + + if (user_mode_vm(regs) || regs->ip != (unsigned long)bp_int3_addr) + return 0; + + /* set up the specified breakpoint handler */ + regs->ip = (unsigned long) bp_int3_handler; + + return 1; + +} + +/** + * text_poke_bp() -- update instructions on live kernel on SMP + * @addr: address to patch + * @opcode: opcode of new instruction + * @len: length to copy + * @handler: address to jump to when the temporary breakpoint is hit + * + * Modify multi-byte instruction by using int3 breakpoint on SMP. + * We completely avoid stop_machine() here, and achieve the + * synchronization using int3 breakpoint. + * + * The way it is done: + * - add a int3 trap to the address that will be patched + * - sync cores + * - update all but the first byte of the patched range + * - sync cores + * - replace the first byte (int3) by the first byte of + * replacing opcode + * - sync cores + * + * Note: must be called under text_mutex. + */ +void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) +{ + unsigned char int3 = 0xcc; + + bp_int3_handler = handler; + bp_int3_addr = (u8 *)addr + sizeof(int3); + bp_patching_in_progress = true; + /* + * Corresponding read barrier in int3 notifier for + * making sure the in_progress flags is correctly ordered wrt. + * patching + */ + smp_wmb(); + + text_poke(addr, &int3, sizeof(int3)); + + on_each_cpu(do_sync_core, NULL, 1); + + if (len - sizeof(int3) > 0) { + /* patch all but the first byte */ + text_poke((char *)addr + sizeof(int3), + (const char *) opcode + sizeof(int3), + len - sizeof(int3)); + /* + * According to Intel, this core syncing is very likely + * not necessary and we'd be safe even without it. But + * better safe than sorry (plus there's not only Intel). + */ + on_each_cpu(do_sync_core, NULL, 1); + } + + /* patch the first byte */ + text_poke(addr, opcode, sizeof(int3)); + + on_each_cpu(do_sync_core, NULL, 1); + + bp_patching_in_progress = false; + smp_wmb(); + + return addr; +} + diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c new file mode 100644 index 00000000000..8e3842fc8be --- /dev/null +++ b/arch/x86/kernel/amd_gart_64.c @@ -0,0 +1,898 @@ +/* + * Dynamic DMA mapping support for AMD Hammer. + * + * Use the integrated AGP GART in the Hammer northbridge as an IOMMU for PCI. + * This allows to use PCI devices that only support 32bit addresses on systems + * with more than 4GB. + * + * See Documentation/DMA-API-HOWTO.txt for the interface specification. + * + * Copyright 2002 Andi Kleen, SuSE Labs. + * Subject to the GNU General Public License v2 only. + */ + +#include <linux/types.h> +#include <linux/ctype.h> +#include <linux/agp_backend.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/spinlock.h> +#include <linux/pci.h> +#include <linux/module.h> +#include <linux/topology.h> +#include <linux/interrupt.h> +#include <linux/bitmap.h> +#include <linux/kdebug.h> +#include <linux/scatterlist.h> +#include <linux/iommu-helper.h> +#include <linux/syscore_ops.h> +#include <linux/io.h> +#include <linux/gfp.h> +#include <linux/atomic.h> +#include <asm/mtrr.h> +#include <asm/pgtable.h> +#include <asm/proto.h> +#include <asm/iommu.h> +#include <asm/gart.h> +#include <asm/cacheflush.h> +#include <asm/swiotlb.h> +#include <asm/dma.h> +#include <asm/amd_nb.h> +#include <asm/x86_init.h> +#include <asm/iommu_table.h> + +static unsigned long iommu_bus_base; /* GART remapping area (physical) */ +static unsigned long iommu_size; /* size of remapping area bytes */ +static unsigned long iommu_pages; /* .. and in pages */ + +static u32 *iommu_gatt_base; /* Remapping table */ + +static dma_addr_t bad_dma_addr; + +/* + * If this is disabled the IOMMU will use an optimized flushing strategy + * of only flushing when an mapping is reused. With it true the GART is + * flushed for every mapping. Problem is that doing the lazy flush seems + * to trigger bugs with some popular PCI cards, in particular 3ware (but + * has been also also seen with Qlogic at least). + */ +static int iommu_fullflush = 1; + +/* Allocation bitmap for the remapping area: */ +static DEFINE_SPINLOCK(iommu_bitmap_lock); +/* Guarded by iommu_bitmap_lock: */ +static unsigned long *iommu_gart_bitmap; + +static u32 gart_unmapped_entry; + +#define GPTE_VALID 1 +#define GPTE_COHERENT 2 +#define GPTE_ENCODE(x) \ + (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT) +#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28)) + +#define EMERGENCY_PAGES 32 /* = 128KB */ + +#ifdef CONFIG_AGP +#define AGPEXTERN extern +#else +#define AGPEXTERN +#endif + +/* GART can only remap to physical addresses < 1TB */ +#define GART_MAX_PHYS_ADDR (1ULL << 40) + +/* backdoor interface to AGP driver */ +AGPEXTERN int agp_memory_reserved; +AGPEXTERN __u32 *agp_gatt_table; + +static unsigned long next_bit; /* protected by iommu_bitmap_lock */ +static bool need_flush; /* global flush state. set for each gart wrap */ + +static unsigned long alloc_iommu(struct device *dev, int size, + unsigned long align_mask) +{ + unsigned long offset, flags; + unsigned long boundary_size; + unsigned long base_index; + + base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev), + PAGE_SIZE) >> PAGE_SHIFT; + boundary_size = ALIGN((u64)dma_get_seg_boundary(dev) + 1, + PAGE_SIZE) >> PAGE_SHIFT; + + spin_lock_irqsave(&iommu_bitmap_lock, flags); + offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit, + size, base_index, boundary_size, align_mask); + if (offset == -1) { + need_flush = true; + offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0, + size, base_index, boundary_size, + align_mask); + } + if (offset != -1) { + next_bit = offset+size; + if (next_bit >= iommu_pages) { + next_bit = 0; + need_flush = true; + } + } + if (iommu_fullflush) + need_flush = true; + spin_unlock_irqrestore(&iommu_bitmap_lock, flags); + + return offset; +} + +static void free_iommu(unsigned long offset, int size) +{ + unsigned long flags; + + spin_lock_irqsave(&iommu_bitmap_lock, flags); + bitmap_clear(iommu_gart_bitmap, offset, size); + if (offset >= next_bit) + next_bit = offset + size; + spin_unlock_irqrestore(&iommu_bitmap_lock, flags); +} + +/* + * Use global flush state to avoid races with multiple flushers. + */ +static void flush_gart(void) +{ + unsigned long flags; + + spin_lock_irqsave(&iommu_bitmap_lock, flags); + if (need_flush) { + amd_flush_garts(); + need_flush = false; + } + spin_unlock_irqrestore(&iommu_bitmap_lock, flags); +} + +#ifdef CONFIG_IOMMU_LEAK +/* Debugging aid for drivers that don't free their IOMMU tables */ +static int leak_trace; +static int iommu_leak_pages = 20; + +static void dump_leak(void) +{ + static int dump; + + if (dump) + return; + dump = 1; + + show_stack(NULL, NULL); + debug_dma_dump_mappings(NULL); +} +#endif + +static void iommu_full(struct device *dev, size_t size, int dir) +{ + /* + * Ran out of IOMMU space for this operation. This is very bad. + * Unfortunately the drivers cannot handle this operation properly. + * Return some non mapped prereserved space in the aperture and + * let the Northbridge deal with it. This will result in garbage + * in the IO operation. When the size exceeds the prereserved space + * memory corruption will occur or random memory will be DMAed + * out. Hopefully no network devices use single mappings that big. + */ + + dev_err(dev, "PCI-DMA: Out of IOMMU space for %lu bytes\n", size); + + if (size > PAGE_SIZE*EMERGENCY_PAGES) { + if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL) + panic("PCI-DMA: Memory would be corrupted\n"); + if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL) + panic(KERN_ERR + "PCI-DMA: Random memory would be DMAed\n"); + } +#ifdef CONFIG_IOMMU_LEAK + dump_leak(); +#endif +} + +static inline int +need_iommu(struct device *dev, unsigned long addr, size_t size) +{ + return force_iommu || !dma_capable(dev, addr, size); +} + +static inline int +nonforced_iommu(struct device *dev, unsigned long addr, size_t size) +{ + return !dma_capable(dev, addr, size); +} + +/* Map a single continuous physical area into the IOMMU. + * Caller needs to check if the iommu is needed and flush. + */ +static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, + size_t size, int dir, unsigned long align_mask) +{ + unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE); + unsigned long iommu_page; + int i; + + if (unlikely(phys_mem + size > GART_MAX_PHYS_ADDR)) + return bad_dma_addr; + + iommu_page = alloc_iommu(dev, npages, align_mask); + if (iommu_page == -1) { + if (!nonforced_iommu(dev, phys_mem, size)) + return phys_mem; + if (panic_on_overflow) + panic("dma_map_area overflow %lu bytes\n", size); + iommu_full(dev, size, dir); + return bad_dma_addr; + } + + for (i = 0; i < npages; i++) { + iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem); + phys_mem += PAGE_SIZE; + } + return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); +} + +/* Map a single area into the IOMMU */ +static dma_addr_t gart_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + unsigned long bus; + phys_addr_t paddr = page_to_phys(page) + offset; + + if (!dev) + dev = &x86_dma_fallback_dev; + + if (!need_iommu(dev, paddr, size)) + return paddr; + + bus = dma_map_area(dev, paddr, size, dir, 0); + flush_gart(); + + return bus; +} + +/* + * Free a DMA mapping. + */ +static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr, + size_t size, enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + unsigned long iommu_page; + int npages; + int i; + + if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE || + dma_addr >= iommu_bus_base + iommu_size) + return; + + iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; + npages = iommu_num_pages(dma_addr, size, PAGE_SIZE); + for (i = 0; i < npages; i++) { + iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; + } + free_iommu(iommu_page, npages); +} + +/* + * Wrapper for pci_unmap_single working with scatterlists. + */ +static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, + enum dma_data_direction dir, struct dma_attrs *attrs) +{ + struct scatterlist *s; + int i; + + for_each_sg(sg, s, nents, i) { + if (!s->dma_length || !s->length) + break; + gart_unmap_page(dev, s->dma_address, s->dma_length, dir, NULL); + } +} + +/* Fallback for dma_map_sg in case of overflow */ +static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, + int nents, int dir) +{ + struct scatterlist *s; + int i; + +#ifdef CONFIG_IOMMU_DEBUG + pr_debug("dma_map_sg overflow\n"); +#endif + + for_each_sg(sg, s, nents, i) { + unsigned long addr = sg_phys(s); + + if (nonforced_iommu(dev, addr, s->length)) { + addr = dma_map_area(dev, addr, s->length, dir, 0); + if (addr == bad_dma_addr) { + if (i > 0) + gart_unmap_sg(dev, sg, i, dir, NULL); + nents = 0; + sg[0].dma_length = 0; + break; + } + } + s->dma_address = addr; + s->dma_length = s->length; + } + flush_gart(); + + return nents; +} + +/* Map multiple scatterlist entries continuous into the first. */ +static int __dma_map_cont(struct device *dev, struct scatterlist *start, + int nelems, struct scatterlist *sout, + unsigned long pages) +{ + unsigned long iommu_start = alloc_iommu(dev, pages, 0); + unsigned long iommu_page = iommu_start; + struct scatterlist *s; + int i; + + if (iommu_start == -1) + return -1; + + for_each_sg(start, s, nelems, i) { + unsigned long pages, addr; + unsigned long phys_addr = s->dma_address; + + BUG_ON(s != start && s->offset); + if (s == start) { + sout->dma_address = iommu_bus_base; + sout->dma_address += iommu_page*PAGE_SIZE + s->offset; + sout->dma_length = s->length; + } else { + sout->dma_length += s->length; + } + + addr = phys_addr; + pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE); + while (pages--) { + iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); + addr += PAGE_SIZE; + iommu_page++; + } + } + BUG_ON(iommu_page - iommu_start != pages); + + return 0; +} + +static inline int +dma_map_cont(struct device *dev, struct scatterlist *start, int nelems, + struct scatterlist *sout, unsigned long pages, int need) +{ + if (!need) { + BUG_ON(nelems != 1); + sout->dma_address = start->dma_address; + sout->dma_length = start->length; + return 0; + } + return __dma_map_cont(dev, start, nelems, sout, pages); +} + +/* + * DMA map all entries in a scatterlist. + * Merge chunks that have page aligned sizes into a continuous mapping. + */ +static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, + enum dma_data_direction dir, struct dma_attrs *attrs) +{ + struct scatterlist *s, *ps, *start_sg, *sgmap; + int need = 0, nextneed, i, out, start; + unsigned long pages = 0; + unsigned int seg_size; + unsigned int max_seg_size; + + if (nents == 0) + return 0; + + if (!dev) + dev = &x86_dma_fallback_dev; + + out = 0; + start = 0; + start_sg = sg; + sgmap = sg; + seg_size = 0; + max_seg_size = dma_get_max_seg_size(dev); + ps = NULL; /* shut up gcc */ + + for_each_sg(sg, s, nents, i) { + dma_addr_t addr = sg_phys(s); + + s->dma_address = addr; + BUG_ON(s->length == 0); + + nextneed = need_iommu(dev, addr, s->length); + + /* Handle the previous not yet processed entries */ + if (i > start) { + /* + * Can only merge when the last chunk ends on a + * page boundary and the new one doesn't have an + * offset. + */ + if (!iommu_merge || !nextneed || !need || s->offset || + (s->length + seg_size > max_seg_size) || + (ps->offset + ps->length) % PAGE_SIZE) { + if (dma_map_cont(dev, start_sg, i - start, + sgmap, pages, need) < 0) + goto error; + out++; + + seg_size = 0; + sgmap = sg_next(sgmap); + pages = 0; + start = i; + start_sg = s; + } + } + + seg_size += s->length; + need = nextneed; + pages += iommu_num_pages(s->offset, s->length, PAGE_SIZE); + ps = s; + } + if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0) + goto error; + out++; + flush_gart(); + if (out < nents) { + sgmap = sg_next(sgmap); + sgmap->dma_length = 0; + } + return out; + +error: + flush_gart(); + gart_unmap_sg(dev, sg, out, dir, NULL); + + /* When it was forced or merged try again in a dumb way */ + if (force_iommu || iommu_merge) { + out = dma_map_sg_nonforce(dev, sg, nents, dir); + if (out > 0) + return out; + } + if (panic_on_overflow) + panic("dma_map_sg: overflow on %lu pages\n", pages); + + iommu_full(dev, pages << PAGE_SHIFT, dir); + for_each_sg(sg, s, nents, i) + s->dma_address = bad_dma_addr; + return 0; +} + +/* allocate and map a coherent mapping */ +static void * +gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, + gfp_t flag, struct dma_attrs *attrs) +{ + dma_addr_t paddr; + unsigned long align_mask; + struct page *page; + + if (force_iommu && !(flag & GFP_DMA)) { + flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); + page = alloc_pages(flag | __GFP_ZERO, get_order(size)); + if (!page) + return NULL; + + align_mask = (1UL << get_order(size)) - 1; + paddr = dma_map_area(dev, page_to_phys(page), size, + DMA_BIDIRECTIONAL, align_mask); + + flush_gart(); + if (paddr != bad_dma_addr) { + *dma_addr = paddr; + return page_address(page); + } + __free_pages(page, get_order(size)); + } else + return dma_generic_alloc_coherent(dev, size, dma_addr, flag, + attrs); + + return NULL; +} + +/* free a coherent mapping */ +static void +gart_free_coherent(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_addr, struct dma_attrs *attrs) +{ + gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, NULL); + dma_generic_free_coherent(dev, size, vaddr, dma_addr, attrs); +} + +static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr) +{ + return (dma_addr == bad_dma_addr); +} + +static int no_agp; + +static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) +{ + unsigned long a; + + if (!iommu_size) { + iommu_size = aper_size; + if (!no_agp) + iommu_size /= 2; + } + + a = aper + iommu_size; + iommu_size -= round_up(a, PMD_PAGE_SIZE) - a; + + if (iommu_size < 64*1024*1024) { + pr_warning( + "PCI-DMA: Warning: Small IOMMU %luMB." + " Consider increasing the AGP aperture in BIOS\n", + iommu_size >> 20); + } + + return iommu_size; +} + +static __init unsigned read_aperture(struct pci_dev *dev, u32 *size) +{ + unsigned aper_size = 0, aper_base_32, aper_order; + u64 aper_base; + + pci_read_config_dword(dev, AMD64_GARTAPERTUREBASE, &aper_base_32); + pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &aper_order); + aper_order = (aper_order >> 1) & 7; + + aper_base = aper_base_32 & 0x7fff; + aper_base <<= 25; + + aper_size = (32 * 1024 * 1024) << aper_order; + if (aper_base + aper_size > 0x100000000UL || !aper_size) + aper_base = 0; + + *size = aper_size; + return aper_base; +} + +static void enable_gart_translations(void) +{ + int i; + + if (!amd_nb_has_feature(AMD_NB_GART)) + return; + + for (i = 0; i < amd_nb_num(); i++) { + struct pci_dev *dev = node_to_amd_nb(i)->misc; + + enable_gart_translation(dev, __pa(agp_gatt_table)); + } + + /* Flush the GART-TLB to remove stale entries */ + amd_flush_garts(); +} + +/* + * If fix_up_north_bridges is set, the north bridges have to be fixed up on + * resume in the same way as they are handled in gart_iommu_hole_init(). + */ +static bool fix_up_north_bridges; +static u32 aperture_order; +static u32 aperture_alloc; + +void set_up_gart_resume(u32 aper_order, u32 aper_alloc) +{ + fix_up_north_bridges = true; + aperture_order = aper_order; + aperture_alloc = aper_alloc; +} + +static void gart_fixup_northbridges(void) +{ + int i; + + if (!fix_up_north_bridges) + return; + + if (!amd_nb_has_feature(AMD_NB_GART)) + return; + + pr_info("PCI-DMA: Restoring GART aperture settings\n"); + + for (i = 0; i < amd_nb_num(); i++) { + struct pci_dev *dev = node_to_amd_nb(i)->misc; + + /* + * Don't enable translations just yet. That is the next + * step. Restore the pre-suspend aperture settings. + */ + gart_set_size_and_enable(dev, aperture_order); + pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25); + } +} + +static void gart_resume(void) +{ + pr_info("PCI-DMA: Resuming GART IOMMU\n"); + + gart_fixup_northbridges(); + + enable_gart_translations(); +} + +static struct syscore_ops gart_syscore_ops = { + .resume = gart_resume, + +}; + +/* + * Private Northbridge GATT initialization in case we cannot use the + * AGP driver for some reason. + */ +static __init int init_amd_gatt(struct agp_kern_info *info) +{ + unsigned aper_size, gatt_size, new_aper_size; + unsigned aper_base, new_aper_base; + struct pci_dev *dev; + void *gatt; + int i; + + pr_info("PCI-DMA: Disabling AGP.\n"); + + aper_size = aper_base = info->aper_size = 0; + dev = NULL; + for (i = 0; i < amd_nb_num(); i++) { + dev = node_to_amd_nb(i)->misc; + new_aper_base = read_aperture(dev, &new_aper_size); + if (!new_aper_base) + goto nommu; + + if (!aper_base) { + aper_size = new_aper_size; + aper_base = new_aper_base; + } + if (aper_size != new_aper_size || aper_base != new_aper_base) + goto nommu; + } + if (!aper_base) + goto nommu; + + info->aper_base = aper_base; + info->aper_size = aper_size >> 20; + + gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); + gatt = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(gatt_size)); + if (!gatt) + panic("Cannot allocate GATT table"); + if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT)) + panic("Could not set GART PTEs to uncacheable pages"); + + agp_gatt_table = gatt; + + register_syscore_ops(&gart_syscore_ops); + + flush_gart(); + + pr_info("PCI-DMA: aperture base @ %x size %u KB\n", + aper_base, aper_size>>10); + + return 0; + + nommu: + /* Should not happen anymore */ + pr_warning("PCI-DMA: More than 4GB of RAM and no IOMMU\n" + "falling back to iommu=soft.\n"); + return -1; +} + +static struct dma_map_ops gart_dma_ops = { + .map_sg = gart_map_sg, + .unmap_sg = gart_unmap_sg, + .map_page = gart_map_page, + .unmap_page = gart_unmap_page, + .alloc = gart_alloc_coherent, + .free = gart_free_coherent, + .mapping_error = gart_mapping_error, +}; + +static void gart_iommu_shutdown(void) +{ + struct pci_dev *dev; + int i; + + /* don't shutdown it if there is AGP installed */ + if (!no_agp) + return; + + if (!amd_nb_has_feature(AMD_NB_GART)) + return; + + for (i = 0; i < amd_nb_num(); i++) { + u32 ctl; + + dev = node_to_amd_nb(i)->misc; + pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); + + ctl &= ~GARTEN; + + pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl); + } +} + +int __init gart_iommu_init(void) +{ + struct agp_kern_info info; + unsigned long iommu_start; + unsigned long aper_base, aper_size; + unsigned long start_pfn, end_pfn; + unsigned long scratch; + long i; + + if (!amd_nb_has_feature(AMD_NB_GART)) + return 0; + +#ifndef CONFIG_AGP_AMD64 + no_agp = 1; +#else + /* Makefile puts PCI initialization via subsys_initcall first. */ + /* Add other AMD AGP bridge drivers here */ + no_agp = no_agp || + (agp_amd64_init() < 0) || + (agp_copy_info(agp_bridge, &info) < 0); +#endif + + if (no_iommu || + (!force_iommu && max_pfn <= MAX_DMA32_PFN) || + !gart_iommu_aperture || + (no_agp && init_amd_gatt(&info) < 0)) { + if (max_pfn > MAX_DMA32_PFN) { + pr_warning("More than 4GB of memory but GART IOMMU not available.\n"); + pr_warning("falling back to iommu=soft.\n"); + } + return 0; + } + + /* need to map that range */ + aper_size = info.aper_size << 20; + aper_base = info.aper_base; + end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); + + start_pfn = PFN_DOWN(aper_base); + if (!pfn_range_is_mapped(start_pfn, end_pfn)) + init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT); + + pr_info("PCI-DMA: using GART IOMMU.\n"); + iommu_size = check_iommu_size(info.aper_base, aper_size); + iommu_pages = iommu_size >> PAGE_SHIFT; + + iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(iommu_pages/8)); + if (!iommu_gart_bitmap) + panic("Cannot allocate iommu bitmap\n"); + +#ifdef CONFIG_IOMMU_LEAK + if (leak_trace) { + int ret; + + ret = dma_debug_resize_entries(iommu_pages); + if (ret) + pr_debug("PCI-DMA: Cannot trace all the entries\n"); + } +#endif + + /* + * Out of IOMMU space handling. + * Reserve some invalid pages at the beginning of the GART. + */ + bitmap_set(iommu_gart_bitmap, 0, EMERGENCY_PAGES); + + pr_info("PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n", + iommu_size >> 20); + + agp_memory_reserved = iommu_size; + iommu_start = aper_size - iommu_size; + iommu_bus_base = info.aper_base + iommu_start; + bad_dma_addr = iommu_bus_base; + iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); + + /* + * Unmap the IOMMU part of the GART. The alias of the page is + * always mapped with cache enabled and there is no full cache + * coherency across the GART remapping. The unmapping avoids + * automatic prefetches from the CPU allocating cache lines in + * there. All CPU accesses are done via the direct mapping to + * the backing memory. The GART address is only used by PCI + * devices. + */ + set_memory_np((unsigned long)__va(iommu_bus_base), + iommu_size >> PAGE_SHIFT); + /* + * Tricky. The GART table remaps the physical memory range, + * so the CPU wont notice potential aliases and if the memory + * is remapped to UC later on, we might surprise the PCI devices + * with a stray writeout of a cacheline. So play it sure and + * do an explicit, full-scale wbinvd() _after_ having marked all + * the pages as Not-Present: + */ + wbinvd(); + + /* + * Now all caches are flushed and we can safely enable + * GART hardware. Doing it early leaves the possibility + * of stale cache entries that can lead to GART PTE + * errors. + */ + enable_gart_translations(); + + /* + * Try to workaround a bug (thanks to BenH): + * Set unmapped entries to a scratch page instead of 0. + * Any prefetches that hit unmapped entries won't get an bus abort + * then. (P2P bridge may be prefetching on DMA reads). + */ + scratch = get_zeroed_page(GFP_KERNEL); + if (!scratch) + panic("Cannot allocate iommu scratch page"); + gart_unmapped_entry = GPTE_ENCODE(__pa(scratch)); + for (i = EMERGENCY_PAGES; i < iommu_pages; i++) + iommu_gatt_base[i] = gart_unmapped_entry; + + flush_gart(); + dma_ops = &gart_dma_ops; + x86_platform.iommu_shutdown = gart_iommu_shutdown; + swiotlb = 0; + + return 0; +} + +void __init gart_parse_options(char *p) +{ + int arg; + +#ifdef CONFIG_IOMMU_LEAK + if (!strncmp(p, "leak", 4)) { + leak_trace = 1; + p += 4; + if (*p == '=') + ++p; + if (isdigit(*p) && get_option(&p, &arg)) + iommu_leak_pages = arg; + } +#endif + if (isdigit(*p) && get_option(&p, &arg)) + iommu_size = arg; + if (!strncmp(p, "fullflush", 9)) + iommu_fullflush = 1; + if (!strncmp(p, "nofullflush", 11)) + iommu_fullflush = 0; + if (!strncmp(p, "noagp", 5)) + no_agp = 1; + if (!strncmp(p, "noaperture", 10)) + fix_aperture = 0; + /* duplicated from pci-dma.c */ + if (!strncmp(p, "force", 5)) + gart_iommu_aperture_allowed = 1; + if (!strncmp(p, "allowed", 7)) + gart_iommu_aperture_allowed = 1; + if (!strncmp(p, "memaper", 7)) { + fallback_aper_force = 1; + p += 7; + if (*p == '=') { + ++p; + if (get_option(&p, &arg)) + fallback_aper_order = arg; + } + } +} +IOMMU_INIT_POST(gart_iommu_hole_init); diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c new file mode 100644 index 00000000000..f04dbb3069b --- /dev/null +++ b/arch/x86/kernel/amd_nb.c @@ -0,0 +1,297 @@ +/* + * Shared support code for AMD K8 northbridges and derivates. + * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/errno.h> +#include <linux/module.h> +#include <linux/spinlock.h> +#include <asm/amd_nb.h> + +static u32 *flush_words; + +const struct pci_device_id amd_nb_misc_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M10H_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) }, + {} +}; +EXPORT_SYMBOL(amd_nb_misc_ids); + +static const struct pci_device_id amd_nb_link_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) }, + {} +}; + +const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[] __initconst = { + { 0x00, 0x18, 0x20 }, + { 0xff, 0x00, 0x20 }, + { 0xfe, 0x00, 0x20 }, + { } +}; + +struct amd_northbridge_info amd_northbridges; +EXPORT_SYMBOL(amd_northbridges); + +static struct pci_dev *next_northbridge(struct pci_dev *dev, + const struct pci_device_id *ids) +{ + do { + dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); + if (!dev) + break; + } while (!pci_match_id(ids, dev)); + return dev; +} + +int amd_cache_northbridges(void) +{ + u16 i = 0; + struct amd_northbridge *nb; + struct pci_dev *misc, *link; + + if (amd_nb_num()) + return 0; + + misc = NULL; + while ((misc = next_northbridge(misc, amd_nb_misc_ids)) != NULL) + i++; + + if (i == 0) + return 0; + + nb = kzalloc(i * sizeof(struct amd_northbridge), GFP_KERNEL); + if (!nb) + return -ENOMEM; + + amd_northbridges.nb = nb; + amd_northbridges.num = i; + + link = misc = NULL; + for (i = 0; i != amd_nb_num(); i++) { + node_to_amd_nb(i)->misc = misc = + next_northbridge(misc, amd_nb_misc_ids); + node_to_amd_nb(i)->link = link = + next_northbridge(link, amd_nb_link_ids); + } + + /* GART present only on Fam15h upto model 0fh */ + if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 || + (boot_cpu_data.x86 == 0x15 && boot_cpu_data.x86_model < 0x10)) + amd_northbridges.flags |= AMD_NB_GART; + + /* + * Check for L3 cache presence. + */ + if (!cpuid_edx(0x80000006)) + return 0; + + /* + * Some CPU families support L3 Cache Index Disable. There are some + * limitations because of E382 and E388 on family 0x10. + */ + if (boot_cpu_data.x86 == 0x10 && + boot_cpu_data.x86_model >= 0x8 && + (boot_cpu_data.x86_model > 0x9 || + boot_cpu_data.x86_mask >= 0x1)) + amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE; + + if (boot_cpu_data.x86 == 0x15) + amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE; + + /* L3 cache partitioning is supported on family 0x15 */ + if (boot_cpu_data.x86 == 0x15) + amd_northbridges.flags |= AMD_NB_L3_PARTITIONING; + + return 0; +} +EXPORT_SYMBOL_GPL(amd_cache_northbridges); + +/* + * Ignores subdevice/subvendor but as far as I can figure out + * they're useless anyways + */ +bool __init early_is_amd_nb(u32 device) +{ + const struct pci_device_id *id; + u32 vendor = device & 0xffff; + + device >>= 16; + for (id = amd_nb_misc_ids; id->vendor; id++) + if (vendor == id->vendor && device == id->device) + return true; + return false; +} + +struct resource *amd_get_mmconfig_range(struct resource *res) +{ + u32 address; + u64 base, msr; + unsigned segn_busn_bits; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + return NULL; + + /* assume all cpus from fam10h have mmconfig */ + if (boot_cpu_data.x86 < 0x10) + return NULL; + + address = MSR_FAM10H_MMIO_CONF_BASE; + rdmsrl(address, msr); + + /* mmconfig is not enabled */ + if (!(msr & FAM10H_MMIO_CONF_ENABLE)) + return NULL; + + base = msr & (FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT); + + segn_busn_bits = (msr >> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) & + FAM10H_MMIO_CONF_BUSRANGE_MASK; + + res->flags = IORESOURCE_MEM; + res->start = base; + res->end = base + (1ULL<<(segn_busn_bits + 20)) - 1; + return res; +} + +int amd_get_subcaches(int cpu) +{ + struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link; + unsigned int mask; + int cuid; + + if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + return 0; + + pci_read_config_dword(link, 0x1d4, &mask); + + cuid = cpu_data(cpu).compute_unit_id; + return (mask >> (4 * cuid)) & 0xf; +} + +int amd_set_subcaches(int cpu, unsigned long mask) +{ + static unsigned int reset, ban; + struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu)); + unsigned int reg; + int cuid; + + if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING) || mask > 0xf) + return -EINVAL; + + /* if necessary, collect reset state of L3 partitioning and BAN mode */ + if (reset == 0) { + pci_read_config_dword(nb->link, 0x1d4, &reset); + pci_read_config_dword(nb->misc, 0x1b8, &ban); + ban &= 0x180000; + } + + /* deactivate BAN mode if any subcaches are to be disabled */ + if (mask != 0xf) { + pci_read_config_dword(nb->misc, 0x1b8, ®); + pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000); + } + + cuid = cpu_data(cpu).compute_unit_id; + mask <<= 4 * cuid; + mask |= (0xf ^ (1 << cuid)) << 26; + + pci_write_config_dword(nb->link, 0x1d4, mask); + + /* reset BAN mode if L3 partitioning returned to reset state */ + pci_read_config_dword(nb->link, 0x1d4, ®); + if (reg == reset) { + pci_read_config_dword(nb->misc, 0x1b8, ®); + reg &= ~0x180000; + pci_write_config_dword(nb->misc, 0x1b8, reg | ban); + } + + return 0; +} + +static int amd_cache_gart(void) +{ + u16 i; + + if (!amd_nb_has_feature(AMD_NB_GART)) + return 0; + + flush_words = kmalloc(amd_nb_num() * sizeof(u32), GFP_KERNEL); + if (!flush_words) { + amd_northbridges.flags &= ~AMD_NB_GART; + return -ENOMEM; + } + + for (i = 0; i != amd_nb_num(); i++) + pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c, + &flush_words[i]); + + return 0; +} + +void amd_flush_garts(void) +{ + int flushed, i; + unsigned long flags; + static DEFINE_SPINLOCK(gart_lock); + + if (!amd_nb_has_feature(AMD_NB_GART)) + return; + + /* Avoid races between AGP and IOMMU. In theory it's not needed + but I'm not sure if the hardware won't lose flush requests + when another is pending. This whole thing is so expensive anyways + that it doesn't matter to serialize more. -AK */ + spin_lock_irqsave(&gart_lock, flags); + flushed = 0; + for (i = 0; i < amd_nb_num(); i++) { + pci_write_config_dword(node_to_amd_nb(i)->misc, 0x9c, + flush_words[i] | 1); + flushed++; + } + for (i = 0; i < amd_nb_num(); i++) { + u32 w; + /* Make sure the hardware actually executed the flush*/ + for (;;) { + pci_read_config_dword(node_to_amd_nb(i)->misc, + 0x9c, &w); + if (!(w & 1)) + break; + cpu_relax(); + } + } + spin_unlock_irqrestore(&gart_lock, flags); + if (!flushed) + pr_notice("nothing to flush?\n"); +} +EXPORT_SYMBOL_GPL(amd_flush_garts); + +static __init int init_amd_nbs(void) +{ + int err = 0; + + err = amd_cache_northbridges(); + + if (err < 0) + pr_notice("Cannot enumerate AMD northbridges\n"); + + if (amd_cache_gart() < 0) + pr_notice("Cannot initialize GART flush words, GART support disabled\n"); + + return err; +} + +/* This has to go after the PCI subsystem */ +fs_initcall(init_amd_nbs); diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c new file mode 100644 index 00000000000..af5b08ab3b7 --- /dev/null +++ b/arch/x86/kernel/apb_timer.c @@ -0,0 +1,427 @@ +/* + * apb_timer.c: Driver for Langwell APB timers + * + * (C) Copyright 2009 Intel Corporation + * Author: Jacob Pan (jacob.jun.pan@intel.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + * Note: + * Langwell is the south complex of Intel Moorestown MID platform. There are + * eight external timers in total that can be used by the operating system. + * The timer information, such as frequency and addresses, is provided to the + * OS via SFI tables. + * Timer interrupts are routed via FW/HW emulated IOAPIC independently via + * individual redirection table entries (RTE). + * Unlike HPET, there is no master counter, therefore one of the timers are + * used as clocksource. The overall allocation looks like: + * - timer 0 - NR_CPUs for per cpu timer + * - one timer for clocksource + * - one timer for watchdog driver. + * It is also worth notice that APB timer does not support true one-shot mode, + * free-running mode will be used here to emulate one-shot mode. + * APB timer can also be used as broadcast timer along with per cpu local APIC + * timer, but by default APB timer has higher rating than local APIC timers. + */ + +#include <linux/delay.h> +#include <linux/dw_apb_timer.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/pm.h> +#include <linux/sfi.h> +#include <linux/interrupt.h> +#include <linux/cpu.h> +#include <linux/irq.h> + +#include <asm/fixmap.h> +#include <asm/apb_timer.h> +#include <asm/intel-mid.h> +#include <asm/time.h> + +#define APBT_CLOCKEVENT_RATING 110 +#define APBT_CLOCKSOURCE_RATING 250 + +#define APBT_CLOCKEVENT0_NUM (0) +#define APBT_CLOCKSOURCE_NUM (2) + +static phys_addr_t apbt_address; +static int apb_timer_block_enabled; +static void __iomem *apbt_virt_address; + +/* + * Common DW APB timer info + */ +static unsigned long apbt_freq; + +struct apbt_dev { + struct dw_apb_clock_event_device *timer; + unsigned int num; + int cpu; + unsigned int irq; + char name[10]; +}; + +static struct dw_apb_clocksource *clocksource_apbt; + +static inline void __iomem *adev_virt_addr(struct apbt_dev *adev) +{ + return apbt_virt_address + adev->num * APBTMRS_REG_SIZE; +} + +static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev); + +#ifdef CONFIG_SMP +static unsigned int apbt_num_timers_used; +#endif + +static inline void apbt_set_mapping(void) +{ + struct sfi_timer_table_entry *mtmr; + int phy_cs_timer_id = 0; + + if (apbt_virt_address) { + pr_debug("APBT base already mapped\n"); + return; + } + mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM); + if (mtmr == NULL) { + printk(KERN_ERR "Failed to get MTMR %d from SFI\n", + APBT_CLOCKEVENT0_NUM); + return; + } + apbt_address = (phys_addr_t)mtmr->phys_addr; + if (!apbt_address) { + printk(KERN_WARNING "No timer base from SFI, use default\n"); + apbt_address = APBT_DEFAULT_BASE; + } + apbt_virt_address = ioremap_nocache(apbt_address, APBT_MMAP_SIZE); + if (!apbt_virt_address) { + pr_debug("Failed mapping APBT phy address at %lu\n",\ + (unsigned long)apbt_address); + goto panic_noapbt; + } + apbt_freq = mtmr->freq_hz; + sfi_free_mtmr(mtmr); + + /* Now figure out the physical timer id for clocksource device */ + mtmr = sfi_get_mtmr(APBT_CLOCKSOURCE_NUM); + if (mtmr == NULL) + goto panic_noapbt; + + /* Now figure out the physical timer id */ + pr_debug("Use timer %d for clocksource\n", + (int)(mtmr->phys_addr & 0xff) / APBTMRS_REG_SIZE); + phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff) / + APBTMRS_REG_SIZE; + + clocksource_apbt = dw_apb_clocksource_init(APBT_CLOCKSOURCE_RATING, + "apbt0", apbt_virt_address + phy_cs_timer_id * + APBTMRS_REG_SIZE, apbt_freq); + return; + +panic_noapbt: + panic("Failed to setup APB system timer\n"); + +} + +static inline void apbt_clear_mapping(void) +{ + iounmap(apbt_virt_address); + apbt_virt_address = NULL; +} + +/* + * APBT timer interrupt enable / disable + */ +static inline int is_apbt_capable(void) +{ + return apbt_virt_address ? 1 : 0; +} + +static int __init apbt_clockevent_register(void) +{ + struct sfi_timer_table_entry *mtmr; + struct apbt_dev *adev = &__get_cpu_var(cpu_apbt_dev); + + mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM); + if (mtmr == NULL) { + printk(KERN_ERR "Failed to get MTMR %d from SFI\n", + APBT_CLOCKEVENT0_NUM); + return -ENODEV; + } + + adev->num = smp_processor_id(); + adev->timer = dw_apb_clockevent_init(smp_processor_id(), "apbt0", + intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT ? + APBT_CLOCKEVENT_RATING - 100 : APBT_CLOCKEVENT_RATING, + adev_virt_addr(adev), 0, apbt_freq); + /* Firmware does EOI handling for us. */ + adev->timer->eoi = NULL; + + if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT) { + global_clock_event = &adev->timer->ced; + printk(KERN_DEBUG "%s clockevent registered as global\n", + global_clock_event->name); + } + + dw_apb_clockevent_register(adev->timer); + + sfi_free_mtmr(mtmr); + return 0; +} + +#ifdef CONFIG_SMP + +static void apbt_setup_irq(struct apbt_dev *adev) +{ + /* timer0 irq has been setup early */ + if (adev->irq == 0) + return; + + irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT); + irq_set_affinity(adev->irq, cpumask_of(adev->cpu)); + /* APB timer irqs are set up as mp_irqs, timer is edge type */ + __irq_set_handler(adev->irq, handle_edge_irq, 0, "edge"); +} + +/* Should be called with per cpu */ +void apbt_setup_secondary_clock(void) +{ + struct apbt_dev *adev; + int cpu; + + /* Don't register boot CPU clockevent */ + cpu = smp_processor_id(); + if (!cpu) + return; + + adev = &__get_cpu_var(cpu_apbt_dev); + if (!adev->timer) { + adev->timer = dw_apb_clockevent_init(cpu, adev->name, + APBT_CLOCKEVENT_RATING, adev_virt_addr(adev), + adev->irq, apbt_freq); + adev->timer->eoi = NULL; + } else { + dw_apb_clockevent_resume(adev->timer); + } + + printk(KERN_INFO "Registering CPU %d clockevent device %s, cpu %08x\n", + cpu, adev->name, adev->cpu); + + apbt_setup_irq(adev); + dw_apb_clockevent_register(adev->timer); + + return; +} + +/* + * this notify handler process CPU hotplug events. in case of S0i3, nonboot + * cpus are disabled/enabled frequently, for performance reasons, we keep the + * per cpu timer irq registered so that we do need to do free_irq/request_irq. + * + * TODO: it might be more reliable to directly disable percpu clockevent device + * without the notifier chain. currently, cpu 0 may get interrupts from other + * cpu timers during the offline process due to the ordering of notification. + * the extra interrupt is harmless. + */ +static int apbt_cpuhp_notify(struct notifier_block *n, + unsigned long action, void *hcpu) +{ + unsigned long cpu = (unsigned long)hcpu; + struct apbt_dev *adev = &per_cpu(cpu_apbt_dev, cpu); + + switch (action & 0xf) { + case CPU_DEAD: + dw_apb_clockevent_pause(adev->timer); + if (system_state == SYSTEM_RUNNING) { + pr_debug("skipping APBT CPU %lu offline\n", cpu); + } else { + pr_debug("APBT clockevent for cpu %lu offline\n", cpu); + dw_apb_clockevent_stop(adev->timer); + } + break; + default: + pr_debug("APBT notified %lu, no action\n", action); + } + return NOTIFY_OK; +} + +static __init int apbt_late_init(void) +{ + if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT || + !apb_timer_block_enabled) + return 0; + /* This notifier should be called after workqueue is ready */ + hotcpu_notifier(apbt_cpuhp_notify, -20); + return 0; +} +fs_initcall(apbt_late_init); +#else + +void apbt_setup_secondary_clock(void) {} + +#endif /* CONFIG_SMP */ + +static int apbt_clocksource_register(void) +{ + u64 start, now; + cycle_t t1; + + /* Start the counter, use timer 2 as source, timer 0/1 for event */ + dw_apb_clocksource_start(clocksource_apbt); + + /* Verify whether apbt counter works */ + t1 = dw_apb_clocksource_read(clocksource_apbt); + rdtscll(start); + + /* + * We don't know the TSC frequency yet, but waiting for + * 200000 TSC cycles is safe: + * 4 GHz == 50us + * 1 GHz == 200us + */ + do { + rep_nop(); + rdtscll(now); + } while ((now - start) < 200000UL); + + /* APBT is the only always on clocksource, it has to work! */ + if (t1 == dw_apb_clocksource_read(clocksource_apbt)) + panic("APBT counter not counting. APBT disabled\n"); + + dw_apb_clocksource_register(clocksource_apbt); + + return 0; +} + +/* + * Early setup the APBT timer, only use timer 0 for booting then switch to + * per CPU timer if possible. + * returns 1 if per cpu apbt is setup + * returns 0 if no per cpu apbt is chosen + * panic if set up failed, this is the only platform timer on Moorestown. + */ +void __init apbt_time_init(void) +{ +#ifdef CONFIG_SMP + int i; + struct sfi_timer_table_entry *p_mtmr; + struct apbt_dev *adev; +#endif + + if (apb_timer_block_enabled) + return; + apbt_set_mapping(); + if (!apbt_virt_address) + goto out_noapbt; + /* + * Read the frequency and check for a sane value, for ESL model + * we extend the possible clock range to allow time scaling. + */ + + if (apbt_freq < APBT_MIN_FREQ || apbt_freq > APBT_MAX_FREQ) { + pr_debug("APBT has invalid freq 0x%lx\n", apbt_freq); + goto out_noapbt; + } + if (apbt_clocksource_register()) { + pr_debug("APBT has failed to register clocksource\n"); + goto out_noapbt; + } + if (!apbt_clockevent_register()) + apb_timer_block_enabled = 1; + else { + pr_debug("APBT has failed to register clockevent\n"); + goto out_noapbt; + } +#ifdef CONFIG_SMP + /* kernel cmdline disable apb timer, so we will use lapic timers */ + if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT) { + printk(KERN_INFO "apbt: disabled per cpu timer\n"); + return; + } + pr_debug("%s: %d CPUs online\n", __func__, num_online_cpus()); + if (num_possible_cpus() <= sfi_mtimer_num) + apbt_num_timers_used = num_possible_cpus(); + else + apbt_num_timers_used = 1; + pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used); + + /* here we set up per CPU timer data structure */ + for (i = 0; i < apbt_num_timers_used; i++) { + adev = &per_cpu(cpu_apbt_dev, i); + adev->num = i; + adev->cpu = i; + p_mtmr = sfi_get_mtmr(i); + if (p_mtmr) + adev->irq = p_mtmr->irq; + else + printk(KERN_ERR "Failed to get timer for cpu %d\n", i); + snprintf(adev->name, sizeof(adev->name) - 1, "apbt%d", i); + } +#endif + + return; + +out_noapbt: + apbt_clear_mapping(); + apb_timer_block_enabled = 0; + panic("failed to enable APB timer\n"); +} + +/* called before apb_timer_enable, use early map */ +unsigned long apbt_quick_calibrate(void) +{ + int i, scale; + u64 old, new; + cycle_t t1, t2; + unsigned long khz = 0; + u32 loop, shift; + + apbt_set_mapping(); + dw_apb_clocksource_start(clocksource_apbt); + + /* check if the timer can count down, otherwise return */ + old = dw_apb_clocksource_read(clocksource_apbt); + i = 10000; + while (--i) { + if (old != dw_apb_clocksource_read(clocksource_apbt)) + break; + } + if (!i) + goto failed; + + /* count 16 ms */ + loop = (apbt_freq / 1000) << 4; + + /* restart the timer to ensure it won't get to 0 in the calibration */ + dw_apb_clocksource_start(clocksource_apbt); + + old = dw_apb_clocksource_read(clocksource_apbt); + old += loop; + + t1 = __native_read_tsc(); + + do { + new = dw_apb_clocksource_read(clocksource_apbt); + } while (new < old); + + t2 = __native_read_tsc(); + + shift = 5; + if (unlikely(loop >> shift == 0)) { + printk(KERN_INFO + "APBT TSC calibration failed, not enough resolution\n"); + return 0; + } + scale = (int)div_u64((t2 - t1), loop >> shift); + khz = (scale * (apbt_freq / 1000)) >> shift; + printk(KERN_INFO "TSC freq calculated by APB timer is %lu khz\n", khz); + return khz; +failed: + return 0; +} diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c new file mode 100644 index 00000000000..76164e173a2 --- /dev/null +++ b/arch/x86/kernel/aperture_64.c @@ -0,0 +1,502 @@ +/* + * Firmware replacement code. + * + * Work around broken BIOSes that don't set an aperture, only set the + * aperture in the AGP bridge, or set too small aperture. + * + * If all fails map the aperture over some low memory. This is cheaper than + * doing bounce buffering. The memory is lost. This is done at early boot + * because only the bootmem allocator can allocate 32+MB. + * + * Copyright 2002 Andi Kleen, SuSE Labs. + */ +#define pr_fmt(fmt) "AGP: " fmt + +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/memblock.h> +#include <linux/mmzone.h> +#include <linux/pci_ids.h> +#include <linux/pci.h> +#include <linux/bitops.h> +#include <linux/suspend.h> +#include <asm/e820.h> +#include <asm/io.h> +#include <asm/iommu.h> +#include <asm/gart.h> +#include <asm/pci-direct.h> +#include <asm/dma.h> +#include <asm/amd_nb.h> +#include <asm/x86_init.h> + +/* + * Using 512M as goal, in case kexec will load kernel_big + * that will do the on-position decompress, and could overlap with + * with the gart aperture that is used. + * Sequence: + * kernel_small + * ==> kexec (with kdump trigger path or gart still enabled) + * ==> kernel_small (gart area become e820_reserved) + * ==> kexec (with kdump trigger path or gart still enabled) + * ==> kerne_big (uncompressed size will be big than 64M or 128M) + * So don't use 512M below as gart iommu, leave the space for kernel + * code for safe. + */ +#define GART_MIN_ADDR (512ULL << 20) +#define GART_MAX_ADDR (1ULL << 32) + +int gart_iommu_aperture; +int gart_iommu_aperture_disabled __initdata; +int gart_iommu_aperture_allowed __initdata; + +int fallback_aper_order __initdata = 1; /* 64MB */ +int fallback_aper_force __initdata; + +int fix_aperture __initdata = 1; + +/* This code runs before the PCI subsystem is initialized, so just + access the northbridge directly. */ + +static u32 __init allocate_aperture(void) +{ + u32 aper_size; + unsigned long addr; + + /* aper_size should <= 1G */ + if (fallback_aper_order > 5) + fallback_aper_order = 5; + aper_size = (32 * 1024 * 1024) << fallback_aper_order; + + /* + * Aperture has to be naturally aligned. This means a 2GB aperture + * won't have much chance of finding a place in the lower 4GB of + * memory. Unfortunately we cannot move it up because that would + * make the IOMMU useless. + */ + addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR, + aper_size, aper_size); + if (!addr) { + pr_err("Cannot allocate aperture memory hole [mem %#010lx-%#010lx] (%uKB)\n", + addr, addr + aper_size - 1, aper_size >> 10); + return 0; + } + memblock_reserve(addr, aper_size); + pr_info("Mapping aperture over RAM [mem %#010lx-%#010lx] (%uKB)\n", + addr, addr + aper_size - 1, aper_size >> 10); + register_nosave_region(addr >> PAGE_SHIFT, + (addr+aper_size) >> PAGE_SHIFT); + + return (u32)addr; +} + + +/* Find a PCI capability */ +static u32 __init find_cap(int bus, int slot, int func, int cap) +{ + int bytes; + u8 pos; + + if (!(read_pci_config_16(bus, slot, func, PCI_STATUS) & + PCI_STATUS_CAP_LIST)) + return 0; + + pos = read_pci_config_byte(bus, slot, func, PCI_CAPABILITY_LIST); + for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { + u8 id; + + pos &= ~3; + id = read_pci_config_byte(bus, slot, func, pos+PCI_CAP_LIST_ID); + if (id == 0xff) + break; + if (id == cap) + return pos; + pos = read_pci_config_byte(bus, slot, func, + pos+PCI_CAP_LIST_NEXT); + } + return 0; +} + +/* Read a standard AGPv3 bridge header */ +static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order) +{ + u32 apsize; + u32 apsizereg; + int nbits; + u32 aper_low, aper_hi; + u64 aper; + u32 old_order; + + pr_info("pci 0000:%02x:%02x:%02x: AGP bridge\n", bus, slot, func); + apsizereg = read_pci_config_16(bus, slot, func, cap + 0x14); + if (apsizereg == 0xffffffff) { + pr_err("pci 0000:%02x:%02x.%d: APSIZE unreadable\n", + bus, slot, func); + return 0; + } + + /* old_order could be the value from NB gart setting */ + old_order = *order; + + apsize = apsizereg & 0xfff; + /* Some BIOS use weird encodings not in the AGPv3 table. */ + if (apsize & 0xff) + apsize |= 0xf00; + nbits = hweight16(apsize); + *order = 7 - nbits; + if ((int)*order < 0) /* < 32MB */ + *order = 0; + + aper_low = read_pci_config(bus, slot, func, 0x10); + aper_hi = read_pci_config(bus, slot, func, 0x14); + aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32); + + /* + * On some sick chips, APSIZE is 0. It means it wants 4G + * so let double check that order, and lets trust AMD NB settings: + */ + pr_info("pci 0000:%02x:%02x.%d: AGP aperture [bus addr %#010Lx-%#010Lx] (old size %uMB)\n", + bus, slot, func, aper, aper + (32ULL << (old_order + 20)) - 1, + 32 << old_order); + if (aper + (32ULL<<(20 + *order)) > 0x100000000ULL) { + pr_info("pci 0000:%02x:%02x.%d: AGP aperture size %uMB (APSIZE %#x) is not right, using settings from NB\n", + bus, slot, func, 32 << *order, apsizereg); + *order = old_order; + } + + pr_info("pci 0000:%02x:%02x.%d: AGP aperture [bus addr %#010Lx-%#010Lx] (%uMB, APSIZE %#x)\n", + bus, slot, func, aper, aper + (32ULL << (*order + 20)) - 1, + 32 << *order, apsizereg); + + if (!aperture_valid(aper, (32*1024*1024) << *order, 32<<20)) + return 0; + return (u32)aper; +} + +/* + * Look for an AGP bridge. Windows only expects the aperture in the + * AGP bridge and some BIOS forget to initialize the Northbridge too. + * Work around this here. + * + * Do an PCI bus scan by hand because we're running before the PCI + * subsystem. + * + * All AMD AGP bridges are AGPv3 compliant, so we can do this scan + * generically. It's probably overkill to always scan all slots because + * the AGP bridges should be always an own bus on the HT hierarchy, + * but do it here for future safety. + */ +static u32 __init search_agp_bridge(u32 *order, int *valid_agp) +{ + int bus, slot, func; + + /* Poor man's PCI discovery */ + for (bus = 0; bus < 256; bus++) { + for (slot = 0; slot < 32; slot++) { + for (func = 0; func < 8; func++) { + u32 class, cap; + u8 type; + class = read_pci_config(bus, slot, func, + PCI_CLASS_REVISION); + if (class == 0xffffffff) + break; + + switch (class >> 16) { + case PCI_CLASS_BRIDGE_HOST: + case PCI_CLASS_BRIDGE_OTHER: /* needed? */ + /* AGP bridge? */ + cap = find_cap(bus, slot, func, + PCI_CAP_ID_AGP); + if (!cap) + break; + *valid_agp = 1; + return read_agp(bus, slot, func, cap, + order); + } + + /* No multi-function device? */ + type = read_pci_config_byte(bus, slot, func, + PCI_HEADER_TYPE); + if (!(type & 0x80)) + break; + } + } + } + pr_info("No AGP bridge found\n"); + + return 0; +} + +static int gart_fix_e820 __initdata = 1; + +static int __init parse_gart_mem(char *p) +{ + if (!p) + return -EINVAL; + + if (!strncmp(p, "off", 3)) + gart_fix_e820 = 0; + else if (!strncmp(p, "on", 2)) + gart_fix_e820 = 1; + + return 0; +} +early_param("gart_fix_e820", parse_gart_mem); + +void __init early_gart_iommu_check(void) +{ + /* + * in case it is enabled before, esp for kexec/kdump, + * previous kernel already enable that. memset called + * by allocate_aperture/__alloc_bootmem_nopanic cause restart. + * or second kernel have different position for GART hole. and new + * kernel could use hole as RAM that is still used by GART set by + * first kernel + * or BIOS forget to put that in reserved. + * try to update e820 to make that region as reserved. + */ + u32 agp_aper_order = 0; + int i, fix, slot, valid_agp = 0; + u32 ctl; + u32 aper_size = 0, aper_order = 0, last_aper_order = 0; + u64 aper_base = 0, last_aper_base = 0; + int aper_enabled = 0, last_aper_enabled = 0, last_valid = 0; + + if (!early_pci_allowed()) + return; + + /* This is mostly duplicate of iommu_hole_init */ + search_agp_bridge(&agp_aper_order, &valid_agp); + + fix = 0; + for (i = 0; amd_nb_bus_dev_ranges[i].dev_limit; i++) { + int bus; + int dev_base, dev_limit; + + bus = amd_nb_bus_dev_ranges[i].bus; + dev_base = amd_nb_bus_dev_ranges[i].dev_base; + dev_limit = amd_nb_bus_dev_ranges[i].dev_limit; + + for (slot = dev_base; slot < dev_limit; slot++) { + if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00))) + continue; + + ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); + aper_enabled = ctl & GARTEN; + aper_order = (ctl >> 1) & 7; + aper_size = (32 * 1024 * 1024) << aper_order; + aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff; + aper_base <<= 25; + + if (last_valid) { + if ((aper_order != last_aper_order) || + (aper_base != last_aper_base) || + (aper_enabled != last_aper_enabled)) { + fix = 1; + break; + } + } + + last_aper_order = aper_order; + last_aper_base = aper_base; + last_aper_enabled = aper_enabled; + last_valid = 1; + } + } + + if (!fix && !aper_enabled) + return; + + if (!aper_base || !aper_size || aper_base + aper_size > 0x100000000UL) + fix = 1; + + if (gart_fix_e820 && !fix && aper_enabled) { + if (e820_any_mapped(aper_base, aper_base + aper_size, + E820_RAM)) { + /* reserve it, so we can reuse it in second kernel */ + pr_info("e820: reserve [mem %#010Lx-%#010Lx] for GART\n", + aper_base, aper_base + aper_size - 1); + e820_add_region(aper_base, aper_size, E820_RESERVED); + update_e820(); + } + } + + if (valid_agp) + return; + + /* disable them all at first */ + for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) { + int bus; + int dev_base, dev_limit; + + bus = amd_nb_bus_dev_ranges[i].bus; + dev_base = amd_nb_bus_dev_ranges[i].dev_base; + dev_limit = amd_nb_bus_dev_ranges[i].dev_limit; + + for (slot = dev_base; slot < dev_limit; slot++) { + if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00))) + continue; + + ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); + ctl &= ~GARTEN; + write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl); + } + } + +} + +static int __initdata printed_gart_size_msg; + +int __init gart_iommu_hole_init(void) +{ + u32 agp_aper_base = 0, agp_aper_order = 0; + u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0; + u64 aper_base, last_aper_base = 0; + int fix, slot, valid_agp = 0; + int i, node; + + if (gart_iommu_aperture_disabled || !fix_aperture || + !early_pci_allowed()) + return -ENODEV; + + pr_info("Checking aperture...\n"); + + if (!fallback_aper_force) + agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp); + + fix = 0; + node = 0; + for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) { + int bus; + int dev_base, dev_limit; + u32 ctl; + + bus = amd_nb_bus_dev_ranges[i].bus; + dev_base = amd_nb_bus_dev_ranges[i].dev_base; + dev_limit = amd_nb_bus_dev_ranges[i].dev_limit; + + for (slot = dev_base; slot < dev_limit; slot++) { + if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00))) + continue; + + iommu_detected = 1; + gart_iommu_aperture = 1; + x86_init.iommu.iommu_init = gart_iommu_init; + + ctl = read_pci_config(bus, slot, 3, + AMD64_GARTAPERTURECTL); + + /* + * Before we do anything else disable the GART. It may + * still be enabled if we boot into a crash-kernel here. + * Reconfiguring the GART while it is enabled could have + * unknown side-effects. + */ + ctl &= ~GARTEN; + write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl); + + aper_order = (ctl >> 1) & 7; + aper_size = (32 * 1024 * 1024) << aper_order; + aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff; + aper_base <<= 25; + + pr_info("Node %d: aperture [bus addr %#010Lx-%#010Lx] (%uMB)\n", + node, aper_base, aper_base + aper_size - 1, + aper_size >> 20); + node++; + + if (!aperture_valid(aper_base, aper_size, 64<<20)) { + if (valid_agp && agp_aper_base && + agp_aper_base == aper_base && + agp_aper_order == aper_order) { + /* the same between two setting from NB and agp */ + if (!no_iommu && + max_pfn > MAX_DMA32_PFN && + !printed_gart_size_msg) { + pr_err("you are using iommu with agp, but GART size is less than 64MB\n"); + pr_err("please increase GART size in your BIOS setup\n"); + pr_err("if BIOS doesn't have that option, contact your HW vendor!\n"); + printed_gart_size_msg = 1; + } + } else { + fix = 1; + goto out; + } + } + + if ((last_aper_order && aper_order != last_aper_order) || + (last_aper_base && aper_base != last_aper_base)) { + fix = 1; + goto out; + } + last_aper_order = aper_order; + last_aper_base = aper_base; + } + } + +out: + if (!fix && !fallback_aper_force) { + if (last_aper_base) + return 1; + return 0; + } + + if (!fallback_aper_force) { + aper_alloc = agp_aper_base; + aper_order = agp_aper_order; + } + + if (aper_alloc) { + /* Got the aperture from the AGP bridge */ + } else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) || + force_iommu || + valid_agp || + fallback_aper_force) { + pr_info("Your BIOS doesn't leave a aperture memory hole\n"); + pr_info("Please enable the IOMMU option in the BIOS setup\n"); + pr_info("This costs you %dMB of RAM\n", + 32 << fallback_aper_order); + + aper_order = fallback_aper_order; + aper_alloc = allocate_aperture(); + if (!aper_alloc) { + /* + * Could disable AGP and IOMMU here, but it's + * probably not worth it. But the later users + * cannot deal with bad apertures and turning + * on the aperture over memory causes very + * strange problems, so it's better to panic + * early. + */ + panic("Not enough memory for aperture"); + } + } else { + return 0; + } + + /* Fix up the north bridges */ + for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) { + int bus, dev_base, dev_limit; + + /* + * Don't enable translation yet but enable GART IO and CPU + * accesses and set DISTLBWALKPRB since GART table memory is UC. + */ + u32 ctl = aper_order << 1; + + bus = amd_nb_bus_dev_ranges[i].bus; + dev_base = amd_nb_bus_dev_ranges[i].dev_base; + dev_limit = amd_nb_bus_dev_ranges[i].dev_limit; + for (slot = dev_base; slot < dev_limit; slot++) { + if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00))) + continue; + + write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl); + write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25); + } + } + + set_up_gart_resume(aper_order, aper_alloc); + + return 1; +} diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile new file mode 100644 index 00000000000..dcb5b15401c --- /dev/null +++ b/arch/x86/kernel/apic/Makefile @@ -0,0 +1,24 @@ +# +# Makefile for local APIC drivers and for the IO-APIC code +# + +obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o ipi.o +obj-y += hw_nmi.o + +obj-$(CONFIG_X86_IO_APIC) += io_apic.o +obj-$(CONFIG_SMP) += ipi.o + +ifeq ($(CONFIG_X86_64),y) +# APIC probe will depend on the listing order here +obj-$(CONFIG_X86_NUMACHIP) += apic_numachip.o +obj-$(CONFIG_X86_UV) += x2apic_uv_x.o +obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o +obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o +obj-y += apic_flat_64.o +endif + +# APIC probe will depend on the listing order here +obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o + +# For 32bit, probe_32 need to be listed last +obj-$(CONFIG_X86_LOCAL_APIC) += probe_$(BITS).o diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c new file mode 100644 index 00000000000..ad28db7e6bd --- /dev/null +++ b/arch/x86/kernel/apic/apic.c @@ -0,0 +1,2649 @@ +/* + * Local APIC handling, local APIC timers + * + * (c) 1999, 2000, 2009 Ingo Molnar <mingo@redhat.com> + * + * Fixes + * Maciej W. Rozycki : Bits for genuine 82489DX APICs; + * thanks to Eric Gilmore + * and Rolf G. Tews + * for testing these extensively. + * Maciej W. Rozycki : Various updates and fixes. + * Mikael Pettersson : Power Management for UP-APIC. + * Pavel Machek and + * Mikael Pettersson : PM converted to driver model. + */ + +#include <linux/perf_event.h> +#include <linux/kernel_stat.h> +#include <linux/mc146818rtc.h> +#include <linux/acpi_pmtmr.h> +#include <linux/clockchips.h> +#include <linux/interrupt.h> +#include <linux/bootmem.h> +#include <linux/ftrace.h> +#include <linux/ioport.h> +#include <linux/module.h> +#include <linux/syscore_ops.h> +#include <linux/delay.h> +#include <linux/timex.h> +#include <linux/i8253.h> +#include <linux/dmar.h> +#include <linux/init.h> +#include <linux/cpu.h> +#include <linux/dmi.h> +#include <linux/smp.h> +#include <linux/mm.h> + +#include <asm/trace/irq_vectors.h> +#include <asm/irq_remapping.h> +#include <asm/perf_event.h> +#include <asm/x86_init.h> +#include <asm/pgalloc.h> +#include <linux/atomic.h> +#include <asm/mpspec.h> +#include <asm/i8259.h> +#include <asm/proto.h> +#include <asm/apic.h> +#include <asm/io_apic.h> +#include <asm/desc.h> +#include <asm/hpet.h> +#include <asm/idle.h> +#include <asm/mtrr.h> +#include <asm/time.h> +#include <asm/smp.h> +#include <asm/mce.h> +#include <asm/tsc.h> +#include <asm/hypervisor.h> + +unsigned int num_processors; + +unsigned disabled_cpus; + +/* Processor that is doing the boot up */ +unsigned int boot_cpu_physical_apicid = -1U; +EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid); + +/* + * The highest APIC ID seen during enumeration. + */ +unsigned int max_physical_apicid; + +/* + * Bitmask of physically existing CPUs: + */ +physid_mask_t phys_cpu_present_map; + +/* + * Processor to be disabled specified by kernel parameter + * disable_cpu_apicid=<int>, mostly used for the kdump 2nd kernel to + * avoid undefined behaviour caused by sending INIT from AP to BSP. + */ +static unsigned int disabled_cpu_apicid __read_mostly = BAD_APICID; + +/* + * Map cpu index to physical APIC ID + */ +DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID); +DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid, BAD_APICID); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); + +#ifdef CONFIG_X86_32 + +/* + * On x86_32, the mapping between cpu and logical apicid may vary + * depending on apic in use. The following early percpu variable is + * used for the mapping. This is where the behaviors of x86_64 and 32 + * actually diverge. Let's keep it ugly for now. + */ +DEFINE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid, BAD_APICID); + +/* Local APIC was disabled by the BIOS and enabled by the kernel */ +static int enabled_via_apicbase; + +/* + * Handle interrupt mode configuration register (IMCR). + * This register controls whether the interrupt signals + * that reach the BSP come from the master PIC or from the + * local APIC. Before entering Symmetric I/O Mode, either + * the BIOS or the operating system must switch out of + * PIC Mode by changing the IMCR. + */ +static inline void imcr_pic_to_apic(void) +{ + /* select IMCR register */ + outb(0x70, 0x22); + /* NMI and 8259 INTR go through APIC */ + outb(0x01, 0x23); +} + +static inline void imcr_apic_to_pic(void) +{ + /* select IMCR register */ + outb(0x70, 0x22); + /* NMI and 8259 INTR go directly to BSP */ + outb(0x00, 0x23); +} +#endif + +/* + * Knob to control our willingness to enable the local APIC. + * + * +1=force-enable + */ +static int force_enable_local_apic __initdata; + +/* Control whether x2APIC mode is enabled or not */ +static bool nox2apic __initdata; + +/* + * APIC command line parameters + */ +static int __init parse_lapic(char *arg) +{ + if (config_enabled(CONFIG_X86_32) && !arg) + force_enable_local_apic = 1; + else if (arg && !strncmp(arg, "notscdeadline", 13)) + setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); + return 0; +} +early_param("lapic", parse_lapic); + +#ifdef CONFIG_X86_64 +static int apic_calibrate_pmtmr __initdata; +static __init int setup_apicpmtimer(char *s) +{ + apic_calibrate_pmtmr = 1; + notsc_setup(NULL); + return 0; +} +__setup("apicpmtimer", setup_apicpmtimer); +#endif + +int x2apic_mode; +#ifdef CONFIG_X86_X2APIC +/* x2apic enabled before OS handover */ +int x2apic_preenabled; +static int x2apic_disabled; +static int __init setup_nox2apic(char *str) +{ + if (x2apic_enabled()) { + int apicid = native_apic_msr_read(APIC_ID); + + if (apicid >= 255) { + pr_warning("Apicid: %08x, cannot enforce nox2apic\n", + apicid); + return 0; + } + + pr_warning("x2apic already enabled. will disable it\n"); + } else + setup_clear_cpu_cap(X86_FEATURE_X2APIC); + + nox2apic = true; + + return 0; +} +early_param("nox2apic", setup_nox2apic); +#endif + +unsigned long mp_lapic_addr; +int disable_apic; +/* Disable local APIC timer from the kernel commandline or via dmi quirk */ +static int disable_apic_timer __initdata; +/* Local APIC timer works in C2 */ +int local_apic_timer_c2_ok; +EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); + +int first_system_vector = 0xfe; + +/* + * Debug level, exported for io_apic.c + */ +unsigned int apic_verbosity; + +int pic_mode; + +/* Have we found an MP table */ +int smp_found_config; + +static struct resource lapic_resource = { + .name = "Local APIC", + .flags = IORESOURCE_MEM | IORESOURCE_BUSY, +}; + +unsigned int lapic_timer_frequency = 0; + +static void apic_pm_activate(void); + +static unsigned long apic_phys; + +/* + * Get the LAPIC version + */ +static inline int lapic_get_version(void) +{ + return GET_APIC_VERSION(apic_read(APIC_LVR)); +} + +/* + * Check, if the APIC is integrated or a separate chip + */ +static inline int lapic_is_integrated(void) +{ +#ifdef CONFIG_X86_64 + return 1; +#else + return APIC_INTEGRATED(lapic_get_version()); +#endif +} + +/* + * Check, whether this is a modern or a first generation APIC + */ +static int modern_apic(void) +{ + /* AMD systems use old APIC versions, so check the CPU */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && + boot_cpu_data.x86 >= 0xf) + return 1; + return lapic_get_version() >= 0x14; +} + +/* + * right after this call apic become NOOP driven + * so apic->write/read doesn't do anything + */ +static void __init apic_disable(void) +{ + pr_info("APIC: switched to apic NOOP\n"); + apic = &apic_noop; +} + +void native_apic_wait_icr_idle(void) +{ + while (apic_read(APIC_ICR) & APIC_ICR_BUSY) + cpu_relax(); +} + +u32 native_safe_apic_wait_icr_idle(void) +{ + u32 send_status; + int timeout; + + timeout = 0; + do { + send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; + if (!send_status) + break; + inc_irq_stat(icr_read_retry_count); + udelay(100); + } while (timeout++ < 1000); + + return send_status; +} + +void native_apic_icr_write(u32 low, u32 id) +{ + unsigned long flags; + + local_irq_save(flags); + apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id)); + apic_write(APIC_ICR, low); + local_irq_restore(flags); +} + +u64 native_apic_icr_read(void) +{ + u32 icr1, icr2; + + icr2 = apic_read(APIC_ICR2); + icr1 = apic_read(APIC_ICR); + + return icr1 | ((u64)icr2 << 32); +} + +#ifdef CONFIG_X86_32 +/** + * get_physical_broadcast - Get number of physical broadcast IDs + */ +int get_physical_broadcast(void) +{ + return modern_apic() ? 0xff : 0xf; +} +#endif + +/** + * lapic_get_maxlvt - get the maximum number of local vector table entries + */ +int lapic_get_maxlvt(void) +{ + unsigned int v; + + v = apic_read(APIC_LVR); + /* + * - we always have APIC integrated on 64bit mode + * - 82489DXs do not report # of LVT entries + */ + return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2; +} + +/* + * Local APIC timer + */ + +/* Clock divisor */ +#define APIC_DIVISOR 16 +#define TSC_DIVISOR 32 + +/* + * This function sets up the local APIC timer, with a timeout of + * 'clocks' APIC bus clock. During calibration we actually call + * this function twice on the boot CPU, once with a bogus timeout + * value, second time for real. The other (noncalibrating) CPUs + * call this function only once, with the real, calibrated value. + * + * We do reads before writes even if unnecessary, to get around the + * P5 APIC double write bug. + */ +static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) +{ + unsigned int lvtt_value, tmp_value; + + lvtt_value = LOCAL_TIMER_VECTOR; + if (!oneshot) + lvtt_value |= APIC_LVT_TIMER_PERIODIC; + else if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) + lvtt_value |= APIC_LVT_TIMER_TSCDEADLINE; + + if (!lapic_is_integrated()) + lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); + + if (!irqen) + lvtt_value |= APIC_LVT_MASKED; + + apic_write(APIC_LVTT, lvtt_value); + + if (lvtt_value & APIC_LVT_TIMER_TSCDEADLINE) { + printk_once(KERN_DEBUG "TSC deadline timer enabled\n"); + return; + } + + /* + * Divide PICLK by 16 + */ + tmp_value = apic_read(APIC_TDCR); + apic_write(APIC_TDCR, + (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | + APIC_TDR_DIV_16); + + if (!oneshot) + apic_write(APIC_TMICT, clocks / APIC_DIVISOR); +} + +/* + * Setup extended LVT, AMD specific + * + * Software should use the LVT offsets the BIOS provides. The offsets + * are determined by the subsystems using it like those for MCE + * threshold or IBS. On K8 only offset 0 (APIC500) and MCE interrupts + * are supported. Beginning with family 10h at least 4 offsets are + * available. + * + * Since the offsets must be consistent for all cores, we keep track + * of the LVT offsets in software and reserve the offset for the same + * vector also to be used on other cores. An offset is freed by + * setting the entry to APIC_EILVT_MASKED. + * + * If the BIOS is right, there should be no conflicts. Otherwise a + * "[Firmware Bug]: ..." error message is generated. However, if + * software does not properly determines the offsets, it is not + * necessarily a BIOS bug. + */ + +static atomic_t eilvt_offsets[APIC_EILVT_NR_MAX]; + +static inline int eilvt_entry_is_changeable(unsigned int old, unsigned int new) +{ + return (old & APIC_EILVT_MASKED) + || (new == APIC_EILVT_MASKED) + || ((new & ~APIC_EILVT_MASKED) == old); +} + +static unsigned int reserve_eilvt_offset(int offset, unsigned int new) +{ + unsigned int rsvd, vector; + + if (offset >= APIC_EILVT_NR_MAX) + return ~0; + + rsvd = atomic_read(&eilvt_offsets[offset]); + do { + vector = rsvd & ~APIC_EILVT_MASKED; /* 0: unassigned */ + if (vector && !eilvt_entry_is_changeable(vector, new)) + /* may not change if vectors are different */ + return rsvd; + rsvd = atomic_cmpxchg(&eilvt_offsets[offset], rsvd, new); + } while (rsvd != new); + + rsvd &= ~APIC_EILVT_MASKED; + if (rsvd && rsvd != vector) + pr_info("LVT offset %d assigned for vector 0x%02x\n", + offset, rsvd); + + return new; +} + +/* + * If mask=1, the LVT entry does not generate interrupts while mask=0 + * enables the vector. See also the BKDGs. Must be called with + * preemption disabled. + */ + +int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask) +{ + unsigned long reg = APIC_EILVTn(offset); + unsigned int new, old, reserved; + + new = (mask << 16) | (msg_type << 8) | vector; + old = apic_read(reg); + reserved = reserve_eilvt_offset(offset, new); + + if (reserved != new) { + pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for " + "vector 0x%x, but the register is already in use for " + "vector 0x%x on another cpu\n", + smp_processor_id(), reg, offset, new, reserved); + return -EINVAL; + } + + if (!eilvt_entry_is_changeable(old, new)) { + pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for " + "vector 0x%x, but the register is already in use for " + "vector 0x%x on this cpu\n", + smp_processor_id(), reg, offset, new, old); + return -EBUSY; + } + + apic_write(reg, new); + + return 0; +} +EXPORT_SYMBOL_GPL(setup_APIC_eilvt); + +/* + * Program the next event, relative to now + */ +static int lapic_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + apic_write(APIC_TMICT, delta); + return 0; +} + +static int lapic_next_deadline(unsigned long delta, + struct clock_event_device *evt) +{ + u64 tsc; + + rdtscll(tsc); + wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR)); + return 0; +} + +/* + * Setup the lapic timer in periodic or oneshot mode + */ +static void lapic_timer_setup(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + unsigned long flags; + unsigned int v; + + /* Lapic used as dummy for broadcast ? */ + if (evt->features & CLOCK_EVT_FEAT_DUMMY) + return; + + local_irq_save(flags); + + switch (mode) { + case CLOCK_EVT_MODE_PERIODIC: + case CLOCK_EVT_MODE_ONESHOT: + __setup_APIC_LVTT(lapic_timer_frequency, + mode != CLOCK_EVT_MODE_PERIODIC, 1); + break; + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_SHUTDOWN: + v = apic_read(APIC_LVTT); + v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); + apic_write(APIC_LVTT, v); + apic_write(APIC_TMICT, 0); + break; + case CLOCK_EVT_MODE_RESUME: + /* Nothing to do here */ + break; + } + + local_irq_restore(flags); +} + +/* + * Local APIC timer broadcast function + */ +static void lapic_timer_broadcast(const struct cpumask *mask) +{ +#ifdef CONFIG_SMP + apic->send_IPI_mask(mask, LOCAL_TIMER_VECTOR); +#endif +} + + +/* + * The local apic timer can be used for any function which is CPU local. + */ +static struct clock_event_device lapic_clockevent = { + .name = "lapic", + .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT + | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY, + .shift = 32, + .set_mode = lapic_timer_setup, + .set_next_event = lapic_next_event, + .broadcast = lapic_timer_broadcast, + .rating = 100, + .irq = -1, +}; +static DEFINE_PER_CPU(struct clock_event_device, lapic_events); + +/* + * Setup the local APIC timer for this CPU. Copy the initialized values + * of the boot CPU and register the clock event in the framework. + */ +static void setup_APIC_timer(void) +{ + struct clock_event_device *levt = &__get_cpu_var(lapic_events); + + if (this_cpu_has(X86_FEATURE_ARAT)) { + lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP; + /* Make LAPIC timer preferrable over percpu HPET */ + lapic_clockevent.rating = 150; + } + + memcpy(levt, &lapic_clockevent, sizeof(*levt)); + levt->cpumask = cpumask_of(smp_processor_id()); + + if (this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) { + levt->features &= ~(CLOCK_EVT_FEAT_PERIODIC | + CLOCK_EVT_FEAT_DUMMY); + levt->set_next_event = lapic_next_deadline; + clockevents_config_and_register(levt, + (tsc_khz / TSC_DIVISOR) * 1000, + 0xF, ~0UL); + } else + clockevents_register_device(levt); +} + +/* + * In this functions we calibrate APIC bus clocks to the external timer. + * + * We want to do the calibration only once since we want to have local timer + * irqs syncron. CPUs connected by the same APIC bus have the very same bus + * frequency. + * + * This was previously done by reading the PIT/HPET and waiting for a wrap + * around to find out, that a tick has elapsed. I have a box, where the PIT + * readout is broken, so it never gets out of the wait loop again. This was + * also reported by others. + * + * Monitoring the jiffies value is inaccurate and the clockevents + * infrastructure allows us to do a simple substitution of the interrupt + * handler. + * + * The calibration routine also uses the pm_timer when possible, as the PIT + * happens to run way too slow (factor 2.3 on my VAIO CoreDuo, which goes + * back to normal later in the boot process). + */ + +#define LAPIC_CAL_LOOPS (HZ/10) + +static __initdata int lapic_cal_loops = -1; +static __initdata long lapic_cal_t1, lapic_cal_t2; +static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2; +static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2; +static __initdata unsigned long lapic_cal_j1, lapic_cal_j2; + +/* + * Temporary interrupt handler. + */ +static void __init lapic_cal_handler(struct clock_event_device *dev) +{ + unsigned long long tsc = 0; + long tapic = apic_read(APIC_TMCCT); + unsigned long pm = acpi_pm_read_early(); + + if (cpu_has_tsc) + rdtscll(tsc); + + switch (lapic_cal_loops++) { + case 0: + lapic_cal_t1 = tapic; + lapic_cal_tsc1 = tsc; + lapic_cal_pm1 = pm; + lapic_cal_j1 = jiffies; + break; + + case LAPIC_CAL_LOOPS: + lapic_cal_t2 = tapic; + lapic_cal_tsc2 = tsc; + if (pm < lapic_cal_pm1) + pm += ACPI_PM_OVRRUN; + lapic_cal_pm2 = pm; + lapic_cal_j2 = jiffies; + break; + } +} + +static int __init +calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc) +{ + const long pm_100ms = PMTMR_TICKS_PER_SEC / 10; + const long pm_thresh = pm_100ms / 100; + unsigned long mult; + u64 res; + +#ifndef CONFIG_X86_PM_TIMER + return -1; +#endif + + apic_printk(APIC_VERBOSE, "... PM-Timer delta = %ld\n", deltapm); + + /* Check, if the PM timer is available */ + if (!deltapm) + return -1; + + mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22); + + if (deltapm > (pm_100ms - pm_thresh) && + deltapm < (pm_100ms + pm_thresh)) { + apic_printk(APIC_VERBOSE, "... PM-Timer result ok\n"); + return 0; + } + + res = (((u64)deltapm) * mult) >> 22; + do_div(res, 1000000); + pr_warning("APIC calibration not consistent " + "with PM-Timer: %ldms instead of 100ms\n",(long)res); + + /* Correct the lapic counter value */ + res = (((u64)(*delta)) * pm_100ms); + do_div(res, deltapm); + pr_info("APIC delta adjusted to PM-Timer: " + "%lu (%ld)\n", (unsigned long)res, *delta); + *delta = (long)res; + + /* Correct the tsc counter value */ + if (cpu_has_tsc) { + res = (((u64)(*deltatsc)) * pm_100ms); + do_div(res, deltapm); + apic_printk(APIC_VERBOSE, "TSC delta adjusted to " + "PM-Timer: %lu (%ld)\n", + (unsigned long)res, *deltatsc); + *deltatsc = (long)res; + } + + return 0; +} + +static int __init calibrate_APIC_clock(void) +{ + struct clock_event_device *levt = &__get_cpu_var(lapic_events); + void (*real_handler)(struct clock_event_device *dev); + unsigned long deltaj; + long delta, deltatsc; + int pm_referenced = 0; + + /** + * check if lapic timer has already been calibrated by platform + * specific routine, such as tsc calibration code. if so, we just fill + * in the clockevent structure and return. + */ + + if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) { + return 0; + } else if (lapic_timer_frequency) { + apic_printk(APIC_VERBOSE, "lapic timer already calibrated %d\n", + lapic_timer_frequency); + lapic_clockevent.mult = div_sc(lapic_timer_frequency/APIC_DIVISOR, + TICK_NSEC, lapic_clockevent.shift); + lapic_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); + lapic_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &lapic_clockevent); + lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; + return 0; + } + + apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" + "calibrating APIC timer ...\n"); + + local_irq_disable(); + + /* Replace the global interrupt handler */ + real_handler = global_clock_event->event_handler; + global_clock_event->event_handler = lapic_cal_handler; + + /* + * Setup the APIC counter to maximum. There is no way the lapic + * can underflow in the 100ms detection time frame + */ + __setup_APIC_LVTT(0xffffffff, 0, 0); + + /* Let the interrupts run */ + local_irq_enable(); + + while (lapic_cal_loops <= LAPIC_CAL_LOOPS) + cpu_relax(); + + local_irq_disable(); + + /* Restore the real event handler */ + global_clock_event->event_handler = real_handler; + + /* Build delta t1-t2 as apic timer counts down */ + delta = lapic_cal_t1 - lapic_cal_t2; + apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta); + + deltatsc = (long)(lapic_cal_tsc2 - lapic_cal_tsc1); + + /* we trust the PM based calibration if possible */ + pm_referenced = !calibrate_by_pmtimer(lapic_cal_pm2 - lapic_cal_pm1, + &delta, &deltatsc); + + /* Calculate the scaled math multiplication factor */ + lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, + lapic_clockevent.shift); + lapic_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFFFFFF, &lapic_clockevent); + lapic_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &lapic_clockevent); + + lapic_timer_frequency = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; + + apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); + apic_printk(APIC_VERBOSE, "..... mult: %u\n", lapic_clockevent.mult); + apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", + lapic_timer_frequency); + + if (cpu_has_tsc) { + apic_printk(APIC_VERBOSE, "..... CPU clock speed is " + "%ld.%04ld MHz.\n", + (deltatsc / LAPIC_CAL_LOOPS) / (1000000 / HZ), + (deltatsc / LAPIC_CAL_LOOPS) % (1000000 / HZ)); + } + + apic_printk(APIC_VERBOSE, "..... host bus clock speed is " + "%u.%04u MHz.\n", + lapic_timer_frequency / (1000000 / HZ), + lapic_timer_frequency % (1000000 / HZ)); + + /* + * Do a sanity check on the APIC calibration result + */ + if (lapic_timer_frequency < (1000000 / HZ)) { + local_irq_enable(); + pr_warning("APIC frequency too slow, disabling apic timer\n"); + return -1; + } + + levt->features &= ~CLOCK_EVT_FEAT_DUMMY; + + /* + * PM timer calibration failed or not turned on + * so lets try APIC timer based calibration + */ + if (!pm_referenced) { + apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); + + /* + * Setup the apic timer manually + */ + levt->event_handler = lapic_cal_handler; + lapic_timer_setup(CLOCK_EVT_MODE_PERIODIC, levt); + lapic_cal_loops = -1; + + /* Let the interrupts run */ + local_irq_enable(); + + while (lapic_cal_loops <= LAPIC_CAL_LOOPS) + cpu_relax(); + + /* Stop the lapic timer */ + lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt); + + /* Jiffies delta */ + deltaj = lapic_cal_j2 - lapic_cal_j1; + apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj); + + /* Check, if the jiffies result is consistent */ + if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2) + apic_printk(APIC_VERBOSE, "... jiffies result ok\n"); + else + levt->features |= CLOCK_EVT_FEAT_DUMMY; + } else + local_irq_enable(); + + if (levt->features & CLOCK_EVT_FEAT_DUMMY) { + pr_warning("APIC timer disabled due to verification failure\n"); + return -1; + } + + return 0; +} + +/* + * Setup the boot APIC + * + * Calibrate and verify the result. + */ +void __init setup_boot_APIC_clock(void) +{ + /* + * The local apic timer can be disabled via the kernel + * commandline or from the CPU detection code. Register the lapic + * timer as a dummy clock event source on SMP systems, so the + * broadcast mechanism is used. On UP systems simply ignore it. + */ + if (disable_apic_timer) { + pr_info("Disabling APIC timer\n"); + /* No broadcast on UP ! */ + if (num_possible_cpus() > 1) { + lapic_clockevent.mult = 1; + setup_APIC_timer(); + } + return; + } + + if (calibrate_APIC_clock()) { + /* No broadcast on UP ! */ + if (num_possible_cpus() > 1) + setup_APIC_timer(); + return; + } + + /* + * If nmi_watchdog is set to IO_APIC, we need the + * PIT/HPET going. Otherwise register lapic as a dummy + * device. + */ + lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; + + /* Setup the lapic or request the broadcast */ + setup_APIC_timer(); +} + +void setup_secondary_APIC_clock(void) +{ + setup_APIC_timer(); +} + +/* + * The guts of the apic timer interrupt + */ +static void local_apic_timer_interrupt(void) +{ + int cpu = smp_processor_id(); + struct clock_event_device *evt = &per_cpu(lapic_events, cpu); + + /* + * Normally we should not be here till LAPIC has been initialized but + * in some cases like kdump, its possible that there is a pending LAPIC + * timer interrupt from previous kernel's context and is delivered in + * new kernel the moment interrupts are enabled. + * + * Interrupts are enabled early and LAPIC is setup much later, hence + * its possible that when we get here evt->event_handler is NULL. + * Check for event_handler being NULL and discard the interrupt as + * spurious. + */ + if (!evt->event_handler) { + pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", cpu); + /* Switch it off */ + lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt); + return; + } + + /* + * the NMI deadlock-detector uses this. + */ + inc_irq_stat(apic_timer_irqs); + + evt->event_handler(evt); +} + +/* + * Local APIC timer interrupt. This is the most natural way for doing + * local interrupts, but local timer interrupts can be emulated by + * broadcast interrupts too. [in case the hw doesn't support APIC timers] + * + * [ if a single-CPU system runs an SMP kernel then we call the local + * interrupt as well. Thus we cannot inline the local irq ... ] + */ +__visible void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + /* + * NOTE! We'd better ACK the irq immediately, + * because timer handling can be slow. + * + * update_process_times() expects us to have done irq_enter(). + * Besides, if we don't timer interrupts ignore the global + * interrupt lock, which is the WrongThing (tm) to do. + */ + entering_ack_irq(); + local_apic_timer_interrupt(); + exiting_irq(); + + set_irq_regs(old_regs); +} + +__visible void __irq_entry smp_trace_apic_timer_interrupt(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + /* + * NOTE! We'd better ACK the irq immediately, + * because timer handling can be slow. + * + * update_process_times() expects us to have done irq_enter(). + * Besides, if we don't timer interrupts ignore the global + * interrupt lock, which is the WrongThing (tm) to do. + */ + entering_ack_irq(); + trace_local_timer_entry(LOCAL_TIMER_VECTOR); + local_apic_timer_interrupt(); + trace_local_timer_exit(LOCAL_TIMER_VECTOR); + exiting_irq(); + + set_irq_regs(old_regs); +} + +int setup_profiling_timer(unsigned int multiplier) +{ + return -EINVAL; +} + +/* + * Local APIC start and shutdown + */ + +/** + * clear_local_APIC - shutdown the local APIC + * + * This is called, when a CPU is disabled and before rebooting, so the state of + * the local APIC has no dangling leftovers. Also used to cleanout any BIOS + * leftovers during boot. + */ +void clear_local_APIC(void) +{ + int maxlvt; + u32 v; + + /* APIC hasn't been mapped yet */ + if (!x2apic_mode && !apic_phys) + return; + + maxlvt = lapic_get_maxlvt(); + /* + * Masking an LVT entry can trigger a local APIC error + * if the vector is zero. Mask LVTERR first to prevent this. + */ + if (maxlvt >= 3) { + v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ + apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); + } + /* + * Careful: we have to set masks only first to deassert + * any level-triggered sources. + */ + v = apic_read(APIC_LVTT); + apic_write(APIC_LVTT, v | APIC_LVT_MASKED); + v = apic_read(APIC_LVT0); + apic_write(APIC_LVT0, v | APIC_LVT_MASKED); + v = apic_read(APIC_LVT1); + apic_write(APIC_LVT1, v | APIC_LVT_MASKED); + if (maxlvt >= 4) { + v = apic_read(APIC_LVTPC); + apic_write(APIC_LVTPC, v | APIC_LVT_MASKED); + } + + /* lets not touch this if we didn't frob it */ +#ifdef CONFIG_X86_THERMAL_VECTOR + if (maxlvt >= 5) { + v = apic_read(APIC_LVTTHMR); + apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); + } +#endif +#ifdef CONFIG_X86_MCE_INTEL + if (maxlvt >= 6) { + v = apic_read(APIC_LVTCMCI); + if (!(v & APIC_LVT_MASKED)) + apic_write(APIC_LVTCMCI, v | APIC_LVT_MASKED); + } +#endif + + /* + * Clean APIC state for other OSs: + */ + apic_write(APIC_LVTT, APIC_LVT_MASKED); + apic_write(APIC_LVT0, APIC_LVT_MASKED); + apic_write(APIC_LVT1, APIC_LVT_MASKED); + if (maxlvt >= 3) + apic_write(APIC_LVTERR, APIC_LVT_MASKED); + if (maxlvt >= 4) + apic_write(APIC_LVTPC, APIC_LVT_MASKED); + + /* Integrated APIC (!82489DX) ? */ + if (lapic_is_integrated()) { + if (maxlvt > 3) + /* Clear ESR due to Pentium errata 3AP and 11AP */ + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + } +} + +/** + * disable_local_APIC - clear and disable the local APIC + */ +void disable_local_APIC(void) +{ + unsigned int value; + + /* APIC hasn't been mapped yet */ + if (!x2apic_mode && !apic_phys) + return; + + clear_local_APIC(); + + /* + * Disable APIC (implies clearing of registers + * for 82489DX!). + */ + value = apic_read(APIC_SPIV); + value &= ~APIC_SPIV_APIC_ENABLED; + apic_write(APIC_SPIV, value); + +#ifdef CONFIG_X86_32 + /* + * When LAPIC was disabled by the BIOS and enabled by the kernel, + * restore the disabled state. + */ + if (enabled_via_apicbase) { + unsigned int l, h; + + rdmsr(MSR_IA32_APICBASE, l, h); + l &= ~MSR_IA32_APICBASE_ENABLE; + wrmsr(MSR_IA32_APICBASE, l, h); + } +#endif +} + +/* + * If Linux enabled the LAPIC against the BIOS default disable it down before + * re-entering the BIOS on shutdown. Otherwise the BIOS may get confused and + * not power-off. Additionally clear all LVT entries before disable_local_APIC + * for the case where Linux didn't enable the LAPIC. + */ +void lapic_shutdown(void) +{ + unsigned long flags; + + if (!cpu_has_apic && !apic_from_smp_config()) + return; + + local_irq_save(flags); + +#ifdef CONFIG_X86_32 + if (!enabled_via_apicbase) + clear_local_APIC(); + else +#endif + disable_local_APIC(); + + + local_irq_restore(flags); +} + +/* + * This is to verify that we're looking at a real local APIC. + * Check these against your board if the CPUs aren't getting + * started for no apparent reason. + */ +int __init verify_local_APIC(void) +{ + unsigned int reg0, reg1; + + /* + * The version register is read-only in a real APIC. + */ + reg0 = apic_read(APIC_LVR); + apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0); + apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK); + reg1 = apic_read(APIC_LVR); + apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1); + + /* + * The two version reads above should print the same + * numbers. If the second one is different, then we + * poke at a non-APIC. + */ + if (reg1 != reg0) + return 0; + + /* + * Check if the version looks reasonably. + */ + reg1 = GET_APIC_VERSION(reg0); + if (reg1 == 0x00 || reg1 == 0xff) + return 0; + reg1 = lapic_get_maxlvt(); + if (reg1 < 0x02 || reg1 == 0xff) + return 0; + + /* + * The ID register is read/write in a real APIC. + */ + reg0 = apic_read(APIC_ID); + apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); + apic_write(APIC_ID, reg0 ^ apic->apic_id_mask); + reg1 = apic_read(APIC_ID); + apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1); + apic_write(APIC_ID, reg0); + if (reg1 != (reg0 ^ apic->apic_id_mask)) + return 0; + + /* + * The next two are just to see if we have sane values. + * They're only really relevant if we're in Virtual Wire + * compatibility mode, but most boxes are anymore. + */ + reg0 = apic_read(APIC_LVT0); + apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0); + reg1 = apic_read(APIC_LVT1); + apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1); + + return 1; +} + +/** + * sync_Arb_IDs - synchronize APIC bus arbitration IDs + */ +void __init sync_Arb_IDs(void) +{ + /* + * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not + * needed on AMD. + */ + if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + return; + + /* + * Wait for idle. + */ + apic_wait_icr_idle(); + + apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); + apic_write(APIC_ICR, APIC_DEST_ALLINC | + APIC_INT_LEVELTRIG | APIC_DM_INIT); +} + +/* + * An initial setup of the virtual wire mode. + */ +void __init init_bsp_APIC(void) +{ + unsigned int value; + + /* + * Don't do the setup now if we have a SMP BIOS as the + * through-I/O-APIC virtual wire mode might be active. + */ + if (smp_found_config || !cpu_has_apic) + return; + + /* + * Do not trust the local APIC being empty at bootup. + */ + clear_local_APIC(); + + /* + * Enable APIC. + */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + value |= APIC_SPIV_APIC_ENABLED; + +#ifdef CONFIG_X86_32 + /* This bit is reserved on P4/Xeon and should be cleared */ + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && + (boot_cpu_data.x86 == 15)) + value &= ~APIC_SPIV_FOCUS_DISABLED; + else +#endif + value |= APIC_SPIV_FOCUS_DISABLED; + value |= SPURIOUS_APIC_VECTOR; + apic_write(APIC_SPIV, value); + + /* + * Set up the virtual wire mode. + */ + apic_write(APIC_LVT0, APIC_DM_EXTINT); + value = APIC_DM_NMI; + if (!lapic_is_integrated()) /* 82489DX */ + value |= APIC_LVT_LEVEL_TRIGGER; + apic_write(APIC_LVT1, value); +} + +static void lapic_setup_esr(void) +{ + unsigned int oldvalue, value, maxlvt; + + if (!lapic_is_integrated()) { + pr_info("No ESR for 82489DX.\n"); + return; + } + + if (apic->disable_esr) { + /* + * Something untraceable is creating bad interrupts on + * secondary quads ... for the moment, just leave the + * ESR disabled - we can't do anything useful with the + * errors anyway - mbligh + */ + pr_info("Leaving ESR disabled.\n"); + return; + } + + maxlvt = lapic_get_maxlvt(); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + oldvalue = apic_read(APIC_ESR); + + /* enables sending errors */ + value = ERROR_APIC_VECTOR; + apic_write(APIC_LVTERR, value); + + /* + * spec says clear errors after enabling vector. + */ + if (maxlvt > 3) + apic_write(APIC_ESR, 0); + value = apic_read(APIC_ESR); + if (value != oldvalue) + apic_printk(APIC_VERBOSE, "ESR value before enabling " + "vector: 0x%08x after: 0x%08x\n", + oldvalue, value); +} + +/** + * setup_local_APIC - setup the local APIC + * + * Used to setup local APIC while initializing BSP or bringin up APs. + * Always called with preemption disabled. + */ +void setup_local_APIC(void) +{ + int cpu = smp_processor_id(); + unsigned int value, queued; + int i, j, acked = 0; + unsigned long long tsc = 0, ntsc; + long long max_loops = cpu_khz; + + if (cpu_has_tsc) + rdtscll(tsc); + + if (disable_apic) { + disable_ioapic_support(); + return; + } + +#ifdef CONFIG_X86_32 + /* Pound the ESR really hard over the head with a big hammer - mbligh */ + if (lapic_is_integrated() && apic->disable_esr) { + apic_write(APIC_ESR, 0); + apic_write(APIC_ESR, 0); + apic_write(APIC_ESR, 0); + apic_write(APIC_ESR, 0); + } +#endif + perf_events_lapic_init(); + + /* + * Double-check whether this APIC is really registered. + * This is meaningless in clustered apic mode, so we skip it. + */ + BUG_ON(!apic->apic_id_registered()); + + /* + * Intel recommends to set DFR, LDR and TPR before enabling + * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel + * document number 292116). So here it goes... + */ + apic->init_apic_ldr(); + +#ifdef CONFIG_X86_32 + /* + * APIC LDR is initialized. If logical_apicid mapping was + * initialized during get_smp_config(), make sure it matches the + * actual value. + */ + i = early_per_cpu(x86_cpu_to_logical_apicid, cpu); + WARN_ON(i != BAD_APICID && i != logical_smp_processor_id()); + /* always use the value from LDR */ + early_per_cpu(x86_cpu_to_logical_apicid, cpu) = + logical_smp_processor_id(); + + /* + * Some NUMA implementations (NUMAQ) don't initialize apicid to + * node mapping during NUMA init. Now that logical apicid is + * guaranteed to be known, give it another chance. This is already + * a bit too late - percpu allocation has already happened without + * proper NUMA affinity. + */ + if (apic->x86_32_numa_cpu_node) + set_apicid_to_node(early_per_cpu(x86_cpu_to_apicid, cpu), + apic->x86_32_numa_cpu_node(cpu)); +#endif + + /* + * Set Task Priority to 'accept all'. We never change this + * later on. + */ + value = apic_read(APIC_TASKPRI); + value &= ~APIC_TPRI_MASK; + apic_write(APIC_TASKPRI, value); + + /* + * After a crash, we no longer service the interrupts and a pending + * interrupt from previous kernel might still have ISR bit set. + * + * Most probably by now CPU has serviced that pending interrupt and + * it might not have done the ack_APIC_irq() because it thought, + * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it + * does not clear the ISR bit and cpu thinks it has already serivced + * the interrupt. Hence a vector might get locked. It was noticed + * for timer irq (vector 0x31). Issue an extra EOI to clear ISR. + */ + do { + queued = 0; + for (i = APIC_ISR_NR - 1; i >= 0; i--) + queued |= apic_read(APIC_IRR + i*0x10); + + for (i = APIC_ISR_NR - 1; i >= 0; i--) { + value = apic_read(APIC_ISR + i*0x10); + for (j = 31; j >= 0; j--) { + if (value & (1<<j)) { + ack_APIC_irq(); + acked++; + } + } + } + if (acked > 256) { + printk(KERN_ERR "LAPIC pending interrupts after %d EOI\n", + acked); + break; + } + if (queued) { + if (cpu_has_tsc) { + rdtscll(ntsc); + max_loops = (cpu_khz << 10) - (ntsc - tsc); + } else + max_loops--; + } + } while (queued && max_loops > 0); + WARN_ON(max_loops <= 0); + + /* + * Now that we are all set up, enable the APIC + */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + /* + * Enable APIC + */ + value |= APIC_SPIV_APIC_ENABLED; + +#ifdef CONFIG_X86_32 + /* + * Some unknown Intel IO/APIC (or APIC) errata is biting us with + * certain networking cards. If high frequency interrupts are + * happening on a particular IOAPIC pin, plus the IOAPIC routing + * entry is masked/unmasked at a high rate as well then sooner or + * later IOAPIC line gets 'stuck', no more interrupts are received + * from the device. If focus CPU is disabled then the hang goes + * away, oh well :-( + * + * [ This bug can be reproduced easily with a level-triggered + * PCI Ne2000 networking cards and PII/PIII processors, dual + * BX chipset. ] + */ + /* + * Actually disabling the focus CPU check just makes the hang less + * frequent as it makes the interrupt distributon model be more + * like LRU than MRU (the short-term load is more even across CPUs). + * See also the comment in end_level_ioapic_irq(). --macro + */ + + /* + * - enable focus processor (bit==0) + * - 64bit mode always use processor focus + * so no need to set it + */ + value &= ~APIC_SPIV_FOCUS_DISABLED; +#endif + + /* + * Set spurious IRQ vector + */ + value |= SPURIOUS_APIC_VECTOR; + apic_write(APIC_SPIV, value); + + /* + * Set up LVT0, LVT1: + * + * set up through-local-APIC on the BP's LINT0. This is not + * strictly necessary in pure symmetric-IO mode, but sometimes + * we delegate interrupts to the 8259A. + */ + /* + * TODO: set up through-local-APIC from through-I/O-APIC? --macro + */ + value = apic_read(APIC_LVT0) & APIC_LVT_MASKED; + if (!cpu && (pic_mode || !value)) { + value = APIC_DM_EXTINT; + apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", cpu); + } else { + value = APIC_DM_EXTINT | APIC_LVT_MASKED; + apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", cpu); + } + apic_write(APIC_LVT0, value); + + /* + * only the BP should see the LINT1 NMI signal, obviously. + */ + if (!cpu) + value = APIC_DM_NMI; + else + value = APIC_DM_NMI | APIC_LVT_MASKED; + if (!lapic_is_integrated()) /* 82489DX */ + value |= APIC_LVT_LEVEL_TRIGGER; + apic_write(APIC_LVT1, value); + +#ifdef CONFIG_X86_MCE_INTEL + /* Recheck CMCI information after local APIC is up on CPU #0 */ + if (!cpu) + cmci_recheck(); +#endif +} + +void end_local_APIC_setup(void) +{ + lapic_setup_esr(); + +#ifdef CONFIG_X86_32 + { + unsigned int value; + /* Disable the local apic timer */ + value = apic_read(APIC_LVTT); + value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); + apic_write(APIC_LVTT, value); + } +#endif + + apic_pm_activate(); +} + +void __init bsp_end_local_APIC_setup(void) +{ + end_local_APIC_setup(); + + /* + * Now that local APIC setup is completed for BP, configure the fault + * handling for interrupt remapping. + */ + irq_remap_enable_fault_handling(); + +} + +#ifdef CONFIG_X86_X2APIC +/* + * Need to disable xapic and x2apic at the same time and then enable xapic mode + */ +static inline void __disable_x2apic(u64 msr) +{ + wrmsrl(MSR_IA32_APICBASE, + msr & ~(X2APIC_ENABLE | XAPIC_ENABLE)); + wrmsrl(MSR_IA32_APICBASE, msr & ~X2APIC_ENABLE); +} + +static __init void disable_x2apic(void) +{ + u64 msr; + + if (!cpu_has_x2apic) + return; + + rdmsrl(MSR_IA32_APICBASE, msr); + if (msr & X2APIC_ENABLE) { + u32 x2apic_id = read_apic_id(); + + if (x2apic_id >= 255) + panic("Cannot disable x2apic, id: %08x\n", x2apic_id); + + pr_info("Disabling x2apic\n"); + __disable_x2apic(msr); + + if (nox2apic) { + clear_cpu_cap(&cpu_data(0), X86_FEATURE_X2APIC); + setup_clear_cpu_cap(X86_FEATURE_X2APIC); + } + + x2apic_disabled = 1; + x2apic_mode = 0; + + register_lapic_address(mp_lapic_addr); + } +} + +void check_x2apic(void) +{ + if (x2apic_enabled()) { + pr_info("x2apic enabled by BIOS, switching to x2apic ops\n"); + x2apic_preenabled = x2apic_mode = 1; + } +} + +void enable_x2apic(void) +{ + u64 msr; + + rdmsrl(MSR_IA32_APICBASE, msr); + if (x2apic_disabled) { + __disable_x2apic(msr); + return; + } + + if (!x2apic_mode) + return; + + if (!(msr & X2APIC_ENABLE)) { + printk_once(KERN_INFO "Enabling x2apic\n"); + wrmsrl(MSR_IA32_APICBASE, msr | X2APIC_ENABLE); + } +} +#endif /* CONFIG_X86_X2APIC */ + +int __init enable_IR(void) +{ +#ifdef CONFIG_IRQ_REMAP + if (!irq_remapping_supported()) { + pr_debug("intr-remapping not supported\n"); + return -1; + } + + if (!x2apic_preenabled && skip_ioapic_setup) { + pr_info("Skipped enabling intr-remap because of skipping " + "io-apic setup\n"); + return -1; + } + + return irq_remapping_enable(); +#endif + return -1; +} + +void __init enable_IR_x2apic(void) +{ + unsigned long flags; + int ret, x2apic_enabled = 0; + int hardware_init_ret; + + /* Make sure irq_remap_ops are initialized */ + setup_irq_remapping_ops(); + + hardware_init_ret = irq_remapping_prepare(); + if (hardware_init_ret && !x2apic_supported()) + return; + + ret = save_ioapic_entries(); + if (ret) { + pr_info("Saving IO-APIC state failed: %d\n", ret); + return; + } + + local_irq_save(flags); + legacy_pic->mask_all(); + mask_ioapic_entries(); + + if (x2apic_preenabled && nox2apic) + disable_x2apic(); + + if (hardware_init_ret) + ret = -1; + else + ret = enable_IR(); + + if (!x2apic_supported()) + goto skip_x2apic; + + if (ret < 0) { + /* IR is required if there is APIC ID > 255 even when running + * under KVM + */ + if (max_physical_apicid > 255 || + !hypervisor_x2apic_available()) { + if (x2apic_preenabled) + disable_x2apic(); + goto skip_x2apic; + } + /* + * without IR all CPUs can be addressed by IOAPIC/MSI + * only in physical mode + */ + x2apic_force_phys(); + } + + if (ret == IRQ_REMAP_XAPIC_MODE) { + pr_info("x2apic not enabled, IRQ remapping is in xapic mode\n"); + goto skip_x2apic; + } + + x2apic_enabled = 1; + + if (x2apic_supported() && !x2apic_mode) { + x2apic_mode = 1; + enable_x2apic(); + pr_info("Enabled x2apic\n"); + } + +skip_x2apic: + if (ret < 0) /* IR enabling failed */ + restore_ioapic_entries(); + legacy_pic->restore_mask(); + local_irq_restore(flags); +} + +#ifdef CONFIG_X86_64 +/* + * Detect and enable local APICs on non-SMP boards. + * Original code written by Keir Fraser. + * On AMD64 we trust the BIOS - if it says no APIC it is likely + * not correctly set up (usually the APIC timer won't work etc.) + */ +static int __init detect_init_APIC(void) +{ + if (!cpu_has_apic) { + pr_info("No local APIC present\n"); + return -1; + } + + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + return 0; +} +#else + +static int __init apic_verify(void) +{ + u32 features, h, l; + + /* + * The APIC feature bit should now be enabled + * in `cpuid' + */ + features = cpuid_edx(1); + if (!(features & (1 << X86_FEATURE_APIC))) { + pr_warning("Could not enable APIC!\n"); + return -1; + } + set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + + /* The BIOS may have set up the APIC at some other address */ + if (boot_cpu_data.x86 >= 6) { + rdmsr(MSR_IA32_APICBASE, l, h); + if (l & MSR_IA32_APICBASE_ENABLE) + mp_lapic_addr = l & MSR_IA32_APICBASE_BASE; + } + + pr_info("Found and enabled local APIC!\n"); + return 0; +} + +int __init apic_force_enable(unsigned long addr) +{ + u32 h, l; + + if (disable_apic) + return -1; + + /* + * Some BIOSes disable the local APIC in the APIC_BASE + * MSR. This can only be done in software for Intel P6 or later + * and AMD K7 (Model > 1) or later. + */ + if (boot_cpu_data.x86 >= 6) { + rdmsr(MSR_IA32_APICBASE, l, h); + if (!(l & MSR_IA32_APICBASE_ENABLE)) { + pr_info("Local APIC disabled by BIOS -- reenabling.\n"); + l &= ~MSR_IA32_APICBASE_BASE; + l |= MSR_IA32_APICBASE_ENABLE | addr; + wrmsr(MSR_IA32_APICBASE, l, h); + enabled_via_apicbase = 1; + } + } + return apic_verify(); +} + +/* + * Detect and initialize APIC + */ +static int __init detect_init_APIC(void) +{ + /* Disabled by kernel option? */ + if (disable_apic) + return -1; + + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model > 1) || + (boot_cpu_data.x86 >= 15)) + break; + goto no_apic; + case X86_VENDOR_INTEL: + if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 || + (boot_cpu_data.x86 == 5 && cpu_has_apic)) + break; + goto no_apic; + default: + goto no_apic; + } + + if (!cpu_has_apic) { + /* + * Over-ride BIOS and try to enable the local APIC only if + * "lapic" specified. + */ + if (!force_enable_local_apic) { + pr_info("Local APIC disabled by BIOS -- " + "you can enable it with \"lapic\"\n"); + return -1; + } + if (apic_force_enable(APIC_DEFAULT_PHYS_BASE)) + return -1; + } else { + if (apic_verify()) + return -1; + } + + apic_pm_activate(); + + return 0; + +no_apic: + pr_info("No local APIC present or hardware disabled\n"); + return -1; +} +#endif + +/** + * init_apic_mappings - initialize APIC mappings + */ +void __init init_apic_mappings(void) +{ + unsigned int new_apicid; + + if (x2apic_mode) { + boot_cpu_physical_apicid = read_apic_id(); + return; + } + + /* If no local APIC can be found return early */ + if (!smp_found_config && detect_init_APIC()) { + /* lets NOP'ify apic operations */ + pr_info("APIC: disable apic facility\n"); + apic_disable(); + } else { + apic_phys = mp_lapic_addr; + + /* + * acpi lapic path already maps that address in + * acpi_register_lapic_address() + */ + if (!acpi_lapic && !smp_found_config) + register_lapic_address(apic_phys); + } + + /* + * Fetch the APIC ID of the BSP in case we have a + * default configuration (or the MP table is broken). + */ + new_apicid = read_apic_id(); + if (boot_cpu_physical_apicid != new_apicid) { + boot_cpu_physical_apicid = new_apicid; + /* + * yeah -- we lie about apic_version + * in case if apic was disabled via boot option + * but it's not a problem for SMP compiled kernel + * since smp_sanity_check is prepared for such a case + * and disable smp mode + */ + apic_version[new_apicid] = + GET_APIC_VERSION(apic_read(APIC_LVR)); + } +} + +void __init register_lapic_address(unsigned long address) +{ + mp_lapic_addr = address; + + if (!x2apic_mode) { + set_fixmap_nocache(FIX_APIC_BASE, address); + apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", + APIC_BASE, mp_lapic_addr); + } + if (boot_cpu_physical_apicid == -1U) { + boot_cpu_physical_apicid = read_apic_id(); + apic_version[boot_cpu_physical_apicid] = + GET_APIC_VERSION(apic_read(APIC_LVR)); + } +} + +/* + * This initializes the IO-APIC and APIC hardware if this is + * a UP kernel. + */ +int apic_version[MAX_LOCAL_APIC]; + +int __init APIC_init_uniprocessor(void) +{ + if (disable_apic) { + pr_info("Apic disabled\n"); + return -1; + } +#ifdef CONFIG_X86_64 + if (!cpu_has_apic) { + disable_apic = 1; + pr_info("Apic disabled by BIOS\n"); + return -1; + } +#else + if (!smp_found_config && !cpu_has_apic) + return -1; + + /* + * Complain if the BIOS pretends there is one. + */ + if (!cpu_has_apic && + APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { + pr_err("BIOS bug, local APIC 0x%x not detected!...\n", + boot_cpu_physical_apicid); + return -1; + } +#endif + + default_setup_apic_routing(); + + verify_local_APIC(); + connect_bsp_APIC(); + +#ifdef CONFIG_X86_64 + apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid)); +#else + /* + * Hack: In case of kdump, after a crash, kernel might be booting + * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid + * might be zero if read from MP tables. Get it from LAPIC. + */ +# ifdef CONFIG_CRASH_DUMP + boot_cpu_physical_apicid = read_apic_id(); +# endif +#endif + physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); + setup_local_APIC(); + +#ifdef CONFIG_X86_IO_APIC + /* + * Now enable IO-APICs, actually call clear_IO_APIC + * We need clear_IO_APIC before enabling error vector + */ + if (!skip_ioapic_setup && nr_ioapics) + enable_IO_APIC(); +#endif + + bsp_end_local_APIC_setup(); + +#ifdef CONFIG_X86_IO_APIC + if (smp_found_config && !skip_ioapic_setup && nr_ioapics) + setup_IO_APIC(); + else { + nr_ioapics = 0; + } +#endif + + x86_init.timers.setup_percpu_clockev(); + return 0; +} + +/* + * Local APIC interrupts + */ + +/* + * This interrupt should _never_ happen with our APIC/SMP architecture + */ +static inline void __smp_spurious_interrupt(void) +{ + u32 v; + + /* + * Check if this really is a spurious interrupt and ACK it + * if it is a vectored one. Just in case... + * Spurious interrupts should not be ACKed. + */ + v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1)); + if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) + ack_APIC_irq(); + + inc_irq_stat(irq_spurious_count); + + /* see sw-dev-man vol 3, chapter 7.4.13.5 */ + pr_info("spurious APIC interrupt on CPU#%d, " + "should never happen.\n", smp_processor_id()); +} + +__visible void smp_spurious_interrupt(struct pt_regs *regs) +{ + entering_irq(); + __smp_spurious_interrupt(); + exiting_irq(); +} + +__visible void smp_trace_spurious_interrupt(struct pt_regs *regs) +{ + entering_irq(); + trace_spurious_apic_entry(SPURIOUS_APIC_VECTOR); + __smp_spurious_interrupt(); + trace_spurious_apic_exit(SPURIOUS_APIC_VECTOR); + exiting_irq(); +} + +/* + * This interrupt should never happen with our APIC/SMP architecture + */ +static inline void __smp_error_interrupt(struct pt_regs *regs) +{ + u32 v; + u32 i = 0; + static const char * const error_interrupt_reason[] = { + "Send CS error", /* APIC Error Bit 0 */ + "Receive CS error", /* APIC Error Bit 1 */ + "Send accept error", /* APIC Error Bit 2 */ + "Receive accept error", /* APIC Error Bit 3 */ + "Redirectable IPI", /* APIC Error Bit 4 */ + "Send illegal vector", /* APIC Error Bit 5 */ + "Received illegal vector", /* APIC Error Bit 6 */ + "Illegal register address", /* APIC Error Bit 7 */ + }; + + /* First tickle the hardware, only then report what went on. -- REW */ + if (lapic_get_maxlvt() > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + v = apic_read(APIC_ESR); + ack_APIC_irq(); + atomic_inc(&irq_err_count); + + apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x", + smp_processor_id(), v); + + v &= 0xff; + while (v) { + if (v & 0x1) + apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]); + i++; + v >>= 1; + } + + apic_printk(APIC_DEBUG, KERN_CONT "\n"); + +} + +__visible void smp_error_interrupt(struct pt_regs *regs) +{ + entering_irq(); + __smp_error_interrupt(regs); + exiting_irq(); +} + +__visible void smp_trace_error_interrupt(struct pt_regs *regs) +{ + entering_irq(); + trace_error_apic_entry(ERROR_APIC_VECTOR); + __smp_error_interrupt(regs); + trace_error_apic_exit(ERROR_APIC_VECTOR); + exiting_irq(); +} + +/** + * connect_bsp_APIC - attach the APIC to the interrupt system + */ +void __init connect_bsp_APIC(void) +{ +#ifdef CONFIG_X86_32 + if (pic_mode) { + /* + * Do not trust the local APIC being empty at bootup. + */ + clear_local_APIC(); + /* + * PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's + * local APIC to INT and NMI lines. + */ + apic_printk(APIC_VERBOSE, "leaving PIC mode, " + "enabling APIC mode.\n"); + imcr_pic_to_apic(); + } +#endif + if (apic->enable_apic_mode) + apic->enable_apic_mode(); +} + +/** + * disconnect_bsp_APIC - detach the APIC from the interrupt system + * @virt_wire_setup: indicates, whether virtual wire mode is selected + * + * Virtual wire mode is necessary to deliver legacy interrupts even when the + * APIC is disabled. + */ +void disconnect_bsp_APIC(int virt_wire_setup) +{ + unsigned int value; + +#ifdef CONFIG_X86_32 + if (pic_mode) { + /* + * Put the board back into PIC mode (has an effect only on + * certain older boards). Note that APIC interrupts, including + * IPIs, won't work beyond this point! The only exception are + * INIT IPIs. + */ + apic_printk(APIC_VERBOSE, "disabling APIC mode, " + "entering PIC mode.\n"); + imcr_apic_to_pic(); + return; + } +#endif + + /* Go back to Virtual Wire compatibility mode */ + + /* For the spurious interrupt use vector F, and enable it */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + value |= APIC_SPIV_APIC_ENABLED; + value |= 0xf; + apic_write(APIC_SPIV, value); + + if (!virt_wire_setup) { + /* + * For LVT0 make it edge triggered, active high, + * external and enabled + */ + value = apic_read(APIC_LVT0); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); + apic_write(APIC_LVT0, value); + } else { + /* Disable LVT0 */ + apic_write(APIC_LVT0, APIC_LVT_MASKED); + } + + /* + * For LVT1 make it edge triggered, active high, + * nmi and enabled + */ + value = apic_read(APIC_LVT1); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); + apic_write(APIC_LVT1, value); +} + +int generic_processor_info(int apicid, int version) +{ + int cpu, max = nr_cpu_ids; + bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid, + phys_cpu_present_map); + + /* + * boot_cpu_physical_apicid is designed to have the apicid + * returned by read_apic_id(), i.e, the apicid of the + * currently booting-up processor. However, on some platforms, + * it is temporarily modified by the apicid reported as BSP + * through MP table. Concretely: + * + * - arch/x86/kernel/mpparse.c: MP_processor_info() + * - arch/x86/mm/amdtopology.c: amd_numa_init() + * + * This function is executed with the modified + * boot_cpu_physical_apicid. So, disabled_cpu_apicid kernel + * parameter doesn't work to disable APs on kdump 2nd kernel. + * + * Since fixing handling of boot_cpu_physical_apicid requires + * another discussion and tests on each platform, we leave it + * for now and here we use read_apic_id() directly in this + * function, generic_processor_info(). + */ + if (disabled_cpu_apicid != BAD_APICID && + disabled_cpu_apicid != read_apic_id() && + disabled_cpu_apicid == apicid) { + int thiscpu = num_processors + disabled_cpus; + + pr_warning("APIC: Disabling requested cpu." + " Processor %d/0x%x ignored.\n", + thiscpu, apicid); + + disabled_cpus++; + return -ENODEV; + } + + /* + * If boot cpu has not been detected yet, then only allow upto + * nr_cpu_ids - 1 processors and keep one slot free for boot cpu + */ + if (!boot_cpu_detected && num_processors >= nr_cpu_ids - 1 && + apicid != boot_cpu_physical_apicid) { + int thiscpu = max + disabled_cpus - 1; + + pr_warning( + "ACPI: NR_CPUS/possible_cpus limit of %i almost" + " reached. Keeping one slot for boot cpu." + " Processor %d/0x%x ignored.\n", max, thiscpu, apicid); + + disabled_cpus++; + return -ENODEV; + } + + if (num_processors >= nr_cpu_ids) { + int thiscpu = max + disabled_cpus; + + pr_warning( + "ACPI: NR_CPUS/possible_cpus limit of %i reached." + " Processor %d/0x%x ignored.\n", max, thiscpu, apicid); + + disabled_cpus++; + return -EINVAL; + } + + num_processors++; + if (apicid == boot_cpu_physical_apicid) { + /* + * x86_bios_cpu_apicid is required to have processors listed + * in same order as logical cpu numbers. Hence the first + * entry is BSP, and so on. + * boot_cpu_init() already hold bit 0 in cpu_present_mask + * for BSP. + */ + cpu = 0; + } else + cpu = cpumask_next_zero(-1, cpu_present_mask); + + /* + * Validate version + */ + if (version == 0x0) { + pr_warning("BIOS bug: APIC version is 0 for CPU %d/0x%x, fixing up to 0x10\n", + cpu, apicid); + version = 0x10; + } + apic_version[apicid] = version; + + if (version != apic_version[boot_cpu_physical_apicid]) { + pr_warning("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n", + apic_version[boot_cpu_physical_apicid], cpu, version); + } + + physid_set(apicid, phys_cpu_present_map); + if (apicid > max_physical_apicid) + max_physical_apicid = apicid; + +#if defined(CONFIG_SMP) || defined(CONFIG_X86_64) + early_per_cpu(x86_cpu_to_apicid, cpu) = apicid; + early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid; +#endif +#ifdef CONFIG_X86_32 + early_per_cpu(x86_cpu_to_logical_apicid, cpu) = + apic->x86_32_early_logical_apicid(cpu); +#endif + set_cpu_possible(cpu, true); + set_cpu_present(cpu, true); + + return cpu; +} + +int hard_smp_processor_id(void) +{ + return read_apic_id(); +} + +void default_init_apic_ldr(void) +{ + unsigned long val; + + apic_write(APIC_DFR, APIC_DFR_VALUE); + val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; + val |= SET_APIC_LOGICAL_ID(1UL << smp_processor_id()); + apic_write(APIC_LDR, val); +} + +int default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask, + unsigned int *apicid) +{ + unsigned int cpu; + + for_each_cpu_and(cpu, cpumask, andmask) { + if (cpumask_test_cpu(cpu, cpu_online_mask)) + break; + } + + if (likely(cpu < nr_cpu_ids)) { + *apicid = per_cpu(x86_cpu_to_apicid, cpu); + return 0; + } + + return -EINVAL; +} + +/* + * Override the generic EOI implementation with an optimized version. + * Only called during early boot when only one CPU is active and with + * interrupts disabled, so we know this does not race with actual APIC driver + * use. + */ +void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) +{ + struct apic **drv; + + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { + /* Should happen once for each apic */ + WARN_ON((*drv)->eoi_write == eoi_write); + (*drv)->eoi_write = eoi_write; + } +} + +/* + * Power management + */ +#ifdef CONFIG_PM + +static struct { + /* + * 'active' is true if the local APIC was enabled by us and + * not the BIOS; this signifies that we are also responsible + * for disabling it before entering apm/acpi suspend + */ + int active; + /* r/w apic fields */ + unsigned int apic_id; + unsigned int apic_taskpri; + unsigned int apic_ldr; + unsigned int apic_dfr; + unsigned int apic_spiv; + unsigned int apic_lvtt; + unsigned int apic_lvtpc; + unsigned int apic_lvt0; + unsigned int apic_lvt1; + unsigned int apic_lvterr; + unsigned int apic_tmict; + unsigned int apic_tdcr; + unsigned int apic_thmr; +} apic_pm_state; + +static int lapic_suspend(void) +{ + unsigned long flags; + int maxlvt; + + if (!apic_pm_state.active) + return 0; + + maxlvt = lapic_get_maxlvt(); + + apic_pm_state.apic_id = apic_read(APIC_ID); + apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); + apic_pm_state.apic_ldr = apic_read(APIC_LDR); + apic_pm_state.apic_dfr = apic_read(APIC_DFR); + apic_pm_state.apic_spiv = apic_read(APIC_SPIV); + apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); + if (maxlvt >= 4) + apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); + apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); + apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); + apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); + apic_pm_state.apic_tmict = apic_read(APIC_TMICT); + apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); +#ifdef CONFIG_X86_THERMAL_VECTOR + if (maxlvt >= 5) + apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); +#endif + + local_irq_save(flags); + disable_local_APIC(); + + irq_remapping_disable(); + + local_irq_restore(flags); + return 0; +} + +static void lapic_resume(void) +{ + unsigned int l, h; + unsigned long flags; + int maxlvt; + + if (!apic_pm_state.active) + return; + + local_irq_save(flags); + + /* + * IO-APIC and PIC have their own resume routines. + * We just mask them here to make sure the interrupt + * subsystem is completely quiet while we enable x2apic + * and interrupt-remapping. + */ + mask_ioapic_entries(); + legacy_pic->mask_all(); + + if (x2apic_mode) + enable_x2apic(); + else { + /* + * Make sure the APICBASE points to the right address + * + * FIXME! This will be wrong if we ever support suspend on + * SMP! We'll need to do this as part of the CPU restore! + */ + if (boot_cpu_data.x86 >= 6) { + rdmsr(MSR_IA32_APICBASE, l, h); + l &= ~MSR_IA32_APICBASE_BASE; + l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; + wrmsr(MSR_IA32_APICBASE, l, h); + } + } + + maxlvt = lapic_get_maxlvt(); + apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); + apic_write(APIC_ID, apic_pm_state.apic_id); + apic_write(APIC_DFR, apic_pm_state.apic_dfr); + apic_write(APIC_LDR, apic_pm_state.apic_ldr); + apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri); + apic_write(APIC_SPIV, apic_pm_state.apic_spiv); + apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); + apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); +#if defined(CONFIG_X86_MCE_INTEL) + if (maxlvt >= 5) + apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); +#endif + if (maxlvt >= 4) + apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); + apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); + apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); + apic_write(APIC_TMICT, apic_pm_state.apic_tmict); + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + + irq_remapping_reenable(x2apic_mode); + + local_irq_restore(flags); +} + +/* + * This device has no shutdown method - fully functioning local APICs + * are needed on every CPU up until machine_halt/restart/poweroff. + */ + +static struct syscore_ops lapic_syscore_ops = { + .resume = lapic_resume, + .suspend = lapic_suspend, +}; + +static void apic_pm_activate(void) +{ + apic_pm_state.active = 1; +} + +static int __init init_lapic_sysfs(void) +{ + /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ + if (cpu_has_apic) + register_syscore_ops(&lapic_syscore_ops); + + return 0; +} + +/* local apic needs to resume before other devices access its registers. */ +core_initcall(init_lapic_sysfs); + +#else /* CONFIG_PM */ + +static void apic_pm_activate(void) { } + +#endif /* CONFIG_PM */ + +#ifdef CONFIG_X86_64 + +static int apic_cluster_num(void) +{ + int i, clusters, zeros; + unsigned id; + u16 *bios_cpu_apicid; + DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); + + bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); + bitmap_zero(clustermap, NUM_APIC_CLUSTERS); + + for (i = 0; i < nr_cpu_ids; i++) { + /* are we being called early in kernel startup? */ + if (bios_cpu_apicid) { + id = bios_cpu_apicid[i]; + } else if (i < nr_cpu_ids) { + if (cpu_present(i)) + id = per_cpu(x86_bios_cpu_apicid, i); + else + continue; + } else + break; + + if (id != BAD_APICID) + __set_bit(APIC_CLUSTERID(id), clustermap); + } + + /* Problem: Partially populated chassis may not have CPUs in some of + * the APIC clusters they have been allocated. Only present CPUs have + * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap. + * Since clusters are allocated sequentially, count zeros only if + * they are bounded by ones. + */ + clusters = 0; + zeros = 0; + for (i = 0; i < NUM_APIC_CLUSTERS; i++) { + if (test_bit(i, clustermap)) { + clusters += 1 + zeros; + zeros = 0; + } else + ++zeros; + } + + return clusters; +} + +static int multi_checked; +static int multi; + +static int set_multi(const struct dmi_system_id *d) +{ + if (multi) + return 0; + pr_info("APIC: %s detected, Multi Chassis\n", d->ident); + multi = 1; + return 0; +} + +static const struct dmi_system_id multi_dmi_table[] = { + { + .callback = set_multi, + .ident = "IBM System Summit2", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "IBM"), + DMI_MATCH(DMI_PRODUCT_NAME, "Summit2"), + }, + }, + {} +}; + +static void dmi_check_multi(void) +{ + if (multi_checked) + return; + + dmi_check_system(multi_dmi_table); + multi_checked = 1; +} + +/* + * apic_is_clustered_box() -- Check if we can expect good TSC + * + * Thus far, the major user of this is IBM's Summit2 series: + * Clustered boxes may have unsynced TSC problems if they are + * multi-chassis. + * Use DMI to check them + */ +int apic_is_clustered_box(void) +{ + dmi_check_multi(); + if (multi) + return 1; + + if (!is_vsmp_box()) + return 0; + + /* + * ScaleMP vSMPowered boxes have one cluster per board and TSCs are + * not guaranteed to be synced between boards + */ + if (apic_cluster_num() > 1) + return 1; + + return 0; +} +#endif + +/* + * APIC command line parameters + */ +static int __init setup_disableapic(char *arg) +{ + disable_apic = 1; + setup_clear_cpu_cap(X86_FEATURE_APIC); + return 0; +} +early_param("disableapic", setup_disableapic); + +/* same as disableapic, for compatibility */ +static int __init setup_nolapic(char *arg) +{ + return setup_disableapic(arg); +} +early_param("nolapic", setup_nolapic); + +static int __init parse_lapic_timer_c2_ok(char *arg) +{ + local_apic_timer_c2_ok = 1; + return 0; +} +early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); + +static int __init parse_disable_apic_timer(char *arg) +{ + disable_apic_timer = 1; + return 0; +} +early_param("noapictimer", parse_disable_apic_timer); + +static int __init parse_nolapic_timer(char *arg) +{ + disable_apic_timer = 1; + return 0; +} +early_param("nolapic_timer", parse_nolapic_timer); + +static int __init apic_set_verbosity(char *arg) +{ + if (!arg) { +#ifdef CONFIG_X86_64 + skip_ioapic_setup = 0; + return 0; +#endif + return -EINVAL; + } + + if (strcmp("debug", arg) == 0) + apic_verbosity = APIC_DEBUG; + else if (strcmp("verbose", arg) == 0) + apic_verbosity = APIC_VERBOSE; + else { + pr_warning("APIC Verbosity level %s not recognised" + " use apic=verbose or apic=debug\n", arg); + return -EINVAL; + } + + return 0; +} +early_param("apic", apic_set_verbosity); + +static int __init lapic_insert_resource(void) +{ + if (!apic_phys) + return -1; + + /* Put local APIC into the resource map. */ + lapic_resource.start = apic_phys; + lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1; + insert_resource(&iomem_resource, &lapic_resource); + + return 0; +} + +/* + * need call insert after e820_reserve_resources() + * that is using request_resource + */ +late_initcall(lapic_insert_resource); + +static int __init apic_set_disabled_cpu_apicid(char *arg) +{ + if (!arg || !get_option(&arg, &disabled_cpu_apicid)) + return -EINVAL; + + return 0; +} +early_param("disable_cpu_apicid", apic_set_disabled_cpu_apicid); diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c new file mode 100644 index 00000000000..7c1b2947951 --- /dev/null +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -0,0 +1,333 @@ +/* + * Copyright 2004 James Cleverdon, IBM. + * Subject to the GNU Public License, v.2 + * + * Flat APIC subarch code. + * + * Hacked for x86-64 by James Cleverdon from i386 architecture code by + * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and + * James Cleverdon. + */ +#include <linux/errno.h> +#include <linux/threads.h> +#include <linux/cpumask.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/ctype.h> +#include <linux/hardirq.h> +#include <linux/module.h> +#include <asm/smp.h> +#include <asm/apic.h> +#include <asm/ipi.h> + +#include <linux/acpi.h> + +static struct apic apic_physflat; +static struct apic apic_flat; + +struct apic __read_mostly *apic = &apic_flat; +EXPORT_SYMBOL_GPL(apic); + +static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + return 1; +} + +/* + * Set up the logical destination ID. + * + * Intel recommends to set DFR, LDR and TPR before enabling + * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel + * document number 292116). So here it goes... + */ +void flat_init_apic_ldr(void) +{ + unsigned long val; + unsigned long num, id; + + num = smp_processor_id(); + id = 1UL << num; + apic_write(APIC_DFR, APIC_DFR_FLAT); + val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; + val |= SET_APIC_LOGICAL_ID(id); + apic_write(APIC_LDR, val); +} + +static inline void _flat_send_IPI_mask(unsigned long mask, int vector) +{ + unsigned long flags; + + local_irq_save(flags); + __default_send_IPI_dest_field(mask, vector, apic->dest_logical); + local_irq_restore(flags); +} + +static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector) +{ + unsigned long mask = cpumask_bits(cpumask)[0]; + + _flat_send_IPI_mask(mask, vector); +} + +static void +flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) +{ + unsigned long mask = cpumask_bits(cpumask)[0]; + int cpu = smp_processor_id(); + + if (cpu < BITS_PER_LONG) + clear_bit(cpu, &mask); + + _flat_send_IPI_mask(mask, vector); +} + +static void flat_send_IPI_allbutself(int vector) +{ + int cpu = smp_processor_id(); +#ifdef CONFIG_HOTPLUG_CPU + int hotplug = 1; +#else + int hotplug = 0; +#endif + if (hotplug || vector == NMI_VECTOR) { + if (!cpumask_equal(cpu_online_mask, cpumask_of(cpu))) { + unsigned long mask = cpumask_bits(cpu_online_mask)[0]; + + if (cpu < BITS_PER_LONG) + clear_bit(cpu, &mask); + + _flat_send_IPI_mask(mask, vector); + } + } else if (num_online_cpus() > 1) { + __default_send_IPI_shortcut(APIC_DEST_ALLBUT, + vector, apic->dest_logical); + } +} + +static void flat_send_IPI_all(int vector) +{ + if (vector == NMI_VECTOR) { + flat_send_IPI_mask(cpu_online_mask, vector); + } else { + __default_send_IPI_shortcut(APIC_DEST_ALLINC, + vector, apic->dest_logical); + } +} + +static unsigned int flat_get_apic_id(unsigned long x) +{ + unsigned int id; + + id = (((x)>>24) & 0xFFu); + + return id; +} + +static unsigned long set_apic_id(unsigned int id) +{ + unsigned long x; + + x = ((id & 0xFFu)<<24); + return x; +} + +static unsigned int read_xapic_id(void) +{ + unsigned int id; + + id = flat_get_apic_id(apic_read(APIC_ID)); + return id; +} + +static int flat_apic_id_registered(void) +{ + return physid_isset(read_xapic_id(), phys_cpu_present_map); +} + +static int flat_phys_pkg_id(int initial_apic_id, int index_msb) +{ + return initial_apic_id >> index_msb; +} + +static int flat_probe(void) +{ + return 1; +} + +static struct apic apic_flat = { + .name = "flat", + .probe = flat_probe, + .acpi_madt_oem_check = flat_acpi_madt_oem_check, + .apic_id_valid = default_apic_id_valid, + .apic_id_registered = flat_apic_id_registered, + + .irq_delivery_mode = dest_LowestPrio, + .irq_dest_mode = 1, /* logical */ + + .target_cpus = online_target_cpus, + .disable_esr = 0, + .dest_logical = APIC_DEST_LOGICAL, + .check_apicid_used = NULL, + .check_apicid_present = NULL, + + .vector_allocation_domain = flat_vector_allocation_domain, + .init_apic_ldr = flat_init_apic_ldr, + + .ioapic_phys_id_map = NULL, + .setup_apic_routing = NULL, + .multi_timer_check = NULL, + .cpu_present_to_apicid = default_cpu_present_to_apicid, + .apicid_to_cpu_present = NULL, + .setup_portio_remap = NULL, + .check_phys_apicid_present = default_check_phys_apicid_present, + .enable_apic_mode = NULL, + .phys_pkg_id = flat_phys_pkg_id, + .mps_oem_check = NULL, + + .get_apic_id = flat_get_apic_id, + .set_apic_id = set_apic_id, + .apic_id_mask = 0xFFu << 24, + + .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, + + .send_IPI_mask = flat_send_IPI_mask, + .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself, + .send_IPI_allbutself = flat_send_IPI_allbutself, + .send_IPI_all = flat_send_IPI_all, + .send_IPI_self = apic_send_IPI_self, + + .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, + .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, + .wait_for_init_deassert = false, + .smp_callin_clear_local_apic = NULL, + .inquire_remote_apic = default_inquire_remote_apic, + + .read = native_apic_mem_read, + .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, + .icr_read = native_apic_icr_read, + .icr_write = native_apic_icr_write, + .wait_icr_idle = native_apic_wait_icr_idle, + .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, +}; + +/* + * Physflat mode is used when there are more than 8 CPUs on a system. + * We cannot use logical delivery in this case because the mask + * overflows, so use physical mode. + */ +static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ +#ifdef CONFIG_ACPI + /* + * Quirk: some x86_64 machines can only use physical APIC mode + * regardless of how many processors are present (x86_64 ES7000 + * is an example). + */ + if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID && + (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) { + printk(KERN_DEBUG "system APIC only can use physical flat"); + return 1; + } + + if (!strncmp(oem_id, "IBM", 3) && !strncmp(oem_table_id, "EXA", 3)) { + printk(KERN_DEBUG "IBM Summit detected, will use apic physical"); + return 1; + } +#endif + + return 0; +} + +static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector) +{ + default_send_IPI_mask_sequence_phys(cpumask, vector); +} + +static void physflat_send_IPI_mask_allbutself(const struct cpumask *cpumask, + int vector) +{ + default_send_IPI_mask_allbutself_phys(cpumask, vector); +} + +static void physflat_send_IPI_allbutself(int vector) +{ + default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector); +} + +static void physflat_send_IPI_all(int vector) +{ + physflat_send_IPI_mask(cpu_online_mask, vector); +} + +static int physflat_probe(void) +{ + if (apic == &apic_physflat || num_possible_cpus() > 8) + return 1; + + return 0; +} + +static struct apic apic_physflat = { + + .name = "physical flat", + .probe = physflat_probe, + .acpi_madt_oem_check = physflat_acpi_madt_oem_check, + .apic_id_valid = default_apic_id_valid, + .apic_id_registered = flat_apic_id_registered, + + .irq_delivery_mode = dest_Fixed, + .irq_dest_mode = 0, /* physical */ + + .target_cpus = online_target_cpus, + .disable_esr = 0, + .dest_logical = 0, + .check_apicid_used = NULL, + .check_apicid_present = NULL, + + .vector_allocation_domain = default_vector_allocation_domain, + /* not needed, but shouldn't hurt: */ + .init_apic_ldr = flat_init_apic_ldr, + + .ioapic_phys_id_map = NULL, + .setup_apic_routing = NULL, + .multi_timer_check = NULL, + .cpu_present_to_apicid = default_cpu_present_to_apicid, + .apicid_to_cpu_present = NULL, + .setup_portio_remap = NULL, + .check_phys_apicid_present = default_check_phys_apicid_present, + .enable_apic_mode = NULL, + .phys_pkg_id = flat_phys_pkg_id, + .mps_oem_check = NULL, + + .get_apic_id = flat_get_apic_id, + .set_apic_id = set_apic_id, + .apic_id_mask = 0xFFu << 24, + + .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, + + .send_IPI_mask = physflat_send_IPI_mask, + .send_IPI_mask_allbutself = physflat_send_IPI_mask_allbutself, + .send_IPI_allbutself = physflat_send_IPI_allbutself, + .send_IPI_all = physflat_send_IPI_all, + .send_IPI_self = apic_send_IPI_self, + + .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, + .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, + .wait_for_init_deassert = false, + .smp_callin_clear_local_apic = NULL, + .inquire_remote_apic = default_inquire_remote_apic, + + .read = native_apic_mem_read, + .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, + .icr_read = native_apic_icr_read, + .icr_write = native_apic_icr_write, + .wait_icr_idle = native_apic_wait_icr_idle, + .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, +}; + +/* + * We need to check for physflat first, so this order is important. + */ +apic_drivers(apic_physflat, apic_flat); diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c new file mode 100644 index 00000000000..8c7c98249c2 --- /dev/null +++ b/arch/x86/kernel/apic/apic_noop.c @@ -0,0 +1,190 @@ +/* + * NOOP APIC driver. + * + * Does almost nothing and should be substituted by a real apic driver via + * probe routine. + * + * Though in case if apic is disabled (for some reason) we try + * to not uglify the caller's code and allow to call (some) apic routines + * like self-ipi, etc... + */ + +#include <linux/threads.h> +#include <linux/cpumask.h> +#include <linux/module.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/ctype.h> +#include <linux/errno.h> +#include <asm/fixmap.h> +#include <asm/mpspec.h> +#include <asm/apicdef.h> +#include <asm/apic.h> +#include <asm/setup.h> + +#include <linux/smp.h> +#include <asm/ipi.h> + +#include <linux/interrupt.h> +#include <asm/acpi.h> +#include <asm/e820.h> + +static void noop_init_apic_ldr(void) { } +static void noop_send_IPI_mask(const struct cpumask *cpumask, int vector) { } +static void noop_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { } +static void noop_send_IPI_allbutself(int vector) { } +static void noop_send_IPI_all(int vector) { } +static void noop_send_IPI_self(int vector) { } +static void noop_apic_wait_icr_idle(void) { } +static void noop_apic_icr_write(u32 low, u32 id) { } + +static int noop_wakeup_secondary_cpu(int apicid, unsigned long start_eip) +{ + return -1; +} + +static u32 noop_safe_apic_wait_icr_idle(void) +{ + return 0; +} + +static u64 noop_apic_icr_read(void) +{ + return 0; +} + +static int noop_phys_pkg_id(int cpuid_apic, int index_msb) +{ + return 0; +} + +static unsigned int noop_get_apic_id(unsigned long x) +{ + return 0; +} + +static int noop_probe(void) +{ + /* + * NOOP apic should not ever be + * enabled via probe routine + */ + return 0; +} + +static int noop_apic_id_registered(void) +{ + /* + * if we would be really "pedantic" + * we should pass read_apic_id() here + * but since NOOP suppose APIC ID = 0 + * lets save a few cycles + */ + return physid_isset(0, phys_cpu_present_map); +} + +static const struct cpumask *noop_target_cpus(void) +{ + /* only BSP here */ + return cpumask_of(0); +} + +static unsigned long noop_check_apicid_used(physid_mask_t *map, int apicid) +{ + return physid_isset(apicid, *map); +} + +static unsigned long noop_check_apicid_present(int bit) +{ + return physid_isset(bit, phys_cpu_present_map); +} + +static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask, + const struct cpumask *mask) +{ + if (cpu != 0) + pr_warning("APIC: Vector allocated for non-BSP cpu\n"); + cpumask_copy(retmask, cpumask_of(cpu)); +} + +static u32 noop_apic_read(u32 reg) +{ + WARN_ON_ONCE((cpu_has_apic && !disable_apic)); + return 0; +} + +static void noop_apic_write(u32 reg, u32 v) +{ + WARN_ON_ONCE(cpu_has_apic && !disable_apic); +} + +struct apic apic_noop = { + .name = "noop", + .probe = noop_probe, + .acpi_madt_oem_check = NULL, + + .apic_id_valid = default_apic_id_valid, + .apic_id_registered = noop_apic_id_registered, + + .irq_delivery_mode = dest_LowestPrio, + /* logical delivery broadcast to all CPUs: */ + .irq_dest_mode = 1, + + .target_cpus = noop_target_cpus, + .disable_esr = 0, + .dest_logical = APIC_DEST_LOGICAL, + .check_apicid_used = noop_check_apicid_used, + .check_apicid_present = noop_check_apicid_present, + + .vector_allocation_domain = noop_vector_allocation_domain, + .init_apic_ldr = noop_init_apic_ldr, + + .ioapic_phys_id_map = default_ioapic_phys_id_map, + .setup_apic_routing = NULL, + .multi_timer_check = NULL, + + .cpu_present_to_apicid = default_cpu_present_to_apicid, + .apicid_to_cpu_present = physid_set_mask_of_physid, + + .setup_portio_remap = NULL, + .check_phys_apicid_present = default_check_phys_apicid_present, + .enable_apic_mode = NULL, + + .phys_pkg_id = noop_phys_pkg_id, + + .mps_oem_check = NULL, + + .get_apic_id = noop_get_apic_id, + .set_apic_id = NULL, + .apic_id_mask = 0x0F << 24, + + .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, + + .send_IPI_mask = noop_send_IPI_mask, + .send_IPI_mask_allbutself = noop_send_IPI_mask_allbutself, + .send_IPI_allbutself = noop_send_IPI_allbutself, + .send_IPI_all = noop_send_IPI_all, + .send_IPI_self = noop_send_IPI_self, + + .wakeup_secondary_cpu = noop_wakeup_secondary_cpu, + + /* should be safe */ + .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, + .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, + + .wait_for_init_deassert = false, + .smp_callin_clear_local_apic = NULL, + .inquire_remote_apic = NULL, + + .read = noop_apic_read, + .write = noop_apic_write, + .eoi_write = noop_apic_write, + .icr_read = noop_apic_icr_read, + .icr_write = noop_apic_icr_write, + .wait_icr_idle = noop_apic_wait_icr_idle, + .safe_wait_icr_idle = noop_safe_apic_wait_icr_idle, + +#ifdef CONFIG_X86_32 + .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid, +#endif +}; diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c new file mode 100644 index 00000000000..a5b45df8bc8 --- /dev/null +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -0,0 +1,264 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Numascale NumaConnect-Specific APIC Code + * + * Copyright (C) 2011 Numascale AS. All rights reserved. + * + * Send feedback to <support@numascale.com> + * + */ + +#include <linux/errno.h> +#include <linux/threads.h> +#include <linux/cpumask.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/ctype.h> +#include <linux/init.h> +#include <linux/hardirq.h> +#include <linux/delay.h> + +#include <asm/numachip/numachip.h> +#include <asm/numachip/numachip_csr.h> +#include <asm/smp.h> +#include <asm/apic.h> +#include <asm/ipi.h> +#include <asm/apic_flat_64.h> +#include <asm/pgtable.h> + +static int numachip_system __read_mostly; + +static const struct apic apic_numachip __read_mostly; + +static unsigned int get_apic_id(unsigned long x) +{ + unsigned long value; + unsigned int id; + + rdmsrl(MSR_FAM10H_NODE_ID, value); + id = ((x >> 24) & 0xffU) | ((value << 2) & 0x3f00U); + + return id; +} + +static unsigned long set_apic_id(unsigned int id) +{ + unsigned long x; + + x = ((id & 0xffU) << 24); + return x; +} + +static unsigned int read_xapic_id(void) +{ + return get_apic_id(apic_read(APIC_ID)); +} + +static int numachip_apic_id_valid(int apicid) +{ + /* Trust what bootloader passes in MADT */ + return 1; +} + +static int numachip_apic_id_registered(void) +{ + return physid_isset(read_xapic_id(), phys_cpu_present_map); +} + +static int numachip_phys_pkg_id(int initial_apic_id, int index_msb) +{ + return initial_apic_id >> index_msb; +} + +static int numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip) +{ + union numachip_csr_g3_ext_irq_gen int_gen; + + int_gen.s._destination_apic_id = phys_apicid; + int_gen.s._vector = 0; + int_gen.s._msgtype = APIC_DM_INIT >> 8; + int_gen.s._index = 0; + + write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v); + + int_gen.s._msgtype = APIC_DM_STARTUP >> 8; + int_gen.s._vector = start_rip >> 12; + + write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v); + + atomic_set(&init_deasserted, 1); + return 0; +} + +static void numachip_send_IPI_one(int cpu, int vector) +{ + union numachip_csr_g3_ext_irq_gen int_gen; + int apicid = per_cpu(x86_cpu_to_apicid, cpu); + + int_gen.s._destination_apic_id = apicid; + int_gen.s._vector = vector; + int_gen.s._msgtype = (vector == NMI_VECTOR ? APIC_DM_NMI : APIC_DM_FIXED) >> 8; + int_gen.s._index = 0; + + write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v); +} + +static void numachip_send_IPI_mask(const struct cpumask *mask, int vector) +{ + unsigned int cpu; + + for_each_cpu(cpu, mask) + numachip_send_IPI_one(cpu, vector); +} + +static void numachip_send_IPI_mask_allbutself(const struct cpumask *mask, + int vector) +{ + unsigned int this_cpu = smp_processor_id(); + unsigned int cpu; + + for_each_cpu(cpu, mask) { + if (cpu != this_cpu) + numachip_send_IPI_one(cpu, vector); + } +} + +static void numachip_send_IPI_allbutself(int vector) +{ + unsigned int this_cpu = smp_processor_id(); + unsigned int cpu; + + for_each_online_cpu(cpu) { + if (cpu != this_cpu) + numachip_send_IPI_one(cpu, vector); + } +} + +static void numachip_send_IPI_all(int vector) +{ + numachip_send_IPI_mask(cpu_online_mask, vector); +} + +static void numachip_send_IPI_self(int vector) +{ + __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); +} + +static int __init numachip_probe(void) +{ + return apic == &apic_numachip; +} + +static void __init map_csrs(void) +{ + printk(KERN_INFO "NumaChip: Mapping local CSR space (%016llx - %016llx)\n", + NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_BASE + NUMACHIP_LCSR_SIZE - 1); + init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE); + + printk(KERN_INFO "NumaChip: Mapping global CSR space (%016llx - %016llx)\n", + NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_BASE + NUMACHIP_GCSR_SIZE - 1); + init_extra_mapping_uc(NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_SIZE); +} + +static void fixup_cpu_id(struct cpuinfo_x86 *c, int node) +{ + + if (c->phys_proc_id != node) { + c->phys_proc_id = node; + per_cpu(cpu_llc_id, smp_processor_id()) = node; + } +} + +static int __init numachip_system_init(void) +{ + unsigned int val; + + if (!numachip_system) + return 0; + + x86_cpuinit.fixup_cpu_id = fixup_cpu_id; + x86_init.pci.arch_init = pci_numachip_init; + + map_csrs(); + + val = read_lcsr(CSR_G0_NODE_IDS); + printk(KERN_INFO "NumaChip: Local NodeID = %08x\n", val); + + return 0; +} +early_initcall(numachip_system_init); + +static int numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + if (!strncmp(oem_id, "NUMASC", 6)) { + numachip_system = 1; + return 1; + } + + return 0; +} + +static const struct apic apic_numachip __refconst = { + + .name = "NumaConnect system", + .probe = numachip_probe, + .acpi_madt_oem_check = numachip_acpi_madt_oem_check, + .apic_id_valid = numachip_apic_id_valid, + .apic_id_registered = numachip_apic_id_registered, + + .irq_delivery_mode = dest_Fixed, + .irq_dest_mode = 0, /* physical */ + + .target_cpus = online_target_cpus, + .disable_esr = 0, + .dest_logical = 0, + .check_apicid_used = NULL, + .check_apicid_present = NULL, + + .vector_allocation_domain = default_vector_allocation_domain, + .init_apic_ldr = flat_init_apic_ldr, + + .ioapic_phys_id_map = NULL, + .setup_apic_routing = NULL, + .multi_timer_check = NULL, + .cpu_present_to_apicid = default_cpu_present_to_apicid, + .apicid_to_cpu_present = NULL, + .setup_portio_remap = NULL, + .check_phys_apicid_present = default_check_phys_apicid_present, + .enable_apic_mode = NULL, + .phys_pkg_id = numachip_phys_pkg_id, + .mps_oem_check = NULL, + + .get_apic_id = get_apic_id, + .set_apic_id = set_apic_id, + .apic_id_mask = 0xffU << 24, + + .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, + + .send_IPI_mask = numachip_send_IPI_mask, + .send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself, + .send_IPI_allbutself = numachip_send_IPI_allbutself, + .send_IPI_all = numachip_send_IPI_all, + .send_IPI_self = numachip_send_IPI_self, + + .wakeup_secondary_cpu = numachip_wakeup_secondary, + .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, + .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, + .wait_for_init_deassert = false, + .smp_callin_clear_local_apic = NULL, + .inquire_remote_apic = NULL, /* REMRD not supported */ + + .read = native_apic_mem_read, + .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, + .icr_read = native_apic_icr_read, + .icr_write = native_apic_icr_write, + .wait_icr_idle = native_apic_wait_icr_idle, + .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, +}; +apic_driver(apic_numachip); + diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c new file mode 100644 index 00000000000..e4840aa7a25 --- /dev/null +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -0,0 +1,237 @@ +/* + * APIC driver for "bigsmp" xAPIC machines with more than 8 virtual CPUs. + * + * Drives the local APIC in "clustered mode". + */ +#include <linux/threads.h> +#include <linux/cpumask.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/dmi.h> +#include <linux/smp.h> + +#include <asm/apicdef.h> +#include <asm/fixmap.h> +#include <asm/mpspec.h> +#include <asm/apic.h> +#include <asm/ipi.h> + +static unsigned bigsmp_get_apic_id(unsigned long x) +{ + return (x >> 24) & 0xFF; +} + +static int bigsmp_apic_id_registered(void) +{ + return 1; +} + +static unsigned long bigsmp_check_apicid_used(physid_mask_t *map, int apicid) +{ + return 0; +} + +static unsigned long bigsmp_check_apicid_present(int bit) +{ + return 1; +} + +static int bigsmp_early_logical_apicid(int cpu) +{ + /* on bigsmp, logical apicid is the same as physical */ + return early_per_cpu(x86_cpu_to_apicid, cpu); +} + +static inline unsigned long calculate_ldr(int cpu) +{ + unsigned long val, id; + + val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; + id = per_cpu(x86_bios_cpu_apicid, cpu); + val |= SET_APIC_LOGICAL_ID(id); + + return val; +} + +/* + * Set up the logical destination ID. + * + * Intel recommends to set DFR, LDR and TPR before enabling + * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel + * document number 292116). So here it goes... + */ +static void bigsmp_init_apic_ldr(void) +{ + unsigned long val; + int cpu = smp_processor_id(); + + apic_write(APIC_DFR, APIC_DFR_FLAT); + val = calculate_ldr(cpu); + apic_write(APIC_LDR, val); +} + +static void bigsmp_setup_apic_routing(void) +{ + printk(KERN_INFO + "Enabling APIC mode: Physflat. Using %d I/O APICs\n", + nr_ioapics); +} + +static int bigsmp_cpu_present_to_apicid(int mps_cpu) +{ + if (mps_cpu < nr_cpu_ids) + return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu); + + return BAD_APICID; +} + +static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) +{ + /* For clustered we don't have a good way to do this yet - hack */ + physids_promote(0xFFL, retmap); +} + +static int bigsmp_check_phys_apicid_present(int phys_apicid) +{ + return 1; +} + +static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb) +{ + return cpuid_apic >> index_msb; +} + +static inline void bigsmp_send_IPI_mask(const struct cpumask *mask, int vector) +{ + default_send_IPI_mask_sequence_phys(mask, vector); +} + +static void bigsmp_send_IPI_allbutself(int vector) +{ + default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector); +} + +static void bigsmp_send_IPI_all(int vector) +{ + bigsmp_send_IPI_mask(cpu_online_mask, vector); +} + +static int dmi_bigsmp; /* can be set by dmi scanners */ + +static int hp_ht_bigsmp(const struct dmi_system_id *d) +{ + printk(KERN_NOTICE "%s detected: force use of apic=bigsmp\n", d->ident); + dmi_bigsmp = 1; + + return 0; +} + + +static const struct dmi_system_id bigsmp_dmi_table[] = { + { hp_ht_bigsmp, "HP ProLiant DL760 G2", + { DMI_MATCH(DMI_BIOS_VENDOR, "HP"), + DMI_MATCH(DMI_BIOS_VERSION, "P44-"), + } + }, + + { hp_ht_bigsmp, "HP ProLiant DL740", + { DMI_MATCH(DMI_BIOS_VENDOR, "HP"), + DMI_MATCH(DMI_BIOS_VERSION, "P47-"), + } + }, + { } /* NULL entry stops DMI scanning */ +}; + +static int probe_bigsmp(void) +{ + if (def_to_bigsmp) + dmi_bigsmp = 1; + else + dmi_check_system(bigsmp_dmi_table); + + return dmi_bigsmp; +} + +static struct apic apic_bigsmp = { + + .name = "bigsmp", + .probe = probe_bigsmp, + .acpi_madt_oem_check = NULL, + .apic_id_valid = default_apic_id_valid, + .apic_id_registered = bigsmp_apic_id_registered, + + .irq_delivery_mode = dest_Fixed, + /* phys delivery to target CPU: */ + .irq_dest_mode = 0, + + .target_cpus = default_target_cpus, + .disable_esr = 1, + .dest_logical = 0, + .check_apicid_used = bigsmp_check_apicid_used, + .check_apicid_present = bigsmp_check_apicid_present, + + .vector_allocation_domain = default_vector_allocation_domain, + .init_apic_ldr = bigsmp_init_apic_ldr, + + .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map, + .setup_apic_routing = bigsmp_setup_apic_routing, + .multi_timer_check = NULL, + .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid, + .apicid_to_cpu_present = physid_set_mask_of_physid, + .setup_portio_remap = NULL, + .check_phys_apicid_present = bigsmp_check_phys_apicid_present, + .enable_apic_mode = NULL, + .phys_pkg_id = bigsmp_phys_pkg_id, + .mps_oem_check = NULL, + + .get_apic_id = bigsmp_get_apic_id, + .set_apic_id = NULL, + .apic_id_mask = 0xFF << 24, + + .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, + + .send_IPI_mask = bigsmp_send_IPI_mask, + .send_IPI_mask_allbutself = NULL, + .send_IPI_allbutself = bigsmp_send_IPI_allbutself, + .send_IPI_all = bigsmp_send_IPI_all, + .send_IPI_self = default_send_IPI_self, + + .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, + .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, + + .wait_for_init_deassert = true, + .smp_callin_clear_local_apic = NULL, + .inquire_remote_apic = default_inquire_remote_apic, + + .read = native_apic_mem_read, + .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, + .icr_read = native_apic_icr_read, + .icr_write = native_apic_icr_write, + .wait_icr_idle = native_apic_wait_icr_idle, + .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, + + .x86_32_early_logical_apicid = bigsmp_early_logical_apicid, +}; + +void __init generic_bigsmp_probe(void) +{ + unsigned int cpu; + + if (!probe_bigsmp()) + return; + + apic = &apic_bigsmp; + + for_each_possible_cpu(cpu) { + if (early_per_cpu(x86_cpu_to_logical_apicid, + cpu) == BAD_APICID) + continue; + early_per_cpu(x86_cpu_to_logical_apicid, cpu) = + bigsmp_early_logical_apicid(cpu); + } + + pr_info("Overriding APIC driver with %s\n", apic_bigsmp.name); +} + +apic_driver(apic_bigsmp); diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c new file mode 100644 index 00000000000..6a1e71bde32 --- /dev/null +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -0,0 +1,102 @@ +/* + * HW NMI watchdog support + * + * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. + * + * Arch specific calls to support NMI watchdog + * + * Bits copied from original nmi.c file + * + */ +#include <asm/apic.h> +#include <asm/nmi.h> + +#include <linux/cpumask.h> +#include <linux/kdebug.h> +#include <linux/notifier.h> +#include <linux/kprobes.h> +#include <linux/nmi.h> +#include <linux/module.h> +#include <linux/delay.h> + +#ifdef CONFIG_HARDLOCKUP_DETECTOR +u64 hw_nmi_get_sample_period(int watchdog_thresh) +{ + return (u64)(cpu_khz) * 1000 * watchdog_thresh; +} +#endif + +#ifdef arch_trigger_all_cpu_backtrace +/* For reliability, we're prepared to waste bits here. */ +static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; + +/* "in progress" flag of arch_trigger_all_cpu_backtrace */ +static unsigned long backtrace_flag; + +void arch_trigger_all_cpu_backtrace(bool include_self) +{ + int i; + int cpu = get_cpu(); + + if (test_and_set_bit(0, &backtrace_flag)) { + /* + * If there is already a trigger_all_cpu_backtrace() in progress + * (backtrace_flag == 1), don't output double cpu dump infos. + */ + put_cpu(); + return; + } + + cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); + if (!include_self) + cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); + + if (!cpumask_empty(to_cpumask(backtrace_mask))) { + pr_info("sending NMI to %s CPUs:\n", + (include_self ? "all" : "other")); + apic->send_IPI_mask(to_cpumask(backtrace_mask), NMI_VECTOR); + } + + /* Wait for up to 10 seconds for all CPUs to do the backtrace */ + for (i = 0; i < 10 * 1000; i++) { + if (cpumask_empty(to_cpumask(backtrace_mask))) + break; + mdelay(1); + touch_softlockup_watchdog(); + } + + clear_bit(0, &backtrace_flag); + smp_mb__after_atomic(); + put_cpu(); +} + +static int +arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs) +{ + int cpu; + + cpu = smp_processor_id(); + + if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { + static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED; + + arch_spin_lock(&lock); + printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); + show_regs(regs); + arch_spin_unlock(&lock); + cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); + return NMI_HANDLED; + } + + return NMI_DONE; +} +NOKPROBE_SYMBOL(arch_trigger_all_cpu_backtrace_handler); + +static int __init register_trigger_all_cpu_backtrace(void) +{ + register_nmi_handler(NMI_LOCAL, arch_trigger_all_cpu_backtrace_handler, + 0, "arch_bt"); + return 0; +} +early_initcall(register_trigger_all_cpu_backtrace); +#endif diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c new file mode 100644 index 00000000000..81e08eff05e --- /dev/null +++ b/arch/x86/kernel/apic/io_apic.c @@ -0,0 +1,3797 @@ +/* + * Intel IO-APIC support for multi-Pentium hosts. + * + * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo + * + * Many thanks to Stig Venaas for trying out countless experimental + * patches and reporting/debugging problems patiently! + * + * (c) 1999, Multiple IO-APIC support, developed by + * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and + * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>, + * further tested and cleaned up by Zach Brown <zab@redhat.com> + * and Ingo Molnar <mingo@redhat.com> + * + * Fixes + * Maciej W. Rozycki : Bits for genuine 82489DX APICs; + * thanks to Eric Gilmore + * and Rolf G. Tews + * for testing these extensively + * Paul Diefenbaugh : Added full ACPI support + */ + +#include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/init.h> +#include <linux/delay.h> +#include <linux/sched.h> +#include <linux/pci.h> +#include <linux/mc146818rtc.h> +#include <linux/compiler.h> +#include <linux/acpi.h> +#include <linux/module.h> +#include <linux/syscore_ops.h> +#include <linux/msi.h> +#include <linux/htirq.h> +#include <linux/freezer.h> +#include <linux/kthread.h> +#include <linux/jiffies.h> /* time_after() */ +#include <linux/slab.h> +#include <linux/bootmem.h> +#include <linux/dmar.h> +#include <linux/hpet.h> + +#include <asm/idle.h> +#include <asm/io.h> +#include <asm/smp.h> +#include <asm/cpu.h> +#include <asm/desc.h> +#include <asm/proto.h> +#include <asm/acpi.h> +#include <asm/dma.h> +#include <asm/timer.h> +#include <asm/i8259.h> +#include <asm/msidef.h> +#include <asm/hypertransport.h> +#include <asm/setup.h> +#include <asm/irq_remapping.h> +#include <asm/hpet.h> +#include <asm/hw_irq.h> + +#include <asm/apic.h> + +#define __apicdebuginit(type) static type __init + +#define for_each_irq_pin(entry, head) \ + for (entry = head; entry; entry = entry->next) + +/* + * Is the SiS APIC rmw bug present ? + * -1 = don't know, 0 = no, 1 = yes + */ +int sis_apic_bug = -1; + +static DEFINE_RAW_SPINLOCK(ioapic_lock); +static DEFINE_RAW_SPINLOCK(vector_lock); + +static struct ioapic { + /* + * # of IRQ routing registers + */ + int nr_registers; + /* + * Saved state during suspend/resume, or while enabling intr-remap. + */ + struct IO_APIC_route_entry *saved_registers; + /* I/O APIC config */ + struct mpc_ioapic mp_config; + /* IO APIC gsi routing info */ + struct mp_ioapic_gsi gsi_config; + DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); +} ioapics[MAX_IO_APICS]; + +#define mpc_ioapic_ver(ioapic_idx) ioapics[ioapic_idx].mp_config.apicver + +int mpc_ioapic_id(int ioapic_idx) +{ + return ioapics[ioapic_idx].mp_config.apicid; +} + +unsigned int mpc_ioapic_addr(int ioapic_idx) +{ + return ioapics[ioapic_idx].mp_config.apicaddr; +} + +struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic_idx) +{ + return &ioapics[ioapic_idx].gsi_config; +} + +int nr_ioapics; + +/* The one past the highest gsi number used */ +u32 gsi_top; + +/* MP IRQ source entries */ +struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; + +/* # of MP IRQ source entries */ +int mp_irq_entries; + +/* GSI interrupts */ +static int nr_irqs_gsi = NR_IRQS_LEGACY; + +#ifdef CONFIG_EISA +int mp_bus_id_to_type[MAX_MP_BUSSES]; +#endif + +DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); + +int skip_ioapic_setup; + +/** + * disable_ioapic_support() - disables ioapic support at runtime + */ +void disable_ioapic_support(void) +{ +#ifdef CONFIG_PCI + noioapicquirk = 1; + noioapicreroute = -1; +#endif + skip_ioapic_setup = 1; +} + +static int __init parse_noapic(char *str) +{ + /* disable IO-APIC */ + disable_ioapic_support(); + return 0; +} +early_param("noapic", parse_noapic); + +static int io_apic_setup_irq_pin(unsigned int irq, int node, + struct io_apic_irq_attr *attr); + +/* Will be called in mpparse/acpi/sfi codes for saving IRQ info */ +void mp_save_irq(struct mpc_intsrc *m) +{ + int i; + + apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x," + " IRQ %02x, APIC ID %x, APIC INT %02x\n", + m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus, + m->srcbusirq, m->dstapic, m->dstirq); + + for (i = 0; i < mp_irq_entries; i++) { + if (!memcmp(&mp_irqs[i], m, sizeof(*m))) + return; + } + + memcpy(&mp_irqs[mp_irq_entries], m, sizeof(*m)); + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!!\n"); +} + +struct irq_pin_list { + int apic, pin; + struct irq_pin_list *next; +}; + +static struct irq_pin_list *alloc_irq_pin_list(int node) +{ + return kzalloc_node(sizeof(struct irq_pin_list), GFP_KERNEL, node); +} + + +/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ +static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY]; + +int __init arch_early_irq_init(void) +{ + struct irq_cfg *cfg; + int count, node, i; + + if (!legacy_pic->nr_legacy_irqs) + io_apic_irqs = ~0UL; + + for (i = 0; i < nr_ioapics; i++) { + ioapics[i].saved_registers = + kzalloc(sizeof(struct IO_APIC_route_entry) * + ioapics[i].nr_registers, GFP_KERNEL); + if (!ioapics[i].saved_registers) + pr_err("IOAPIC %d: suspend/resume impossible!\n", i); + } + + cfg = irq_cfgx; + count = ARRAY_SIZE(irq_cfgx); + node = cpu_to_node(0); + + for (i = 0; i < count; i++) { + irq_set_chip_data(i, &cfg[i]); + zalloc_cpumask_var_node(&cfg[i].domain, GFP_KERNEL, node); + zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node); + /* + * For legacy IRQ's, start with assigning irq0 to irq15 to + * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's. + */ + if (i < legacy_pic->nr_legacy_irqs) { + cfg[i].vector = IRQ0_VECTOR + i; + cpumask_setall(cfg[i].domain); + } + } + + return 0; +} + +static struct irq_cfg *irq_cfg(unsigned int irq) +{ + return irq_get_chip_data(irq); +} + +static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node) +{ + struct irq_cfg *cfg; + + cfg = kzalloc_node(sizeof(*cfg), GFP_KERNEL, node); + if (!cfg) + return NULL; + if (!zalloc_cpumask_var_node(&cfg->domain, GFP_KERNEL, node)) + goto out_cfg; + if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_KERNEL, node)) + goto out_domain; + return cfg; +out_domain: + free_cpumask_var(cfg->domain); +out_cfg: + kfree(cfg); + return NULL; +} + +static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg) +{ + if (!cfg) + return; + irq_set_chip_data(at, NULL); + free_cpumask_var(cfg->domain); + free_cpumask_var(cfg->old_domain); + kfree(cfg); +} + +static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node) +{ + int res = irq_alloc_desc_at(at, node); + struct irq_cfg *cfg; + + if (res < 0) { + if (res != -EEXIST) + return NULL; + cfg = irq_get_chip_data(at); + if (cfg) + return cfg; + } + + cfg = alloc_irq_cfg(at, node); + if (cfg) + irq_set_chip_data(at, cfg); + else + irq_free_desc(at); + return cfg; +} + +struct io_apic { + unsigned int index; + unsigned int unused[3]; + unsigned int data; + unsigned int unused2[11]; + unsigned int eoi; +}; + +static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) +{ + return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) + + (mpc_ioapic_addr(idx) & ~PAGE_MASK); +} + +void io_apic_eoi(unsigned int apic, unsigned int vector) +{ + struct io_apic __iomem *io_apic = io_apic_base(apic); + writel(vector, &io_apic->eoi); +} + +unsigned int native_io_apic_read(unsigned int apic, unsigned int reg) +{ + struct io_apic __iomem *io_apic = io_apic_base(apic); + writel(reg, &io_apic->index); + return readl(&io_apic->data); +} + +void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) +{ + struct io_apic __iomem *io_apic = io_apic_base(apic); + + writel(reg, &io_apic->index); + writel(value, &io_apic->data); +} + +/* + * Re-write a value: to be used for read-modify-write + * cycles where the read already set up the index register. + * + * Older SiS APIC requires we rewrite the index register + */ +void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) +{ + struct io_apic __iomem *io_apic = io_apic_base(apic); + + if (sis_apic_bug) + writel(reg, &io_apic->index); + writel(value, &io_apic->data); +} + +union entry_union { + struct { u32 w1, w2; }; + struct IO_APIC_route_entry entry; +}; + +static struct IO_APIC_route_entry __ioapic_read_entry(int apic, int pin) +{ + union entry_union eu; + + eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); + eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); + + return eu.entry; +} + +static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) +{ + union entry_union eu; + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + eu.entry = __ioapic_read_entry(apic, pin); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + return eu.entry; +} + +/* + * When we write a new IO APIC routing entry, we need to write the high + * word first! If the mask bit in the low word is clear, we will enable + * the interrupt, and we need to make sure the entry is fully populated + * before that happens. + */ +static void __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +{ + union entry_union eu = {{0, 0}}; + + eu.entry = e; + io_apic_write(apic, 0x11 + 2*pin, eu.w2); + io_apic_write(apic, 0x10 + 2*pin, eu.w1); +} + +static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + __ioapic_write_entry(apic, pin, e); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); +} + +/* + * When we mask an IO APIC routing entry, we need to write the low + * word first, in order to set the mask bit before we change the + * high bits! + */ +static void ioapic_mask_entry(int apic, int pin) +{ + unsigned long flags; + union entry_union eu = { .entry.mask = 1 }; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x10 + 2*pin, eu.w1); + io_apic_write(apic, 0x11 + 2*pin, eu.w2); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); +} + +/* + * The common case is 1:1 IRQ<->pin mappings. Sometimes there are + * shared ISA-space IRQs, so we have to support them. We are super + * fast in the common case, and fast for shared ISA-space IRQs. + */ +static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) +{ + struct irq_pin_list **last, *entry; + + /* don't allow duplicates */ + last = &cfg->irq_2_pin; + for_each_irq_pin(entry, cfg->irq_2_pin) { + if (entry->apic == apic && entry->pin == pin) + return 0; + last = &entry->next; + } + + entry = alloc_irq_pin_list(node); + if (!entry) { + pr_err("can not alloc irq_pin_list (%d,%d,%d)\n", + node, apic, pin); + return -ENOMEM; + } + entry->apic = apic; + entry->pin = pin; + + *last = entry; + return 0; +} + +static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) +{ + if (__add_pin_to_irq_node(cfg, node, apic, pin)) + panic("IO-APIC: failed to add irq-pin. Can not proceed\n"); +} + +/* + * Reroute an IRQ to a different pin. + */ +static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node, + int oldapic, int oldpin, + int newapic, int newpin) +{ + struct irq_pin_list *entry; + + for_each_irq_pin(entry, cfg->irq_2_pin) { + if (entry->apic == oldapic && entry->pin == oldpin) { + entry->apic = newapic; + entry->pin = newpin; + /* every one is different, right? */ + return; + } + } + + /* old apic/pin didn't exist, so just add new ones */ + add_pin_to_irq_node(cfg, node, newapic, newpin); +} + +static void __io_apic_modify_irq(struct irq_pin_list *entry, + int mask_and, int mask_or, + void (*final)(struct irq_pin_list *entry)) +{ + unsigned int reg, pin; + + pin = entry->pin; + reg = io_apic_read(entry->apic, 0x10 + pin * 2); + reg &= mask_and; + reg |= mask_or; + io_apic_modify(entry->apic, 0x10 + pin * 2, reg); + if (final) + final(entry); +} + +static void io_apic_modify_irq(struct irq_cfg *cfg, + int mask_and, int mask_or, + void (*final)(struct irq_pin_list *entry)) +{ + struct irq_pin_list *entry; + + for_each_irq_pin(entry, cfg->irq_2_pin) + __io_apic_modify_irq(entry, mask_and, mask_or, final); +} + +static void io_apic_sync(struct irq_pin_list *entry) +{ + /* + * Synchronize the IO-APIC and the CPU by doing + * a dummy read from the IO-APIC + */ + struct io_apic __iomem *io_apic; + + io_apic = io_apic_base(entry->apic); + readl(&io_apic->data); +} + +static void mask_ioapic(struct irq_cfg *cfg) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void mask_ioapic_irq(struct irq_data *data) +{ + mask_ioapic(data->chip_data); +} + +static void __unmask_ioapic(struct irq_cfg *cfg) +{ + io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL); +} + +static void unmask_ioapic(struct irq_cfg *cfg) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + __unmask_ioapic(cfg); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void unmask_ioapic_irq(struct irq_data *data) +{ + unmask_ioapic(data->chip_data); +} + +/* + * IO-APIC versions below 0x20 don't support EOI register. + * For the record, here is the information about various versions: + * 0Xh 82489DX + * 1Xh I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant + * 2Xh I/O(x)APIC which is PCI 2.2 Compliant + * 30h-FFh Reserved + * + * Some of the Intel ICH Specs (ICH2 to ICH5) documents the io-apic + * version as 0x2. This is an error with documentation and these ICH chips + * use io-apic's of version 0x20. + * + * For IO-APIC's with EOI register, we use that to do an explicit EOI. + * Otherwise, we simulate the EOI message manually by changing the trigger + * mode to edge and then back to level, with RTE being masked during this. + */ +void native_eoi_ioapic_pin(int apic, int pin, int vector) +{ + if (mpc_ioapic_ver(apic) >= 0x20) { + io_apic_eoi(apic, vector); + } else { + struct IO_APIC_route_entry entry, entry1; + + entry = entry1 = __ioapic_read_entry(apic, pin); + + /* + * Mask the entry and change the trigger mode to edge. + */ + entry1.mask = 1; + entry1.trigger = IOAPIC_EDGE; + + __ioapic_write_entry(apic, pin, entry1); + + /* + * Restore the previous level triggered entry. + */ + __ioapic_write_entry(apic, pin, entry); + } +} + +void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) +{ + struct irq_pin_list *entry; + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + for_each_irq_pin(entry, cfg->irq_2_pin) + x86_io_apic_ops.eoi_ioapic_pin(entry->apic, entry->pin, + cfg->vector); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) +{ + struct IO_APIC_route_entry entry; + + /* Check delivery_mode to be sure we're not clearing an SMI pin */ + entry = ioapic_read_entry(apic, pin); + if (entry.delivery_mode == dest_SMI) + return; + + /* + * Make sure the entry is masked and re-read the contents to check + * if it is a level triggered pin and if the remote-IRR is set. + */ + if (!entry.mask) { + entry.mask = 1; + ioapic_write_entry(apic, pin, entry); + entry = ioapic_read_entry(apic, pin); + } + + if (entry.irr) { + unsigned long flags; + + /* + * Make sure the trigger mode is set to level. Explicit EOI + * doesn't clear the remote-IRR if the trigger mode is not + * set to level. + */ + if (!entry.trigger) { + entry.trigger = IOAPIC_LEVEL; + ioapic_write_entry(apic, pin, entry); + } + + raw_spin_lock_irqsave(&ioapic_lock, flags); + x86_io_apic_ops.eoi_ioapic_pin(apic, pin, entry.vector); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + } + + /* + * Clear the rest of the bits in the IO-APIC RTE except for the mask + * bit. + */ + ioapic_mask_entry(apic, pin); + entry = ioapic_read_entry(apic, pin); + if (entry.irr) + pr_err("Unable to reset IRR for apic: %d, pin :%d\n", + mpc_ioapic_id(apic), pin); +} + +static void clear_IO_APIC (void) +{ + int apic, pin; + + for (apic = 0; apic < nr_ioapics; apic++) + for (pin = 0; pin < ioapics[apic].nr_registers; pin++) + clear_IO_APIC_pin(apic, pin); +} + +#ifdef CONFIG_X86_32 +/* + * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to + * specific CPU-side IRQs. + */ + +#define MAX_PIRQS 8 +static int pirq_entries[MAX_PIRQS] = { + [0 ... MAX_PIRQS - 1] = -1 +}; + +static int __init ioapic_pirq_setup(char *str) +{ + int i, max; + int ints[MAX_PIRQS+1]; + + get_options(str, ARRAY_SIZE(ints), ints); + + apic_printk(APIC_VERBOSE, KERN_INFO + "PIRQ redirection, working around broken MP-BIOS.\n"); + max = MAX_PIRQS; + if (ints[0] < MAX_PIRQS) + max = ints[0]; + + for (i = 0; i < max; i++) { + apic_printk(APIC_VERBOSE, KERN_DEBUG + "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); + /* + * PIRQs are mapped upside down, usually. + */ + pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; + } + return 1; +} + +__setup("pirq=", ioapic_pirq_setup); +#endif /* CONFIG_X86_32 */ + +/* + * Saves all the IO-APIC RTE's + */ +int save_ioapic_entries(void) +{ + int apic, pin; + int err = 0; + + for (apic = 0; apic < nr_ioapics; apic++) { + if (!ioapics[apic].saved_registers) { + err = -ENOMEM; + continue; + } + + for (pin = 0; pin < ioapics[apic].nr_registers; pin++) + ioapics[apic].saved_registers[pin] = + ioapic_read_entry(apic, pin); + } + + return err; +} + +/* + * Mask all IO APIC entries. + */ +void mask_ioapic_entries(void) +{ + int apic, pin; + + for (apic = 0; apic < nr_ioapics; apic++) { + if (!ioapics[apic].saved_registers) + continue; + + for (pin = 0; pin < ioapics[apic].nr_registers; pin++) { + struct IO_APIC_route_entry entry; + + entry = ioapics[apic].saved_registers[pin]; + if (!entry.mask) { + entry.mask = 1; + ioapic_write_entry(apic, pin, entry); + } + } + } +} + +/* + * Restore IO APIC entries which was saved in the ioapic structure. + */ +int restore_ioapic_entries(void) +{ + int apic, pin; + + for (apic = 0; apic < nr_ioapics; apic++) { + if (!ioapics[apic].saved_registers) + continue; + + for (pin = 0; pin < ioapics[apic].nr_registers; pin++) + ioapic_write_entry(apic, pin, + ioapics[apic].saved_registers[pin]); + } + return 0; +} + +/* + * Find the IRQ entry number of a certain pin. + */ +static int find_irq_entry(int ioapic_idx, int pin, int type) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) + if (mp_irqs[i].irqtype == type && + (mp_irqs[i].dstapic == mpc_ioapic_id(ioapic_idx) || + mp_irqs[i].dstapic == MP_APIC_ALL) && + mp_irqs[i].dstirq == pin) + return i; + + return -1; +} + +/* + * Find the pin to which IRQ[irq] (ISA) is connected + */ +static int __init find_isa_irq_pin(int irq, int type) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].srcbus; + + if (test_bit(lbus, mp_bus_not_pci) && + (mp_irqs[i].irqtype == type) && + (mp_irqs[i].srcbusirq == irq)) + + return mp_irqs[i].dstirq; + } + return -1; +} + +static int __init find_isa_irq_apic(int irq, int type) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].srcbus; + + if (test_bit(lbus, mp_bus_not_pci) && + (mp_irqs[i].irqtype == type) && + (mp_irqs[i].srcbusirq == irq)) + break; + } + + if (i < mp_irq_entries) { + int ioapic_idx; + + for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) + if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic) + return ioapic_idx; + } + + return -1; +} + +#ifdef CONFIG_EISA +/* + * EISA Edge/Level control register, ELCR + */ +static int EISA_ELCR(unsigned int irq) +{ + if (irq < legacy_pic->nr_legacy_irqs) { + unsigned int port = 0x4d0 + (irq >> 3); + return (inb(port) >> (irq & 7)) & 1; + } + apic_printk(APIC_VERBOSE, KERN_INFO + "Broken MPtable reports ISA irq %d\n", irq); + return 0; +} + +#endif + +/* ISA interrupts are always polarity zero edge triggered, + * when listed as conforming in the MP table. */ + +#define default_ISA_trigger(idx) (0) +#define default_ISA_polarity(idx) (0) + +/* EISA interrupts are always polarity zero and can be edge or level + * trigger depending on the ELCR value. If an interrupt is listed as + * EISA conforming in the MP table, that means its trigger type must + * be read in from the ELCR */ + +#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].srcbusirq)) +#define default_EISA_polarity(idx) default_ISA_polarity(idx) + +/* PCI interrupts are always polarity one level triggered, + * when listed as conforming in the MP table. */ + +#define default_PCI_trigger(idx) (1) +#define default_PCI_polarity(idx) (1) + +static int irq_polarity(int idx) +{ + int bus = mp_irqs[idx].srcbus; + int polarity; + + /* + * Determine IRQ line polarity (high active or low active): + */ + switch (mp_irqs[idx].irqflag & 3) + { + case 0: /* conforms, ie. bus-type dependent polarity */ + if (test_bit(bus, mp_bus_not_pci)) + polarity = default_ISA_polarity(idx); + else + polarity = default_PCI_polarity(idx); + break; + case 1: /* high active */ + { + polarity = 0; + break; + } + case 2: /* reserved */ + { + pr_warn("broken BIOS!!\n"); + polarity = 1; + break; + } + case 3: /* low active */ + { + polarity = 1; + break; + } + default: /* invalid */ + { + pr_warn("broken BIOS!!\n"); + polarity = 1; + break; + } + } + return polarity; +} + +static int irq_trigger(int idx) +{ + int bus = mp_irqs[idx].srcbus; + int trigger; + + /* + * Determine IRQ trigger mode (edge or level sensitive): + */ + switch ((mp_irqs[idx].irqflag>>2) & 3) + { + case 0: /* conforms, ie. bus-type dependent */ + if (test_bit(bus, mp_bus_not_pci)) + trigger = default_ISA_trigger(idx); + else + trigger = default_PCI_trigger(idx); +#ifdef CONFIG_EISA + switch (mp_bus_id_to_type[bus]) { + case MP_BUS_ISA: /* ISA pin */ + { + /* set before the switch */ + break; + } + case MP_BUS_EISA: /* EISA pin */ + { + trigger = default_EISA_trigger(idx); + break; + } + case MP_BUS_PCI: /* PCI pin */ + { + /* set before the switch */ + break; + } + default: + { + pr_warn("broken BIOS!!\n"); + trigger = 1; + break; + } + } +#endif + break; + case 1: /* edge */ + { + trigger = 0; + break; + } + case 2: /* reserved */ + { + pr_warn("broken BIOS!!\n"); + trigger = 1; + break; + } + case 3: /* level */ + { + trigger = 1; + break; + } + default: /* invalid */ + { + pr_warn("broken BIOS!!\n"); + trigger = 0; + break; + } + } + return trigger; +} + +static int pin_2_irq(int idx, int apic, int pin) +{ + int irq; + int bus = mp_irqs[idx].srcbus; + struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(apic); + + /* + * Debugging check, we are in big trouble if this message pops up! + */ + if (mp_irqs[idx].dstirq != pin) + pr_err("broken BIOS or MPTABLE parser, ayiee!!\n"); + + if (test_bit(bus, mp_bus_not_pci)) { + irq = mp_irqs[idx].srcbusirq; + } else { + u32 gsi = gsi_cfg->gsi_base + pin; + + if (gsi >= NR_IRQS_LEGACY) + irq = gsi; + else + irq = gsi_top + gsi; + } + +#ifdef CONFIG_X86_32 + /* + * PCI IRQ command line redirection. Yes, limits are hardcoded. + */ + if ((pin >= 16) && (pin <= 23)) { + if (pirq_entries[pin-16] != -1) { + if (!pirq_entries[pin-16]) { + apic_printk(APIC_VERBOSE, KERN_DEBUG + "disabling PIRQ%d\n", pin-16); + } else { + irq = pirq_entries[pin-16]; + apic_printk(APIC_VERBOSE, KERN_DEBUG + "using PIRQ%d -> IRQ %d\n", + pin-16, irq); + } + } + } +#endif + + return irq; +} + +/* + * Find a specific PCI IRQ entry. + * Not an __init, possibly needed by modules + */ +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin, + struct io_apic_irq_attr *irq_attr) +{ + int ioapic_idx, i, best_guess = -1; + + apic_printk(APIC_DEBUG, + "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", + bus, slot, pin); + if (test_bit(bus, mp_bus_not_pci)) { + apic_printk(APIC_VERBOSE, + "PCI BIOS passed nonexistent PCI bus %d!\n", bus); + return -1; + } + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].srcbus; + + for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) + if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic || + mp_irqs[i].dstapic == MP_APIC_ALL) + break; + + if (!test_bit(lbus, mp_bus_not_pci) && + !mp_irqs[i].irqtype && + (bus == lbus) && + (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) { + int irq = pin_2_irq(i, ioapic_idx, mp_irqs[i].dstirq); + + if (!(ioapic_idx || IO_APIC_IRQ(irq))) + continue; + + if (pin == (mp_irqs[i].srcbusirq & 3)) { + set_io_apic_irq_attr(irq_attr, ioapic_idx, + mp_irqs[i].dstirq, + irq_trigger(i), + irq_polarity(i)); + return irq; + } + /* + * Use the first all-but-pin matching entry as a + * best-guess fuzzy result for broken mptables. + */ + if (best_guess < 0) { + set_io_apic_irq_attr(irq_attr, ioapic_idx, + mp_irqs[i].dstirq, + irq_trigger(i), + irq_polarity(i)); + best_guess = irq; + } + } + } + return best_guess; +} +EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); + +void lock_vector_lock(void) +{ + /* Used to the online set of cpus does not change + * during assign_irq_vector. + */ + raw_spin_lock(&vector_lock); +} + +void unlock_vector_lock(void) +{ + raw_spin_unlock(&vector_lock); +} + +static int +__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) +{ + /* + * NOTE! The local APIC isn't very good at handling + * multiple interrupts at the same interrupt level. + * As the interrupt level is determined by taking the + * vector number and shifting that right by 4, we + * want to spread these out a bit so that they don't + * all fall in the same interrupt level. + * + * Also, we've got to be careful not to trash gate + * 0x80, because int 0x80 is hm, kind of importantish. ;) + */ + static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START; + static int current_offset = VECTOR_OFFSET_START % 16; + int cpu, err; + cpumask_var_t tmp_mask; + + if (cfg->move_in_progress) + return -EBUSY; + + if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC)) + return -ENOMEM; + + /* Only try and allocate irqs on cpus that are present */ + err = -ENOSPC; + cpumask_clear(cfg->old_domain); + cpu = cpumask_first_and(mask, cpu_online_mask); + while (cpu < nr_cpu_ids) { + int new_cpu, vector, offset; + + apic->vector_allocation_domain(cpu, tmp_mask, mask); + + if (cpumask_subset(tmp_mask, cfg->domain)) { + err = 0; + if (cpumask_equal(tmp_mask, cfg->domain)) + break; + /* + * New cpumask using the vector is a proper subset of + * the current in use mask. So cleanup the vector + * allocation for the members that are not used anymore. + */ + cpumask_andnot(cfg->old_domain, cfg->domain, tmp_mask); + cfg->move_in_progress = + cpumask_intersects(cfg->old_domain, cpu_online_mask); + cpumask_and(cfg->domain, cfg->domain, tmp_mask); + break; + } + + vector = current_vector; + offset = current_offset; +next: + vector += 16; + if (vector >= first_system_vector) { + offset = (offset + 1) % 16; + vector = FIRST_EXTERNAL_VECTOR + offset; + } + + if (unlikely(current_vector == vector)) { + cpumask_or(cfg->old_domain, cfg->old_domain, tmp_mask); + cpumask_andnot(tmp_mask, mask, cfg->old_domain); + cpu = cpumask_first_and(tmp_mask, cpu_online_mask); + continue; + } + + if (test_bit(vector, used_vectors)) + goto next; + + for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) { + if (per_cpu(vector_irq, new_cpu)[vector] > VECTOR_UNDEFINED) + goto next; + } + /* Found one! */ + current_vector = vector; + current_offset = offset; + if (cfg->vector) { + cpumask_copy(cfg->old_domain, cfg->domain); + cfg->move_in_progress = + cpumask_intersects(cfg->old_domain, cpu_online_mask); + } + for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) + per_cpu(vector_irq, new_cpu)[vector] = irq; + cfg->vector = vector; + cpumask_copy(cfg->domain, tmp_mask); + err = 0; + break; + } + free_cpumask_var(tmp_mask); + return err; +} + +int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) +{ + int err; + unsigned long flags; + + raw_spin_lock_irqsave(&vector_lock, flags); + err = __assign_irq_vector(irq, cfg, mask); + raw_spin_unlock_irqrestore(&vector_lock, flags); + return err; +} + +static void __clear_irq_vector(int irq, struct irq_cfg *cfg) +{ + int cpu, vector; + + BUG_ON(!cfg->vector); + + vector = cfg->vector; + for_each_cpu_and(cpu, cfg->domain, cpu_online_mask) + per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED; + + cfg->vector = 0; + cpumask_clear(cfg->domain); + + if (likely(!cfg->move_in_progress)) + return; + for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) { + for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { + if (per_cpu(vector_irq, cpu)[vector] != irq) + continue; + per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED; + break; + } + } + cfg->move_in_progress = 0; +} + +void __setup_vector_irq(int cpu) +{ + /* Initialize vector_irq on a new cpu */ + int irq, vector; + struct irq_cfg *cfg; + + /* + * vector_lock will make sure that we don't run into irq vector + * assignments that might be happening on another cpu in parallel, + * while we setup our initial vector to irq mappings. + */ + raw_spin_lock(&vector_lock); + /* Mark the inuse vectors */ + for_each_active_irq(irq) { + cfg = irq_get_chip_data(irq); + if (!cfg) + continue; + + if (!cpumask_test_cpu(cpu, cfg->domain)) + continue; + vector = cfg->vector; + per_cpu(vector_irq, cpu)[vector] = irq; + } + /* Mark the free vectors */ + for (vector = 0; vector < NR_VECTORS; ++vector) { + irq = per_cpu(vector_irq, cpu)[vector]; + if (irq <= VECTOR_UNDEFINED) + continue; + + cfg = irq_cfg(irq); + if (!cpumask_test_cpu(cpu, cfg->domain)) + per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED; + } + raw_spin_unlock(&vector_lock); +} + +static struct irq_chip ioapic_chip; + +#ifdef CONFIG_X86_32 +static inline int IO_APIC_irq_trigger(int irq) +{ + int apic, idx, pin; + + for (apic = 0; apic < nr_ioapics; apic++) { + for (pin = 0; pin < ioapics[apic].nr_registers; pin++) { + idx = find_irq_entry(apic, pin, mp_INT); + if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin))) + return irq_trigger(idx); + } + } + /* + * nonexistent IRQs are edge default + */ + return 0; +} +#else +static inline int IO_APIC_irq_trigger(int irq) +{ + return 1; +} +#endif + +static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg, + unsigned long trigger) +{ + struct irq_chip *chip = &ioapic_chip; + irq_flow_handler_t hdl; + bool fasteoi; + + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || + trigger == IOAPIC_LEVEL) { + irq_set_status_flags(irq, IRQ_LEVEL); + fasteoi = true; + } else { + irq_clear_status_flags(irq, IRQ_LEVEL); + fasteoi = false; + } + + if (setup_remapped_irq(irq, cfg, chip)) + fasteoi = trigger != 0; + + hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq; + irq_set_chip_and_handler_name(irq, chip, hdl, + fasteoi ? "fasteoi" : "edge"); +} + +int native_setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry, + unsigned int destination, int vector, + struct io_apic_irq_attr *attr) +{ + memset(entry, 0, sizeof(*entry)); + + entry->delivery_mode = apic->irq_delivery_mode; + entry->dest_mode = apic->irq_dest_mode; + entry->dest = destination; + entry->vector = vector; + entry->mask = 0; /* enable IRQ */ + entry->trigger = attr->trigger; + entry->polarity = attr->polarity; + + /* + * Mask level triggered irqs. + * Use IRQ_DELAYED_DISABLE for edge triggered irqs. + */ + if (attr->trigger) + entry->mask = 1; + + return 0; +} + +static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg, + struct io_apic_irq_attr *attr) +{ + struct IO_APIC_route_entry entry; + unsigned int dest; + + if (!IO_APIC_IRQ(irq)) + return; + + if (assign_irq_vector(irq, cfg, apic->target_cpus())) + return; + + if (apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus(), + &dest)) { + pr_warn("Failed to obtain apicid for ioapic %d, pin %d\n", + mpc_ioapic_id(attr->ioapic), attr->ioapic_pin); + __clear_irq_vector(irq, cfg); + + return; + } + + apic_printk(APIC_VERBOSE,KERN_DEBUG + "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " + "IRQ %d Mode:%i Active:%i Dest:%d)\n", + attr->ioapic, mpc_ioapic_id(attr->ioapic), attr->ioapic_pin, + cfg->vector, irq, attr->trigger, attr->polarity, dest); + + if (x86_io_apic_ops.setup_entry(irq, &entry, dest, cfg->vector, attr)) { + pr_warn("Failed to setup ioapic entry for ioapic %d, pin %d\n", + mpc_ioapic_id(attr->ioapic), attr->ioapic_pin); + __clear_irq_vector(irq, cfg); + + return; + } + + ioapic_register_intr(irq, cfg, attr->trigger); + if (irq < legacy_pic->nr_legacy_irqs) + legacy_pic->mask(irq); + + ioapic_write_entry(attr->ioapic, attr->ioapic_pin, entry); +} + +static bool __init io_apic_pin_not_connected(int idx, int ioapic_idx, int pin) +{ + if (idx != -1) + return false; + + apic_printk(APIC_VERBOSE, KERN_DEBUG " apic %d pin %d not connected\n", + mpc_ioapic_id(ioapic_idx), pin); + return true; +} + +static void __init __io_apic_setup_irqs(unsigned int ioapic_idx) +{ + int idx, node = cpu_to_node(0); + struct io_apic_irq_attr attr; + unsigned int pin, irq; + + for (pin = 0; pin < ioapics[ioapic_idx].nr_registers; pin++) { + idx = find_irq_entry(ioapic_idx, pin, mp_INT); + if (io_apic_pin_not_connected(idx, ioapic_idx, pin)) + continue; + + irq = pin_2_irq(idx, ioapic_idx, pin); + + if ((ioapic_idx > 0) && (irq > 16)) + continue; + + /* + * Skip the timer IRQ if there's a quirk handler + * installed and if it returns 1: + */ + if (apic->multi_timer_check && + apic->multi_timer_check(ioapic_idx, irq)) + continue; + + set_io_apic_irq_attr(&attr, ioapic_idx, pin, irq_trigger(idx), + irq_polarity(idx)); + + io_apic_setup_irq_pin(irq, node, &attr); + } +} + +static void __init setup_IO_APIC_irqs(void) +{ + unsigned int ioapic_idx; + + apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); + + for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) + __io_apic_setup_irqs(ioapic_idx); +} + +/* + * for the gsit that is not in first ioapic + * but could not use acpi_register_gsi() + * like some special sci in IBM x3330 + */ +void setup_IO_APIC_irq_extra(u32 gsi) +{ + int ioapic_idx = 0, pin, idx, irq, node = cpu_to_node(0); + struct io_apic_irq_attr attr; + + /* + * Convert 'gsi' to 'ioapic.pin'. + */ + ioapic_idx = mp_find_ioapic(gsi); + if (ioapic_idx < 0) + return; + + pin = mp_find_ioapic_pin(ioapic_idx, gsi); + idx = find_irq_entry(ioapic_idx, pin, mp_INT); + if (idx == -1) + return; + + irq = pin_2_irq(idx, ioapic_idx, pin); + + /* Only handle the non legacy irqs on secondary ioapics */ + if (ioapic_idx == 0 || irq < NR_IRQS_LEGACY) + return; + + set_io_apic_irq_attr(&attr, ioapic_idx, pin, irq_trigger(idx), + irq_polarity(idx)); + + io_apic_setup_irq_pin_once(irq, node, &attr); +} + +/* + * Set up the timer pin, possibly with the 8259A-master behind. + */ +static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx, + unsigned int pin, int vector) +{ + struct IO_APIC_route_entry entry; + unsigned int dest; + + memset(&entry, 0, sizeof(entry)); + + /* + * We use logical delivery to get the timer IRQ + * to the first CPU. + */ + if (unlikely(apic->cpu_mask_to_apicid_and(apic->target_cpus(), + apic->target_cpus(), &dest))) + dest = BAD_APICID; + + entry.dest_mode = apic->irq_dest_mode; + entry.mask = 0; /* don't mask IRQ for edge */ + entry.dest = dest; + entry.delivery_mode = apic->irq_delivery_mode; + entry.polarity = 0; + entry.trigger = 0; + entry.vector = vector; + + /* + * The timer IRQ doesn't have to know that behind the + * scene we may have a 8259A-master in AEOI mode ... + */ + irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, + "edge"); + + /* + * Add it to the IO-APIC irq-routing table: + */ + ioapic_write_entry(ioapic_idx, pin, entry); +} + +void native_io_apic_print_entries(unsigned int apic, unsigned int nr_entries) +{ + int i; + + pr_debug(" NR Dst Mask Trig IRR Pol Stat Dmod Deli Vect:\n"); + + for (i = 0; i <= nr_entries; i++) { + struct IO_APIC_route_entry entry; + + entry = ioapic_read_entry(apic, i); + + pr_debug(" %02x %02X ", i, entry.dest); + pr_cont("%1d %1d %1d %1d %1d " + "%1d %1d %02X\n", + entry.mask, + entry.trigger, + entry.irr, + entry.polarity, + entry.delivery_status, + entry.dest_mode, + entry.delivery_mode, + entry.vector); + } +} + +void intel_ir_io_apic_print_entries(unsigned int apic, + unsigned int nr_entries) +{ + int i; + + pr_debug(" NR Indx Fmt Mask Trig IRR Pol Stat Indx2 Zero Vect:\n"); + + for (i = 0; i <= nr_entries; i++) { + struct IR_IO_APIC_route_entry *ir_entry; + struct IO_APIC_route_entry entry; + + entry = ioapic_read_entry(apic, i); + + ir_entry = (struct IR_IO_APIC_route_entry *)&entry; + + pr_debug(" %02x %04X ", i, ir_entry->index); + pr_cont("%1d %1d %1d %1d %1d " + "%1d %1d %X %02X\n", + ir_entry->format, + ir_entry->mask, + ir_entry->trigger, + ir_entry->irr, + ir_entry->polarity, + ir_entry->delivery_status, + ir_entry->index2, + ir_entry->zero, + ir_entry->vector); + } +} + +void ioapic_zap_locks(void) +{ + raw_spin_lock_init(&ioapic_lock); +} + +__apicdebuginit(void) print_IO_APIC(int ioapic_idx) +{ + union IO_APIC_reg_00 reg_00; + union IO_APIC_reg_01 reg_01; + union IO_APIC_reg_02 reg_02; + union IO_APIC_reg_03 reg_03; + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(ioapic_idx, 0); + reg_01.raw = io_apic_read(ioapic_idx, 1); + if (reg_01.bits.version >= 0x10) + reg_02.raw = io_apic_read(ioapic_idx, 2); + if (reg_01.bits.version >= 0x20) + reg_03.raw = io_apic_read(ioapic_idx, 3); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(ioapic_idx)); + printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); + printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); + printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); + printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); + + printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01); + printk(KERN_DEBUG "....... : max redirection entries: %02X\n", + reg_01.bits.entries); + + printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); + printk(KERN_DEBUG "....... : IO APIC version: %02X\n", + reg_01.bits.version); + + /* + * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, + * but the value of reg_02 is read as the previous read register + * value, so ignore it if reg_02 == reg_01. + */ + if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { + printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); + printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); + } + + /* + * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02 + * or reg_03, but the value of reg_0[23] is read as the previous read + * register value, so ignore it if reg_03 == reg_0[12]. + */ + if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw && + reg_03.raw != reg_01.raw) { + printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); + printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); + } + + printk(KERN_DEBUG ".... IRQ redirection table:\n"); + + x86_io_apic_ops.print_entries(ioapic_idx, reg_01.bits.entries); +} + +__apicdebuginit(void) print_IO_APICs(void) +{ + int ioapic_idx; + struct irq_cfg *cfg; + unsigned int irq; + struct irq_chip *chip; + + printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); + for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) + printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", + mpc_ioapic_id(ioapic_idx), + ioapics[ioapic_idx].nr_registers); + + /* + * We are a bit conservative about what we expect. We have to + * know about every hardware change ASAP. + */ + printk(KERN_INFO "testing the IO APIC.......................\n"); + + for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) + print_IO_APIC(ioapic_idx); + + printk(KERN_DEBUG "IRQ to pin mappings:\n"); + for_each_active_irq(irq) { + struct irq_pin_list *entry; + + chip = irq_get_chip(irq); + if (chip != &ioapic_chip) + continue; + + cfg = irq_get_chip_data(irq); + if (!cfg) + continue; + entry = cfg->irq_2_pin; + if (!entry) + continue; + printk(KERN_DEBUG "IRQ%d ", irq); + for_each_irq_pin(entry, cfg->irq_2_pin) + pr_cont("-> %d:%d", entry->apic, entry->pin); + pr_cont("\n"); + } + + printk(KERN_INFO ".................................... done.\n"); +} + +__apicdebuginit(void) print_APIC_field(int base) +{ + int i; + + printk(KERN_DEBUG); + + for (i = 0; i < 8; i++) + pr_cont("%08x", apic_read(base + i*0x10)); + + pr_cont("\n"); +} + +__apicdebuginit(void) print_local_APIC(void *dummy) +{ + unsigned int i, v, ver, maxlvt; + u64 icr; + + printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", + smp_processor_id(), hard_smp_processor_id()); + v = apic_read(APIC_ID); + printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, read_apic_id()); + v = apic_read(APIC_LVR); + printk(KERN_INFO "... APIC VERSION: %08x\n", v); + ver = GET_APIC_VERSION(v); + maxlvt = lapic_get_maxlvt(); + + v = apic_read(APIC_TASKPRI); + printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); + + if (APIC_INTEGRATED(ver)) { /* !82489DX */ + if (!APIC_XAPIC(ver)) { + v = apic_read(APIC_ARBPRI); + printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, + v & APIC_ARBPRI_MASK); + } + v = apic_read(APIC_PROCPRI); + printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v); + } + + /* + * Remote read supported only in the 82489DX and local APIC for + * Pentium processors. + */ + if (!APIC_INTEGRATED(ver) || maxlvt == 3) { + v = apic_read(APIC_RRR); + printk(KERN_DEBUG "... APIC RRR: %08x\n", v); + } + + v = apic_read(APIC_LDR); + printk(KERN_DEBUG "... APIC LDR: %08x\n", v); + if (!x2apic_enabled()) { + v = apic_read(APIC_DFR); + printk(KERN_DEBUG "... APIC DFR: %08x\n", v); + } + v = apic_read(APIC_SPIV); + printk(KERN_DEBUG "... APIC SPIV: %08x\n", v); + + printk(KERN_DEBUG "... APIC ISR field:\n"); + print_APIC_field(APIC_ISR); + printk(KERN_DEBUG "... APIC TMR field:\n"); + print_APIC_field(APIC_TMR); + printk(KERN_DEBUG "... APIC IRR field:\n"); + print_APIC_field(APIC_IRR); + + if (APIC_INTEGRATED(ver)) { /* !82489DX */ + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + + v = apic_read(APIC_ESR); + printk(KERN_DEBUG "... APIC ESR: %08x\n", v); + } + + icr = apic_icr_read(); + printk(KERN_DEBUG "... APIC ICR: %08x\n", (u32)icr); + printk(KERN_DEBUG "... APIC ICR2: %08x\n", (u32)(icr >> 32)); + + v = apic_read(APIC_LVTT); + printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); + + if (maxlvt > 3) { /* PC is LVT#4. */ + v = apic_read(APIC_LVTPC); + printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v); + } + v = apic_read(APIC_LVT0); + printk(KERN_DEBUG "... APIC LVT0: %08x\n", v); + v = apic_read(APIC_LVT1); + printk(KERN_DEBUG "... APIC LVT1: %08x\n", v); + + if (maxlvt > 2) { /* ERR is LVT#3. */ + v = apic_read(APIC_LVTERR); + printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v); + } + + v = apic_read(APIC_TMICT); + printk(KERN_DEBUG "... APIC TMICT: %08x\n", v); + v = apic_read(APIC_TMCCT); + printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); + v = apic_read(APIC_TDCR); + printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); + + if (boot_cpu_has(X86_FEATURE_EXTAPIC)) { + v = apic_read(APIC_EFEAT); + maxlvt = (v >> 16) & 0xff; + printk(KERN_DEBUG "... APIC EFEAT: %08x\n", v); + v = apic_read(APIC_ECTRL); + printk(KERN_DEBUG "... APIC ECTRL: %08x\n", v); + for (i = 0; i < maxlvt; i++) { + v = apic_read(APIC_EILVTn(i)); + printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v); + } + } + pr_cont("\n"); +} + +__apicdebuginit(void) print_local_APICs(int maxcpu) +{ + int cpu; + + if (!maxcpu) + return; + + preempt_disable(); + for_each_online_cpu(cpu) { + if (cpu >= maxcpu) + break; + smp_call_function_single(cpu, print_local_APIC, NULL, 1); + } + preempt_enable(); +} + +__apicdebuginit(void) print_PIC(void) +{ + unsigned int v; + unsigned long flags; + + if (!legacy_pic->nr_legacy_irqs) + return; + + printk(KERN_DEBUG "\nprinting PIC contents\n"); + + raw_spin_lock_irqsave(&i8259A_lock, flags); + + v = inb(0xa1) << 8 | inb(0x21); + printk(KERN_DEBUG "... PIC IMR: %04x\n", v); + + v = inb(0xa0) << 8 | inb(0x20); + printk(KERN_DEBUG "... PIC IRR: %04x\n", v); + + outb(0x0b,0xa0); + outb(0x0b,0x20); + v = inb(0xa0) << 8 | inb(0x20); + outb(0x0a,0xa0); + outb(0x0a,0x20); + + raw_spin_unlock_irqrestore(&i8259A_lock, flags); + + printk(KERN_DEBUG "... PIC ISR: %04x\n", v); + + v = inb(0x4d1) << 8 | inb(0x4d0); + printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); +} + +static int __initdata show_lapic = 1; +static __init int setup_show_lapic(char *arg) +{ + int num = -1; + + if (strcmp(arg, "all") == 0) { + show_lapic = CONFIG_NR_CPUS; + } else { + get_option(&arg, &num); + if (num >= 0) + show_lapic = num; + } + + return 1; +} +__setup("show_lapic=", setup_show_lapic); + +__apicdebuginit(int) print_ICs(void) +{ + if (apic_verbosity == APIC_QUIET) + return 0; + + print_PIC(); + + /* don't print out if apic is not there */ + if (!cpu_has_apic && !apic_from_smp_config()) + return 0; + + print_local_APICs(show_lapic); + print_IO_APICs(); + + return 0; +} + +late_initcall(print_ICs); + + +/* Where if anywhere is the i8259 connect in external int mode */ +static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; + +void __init enable_IO_APIC(void) +{ + int i8259_apic, i8259_pin; + int apic; + + if (!legacy_pic->nr_legacy_irqs) + return; + + for(apic = 0; apic < nr_ioapics; apic++) { + int pin; + /* See if any of the pins is in ExtINT mode */ + for (pin = 0; pin < ioapics[apic].nr_registers; pin++) { + struct IO_APIC_route_entry entry; + entry = ioapic_read_entry(apic, pin); + + /* If the interrupt line is enabled and in ExtInt mode + * I have found the pin where the i8259 is connected. + */ + if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) { + ioapic_i8259.apic = apic; + ioapic_i8259.pin = pin; + goto found_i8259; + } + } + } + found_i8259: + /* Look to see what if the MP table has reported the ExtINT */ + /* If we could not find the appropriate pin by looking at the ioapic + * the i8259 probably is not connected the ioapic but give the + * mptable a chance anyway. + */ + i8259_pin = find_isa_irq_pin(0, mp_ExtINT); + i8259_apic = find_isa_irq_apic(0, mp_ExtINT); + /* Trust the MP table if nothing is setup in the hardware */ + if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) { + printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n"); + ioapic_i8259.pin = i8259_pin; + ioapic_i8259.apic = i8259_apic; + } + /* Complain if the MP table and the hardware disagree */ + if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) && + (i8259_pin >= 0) && (ioapic_i8259.pin >= 0)) + { + printk(KERN_WARNING "ExtINT in hardware and MP table differ\n"); + } + + /* + * Do not trust the IO-APIC being empty at bootup + */ + clear_IO_APIC(); +} + +void native_disable_io_apic(void) +{ + /* + * If the i8259 is routed through an IOAPIC + * Put that IOAPIC in virtual wire mode + * so legacy interrupts can be delivered. + */ + if (ioapic_i8259.pin != -1) { + struct IO_APIC_route_entry entry; + + memset(&entry, 0, sizeof(entry)); + entry.mask = 0; /* Enabled */ + entry.trigger = 0; /* Edge */ + entry.irr = 0; + entry.polarity = 0; /* High */ + entry.delivery_status = 0; + entry.dest_mode = 0; /* Physical */ + entry.delivery_mode = dest_ExtINT; /* ExtInt */ + entry.vector = 0; + entry.dest = read_apic_id(); + + /* + * Add it to the IO-APIC irq-routing table: + */ + ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); + } + + if (cpu_has_apic || apic_from_smp_config()) + disconnect_bsp_APIC(ioapic_i8259.pin != -1); + +} + +/* + * Not an __init, needed by the reboot code + */ +void disable_IO_APIC(void) +{ + /* + * Clear the IO-APIC before rebooting: + */ + clear_IO_APIC(); + + if (!legacy_pic->nr_legacy_irqs) + return; + + x86_io_apic_ops.disable(); +} + +#ifdef CONFIG_X86_32 +/* + * function to set the IO-APIC physical IDs based on the + * values stored in the MPC table. + * + * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 + */ +void __init setup_ioapic_ids_from_mpc_nocheck(void) +{ + union IO_APIC_reg_00 reg_00; + physid_mask_t phys_id_present_map; + int ioapic_idx; + int i; + unsigned char old_id; + unsigned long flags; + + /* + * This is broken; anything with a real cpu count has to + * circumvent this idiocy regardless. + */ + apic->ioapic_phys_id_map(&phys_cpu_present_map, &phys_id_present_map); + + /* + * Set the IOAPIC ID to the value stored in the MPC table. + */ + for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) { + /* Read the register 0 value */ + raw_spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(ioapic_idx, 0); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + old_id = mpc_ioapic_id(ioapic_idx); + + if (mpc_ioapic_id(ioapic_idx) >= get_physical_broadcast()) { + printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", + ioapic_idx, mpc_ioapic_id(ioapic_idx)); + printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", + reg_00.bits.ID); + ioapics[ioapic_idx].mp_config.apicid = reg_00.bits.ID; + } + + /* + * Sanity check, is the ID really free? Every APIC in a + * system must have a unique ID or we get lots of nice + * 'stuck on smp_invalidate_needed IPI wait' messages. + */ + if (apic->check_apicid_used(&phys_id_present_map, + mpc_ioapic_id(ioapic_idx))) { + printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", + ioapic_idx, mpc_ioapic_id(ioapic_idx)); + for (i = 0; i < get_physical_broadcast(); i++) + if (!physid_isset(i, phys_id_present_map)) + break; + if (i >= get_physical_broadcast()) + panic("Max APIC ID exceeded!\n"); + printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", + i); + physid_set(i, phys_id_present_map); + ioapics[ioapic_idx].mp_config.apicid = i; + } else { + physid_mask_t tmp; + apic->apicid_to_cpu_present(mpc_ioapic_id(ioapic_idx), + &tmp); + apic_printk(APIC_VERBOSE, "Setting %d in the " + "phys_id_present_map\n", + mpc_ioapic_id(ioapic_idx)); + physids_or(phys_id_present_map, phys_id_present_map, tmp); + } + + /* + * We need to adjust the IRQ routing table + * if the ID changed. + */ + if (old_id != mpc_ioapic_id(ioapic_idx)) + for (i = 0; i < mp_irq_entries; i++) + if (mp_irqs[i].dstapic == old_id) + mp_irqs[i].dstapic + = mpc_ioapic_id(ioapic_idx); + + /* + * Update the ID register according to the right value + * from the MPC table if they are different. + */ + if (mpc_ioapic_id(ioapic_idx) == reg_00.bits.ID) + continue; + + apic_printk(APIC_VERBOSE, KERN_INFO + "...changing IO-APIC physical APIC ID to %d ...", + mpc_ioapic_id(ioapic_idx)); + + reg_00.bits.ID = mpc_ioapic_id(ioapic_idx); + raw_spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(ioapic_idx, 0, reg_00.raw); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + /* + * Sanity check + */ + raw_spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(ioapic_idx, 0); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx)) + pr_cont("could not set ID!\n"); + else + apic_printk(APIC_VERBOSE, " ok.\n"); + } +} + +void __init setup_ioapic_ids_from_mpc(void) +{ + + if (acpi_ioapic) + return; + /* + * Don't check I/O APIC IDs for xAPIC systems. They have + * no meaning without the serial APIC bus. + */ + if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + return; + setup_ioapic_ids_from_mpc_nocheck(); +} +#endif + +int no_timer_check __initdata; + +static int __init notimercheck(char *s) +{ + no_timer_check = 1; + return 1; +} +__setup("no_timer_check", notimercheck); + +/* + * There is a nasty bug in some older SMP boards, their mptable lies + * about the timer IRQ. We do the following to work around the situation: + * + * - timer IRQ defaults to IO-APIC IRQ + * - if this function detects that timer IRQs are defunct, then we fall + * back to ISA timer IRQs + */ +static int __init timer_irq_works(void) +{ + unsigned long t1 = jiffies; + unsigned long flags; + + if (no_timer_check) + return 1; + + local_save_flags(flags); + local_irq_enable(); + /* Let ten ticks pass... */ + mdelay((10 * 1000) / HZ); + local_irq_restore(flags); + + /* + * Expect a few ticks at least, to be sure some possible + * glue logic does not lock up after one or two first + * ticks in a non-ExtINT mode. Also the local APIC + * might have cached one ExtINT interrupt. Finally, at + * least one tick may be lost due to delays. + */ + + /* jiffies wrap? */ + if (time_after(jiffies, t1 + 4)) + return 1; + return 0; +} + +/* + * In the SMP+IOAPIC case it might happen that there are an unspecified + * number of pending IRQ events unhandled. These cases are very rare, + * so we 'resend' these IRQs via IPIs, to the same CPU. It's much + * better to do it this way as thus we do not have to be aware of + * 'pending' interrupts in the IRQ path, except at this point. + */ +/* + * Edge triggered needs to resend any interrupt + * that was delayed but this is now handled in the device + * independent code. + */ + +/* + * Starting up a edge-triggered IO-APIC interrupt is + * nasty - we need to make sure that we get the edge. + * If it is already asserted for some reason, we need + * return 1 to indicate that is was pending. + * + * This is not complete - we should be able to fake + * an edge even if it isn't on the 8259A... + */ + +static unsigned int startup_ioapic_irq(struct irq_data *data) +{ + int was_pending = 0, irq = data->irq; + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + if (irq < legacy_pic->nr_legacy_irqs) { + legacy_pic->mask(irq); + if (legacy_pic->irq_pending(irq)) + was_pending = 1; + } + __unmask_ioapic(data->chip_data); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + return was_pending; +} + +static int ioapic_retrigger_irq(struct irq_data *data) +{ + struct irq_cfg *cfg = data->chip_data; + unsigned long flags; + int cpu; + + raw_spin_lock_irqsave(&vector_lock, flags); + cpu = cpumask_first_and(cfg->domain, cpu_online_mask); + apic->send_IPI_mask(cpumask_of(cpu), cfg->vector); + raw_spin_unlock_irqrestore(&vector_lock, flags); + + return 1; +} + +/* + * Level and edge triggered IO-APIC interrupts need different handling, + * so we use two separate IRQ descriptors. Edge triggered IRQs can be + * handled with the level-triggered descriptor, but that one has slightly + * more overhead. Level-triggered interrupts cannot be handled with the + * edge-triggered handler, without risking IRQ storms and other ugly + * races. + */ + +#ifdef CONFIG_SMP +void send_cleanup_vector(struct irq_cfg *cfg) +{ + cpumask_var_t cleanup_mask; + + if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { + unsigned int i; + for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) + apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); + } else { + cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); + apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); + free_cpumask_var(cleanup_mask); + } + cfg->move_in_progress = 0; +} + +asmlinkage __visible void smp_irq_move_cleanup_interrupt(void) +{ + unsigned vector, me; + + ack_APIC_irq(); + irq_enter(); + exit_idle(); + + me = smp_processor_id(); + for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { + int irq; + unsigned int irr; + struct irq_desc *desc; + struct irq_cfg *cfg; + irq = __this_cpu_read(vector_irq[vector]); + + if (irq <= VECTOR_UNDEFINED) + continue; + + desc = irq_to_desc(irq); + if (!desc) + continue; + + cfg = irq_cfg(irq); + if (!cfg) + continue; + + raw_spin_lock(&desc->lock); + + /* + * Check if the irq migration is in progress. If so, we + * haven't received the cleanup request yet for this irq. + */ + if (cfg->move_in_progress) + goto unlock; + + if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) + goto unlock; + + irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); + /* + * Check if the vector that needs to be cleanedup is + * registered at the cpu's IRR. If so, then this is not + * the best time to clean it up. Lets clean it up in the + * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR + * to myself. + */ + if (irr & (1 << (vector % 32))) { + apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR); + goto unlock; + } + __this_cpu_write(vector_irq[vector], -1); +unlock: + raw_spin_unlock(&desc->lock); + } + + irq_exit(); +} + +static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector) +{ + unsigned me; + + if (likely(!cfg->move_in_progress)) + return; + + me = smp_processor_id(); + + if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) + send_cleanup_vector(cfg); +} + +static void irq_complete_move(struct irq_cfg *cfg) +{ + __irq_complete_move(cfg, ~get_irq_regs()->orig_ax); +} + +void irq_force_complete_move(int irq) +{ + struct irq_cfg *cfg = irq_get_chip_data(irq); + + if (!cfg) + return; + + __irq_complete_move(cfg, cfg->vector); +} +#else +static inline void irq_complete_move(struct irq_cfg *cfg) { } +#endif + +static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg) +{ + int apic, pin; + struct irq_pin_list *entry; + u8 vector = cfg->vector; + + for_each_irq_pin(entry, cfg->irq_2_pin) { + unsigned int reg; + + apic = entry->apic; + pin = entry->pin; + + io_apic_write(apic, 0x11 + pin*2, dest); + reg = io_apic_read(apic, 0x10 + pin*2); + reg &= ~IO_APIC_REDIR_VECTOR_MASK; + reg |= vector; + io_apic_modify(apic, 0x10 + pin*2, reg); + } +} + +/* + * Either sets data->affinity to a valid value, and returns + * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and + * leaves data->affinity untouched. + */ +int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, + unsigned int *dest_id) +{ + struct irq_cfg *cfg = data->chip_data; + unsigned int irq = data->irq; + int err; + + if (!config_enabled(CONFIG_SMP)) + return -EPERM; + + if (!cpumask_intersects(mask, cpu_online_mask)) + return -EINVAL; + + err = assign_irq_vector(irq, cfg, mask); + if (err) + return err; + + err = apic->cpu_mask_to_apicid_and(mask, cfg->domain, dest_id); + if (err) { + if (assign_irq_vector(irq, cfg, data->affinity)) + pr_err("Failed to recover vector for irq %d\n", irq); + return err; + } + + cpumask_copy(data->affinity, mask); + + return 0; +} + + +int native_ioapic_set_affinity(struct irq_data *data, + const struct cpumask *mask, + bool force) +{ + unsigned int dest, irq = data->irq; + unsigned long flags; + int ret; + + if (!config_enabled(CONFIG_SMP)) + return -EPERM; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + ret = __ioapic_set_affinity(data, mask, &dest); + if (!ret) { + /* Only the high 8 bits are valid. */ + dest = SET_APIC_LOGICAL_ID(dest); + __target_IO_APIC_irq(irq, dest, data->chip_data); + ret = IRQ_SET_MASK_OK_NOCOPY; + } + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + return ret; +} + +static void ack_apic_edge(struct irq_data *data) +{ + irq_complete_move(data->chip_data); + irq_move_irq(data); + ack_APIC_irq(); +} + +atomic_t irq_mis_count; + +#ifdef CONFIG_GENERIC_PENDING_IRQ +static bool io_apic_level_ack_pending(struct irq_cfg *cfg) +{ + struct irq_pin_list *entry; + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + for_each_irq_pin(entry, cfg->irq_2_pin) { + unsigned int reg; + int pin; + + pin = entry->pin; + reg = io_apic_read(entry->apic, 0x10 + pin*2); + /* Is the remote IRR bit set? */ + if (reg & IO_APIC_REDIR_REMOTE_IRR) { + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + return true; + } + } + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + return false; +} + +static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg) +{ + /* If we are moving the irq we need to mask it */ + if (unlikely(irqd_is_setaffinity_pending(data))) { + mask_ioapic(cfg); + return true; + } + return false; +} + +static inline void ioapic_irqd_unmask(struct irq_data *data, + struct irq_cfg *cfg, bool masked) +{ + if (unlikely(masked)) { + /* Only migrate the irq if the ack has been received. + * + * On rare occasions the broadcast level triggered ack gets + * delayed going to ioapics, and if we reprogram the + * vector while Remote IRR is still set the irq will never + * fire again. + * + * To prevent this scenario we read the Remote IRR bit + * of the ioapic. This has two effects. + * - On any sane system the read of the ioapic will + * flush writes (and acks) going to the ioapic from + * this cpu. + * - We get to see if the ACK has actually been delivered. + * + * Based on failed experiments of reprogramming the + * ioapic entry from outside of irq context starting + * with masking the ioapic entry and then polling until + * Remote IRR was clear before reprogramming the + * ioapic I don't trust the Remote IRR bit to be + * completey accurate. + * + * However there appears to be no other way to plug + * this race, so if the Remote IRR bit is not + * accurate and is causing problems then it is a hardware bug + * and you can go talk to the chipset vendor about it. + */ + if (!io_apic_level_ack_pending(cfg)) + irq_move_masked_irq(data); + unmask_ioapic(cfg); + } +} +#else +static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg) +{ + return false; +} +static inline void ioapic_irqd_unmask(struct irq_data *data, + struct irq_cfg *cfg, bool masked) +{ +} +#endif + +static void ack_apic_level(struct irq_data *data) +{ + struct irq_cfg *cfg = data->chip_data; + int i, irq = data->irq; + unsigned long v; + bool masked; + + irq_complete_move(cfg); + masked = ioapic_irqd_mask(data, cfg); + + /* + * It appears there is an erratum which affects at least version 0x11 + * of I/O APIC (that's the 82093AA and cores integrated into various + * chipsets). Under certain conditions a level-triggered interrupt is + * erroneously delivered as edge-triggered one but the respective IRR + * bit gets set nevertheless. As a result the I/O unit expects an EOI + * message but it will never arrive and further interrupts are blocked + * from the source. The exact reason is so far unknown, but the + * phenomenon was observed when two consecutive interrupt requests + * from a given source get delivered to the same CPU and the source is + * temporarily disabled in between. + * + * A workaround is to simulate an EOI message manually. We achieve it + * by setting the trigger mode to edge and then to level when the edge + * trigger mode gets detected in the TMR of a local APIC for a + * level-triggered interrupt. We mask the source for the time of the + * operation to prevent an edge-triggered interrupt escaping meanwhile. + * The idea is from Manfred Spraul. --macro + * + * Also in the case when cpu goes offline, fixup_irqs() will forward + * any unhandled interrupt on the offlined cpu to the new cpu + * destination that is handling the corresponding interrupt. This + * interrupt forwarding is done via IPI's. Hence, in this case also + * level-triggered io-apic interrupt will be seen as an edge + * interrupt in the IRR. And we can't rely on the cpu's EOI + * to be broadcasted to the IO-APIC's which will clear the remoteIRR + * corresponding to the level-triggered interrupt. Hence on IO-APIC's + * supporting EOI register, we do an explicit EOI to clear the + * remote IRR and on IO-APIC's which don't have an EOI register, + * we use the above logic (mask+edge followed by unmask+level) from + * Manfred Spraul to clear the remote IRR. + */ + i = cfg->vector; + v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); + + /* + * We must acknowledge the irq before we move it or the acknowledge will + * not propagate properly. + */ + ack_APIC_irq(); + + /* + * Tail end of clearing remote IRR bit (either by delivering the EOI + * message via io-apic EOI register write or simulating it using + * mask+edge followed by unnask+level logic) manually when the + * level triggered interrupt is seen as the edge triggered interrupt + * at the cpu. + */ + if (!(v & (1 << (i & 0x1f)))) { + atomic_inc(&irq_mis_count); + + eoi_ioapic_irq(irq, cfg); + } + + ioapic_irqd_unmask(data, cfg, masked); +} + +static struct irq_chip ioapic_chip __read_mostly = { + .name = "IO-APIC", + .irq_startup = startup_ioapic_irq, + .irq_mask = mask_ioapic_irq, + .irq_unmask = unmask_ioapic_irq, + .irq_ack = ack_apic_edge, + .irq_eoi = ack_apic_level, + .irq_set_affinity = native_ioapic_set_affinity, + .irq_retrigger = ioapic_retrigger_irq, +}; + +static inline void init_IO_APIC_traps(void) +{ + struct irq_cfg *cfg; + unsigned int irq; + + /* + * NOTE! The local APIC isn't very good at handling + * multiple interrupts at the same interrupt level. + * As the interrupt level is determined by taking the + * vector number and shifting that right by 4, we + * want to spread these out a bit so that they don't + * all fall in the same interrupt level. + * + * Also, we've got to be careful not to trash gate + * 0x80, because int 0x80 is hm, kind of importantish. ;) + */ + for_each_active_irq(irq) { + cfg = irq_get_chip_data(irq); + if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) { + /* + * Hmm.. We don't have an entry for this, + * so default to an old-fashioned 8259 + * interrupt if we can.. + */ + if (irq < legacy_pic->nr_legacy_irqs) + legacy_pic->make_irq(irq); + else + /* Strange. Oh, well.. */ + irq_set_chip(irq, &no_irq_chip); + } + } +} + +/* + * The local APIC irq-chip implementation: + */ + +static void mask_lapic_irq(struct irq_data *data) +{ + unsigned long v; + + v = apic_read(APIC_LVT0); + apic_write(APIC_LVT0, v | APIC_LVT_MASKED); +} + +static void unmask_lapic_irq(struct irq_data *data) +{ + unsigned long v; + + v = apic_read(APIC_LVT0); + apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); +} + +static void ack_lapic_irq(struct irq_data *data) +{ + ack_APIC_irq(); +} + +static struct irq_chip lapic_chip __read_mostly = { + .name = "local-APIC", + .irq_mask = mask_lapic_irq, + .irq_unmask = unmask_lapic_irq, + .irq_ack = ack_lapic_irq, +}; + +static void lapic_register_intr(int irq) +{ + irq_clear_status_flags(irq, IRQ_LEVEL); + irq_set_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, + "edge"); +} + +/* + * This looks a bit hackish but it's about the only one way of sending + * a few INTA cycles to 8259As and any associated glue logic. ICR does + * not support the ExtINT mode, unfortunately. We need to send these + * cycles as some i82489DX-based boards have glue logic that keeps the + * 8259A interrupt line asserted until INTA. --macro + */ +static inline void __init unlock_ExtINT_logic(void) +{ + int apic, pin, i; + struct IO_APIC_route_entry entry0, entry1; + unsigned char save_control, save_freq_select; + + pin = find_isa_irq_pin(8, mp_INT); + if (pin == -1) { + WARN_ON_ONCE(1); + return; + } + apic = find_isa_irq_apic(8, mp_INT); + if (apic == -1) { + WARN_ON_ONCE(1); + return; + } + + entry0 = ioapic_read_entry(apic, pin); + clear_IO_APIC_pin(apic, pin); + + memset(&entry1, 0, sizeof(entry1)); + + entry1.dest_mode = 0; /* physical delivery */ + entry1.mask = 0; /* unmask IRQ now */ + entry1.dest = hard_smp_processor_id(); + entry1.delivery_mode = dest_ExtINT; + entry1.polarity = entry0.polarity; + entry1.trigger = 0; + entry1.vector = 0; + + ioapic_write_entry(apic, pin, entry1); + + save_control = CMOS_READ(RTC_CONTROL); + save_freq_select = CMOS_READ(RTC_FREQ_SELECT); + CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6, + RTC_FREQ_SELECT); + CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL); + + i = 100; + while (i-- > 0) { + mdelay(10); + if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF) + i -= 10; + } + + CMOS_WRITE(save_control, RTC_CONTROL); + CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); + clear_IO_APIC_pin(apic, pin); + + ioapic_write_entry(apic, pin, entry0); +} + +static int disable_timer_pin_1 __initdata; +/* Actually the next is obsolete, but keep it for paranoid reasons -AK */ +static int __init disable_timer_pin_setup(char *arg) +{ + disable_timer_pin_1 = 1; + return 0; +} +early_param("disable_timer_pin_1", disable_timer_pin_setup); + +int timer_through_8259 __initdata; + +/* + * This code may look a bit paranoid, but it's supposed to cooperate with + * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ + * is so screwy. Thanks to Brian Perkins for testing/hacking this beast + * fanatically on his truly buggy board. + * + * FIXME: really need to revamp this for all platforms. + */ +static inline void __init check_timer(void) +{ + struct irq_cfg *cfg = irq_get_chip_data(0); + int node = cpu_to_node(0); + int apic1, pin1, apic2, pin2; + unsigned long flags; + int no_pin1 = 0; + + local_irq_save(flags); + + /* + * get/set the timer IRQ vector: + */ + legacy_pic->mask(0); + assign_irq_vector(0, cfg, apic->target_cpus()); + + /* + * As IRQ0 is to be enabled in the 8259A, the virtual + * wire has to be disabled in the local APIC. Also + * timer interrupts need to be acknowledged manually in + * the 8259A for the i82489DX when using the NMI + * watchdog as that APIC treats NMIs as level-triggered. + * The AEOI mode will finish them in the 8259A + * automatically. + */ + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); + legacy_pic->init(1); + + pin1 = find_isa_irq_pin(0, mp_INT); + apic1 = find_isa_irq_apic(0, mp_INT); + pin2 = ioapic_i8259.pin; + apic2 = ioapic_i8259.apic; + + apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X " + "apic1=%d pin1=%d apic2=%d pin2=%d\n", + cfg->vector, apic1, pin1, apic2, pin2); + + /* + * Some BIOS writers are clueless and report the ExtINTA + * I/O APIC input from the cascaded 8259A as the timer + * interrupt input. So just in case, if only one pin + * was found above, try it both directly and through the + * 8259A. + */ + if (pin1 == -1) { + panic_if_irq_remap("BIOS bug: timer not connected to IO-APIC"); + pin1 = pin2; + apic1 = apic2; + no_pin1 = 1; + } else if (pin2 == -1) { + pin2 = pin1; + apic2 = apic1; + } + + if (pin1 != -1) { + /* + * Ok, does IRQ0 through the IOAPIC work? + */ + if (no_pin1) { + add_pin_to_irq_node(cfg, node, apic1, pin1); + setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); + } else { + /* for edge trigger, setup_ioapic_irq already + * leave it unmasked. + * so only need to unmask if it is level-trigger + * do we really have level trigger timer? + */ + int idx; + idx = find_irq_entry(apic1, pin1, mp_INT); + if (idx != -1 && irq_trigger(idx)) + unmask_ioapic(cfg); + } + if (timer_irq_works()) { + if (disable_timer_pin_1 > 0) + clear_IO_APIC_pin(0, pin1); + goto out; + } + panic_if_irq_remap("timer doesn't work through Interrupt-remapped IO-APIC"); + local_irq_disable(); + clear_IO_APIC_pin(apic1, pin1); + if (!no_pin1) + apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " + "8254 timer not connected to IO-APIC\n"); + + apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer " + "(IRQ0) through the 8259A ...\n"); + apic_printk(APIC_QUIET, KERN_INFO + "..... (found apic %d pin %d) ...\n", apic2, pin2); + /* + * legacy devices should be connected to IO APIC #0 + */ + replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2); + setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); + legacy_pic->unmask(0); + if (timer_irq_works()) { + apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); + timer_through_8259 = 1; + goto out; + } + /* + * Cleanup, just in case ... + */ + local_irq_disable(); + legacy_pic->mask(0); + clear_IO_APIC_pin(apic2, pin2); + apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); + } + + apic_printk(APIC_QUIET, KERN_INFO + "...trying to set up timer as Virtual Wire IRQ...\n"); + + lapic_register_intr(0); + apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ + legacy_pic->unmask(0); + + if (timer_irq_works()) { + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); + goto out; + } + local_irq_disable(); + legacy_pic->mask(0); + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); + apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); + + apic_printk(APIC_QUIET, KERN_INFO + "...trying to set up timer as ExtINT IRQ...\n"); + + legacy_pic->init(0); + legacy_pic->make_irq(0); + apic_write(APIC_LVT0, APIC_DM_EXTINT); + + unlock_ExtINT_logic(); + + if (timer_irq_works()) { + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); + goto out; + } + local_irq_disable(); + apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); + if (x2apic_preenabled) + apic_printk(APIC_QUIET, KERN_INFO + "Perhaps problem with the pre-enabled x2apic mode\n" + "Try booting with x2apic and interrupt-remapping disabled in the bios.\n"); + panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " + "report. Then try booting with the 'noapic' option.\n"); +out: + local_irq_restore(flags); +} + +/* + * Traditionally ISA IRQ2 is the cascade IRQ, and is not available + * to devices. However there may be an I/O APIC pin available for + * this interrupt regardless. The pin may be left unconnected, but + * typically it will be reused as an ExtINT cascade interrupt for + * the master 8259A. In the MPS case such a pin will normally be + * reported as an ExtINT interrupt in the MP table. With ACPI + * there is no provision for ExtINT interrupts, and in the absence + * of an override it would be treated as an ordinary ISA I/O APIC + * interrupt, that is edge-triggered and unmasked by default. We + * used to do this, but it caused problems on some systems because + * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using + * the same ExtINT cascade interrupt to drive the local APIC of the + * bootstrap processor. Therefore we refrain from routing IRQ2 to + * the I/O APIC in all cases now. No actual device should request + * it anyway. --macro + */ +#define PIC_IRQS (1UL << PIC_CASCADE_IR) + +void __init setup_IO_APIC(void) +{ + + /* + * calling enable_IO_APIC() is moved to setup_local_APIC for BP + */ + io_apic_irqs = legacy_pic->nr_legacy_irqs ? ~PIC_IRQS : ~0UL; + + apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); + /* + * Set up IO-APIC IRQ routing. + */ + x86_init.mpparse.setup_ioapic_ids(); + + sync_Arb_IDs(); + setup_IO_APIC_irqs(); + init_IO_APIC_traps(); + if (legacy_pic->nr_legacy_irqs) + check_timer(); +} + +/* + * Called after all the initialization is done. If we didn't find any + * APIC bugs then we can allow the modify fast path + */ + +static int __init io_apic_bug_finalize(void) +{ + if (sis_apic_bug == -1) + sis_apic_bug = 0; + return 0; +} + +late_initcall(io_apic_bug_finalize); + +static void resume_ioapic_id(int ioapic_idx) +{ + unsigned long flags; + union IO_APIC_reg_00 reg_00; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(ioapic_idx, 0); + if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx)) { + reg_00.bits.ID = mpc_ioapic_id(ioapic_idx); + io_apic_write(ioapic_idx, 0, reg_00.raw); + } + raw_spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void ioapic_resume(void) +{ + int ioapic_idx; + + for (ioapic_idx = nr_ioapics - 1; ioapic_idx >= 0; ioapic_idx--) + resume_ioapic_id(ioapic_idx); + + restore_ioapic_entries(); +} + +static struct syscore_ops ioapic_syscore_ops = { + .suspend = save_ioapic_entries, + .resume = ioapic_resume, +}; + +static int __init ioapic_init_ops(void) +{ + register_syscore_ops(&ioapic_syscore_ops); + + return 0; +} + +device_initcall(ioapic_init_ops); + +/* + * Dynamic irq allocate and deallocation. Should be replaced by irq domains! + */ +int arch_setup_hwirq(unsigned int irq, int node) +{ + struct irq_cfg *cfg; + unsigned long flags; + int ret; + + cfg = alloc_irq_cfg(irq, node); + if (!cfg) + return -ENOMEM; + + raw_spin_lock_irqsave(&vector_lock, flags); + ret = __assign_irq_vector(irq, cfg, apic->target_cpus()); + raw_spin_unlock_irqrestore(&vector_lock, flags); + + if (!ret) + irq_set_chip_data(irq, cfg); + else + free_irq_cfg(irq, cfg); + return ret; +} + +void arch_teardown_hwirq(unsigned int irq) +{ + struct irq_cfg *cfg = irq_get_chip_data(irq); + unsigned long flags; + + free_remapped_irq(irq); + raw_spin_lock_irqsave(&vector_lock, flags); + __clear_irq_vector(irq, cfg); + raw_spin_unlock_irqrestore(&vector_lock, flags); + free_irq_cfg(irq, cfg); +} + +/* + * MSI message composition + */ +void native_compose_msi_msg(struct pci_dev *pdev, + unsigned int irq, unsigned int dest, + struct msi_msg *msg, u8 hpet_id) +{ + struct irq_cfg *cfg = irq_cfg(irq); + + msg->address_hi = MSI_ADDR_BASE_HI; + + if (x2apic_enabled()) + msg->address_hi |= MSI_ADDR_EXT_DEST_ID(dest); + + msg->address_lo = + MSI_ADDR_BASE_LO | + ((apic->irq_dest_mode == 0) ? + MSI_ADDR_DEST_MODE_PHYSICAL: + MSI_ADDR_DEST_MODE_LOGICAL) | + ((apic->irq_delivery_mode != dest_LowestPrio) ? + MSI_ADDR_REDIRECTION_CPU: + MSI_ADDR_REDIRECTION_LOWPRI) | + MSI_ADDR_DEST_ID(dest); + + msg->data = + MSI_DATA_TRIGGER_EDGE | + MSI_DATA_LEVEL_ASSERT | + ((apic->irq_delivery_mode != dest_LowestPrio) ? + MSI_DATA_DELIVERY_FIXED: + MSI_DATA_DELIVERY_LOWPRI) | + MSI_DATA_VECTOR(cfg->vector); +} + +#ifdef CONFIG_PCI_MSI +static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, + struct msi_msg *msg, u8 hpet_id) +{ + struct irq_cfg *cfg; + int err; + unsigned dest; + + if (disable_apic) + return -ENXIO; + + cfg = irq_cfg(irq); + err = assign_irq_vector(irq, cfg, apic->target_cpus()); + if (err) + return err; + + err = apic->cpu_mask_to_apicid_and(cfg->domain, + apic->target_cpus(), &dest); + if (err) + return err; + + x86_msi.compose_msi_msg(pdev, irq, dest, msg, hpet_id); + + return 0; +} + +static int +msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) +{ + struct irq_cfg *cfg = data->chip_data; + struct msi_msg msg; + unsigned int dest; + int ret; + + ret = __ioapic_set_affinity(data, mask, &dest); + if (ret) + return ret; + + __get_cached_msi_msg(data->msi_desc, &msg); + + msg.data &= ~MSI_DATA_VECTOR_MASK; + msg.data |= MSI_DATA_VECTOR(cfg->vector); + msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; + msg.address_lo |= MSI_ADDR_DEST_ID(dest); + + __write_msi_msg(data->msi_desc, &msg); + + return IRQ_SET_MASK_OK_NOCOPY; +} + +/* + * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, + * which implement the MSI or MSI-X Capability Structure. + */ +static struct irq_chip msi_chip = { + .name = "PCI-MSI", + .irq_unmask = unmask_msi_irq, + .irq_mask = mask_msi_irq, + .irq_ack = ack_apic_edge, + .irq_set_affinity = msi_set_affinity, + .irq_retrigger = ioapic_retrigger_irq, +}; + +int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, + unsigned int irq_base, unsigned int irq_offset) +{ + struct irq_chip *chip = &msi_chip; + struct msi_msg msg; + unsigned int irq = irq_base + irq_offset; + int ret; + + ret = msi_compose_msg(dev, irq, &msg, -1); + if (ret < 0) + return ret; + + irq_set_msi_desc_off(irq_base, irq_offset, msidesc); + + /* + * MSI-X message is written per-IRQ, the offset is always 0. + * MSI message denotes a contiguous group of IRQs, written for 0th IRQ. + */ + if (!irq_offset) + write_msi_msg(irq, &msg); + + setup_remapped_irq(irq, irq_get_chip_data(irq), chip); + + irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge"); + + dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq); + + return 0; +} + +int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) +{ + struct msi_desc *msidesc; + unsigned int irq; + int node, ret; + + /* Multiple MSI vectors only supported with interrupt remapping */ + if (type == PCI_CAP_ID_MSI && nvec > 1) + return 1; + + node = dev_to_node(&dev->dev); + + list_for_each_entry(msidesc, &dev->msi_list, list) { + irq = irq_alloc_hwirq(node); + if (!irq) + return -ENOSPC; + + ret = setup_msi_irq(dev, msidesc, irq, 0); + if (ret < 0) { + irq_free_hwirq(irq); + return ret; + } + + } + return 0; +} + +void native_teardown_msi_irq(unsigned int irq) +{ + irq_free_hwirq(irq); +} + +#ifdef CONFIG_DMAR_TABLE +static int +dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, + bool force) +{ + struct irq_cfg *cfg = data->chip_data; + unsigned int dest, irq = data->irq; + struct msi_msg msg; + int ret; + + ret = __ioapic_set_affinity(data, mask, &dest); + if (ret) + return ret; + + dmar_msi_read(irq, &msg); + + msg.data &= ~MSI_DATA_VECTOR_MASK; + msg.data |= MSI_DATA_VECTOR(cfg->vector); + msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; + msg.address_lo |= MSI_ADDR_DEST_ID(dest); + msg.address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(dest); + + dmar_msi_write(irq, &msg); + + return IRQ_SET_MASK_OK_NOCOPY; +} + +static struct irq_chip dmar_msi_type = { + .name = "DMAR_MSI", + .irq_unmask = dmar_msi_unmask, + .irq_mask = dmar_msi_mask, + .irq_ack = ack_apic_edge, + .irq_set_affinity = dmar_msi_set_affinity, + .irq_retrigger = ioapic_retrigger_irq, +}; + +int arch_setup_dmar_msi(unsigned int irq) +{ + int ret; + struct msi_msg msg; + + ret = msi_compose_msg(NULL, irq, &msg, -1); + if (ret < 0) + return ret; + dmar_msi_write(irq, &msg); + irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, + "edge"); + return 0; +} +#endif + +#ifdef CONFIG_HPET_TIMER + +static int hpet_msi_set_affinity(struct irq_data *data, + const struct cpumask *mask, bool force) +{ + struct irq_cfg *cfg = data->chip_data; + struct msi_msg msg; + unsigned int dest; + int ret; + + ret = __ioapic_set_affinity(data, mask, &dest); + if (ret) + return ret; + + hpet_msi_read(data->handler_data, &msg); + + msg.data &= ~MSI_DATA_VECTOR_MASK; + msg.data |= MSI_DATA_VECTOR(cfg->vector); + msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; + msg.address_lo |= MSI_ADDR_DEST_ID(dest); + + hpet_msi_write(data->handler_data, &msg); + + return IRQ_SET_MASK_OK_NOCOPY; +} + +static struct irq_chip hpet_msi_type = { + .name = "HPET_MSI", + .irq_unmask = hpet_msi_unmask, + .irq_mask = hpet_msi_mask, + .irq_ack = ack_apic_edge, + .irq_set_affinity = hpet_msi_set_affinity, + .irq_retrigger = ioapic_retrigger_irq, +}; + +int default_setup_hpet_msi(unsigned int irq, unsigned int id) +{ + struct irq_chip *chip = &hpet_msi_type; + struct msi_msg msg; + int ret; + + ret = msi_compose_msg(NULL, irq, &msg, id); + if (ret < 0) + return ret; + + hpet_msi_write(irq_get_handler_data(irq), &msg); + irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); + setup_remapped_irq(irq, irq_get_chip_data(irq), chip); + + irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge"); + return 0; +} +#endif + +#endif /* CONFIG_PCI_MSI */ +/* + * Hypertransport interrupt support + */ +#ifdef CONFIG_HT_IRQ + +static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) +{ + struct ht_irq_msg msg; + fetch_ht_irq_msg(irq, &msg); + + msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK); + msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); + + msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest); + msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest); + + write_ht_irq_msg(irq, &msg); +} + +static int +ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) +{ + struct irq_cfg *cfg = data->chip_data; + unsigned int dest; + int ret; + + ret = __ioapic_set_affinity(data, mask, &dest); + if (ret) + return ret; + + target_ht_irq(data->irq, dest, cfg->vector); + return IRQ_SET_MASK_OK_NOCOPY; +} + +static struct irq_chip ht_irq_chip = { + .name = "PCI-HT", + .irq_mask = mask_ht_irq, + .irq_unmask = unmask_ht_irq, + .irq_ack = ack_apic_edge, + .irq_set_affinity = ht_set_affinity, + .irq_retrigger = ioapic_retrigger_irq, +}; + +int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) +{ + struct irq_cfg *cfg; + struct ht_irq_msg msg; + unsigned dest; + int err; + + if (disable_apic) + return -ENXIO; + + cfg = irq_cfg(irq); + err = assign_irq_vector(irq, cfg, apic->target_cpus()); + if (err) + return err; + + err = apic->cpu_mask_to_apicid_and(cfg->domain, + apic->target_cpus(), &dest); + if (err) + return err; + + msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); + + msg.address_lo = + HT_IRQ_LOW_BASE | + HT_IRQ_LOW_DEST_ID(dest) | + HT_IRQ_LOW_VECTOR(cfg->vector) | + ((apic->irq_dest_mode == 0) ? + HT_IRQ_LOW_DM_PHYSICAL : + HT_IRQ_LOW_DM_LOGICAL) | + HT_IRQ_LOW_RQEOI_EDGE | + ((apic->irq_delivery_mode != dest_LowestPrio) ? + HT_IRQ_LOW_MT_FIXED : + HT_IRQ_LOW_MT_ARBITRATED) | + HT_IRQ_LOW_IRQ_MASKED; + + write_ht_irq_msg(irq, &msg); + + irq_set_chip_and_handler_name(irq, &ht_irq_chip, + handle_edge_irq, "edge"); + + dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq); + + return 0; +} +#endif /* CONFIG_HT_IRQ */ + +static int +io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr) +{ + struct irq_cfg *cfg = alloc_irq_and_cfg_at(irq, node); + int ret; + + if (!cfg) + return -EINVAL; + ret = __add_pin_to_irq_node(cfg, node, attr->ioapic, attr->ioapic_pin); + if (!ret) + setup_ioapic_irq(irq, cfg, attr); + return ret; +} + +int io_apic_setup_irq_pin_once(unsigned int irq, int node, + struct io_apic_irq_attr *attr) +{ + unsigned int ioapic_idx = attr->ioapic, pin = attr->ioapic_pin; + int ret; + struct IO_APIC_route_entry orig_entry; + + /* Avoid redundant programming */ + if (test_bit(pin, ioapics[ioapic_idx].pin_programmed)) { + pr_debug("Pin %d-%d already programmed\n", mpc_ioapic_id(ioapic_idx), pin); + orig_entry = ioapic_read_entry(attr->ioapic, pin); + if (attr->trigger == orig_entry.trigger && attr->polarity == orig_entry.polarity) + return 0; + return -EBUSY; + } + ret = io_apic_setup_irq_pin(irq, node, attr); + if (!ret) + set_bit(pin, ioapics[ioapic_idx].pin_programmed); + return ret; +} + +static int __init io_apic_get_redir_entries(int ioapic) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(ioapic, 1); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + /* The register returns the maximum index redir index + * supported, which is one less than the total number of redir + * entries. + */ + return reg_01.bits.entries + 1; +} + +static void __init probe_nr_irqs_gsi(void) +{ + int nr; + + nr = gsi_top + NR_IRQS_LEGACY; + if (nr > nr_irqs_gsi) + nr_irqs_gsi = nr; + + printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi); +} + +unsigned int arch_dynirq_lower_bound(unsigned int from) +{ + return from < nr_irqs_gsi ? nr_irqs_gsi : from; +} + +int __init arch_probe_nr_irqs(void) +{ + int nr; + + if (nr_irqs > (NR_VECTORS * nr_cpu_ids)) + nr_irqs = NR_VECTORS * nr_cpu_ids; + + nr = nr_irqs_gsi + 8 * nr_cpu_ids; +#if defined(CONFIG_PCI_MSI) || defined(CONFIG_HT_IRQ) + /* + * for MSI and HT dyn irq + */ + nr += nr_irqs_gsi * 16; +#endif + if (nr < nr_irqs) + nr_irqs = nr; + + return NR_IRQS_LEGACY; +} + +int io_apic_set_pci_routing(struct device *dev, int irq, + struct io_apic_irq_attr *irq_attr) +{ + int node; + + if (!IO_APIC_IRQ(irq)) { + apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", + irq_attr->ioapic); + return -EINVAL; + } + + node = dev ? dev_to_node(dev) : cpu_to_node(0); + + return io_apic_setup_irq_pin_once(irq, node, irq_attr); +} + +#ifdef CONFIG_X86_32 +static int __init io_apic_get_unique_id(int ioapic, int apic_id) +{ + union IO_APIC_reg_00 reg_00; + static physid_mask_t apic_id_map = PHYSID_MASK_NONE; + physid_mask_t tmp; + unsigned long flags; + int i = 0; + + /* + * The P4 platform supports up to 256 APIC IDs on two separate APIC + * buses (one for LAPICs, one for IOAPICs), where predecessors only + * supports up to 16 on one shared APIC bus. + * + * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full + * advantage of new APIC bus architecture. + */ + + if (physids_empty(apic_id_map)) + apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map); + + raw_spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(ioapic, 0); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + if (apic_id >= get_physical_broadcast()) { + printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " + "%d\n", ioapic, apic_id, reg_00.bits.ID); + apic_id = reg_00.bits.ID; + } + + /* + * Every APIC in a system must have a unique ID or we get lots of nice + * 'stuck on smp_invalidate_needed IPI wait' messages. + */ + if (apic->check_apicid_used(&apic_id_map, apic_id)) { + + for (i = 0; i < get_physical_broadcast(); i++) { + if (!apic->check_apicid_used(&apic_id_map, i)) + break; + } + + if (i == get_physical_broadcast()) + panic("Max apic_id exceeded!\n"); + + printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " + "trying %d\n", ioapic, apic_id, i); + + apic_id = i; + } + + apic->apicid_to_cpu_present(apic_id, &tmp); + physids_or(apic_id_map, apic_id_map, tmp); + + if (reg_00.bits.ID != apic_id) { + reg_00.bits.ID = apic_id; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(ioapic, 0, reg_00.raw); + reg_00.raw = io_apic_read(ioapic, 0); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + /* Sanity check */ + if (reg_00.bits.ID != apic_id) { + pr_err("IOAPIC[%d]: Unable to change apic_id!\n", + ioapic); + return -1; + } + } + + apic_printk(APIC_VERBOSE, KERN_INFO + "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); + + return apic_id; +} + +static u8 __init io_apic_unique_id(u8 id) +{ + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && + !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + return io_apic_get_unique_id(nr_ioapics, id); + else + return id; +} +#else +static u8 __init io_apic_unique_id(u8 id) +{ + int i; + DECLARE_BITMAP(used, 256); + + bitmap_zero(used, 256); + for (i = 0; i < nr_ioapics; i++) { + __set_bit(mpc_ioapic_id(i), used); + } + if (!test_bit(id, used)) + return id; + return find_first_zero_bit(used, 256); +} +#endif + +static int __init io_apic_get_version(int ioapic) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(ioapic, 1); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + + return reg_01.bits.version; +} + +int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity) +{ + int ioapic, pin, idx; + + if (skip_ioapic_setup) + return -1; + + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) + return -1; + + pin = mp_find_ioapic_pin(ioapic, gsi); + if (pin < 0) + return -1; + + idx = find_irq_entry(ioapic, pin, mp_INT); + if (idx < 0) + return -1; + + *trigger = irq_trigger(idx); + *polarity = irq_polarity(idx); + return 0; +} + +/* + * This function currently is only a helper for the i386 smp boot process where + * we need to reprogram the ioredtbls to cater for the cpus which have come online + * so mask in all cases should simply be apic->target_cpus() + */ +#ifdef CONFIG_SMP +void __init setup_ioapic_dest(void) +{ + int pin, ioapic, irq, irq_entry; + const struct cpumask *mask; + struct irq_data *idata; + + if (skip_ioapic_setup == 1) + return; + + for (ioapic = 0; ioapic < nr_ioapics; ioapic++) + for (pin = 0; pin < ioapics[ioapic].nr_registers; pin++) { + irq_entry = find_irq_entry(ioapic, pin, mp_INT); + if (irq_entry == -1) + continue; + irq = pin_2_irq(irq_entry, ioapic, pin); + + if ((ioapic > 0) && (irq > 16)) + continue; + + idata = irq_get_irq_data(irq); + + /* + * Honour affinities which have been set in early boot + */ + if (!irqd_can_balance(idata) || irqd_affinity_was_set(idata)) + mask = idata->affinity; + else + mask = apic->target_cpus(); + + x86_io_apic_ops.set_affinity(idata, mask, false); + } + +} +#endif + +#define IOAPIC_RESOURCE_NAME_SIZE 11 + +static struct resource *ioapic_resources; + +static struct resource * __init ioapic_setup_resources(int nr_ioapics) +{ + unsigned long n; + struct resource *res; + char *mem; + int i; + + if (nr_ioapics <= 0) + return NULL; + + n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); + n *= nr_ioapics; + + mem = alloc_bootmem(n); + res = (void *)mem; + + mem += sizeof(struct resource) * nr_ioapics; + + for (i = 0; i < nr_ioapics; i++) { + res[i].name = mem; + res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; + snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i); + mem += IOAPIC_RESOURCE_NAME_SIZE; + } + + ioapic_resources = res; + + return res; +} + +void __init native_io_apic_init_mappings(void) +{ + unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; + struct resource *ioapic_res; + int i; + + ioapic_res = ioapic_setup_resources(nr_ioapics); + for (i = 0; i < nr_ioapics; i++) { + if (smp_found_config) { + ioapic_phys = mpc_ioapic_addr(i); +#ifdef CONFIG_X86_32 + if (!ioapic_phys) { + printk(KERN_ERR + "WARNING: bogus zero IO-APIC " + "address found in MPTABLE, " + "disabling IO/APIC support!\n"); + smp_found_config = 0; + skip_ioapic_setup = 1; + goto fake_ioapic_page; + } +#endif + } else { +#ifdef CONFIG_X86_32 +fake_ioapic_page: +#endif + ioapic_phys = (unsigned long)alloc_bootmem_pages(PAGE_SIZE); + ioapic_phys = __pa(ioapic_phys); + } + set_fixmap_nocache(idx, ioapic_phys); + apic_printk(APIC_VERBOSE, "mapped IOAPIC to %08lx (%08lx)\n", + __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK), + ioapic_phys); + idx++; + + ioapic_res->start = ioapic_phys; + ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1; + ioapic_res++; + } + + probe_nr_irqs_gsi(); +} + +void __init ioapic_insert_resources(void) +{ + int i; + struct resource *r = ioapic_resources; + + if (!r) { + if (nr_ioapics > 0) + printk(KERN_ERR + "IO APIC resources couldn't be allocated.\n"); + return; + } + + for (i = 0; i < nr_ioapics; i++) { + insert_resource(&iomem_resource, r); + r++; + } +} + +int mp_find_ioapic(u32 gsi) +{ + int i = 0; + + if (nr_ioapics == 0) + return -1; + + /* Find the IOAPIC that manages this GSI. */ + for (i = 0; i < nr_ioapics; i++) { + struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(i); + if ((gsi >= gsi_cfg->gsi_base) + && (gsi <= gsi_cfg->gsi_end)) + return i; + } + + printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); + return -1; +} + +int mp_find_ioapic_pin(int ioapic, u32 gsi) +{ + struct mp_ioapic_gsi *gsi_cfg; + + if (WARN_ON(ioapic == -1)) + return -1; + + gsi_cfg = mp_ioapic_gsi_routing(ioapic); + if (WARN_ON(gsi > gsi_cfg->gsi_end)) + return -1; + + return gsi - gsi_cfg->gsi_base; +} + +static __init int bad_ioapic(unsigned long address) +{ + if (nr_ioapics >= MAX_IO_APICS) { + pr_warn("WARNING: Max # of I/O APICs (%d) exceeded (found %d), skipping\n", + MAX_IO_APICS, nr_ioapics); + return 1; + } + if (!address) { + pr_warn("WARNING: Bogus (zero) I/O APIC address found in table, skipping!\n"); + return 1; + } + return 0; +} + +static __init int bad_ioapic_register(int idx) +{ + union IO_APIC_reg_00 reg_00; + union IO_APIC_reg_01 reg_01; + union IO_APIC_reg_02 reg_02; + + reg_00.raw = io_apic_read(idx, 0); + reg_01.raw = io_apic_read(idx, 1); + reg_02.raw = io_apic_read(idx, 2); + + if (reg_00.raw == -1 && reg_01.raw == -1 && reg_02.raw == -1) { + pr_warn("I/O APIC 0x%x registers return all ones, skipping!\n", + mpc_ioapic_addr(idx)); + return 1; + } + + return 0; +} + +void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) +{ + int idx = 0; + int entries; + struct mp_ioapic_gsi *gsi_cfg; + + if (bad_ioapic(address)) + return; + + idx = nr_ioapics; + + ioapics[idx].mp_config.type = MP_IOAPIC; + ioapics[idx].mp_config.flags = MPC_APIC_USABLE; + ioapics[idx].mp_config.apicaddr = address; + + set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); + + if (bad_ioapic_register(idx)) { + clear_fixmap(FIX_IO_APIC_BASE_0 + idx); + return; + } + + ioapics[idx].mp_config.apicid = io_apic_unique_id(id); + ioapics[idx].mp_config.apicver = io_apic_get_version(idx); + + /* + * Build basic GSI lookup table to facilitate gsi->io_apic lookups + * and to prevent reprogramming of IOAPIC pins (PCI GSIs). + */ + entries = io_apic_get_redir_entries(idx); + gsi_cfg = mp_ioapic_gsi_routing(idx); + gsi_cfg->gsi_base = gsi_base; + gsi_cfg->gsi_end = gsi_base + entries - 1; + + /* + * The number of IO-APIC IRQ registers (== #pins): + */ + ioapics[idx].nr_registers = entries; + + if (gsi_cfg->gsi_end >= gsi_top) + gsi_top = gsi_cfg->gsi_end + 1; + + pr_info("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, GSI %d-%d\n", + idx, mpc_ioapic_id(idx), + mpc_ioapic_ver(idx), mpc_ioapic_addr(idx), + gsi_cfg->gsi_base, gsi_cfg->gsi_end); + + nr_ioapics++; +} + +/* Enable IOAPIC early just for system timer */ +void __init pre_init_apic_IRQ0(void) +{ + struct io_apic_irq_attr attr = { 0, 0, 0, 0 }; + + printk(KERN_INFO "Early APIC setup for system timer0\n"); +#ifndef CONFIG_SMP + physid_set_mask_of_physid(boot_cpu_physical_apicid, + &phys_cpu_present_map); +#endif + setup_local_APIC(); + + io_apic_setup_irq_pin(0, 0, &attr); + irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, + "edge"); +} diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c new file mode 100644 index 00000000000..62071569bd5 --- /dev/null +++ b/arch/x86/kernel/apic/ipi.c @@ -0,0 +1,166 @@ +#include <linux/cpumask.h> +#include <linux/interrupt.h> + +#include <linux/mm.h> +#include <linux/delay.h> +#include <linux/spinlock.h> +#include <linux/kernel_stat.h> +#include <linux/mc146818rtc.h> +#include <linux/cache.h> +#include <linux/cpu.h> +#include <linux/module.h> + +#include <asm/smp.h> +#include <asm/mtrr.h> +#include <asm/tlbflush.h> +#include <asm/mmu_context.h> +#include <asm/apic.h> +#include <asm/proto.h> +#include <asm/ipi.h> + +void default_send_IPI_mask_sequence_phys(const struct cpumask *mask, int vector) +{ + unsigned long query_cpu; + unsigned long flags; + + /* + * Hack. The clustered APIC addressing mode doesn't allow us to send + * to an arbitrary mask, so I do a unicast to each CPU instead. + * - mbligh + */ + local_irq_save(flags); + for_each_cpu(query_cpu, mask) { + __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, + query_cpu), vector, APIC_DEST_PHYSICAL); + } + local_irq_restore(flags); +} + +void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, + int vector) +{ + unsigned int this_cpu = smp_processor_id(); + unsigned int query_cpu; + unsigned long flags; + + /* See Hack comment above */ + + local_irq_save(flags); + for_each_cpu(query_cpu, mask) { + if (query_cpu == this_cpu) + continue; + __default_send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, + query_cpu), vector, APIC_DEST_PHYSICAL); + } + local_irq_restore(flags); +} + +#ifdef CONFIG_X86_32 + +void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, + int vector) +{ + unsigned long flags; + unsigned int query_cpu; + + /* + * Hack. The clustered APIC addressing mode doesn't allow us to send + * to an arbitrary mask, so I do a unicasts to each CPU instead. This + * should be modified to do 1 message per cluster ID - mbligh + */ + + local_irq_save(flags); + for_each_cpu(query_cpu, mask) + __default_send_IPI_dest_field( + early_per_cpu(x86_cpu_to_logical_apicid, query_cpu), + vector, apic->dest_logical); + local_irq_restore(flags); +} + +void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, + int vector) +{ + unsigned long flags; + unsigned int query_cpu; + unsigned int this_cpu = smp_processor_id(); + + /* See Hack comment above */ + + local_irq_save(flags); + for_each_cpu(query_cpu, mask) { + if (query_cpu == this_cpu) + continue; + __default_send_IPI_dest_field( + early_per_cpu(x86_cpu_to_logical_apicid, query_cpu), + vector, apic->dest_logical); + } + local_irq_restore(flags); +} + +/* + * This is only used on smaller machines. + */ +void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector) +{ + unsigned long mask = cpumask_bits(cpumask)[0]; + unsigned long flags; + + if (!mask) + return; + + local_irq_save(flags); + WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]); + __default_send_IPI_dest_field(mask, vector, apic->dest_logical); + local_irq_restore(flags); +} + +void default_send_IPI_allbutself(int vector) +{ + /* + * if there are no other CPUs in the system then we get an APIC send + * error if we try to broadcast, thus avoid sending IPIs in this case. + */ + if (!(num_online_cpus() > 1)) + return; + + __default_local_send_IPI_allbutself(vector); +} + +void default_send_IPI_all(int vector) +{ + __default_local_send_IPI_all(vector); +} + +void default_send_IPI_self(int vector) +{ + __default_send_IPI_shortcut(APIC_DEST_SELF, vector, apic->dest_logical); +} + +/* must come after the send_IPI functions above for inlining */ +static int convert_apicid_to_cpu(int apic_id) +{ + int i; + + for_each_possible_cpu(i) { + if (per_cpu(x86_cpu_to_apicid, i) == apic_id) + return i; + } + return -1; +} + +int safe_smp_processor_id(void) +{ + int apicid, cpuid; + + if (!cpu_has_apic) + return 0; + + apicid = hard_smp_processor_id(); + if (apicid == BAD_APICID) + return 0; + + cpuid = convert_apicid_to_cpu(apicid); + + return cpuid >= 0 ? cpuid : 0; +} +#endif diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c new file mode 100644 index 00000000000..cceb352c968 --- /dev/null +++ b/arch/x86/kernel/apic/probe_32.c @@ -0,0 +1,258 @@ +/* + * Default generic APIC driver. This handles up to 8 CPUs. + * + * Copyright 2003 Andi Kleen, SuSE Labs. + * Subject to the GNU Public License, v.2 + * + * Generic x86 APIC driver probe layer. + */ +#include <linux/threads.h> +#include <linux/cpumask.h> +#include <linux/module.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/ctype.h> +#include <linux/init.h> +#include <linux/errno.h> +#include <asm/fixmap.h> +#include <asm/mpspec.h> +#include <asm/apicdef.h> +#include <asm/apic.h> +#include <asm/setup.h> + +#include <linux/smp.h> +#include <asm/ipi.h> + +#include <linux/interrupt.h> +#include <asm/acpi.h> +#include <asm/e820.h> + +#ifdef CONFIG_HOTPLUG_CPU +#define DEFAULT_SEND_IPI (1) +#else +#define DEFAULT_SEND_IPI (0) +#endif + +int no_broadcast = DEFAULT_SEND_IPI; + +static __init int no_ipi_broadcast(char *str) +{ + get_option(&str, &no_broadcast); + pr_info("Using %s mode\n", + no_broadcast ? "No IPI Broadcast" : "IPI Broadcast"); + return 1; +} +__setup("no_ipi_broadcast=", no_ipi_broadcast); + +static int __init print_ipi_mode(void) +{ + pr_info("Using IPI %s mode\n", + no_broadcast ? "No-Shortcut" : "Shortcut"); + return 0; +} +late_initcall(print_ipi_mode); + +static int default_x86_32_early_logical_apicid(int cpu) +{ + return 1 << cpu; +} + +static void setup_apic_flat_routing(void) +{ +#ifdef CONFIG_X86_IO_APIC + printk(KERN_INFO + "Enabling APIC mode: Flat. Using %d I/O APICs\n", + nr_ioapics); +#endif +} + +/* should be called last. */ +static int probe_default(void) +{ + return 1; +} + +static struct apic apic_default = { + + .name = "default", + .probe = probe_default, + .acpi_madt_oem_check = NULL, + .apic_id_valid = default_apic_id_valid, + .apic_id_registered = default_apic_id_registered, + + .irq_delivery_mode = dest_LowestPrio, + /* logical delivery broadcast to all CPUs: */ + .irq_dest_mode = 1, + + .target_cpus = default_target_cpus, + .disable_esr = 0, + .dest_logical = APIC_DEST_LOGICAL, + .check_apicid_used = default_check_apicid_used, + .check_apicid_present = default_check_apicid_present, + + .vector_allocation_domain = flat_vector_allocation_domain, + .init_apic_ldr = default_init_apic_ldr, + + .ioapic_phys_id_map = default_ioapic_phys_id_map, + .setup_apic_routing = setup_apic_flat_routing, + .multi_timer_check = NULL, + .cpu_present_to_apicid = default_cpu_present_to_apicid, + .apicid_to_cpu_present = physid_set_mask_of_physid, + .setup_portio_remap = NULL, + .check_phys_apicid_present = default_check_phys_apicid_present, + .enable_apic_mode = NULL, + .phys_pkg_id = default_phys_pkg_id, + .mps_oem_check = NULL, + + .get_apic_id = default_get_apic_id, + .set_apic_id = NULL, + .apic_id_mask = 0x0F << 24, + + .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, + + .send_IPI_mask = default_send_IPI_mask_logical, + .send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_logical, + .send_IPI_allbutself = default_send_IPI_allbutself, + .send_IPI_all = default_send_IPI_all, + .send_IPI_self = default_send_IPI_self, + + .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, + .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, + + .wait_for_init_deassert = true, + .smp_callin_clear_local_apic = NULL, + .inquire_remote_apic = default_inquire_remote_apic, + + .read = native_apic_mem_read, + .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, + .icr_read = native_apic_icr_read, + .icr_write = native_apic_icr_write, + .wait_icr_idle = native_apic_wait_icr_idle, + .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, + + .x86_32_early_logical_apicid = default_x86_32_early_logical_apicid, +}; + +apic_driver(apic_default); + +struct apic *apic = &apic_default; +EXPORT_SYMBOL_GPL(apic); + +static int cmdline_apic __initdata; +static int __init parse_apic(char *arg) +{ + struct apic **drv; + + if (!arg) + return -EINVAL; + + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { + if (!strcmp((*drv)->name, arg)) { + apic = *drv; + cmdline_apic = 1; + return 0; + } + } + + /* Parsed again by __setup for debug/verbose */ + return 0; +} +early_param("apic", parse_apic); + +void __init default_setup_apic_routing(void) +{ + int version = apic_version[boot_cpu_physical_apicid]; + + if (num_possible_cpus() > 8) { + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + if (!APIC_XAPIC(version)) { + def_to_bigsmp = 0; + break; + } + /* If P4 and above fall through */ + case X86_VENDOR_AMD: + def_to_bigsmp = 1; + } + } + +#ifdef CONFIG_X86_BIGSMP + /* + * This is used to switch to bigsmp mode when + * - There is no apic= option specified by the user + * - generic_apic_probe() has chosen apic_default as the sub_arch + * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support + */ + + if (!cmdline_apic && apic == &apic_default) + generic_bigsmp_probe(); +#endif + + if (apic->setup_apic_routing) + apic->setup_apic_routing(); + + if (x86_platform.apic_post_init) + x86_platform.apic_post_init(); +} + +void __init generic_apic_probe(void) +{ + if (!cmdline_apic) { + struct apic **drv; + + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { + if ((*drv)->probe()) { + apic = *drv; + break; + } + } + /* Not visible without early console */ + if (drv == __apicdrivers_end) + panic("Didn't find an APIC driver"); + } + printk(KERN_INFO "Using APIC driver %s\n", apic->name); +} + +/* These functions can switch the APIC even after the initial ->probe() */ + +int __init +generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid) +{ + struct apic **drv; + + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { + if (!((*drv)->mps_oem_check)) + continue; + if (!(*drv)->mps_oem_check(mpc, oem, productid)) + continue; + + if (!cmdline_apic) { + apic = *drv; + printk(KERN_INFO "Switched to APIC driver `%s'.\n", + apic->name); + } + return 1; + } + return 0; +} + +int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + struct apic **drv; + + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { + if (!(*drv)->acpi_madt_oem_check) + continue; + if (!(*drv)->acpi_madt_oem_check(oem_id, oem_table_id)) + continue; + + if (!cmdline_apic) { + apic = *drv; + printk(KERN_INFO "Switched to APIC driver `%s'.\n", + apic->name); + } + return 1; + } + return 0; +} diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c new file mode 100644 index 00000000000..1793dba7a74 --- /dev/null +++ b/arch/x86/kernel/apic/probe_64.c @@ -0,0 +1,72 @@ +/* + * Copyright 2004 James Cleverdon, IBM. + * Subject to the GNU Public License, v.2 + * + * Generic APIC sub-arch probe layer. + * + * Hacked for x86-64 by James Cleverdon from i386 architecture code by + * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and + * James Cleverdon. + */ +#include <linux/threads.h> +#include <linux/cpumask.h> +#include <linux/string.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/ctype.h> +#include <linux/init.h> +#include <linux/hardirq.h> +#include <linux/dmar.h> + +#include <asm/smp.h> +#include <asm/apic.h> +#include <asm/ipi.h> +#include <asm/setup.h> + +/* + * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. + */ +void __init default_setup_apic_routing(void) +{ + struct apic **drv; + + enable_IR_x2apic(); + + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { + if ((*drv)->probe && (*drv)->probe()) { + if (apic != *drv) { + apic = *drv; + pr_info("Switched APIC routing to %s.\n", + apic->name); + } + break; + } + } + + if (x86_platform.apic_post_init) + x86_platform.apic_post_init(); +} + +/* Same for both flat and physical. */ + +void apic_send_IPI_self(int vector) +{ + __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); +} + +int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + struct apic **drv; + + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { + if ((*drv)->acpi_madt_oem_check(oem_id, oem_table_id)) { + if (apic != *drv) { + apic = *drv; + pr_info("Setting APIC routing to %s.\n", + apic->name); + } + return 1; + } + } + return 0; +} diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c new file mode 100644 index 00000000000..e66766bf164 --- /dev/null +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -0,0 +1,295 @@ +#include <linux/threads.h> +#include <linux/cpumask.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/ctype.h> +#include <linux/dmar.h> +#include <linux/cpu.h> + +#include <asm/smp.h> +#include <asm/x2apic.h> + +static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); +static DEFINE_PER_CPU(cpumask_var_t, cpus_in_cluster); +static DEFINE_PER_CPU(cpumask_var_t, ipi_mask); + +static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + return x2apic_enabled(); +} + +static inline u32 x2apic_cluster(int cpu) +{ + return per_cpu(x86_cpu_to_logical_apicid, cpu) >> 16; +} + +static void +__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) +{ + struct cpumask *cpus_in_cluster_ptr; + struct cpumask *ipi_mask_ptr; + unsigned int cpu, this_cpu; + unsigned long flags; + u32 dest; + + x2apic_wrmsr_fence(); + + local_irq_save(flags); + + this_cpu = smp_processor_id(); + + /* + * We are to modify mask, so we need an own copy + * and be sure it's manipulated with irq off. + */ + ipi_mask_ptr = __raw_get_cpu_var(ipi_mask); + cpumask_copy(ipi_mask_ptr, mask); + + /* + * The idea is to send one IPI per cluster. + */ + for_each_cpu(cpu, ipi_mask_ptr) { + unsigned long i; + + cpus_in_cluster_ptr = per_cpu(cpus_in_cluster, cpu); + dest = 0; + + /* Collect cpus in cluster. */ + for_each_cpu_and(i, ipi_mask_ptr, cpus_in_cluster_ptr) { + if (apic_dest == APIC_DEST_ALLINC || i != this_cpu) + dest |= per_cpu(x86_cpu_to_logical_apicid, i); + } + + if (!dest) + continue; + + __x2apic_send_IPI_dest(dest, vector, apic->dest_logical); + /* + * Cluster sibling cpus should be discared now so + * we would not send IPI them second time. + */ + cpumask_andnot(ipi_mask_ptr, ipi_mask_ptr, cpus_in_cluster_ptr); + } + + local_irq_restore(flags); +} + +static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) +{ + __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLINC); +} + +static void +x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) +{ + __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT); +} + +static void x2apic_send_IPI_allbutself(int vector) +{ + __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLBUT); +} + +static void x2apic_send_IPI_all(int vector) +{ + __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC); +} + +static int +x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask, + unsigned int *apicid) +{ + u32 dest = 0; + u16 cluster; + int i; + + for_each_cpu_and(i, cpumask, andmask) { + if (!cpumask_test_cpu(i, cpu_online_mask)) + continue; + dest = per_cpu(x86_cpu_to_logical_apicid, i); + cluster = x2apic_cluster(i); + break; + } + + if (!dest) + return -EINVAL; + + for_each_cpu_and(i, cpumask, andmask) { + if (!cpumask_test_cpu(i, cpu_online_mask)) + continue; + if (cluster != x2apic_cluster(i)) + continue; + dest |= per_cpu(x86_cpu_to_logical_apicid, i); + } + + *apicid = dest; + + return 0; +} + +static void init_x2apic_ldr(void) +{ + unsigned int this_cpu = smp_processor_id(); + unsigned int cpu; + + per_cpu(x86_cpu_to_logical_apicid, this_cpu) = apic_read(APIC_LDR); + + __cpu_set(this_cpu, per_cpu(cpus_in_cluster, this_cpu)); + for_each_online_cpu(cpu) { + if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu)) + continue; + __cpu_set(this_cpu, per_cpu(cpus_in_cluster, cpu)); + __cpu_set(cpu, per_cpu(cpus_in_cluster, this_cpu)); + } +} + + /* + * At CPU state changes, update the x2apic cluster sibling info. + */ +static int +update_clusterinfo(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + unsigned int this_cpu = (unsigned long)hcpu; + unsigned int cpu; + int err = 0; + + switch (action) { + case CPU_UP_PREPARE: + if (!zalloc_cpumask_var(&per_cpu(cpus_in_cluster, this_cpu), + GFP_KERNEL)) { + err = -ENOMEM; + } else if (!zalloc_cpumask_var(&per_cpu(ipi_mask, this_cpu), + GFP_KERNEL)) { + free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu)); + err = -ENOMEM; + } + break; + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + case CPU_DEAD: + for_each_online_cpu(cpu) { + if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu)) + continue; + __cpu_clear(this_cpu, per_cpu(cpus_in_cluster, cpu)); + __cpu_clear(cpu, per_cpu(cpus_in_cluster, this_cpu)); + } + free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu)); + free_cpumask_var(per_cpu(ipi_mask, this_cpu)); + break; + } + + return notifier_from_errno(err); +} + +static struct notifier_block __refdata x2apic_cpu_notifier = { + .notifier_call = update_clusterinfo, +}; + +static int x2apic_init_cpu_notifier(void) +{ + int cpu = smp_processor_id(); + + zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL); + zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL); + + BUG_ON(!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu)); + + __cpu_set(cpu, per_cpu(cpus_in_cluster, cpu)); + register_hotcpu_notifier(&x2apic_cpu_notifier); + return 1; +} + +static int x2apic_cluster_probe(void) +{ + if (x2apic_mode) + return x2apic_init_cpu_notifier(); + else + return 0; +} + +static const struct cpumask *x2apic_cluster_target_cpus(void) +{ + return cpu_all_mask; +} + +/* + * Each x2apic cluster is an allocation domain. + */ +static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask, + const struct cpumask *mask) +{ + /* + * To minimize vector pressure, default case of boot, device bringup + * etc will use a single cpu for the interrupt destination. + * + * On explicit migration requests coming from irqbalance etc, + * interrupts will be routed to the x2apic cluster (cluster-id + * derived from the first cpu in the mask) members specified + * in the mask. + */ + if (mask == x2apic_cluster_target_cpus()) + cpumask_copy(retmask, cpumask_of(cpu)); + else + cpumask_and(retmask, mask, per_cpu(cpus_in_cluster, cpu)); +} + +static struct apic apic_x2apic_cluster = { + + .name = "cluster x2apic", + .probe = x2apic_cluster_probe, + .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, + .apic_id_valid = x2apic_apic_id_valid, + .apic_id_registered = x2apic_apic_id_registered, + + .irq_delivery_mode = dest_LowestPrio, + .irq_dest_mode = 1, /* logical */ + + .target_cpus = x2apic_cluster_target_cpus, + .disable_esr = 0, + .dest_logical = APIC_DEST_LOGICAL, + .check_apicid_used = NULL, + .check_apicid_present = NULL, + + .vector_allocation_domain = cluster_vector_allocation_domain, + .init_apic_ldr = init_x2apic_ldr, + + .ioapic_phys_id_map = NULL, + .setup_apic_routing = NULL, + .multi_timer_check = NULL, + .cpu_present_to_apicid = default_cpu_present_to_apicid, + .apicid_to_cpu_present = NULL, + .setup_portio_remap = NULL, + .check_phys_apicid_present = default_check_phys_apicid_present, + .enable_apic_mode = NULL, + .phys_pkg_id = x2apic_phys_pkg_id, + .mps_oem_check = NULL, + + .get_apic_id = x2apic_get_apic_id, + .set_apic_id = x2apic_set_apic_id, + .apic_id_mask = 0xFFFFFFFFu, + + .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and, + + .send_IPI_mask = x2apic_send_IPI_mask, + .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself, + .send_IPI_allbutself = x2apic_send_IPI_allbutself, + .send_IPI_all = x2apic_send_IPI_all, + .send_IPI_self = x2apic_send_IPI_self, + + .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, + .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, + .wait_for_init_deassert = false, + .smp_callin_clear_local_apic = NULL, + .inquire_remote_apic = NULL, + + .read = native_apic_msr_read, + .write = native_apic_msr_write, + .eoi_write = native_apic_msr_eoi_write, + .icr_read = native_x2apic_icr_read, + .icr_write = native_x2apic_icr_write, + .wait_icr_idle = native_x2apic_wait_icr_idle, + .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle, +}; + +apic_driver(apic_x2apic_cluster); diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c new file mode 100644 index 00000000000..6d600ebf6c1 --- /dev/null +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -0,0 +1,149 @@ +#include <linux/threads.h> +#include <linux/cpumask.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/ctype.h> +#include <linux/dmar.h> + +#include <asm/smp.h> +#include <asm/x2apic.h> + +int x2apic_phys; + +static struct apic apic_x2apic_phys; + +static int set_x2apic_phys_mode(char *arg) +{ + x2apic_phys = 1; + return 0; +} +early_param("x2apic_phys", set_x2apic_phys_mode); + +static bool x2apic_fadt_phys(void) +{ + if ((acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) && + (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) { + printk(KERN_DEBUG "System requires x2apic physical mode\n"); + return true; + } + return false; +} + +static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + return x2apic_enabled() && (x2apic_phys || x2apic_fadt_phys()); +} + +static void +__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) +{ + unsigned long query_cpu; + unsigned long this_cpu; + unsigned long flags; + + x2apic_wrmsr_fence(); + + local_irq_save(flags); + + this_cpu = smp_processor_id(); + for_each_cpu(query_cpu, mask) { + if (apic_dest == APIC_DEST_ALLBUT && this_cpu == query_cpu) + continue; + __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu), + vector, APIC_DEST_PHYSICAL); + } + local_irq_restore(flags); +} + +static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) +{ + __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLINC); +} + +static void + x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) +{ + __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT); +} + +static void x2apic_send_IPI_allbutself(int vector) +{ + __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLBUT); +} + +static void x2apic_send_IPI_all(int vector) +{ + __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC); +} + +static void init_x2apic_ldr(void) +{ +} + +static int x2apic_phys_probe(void) +{ + if (x2apic_mode && (x2apic_phys || x2apic_fadt_phys())) + return 1; + + return apic == &apic_x2apic_phys; +} + +static struct apic apic_x2apic_phys = { + + .name = "physical x2apic", + .probe = x2apic_phys_probe, + .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, + .apic_id_valid = x2apic_apic_id_valid, + .apic_id_registered = x2apic_apic_id_registered, + + .irq_delivery_mode = dest_Fixed, + .irq_dest_mode = 0, /* physical */ + + .target_cpus = online_target_cpus, + .disable_esr = 0, + .dest_logical = 0, + .check_apicid_used = NULL, + .check_apicid_present = NULL, + + .vector_allocation_domain = default_vector_allocation_domain, + .init_apic_ldr = init_x2apic_ldr, + + .ioapic_phys_id_map = NULL, + .setup_apic_routing = NULL, + .multi_timer_check = NULL, + .cpu_present_to_apicid = default_cpu_present_to_apicid, + .apicid_to_cpu_present = NULL, + .setup_portio_remap = NULL, + .check_phys_apicid_present = default_check_phys_apicid_present, + .enable_apic_mode = NULL, + .phys_pkg_id = x2apic_phys_pkg_id, + .mps_oem_check = NULL, + + .get_apic_id = x2apic_get_apic_id, + .set_apic_id = x2apic_set_apic_id, + .apic_id_mask = 0xFFFFFFFFu, + + .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, + + .send_IPI_mask = x2apic_send_IPI_mask, + .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself, + .send_IPI_allbutself = x2apic_send_IPI_allbutself, + .send_IPI_all = x2apic_send_IPI_all, + .send_IPI_self = x2apic_send_IPI_self, + + .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, + .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, + .wait_for_init_deassert = false, + .smp_callin_clear_local_apic = NULL, + .inquire_remote_apic = NULL, + + .read = native_apic_msr_read, + .write = native_apic_msr_write, + .eoi_write = native_apic_msr_eoi_write, + .icr_read = native_x2apic_icr_read, + .icr_write = native_x2apic_icr_write, + .wait_icr_idle = native_x2apic_wait_icr_idle, + .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle, +}; + +apic_driver(apic_x2apic_phys); diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c new file mode 100644 index 00000000000..293b41df54e --- /dev/null +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -0,0 +1,1012 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * SGI UV APIC functions (note: not an Intel compatible APIC) + * + * Copyright (C) 2007-2014 Silicon Graphics, Inc. All rights reserved. + */ +#include <linux/cpumask.h> +#include <linux/hardirq.h> +#include <linux/proc_fs.h> +#include <linux/threads.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/string.h> +#include <linux/ctype.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/slab.h> +#include <linux/cpu.h> +#include <linux/init.h> +#include <linux/io.h> +#include <linux/pci.h> +#include <linux/kdebug.h> +#include <linux/delay.h> +#include <linux/crash_dump.h> +#include <linux/reboot.h> + +#include <asm/uv/uv_mmrs.h> +#include <asm/uv/uv_hub.h> +#include <asm/current.h> +#include <asm/pgtable.h> +#include <asm/uv/bios.h> +#include <asm/uv/uv.h> +#include <asm/apic.h> +#include <asm/ipi.h> +#include <asm/smp.h> +#include <asm/x86_init.h> +#include <asm/nmi.h> + +DEFINE_PER_CPU(int, x2apic_extra_bits); + +#define PR_DEVEL(fmt, args...) pr_devel("%s: " fmt, __func__, args) + +static enum uv_system_type uv_system_type; +static u64 gru_start_paddr, gru_end_paddr; +static u64 gru_dist_base, gru_first_node_paddr = -1LL, gru_last_node_paddr; +static u64 gru_dist_lmask, gru_dist_umask; +static union uvh_apicid uvh_apicid; +int uv_min_hub_revision_id; +EXPORT_SYMBOL_GPL(uv_min_hub_revision_id); +unsigned int uv_apicid_hibits; +EXPORT_SYMBOL_GPL(uv_apicid_hibits); + +static struct apic apic_x2apic_uv_x; + +static unsigned long __init uv_early_read_mmr(unsigned long addr) +{ + unsigned long val, *mmr; + + mmr = early_ioremap(UV_LOCAL_MMR_BASE | addr, sizeof(*mmr)); + val = *mmr; + early_iounmap(mmr, sizeof(*mmr)); + return val; +} + +static inline bool is_GRU_range(u64 start, u64 end) +{ + if (gru_dist_base) { + u64 su = start & gru_dist_umask; /* upper (incl pnode) bits */ + u64 sl = start & gru_dist_lmask; /* base offset bits */ + u64 eu = end & gru_dist_umask; + u64 el = end & gru_dist_lmask; + + /* Must reside completely within a single GRU range */ + return (sl == gru_dist_base && el == gru_dist_base && + su >= gru_first_node_paddr && + su <= gru_last_node_paddr && + eu == su); + } else { + return start >= gru_start_paddr && end <= gru_end_paddr; + } +} + +static bool uv_is_untracked_pat_range(u64 start, u64 end) +{ + return is_ISA_range(start, end) || is_GRU_range(start, end); +} + +static int __init early_get_pnodeid(void) +{ + union uvh_node_id_u node_id; + union uvh_rh_gam_config_mmr_u m_n_config; + int pnode; + + /* Currently, all blades have same revision number */ + node_id.v = uv_early_read_mmr(UVH_NODE_ID); + m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_CONFIG_MMR); + uv_min_hub_revision_id = node_id.s.revision; + + switch (node_id.s.part_number) { + case UV2_HUB_PART_NUMBER: + case UV2_HUB_PART_NUMBER_X: + uv_min_hub_revision_id += UV2_HUB_REVISION_BASE - 1; + break; + case UV3_HUB_PART_NUMBER: + case UV3_HUB_PART_NUMBER_X: + uv_min_hub_revision_id += UV3_HUB_REVISION_BASE; + break; + } + + uv_hub_info->hub_revision = uv_min_hub_revision_id; + pnode = (node_id.s.node_id >> 1) & ((1 << m_n_config.s.n_skt) - 1); + return pnode; +} + +static void __init early_get_apic_pnode_shift(void) +{ + uvh_apicid.v = uv_early_read_mmr(UVH_APICID); + if (!uvh_apicid.v) + /* + * Old bios, use default value + */ + uvh_apicid.s.pnode_shift = UV_APIC_PNODE_SHIFT; +} + +/* + * Add an extra bit as dictated by bios to the destination apicid of + * interrupts potentially passing through the UV HUB. This prevents + * a deadlock between interrupts and IO port operations. + */ +static void __init uv_set_apicid_hibit(void) +{ + union uv1h_lb_target_physical_apic_id_mask_u apicid_mask; + + if (is_uv1_hub()) { + apicid_mask.v = + uv_early_read_mmr(UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK); + uv_apicid_hibits = + apicid_mask.s1.bit_enables & UV_APICID_HIBIT_MASK; + } +} + +static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + int pnodeid, is_uv1, is_uv2, is_uv3; + + is_uv1 = !strcmp(oem_id, "SGI"); + is_uv2 = !strcmp(oem_id, "SGI2"); + is_uv3 = !strncmp(oem_id, "SGI3", 4); /* there are varieties of UV3 */ + if (is_uv1 || is_uv2 || is_uv3) { + uv_hub_info->hub_revision = + (is_uv1 ? UV1_HUB_REVISION_BASE : + (is_uv2 ? UV2_HUB_REVISION_BASE : + UV3_HUB_REVISION_BASE)); + pnodeid = early_get_pnodeid(); + early_get_apic_pnode_shift(); + x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; + x86_platform.nmi_init = uv_nmi_init; + if (!strcmp(oem_table_id, "UVL")) + uv_system_type = UV_LEGACY_APIC; + else if (!strcmp(oem_table_id, "UVX")) + uv_system_type = UV_X2APIC; + else if (!strcmp(oem_table_id, "UVH")) { + __this_cpu_write(x2apic_extra_bits, + pnodeid << uvh_apicid.s.pnode_shift); + uv_system_type = UV_NON_UNIQUE_APIC; + uv_set_apicid_hibit(); + return 1; + } + } + return 0; +} + +enum uv_system_type get_uv_system_type(void) +{ + return uv_system_type; +} + +int is_uv_system(void) +{ + return uv_system_type != UV_NONE; +} +EXPORT_SYMBOL_GPL(is_uv_system); + +DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); +EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info); + +struct uv_blade_info *uv_blade_info; +EXPORT_SYMBOL_GPL(uv_blade_info); + +short *uv_node_to_blade; +EXPORT_SYMBOL_GPL(uv_node_to_blade); + +short *uv_cpu_to_blade; +EXPORT_SYMBOL_GPL(uv_cpu_to_blade); + +short uv_possible_blades; +EXPORT_SYMBOL_GPL(uv_possible_blades); + +unsigned long sn_rtc_cycles_per_second; +EXPORT_SYMBOL(sn_rtc_cycles_per_second); + +static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) +{ +#ifdef CONFIG_SMP + unsigned long val; + int pnode; + + pnode = uv_apicid_to_pnode(phys_apicid); + phys_apicid |= uv_apicid_hibits; + val = (1UL << UVH_IPI_INT_SEND_SHFT) | + (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | + ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | + APIC_DM_INIT; + uv_write_global_mmr64(pnode, UVH_IPI_INT, val); + + val = (1UL << UVH_IPI_INT_SEND_SHFT) | + (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | + ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | + APIC_DM_STARTUP; + uv_write_global_mmr64(pnode, UVH_IPI_INT, val); + + atomic_set(&init_deasserted, 1); +#endif + return 0; +} + +static void uv_send_IPI_one(int cpu, int vector) +{ + unsigned long apicid; + int pnode; + + apicid = per_cpu(x86_cpu_to_apicid, cpu); + pnode = uv_apicid_to_pnode(apicid); + uv_hub_send_ipi(pnode, apicid, vector); +} + +static void uv_send_IPI_mask(const struct cpumask *mask, int vector) +{ + unsigned int cpu; + + for_each_cpu(cpu, mask) + uv_send_IPI_one(cpu, vector); +} + +static void uv_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) +{ + unsigned int this_cpu = smp_processor_id(); + unsigned int cpu; + + for_each_cpu(cpu, mask) { + if (cpu != this_cpu) + uv_send_IPI_one(cpu, vector); + } +} + +static void uv_send_IPI_allbutself(int vector) +{ + unsigned int this_cpu = smp_processor_id(); + unsigned int cpu; + + for_each_online_cpu(cpu) { + if (cpu != this_cpu) + uv_send_IPI_one(cpu, vector); + } +} + +static void uv_send_IPI_all(int vector) +{ + uv_send_IPI_mask(cpu_online_mask, vector); +} + +static int uv_apic_id_valid(int apicid) +{ + return 1; +} + +static int uv_apic_id_registered(void) +{ + return 1; +} + +static void uv_init_apic_ldr(void) +{ +} + +static int +uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask, + unsigned int *apicid) +{ + int unsigned cpu; + + /* + * We're using fixed IRQ delivery, can only return one phys APIC ID. + * May as well be the first. + */ + for_each_cpu_and(cpu, cpumask, andmask) { + if (cpumask_test_cpu(cpu, cpu_online_mask)) + break; + } + + if (likely(cpu < nr_cpu_ids)) { + *apicid = per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits; + return 0; + } + + return -EINVAL; +} + +static unsigned int x2apic_get_apic_id(unsigned long x) +{ + unsigned int id; + + WARN_ON(preemptible() && num_online_cpus() > 1); + id = x | __this_cpu_read(x2apic_extra_bits); + + return id; +} + +static unsigned long set_apic_id(unsigned int id) +{ + unsigned long x; + + /* maskout x2apic_extra_bits ? */ + x = id; + return x; +} + +static unsigned int uv_read_apic_id(void) +{ + + return x2apic_get_apic_id(apic_read(APIC_ID)); +} + +static int uv_phys_pkg_id(int initial_apicid, int index_msb) +{ + return uv_read_apic_id() >> index_msb; +} + +static void uv_send_IPI_self(int vector) +{ + apic_write(APIC_SELF_IPI, vector); +} + +static int uv_probe(void) +{ + return apic == &apic_x2apic_uv_x; +} + +static struct apic __refdata apic_x2apic_uv_x = { + + .name = "UV large system", + .probe = uv_probe, + .acpi_madt_oem_check = uv_acpi_madt_oem_check, + .apic_id_valid = uv_apic_id_valid, + .apic_id_registered = uv_apic_id_registered, + + .irq_delivery_mode = dest_Fixed, + .irq_dest_mode = 0, /* physical */ + + .target_cpus = online_target_cpus, + .disable_esr = 0, + .dest_logical = APIC_DEST_LOGICAL, + .check_apicid_used = NULL, + .check_apicid_present = NULL, + + .vector_allocation_domain = default_vector_allocation_domain, + .init_apic_ldr = uv_init_apic_ldr, + + .ioapic_phys_id_map = NULL, + .setup_apic_routing = NULL, + .multi_timer_check = NULL, + .cpu_present_to_apicid = default_cpu_present_to_apicid, + .apicid_to_cpu_present = NULL, + .setup_portio_remap = NULL, + .check_phys_apicid_present = default_check_phys_apicid_present, + .enable_apic_mode = NULL, + .phys_pkg_id = uv_phys_pkg_id, + .mps_oem_check = NULL, + + .get_apic_id = x2apic_get_apic_id, + .set_apic_id = set_apic_id, + .apic_id_mask = 0xFFFFFFFFu, + + .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and, + + .send_IPI_mask = uv_send_IPI_mask, + .send_IPI_mask_allbutself = uv_send_IPI_mask_allbutself, + .send_IPI_allbutself = uv_send_IPI_allbutself, + .send_IPI_all = uv_send_IPI_all, + .send_IPI_self = uv_send_IPI_self, + + .wakeup_secondary_cpu = uv_wakeup_secondary, + .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, + .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, + .wait_for_init_deassert = false, + .smp_callin_clear_local_apic = NULL, + .inquire_remote_apic = NULL, + + .read = native_apic_msr_read, + .write = native_apic_msr_write, + .eoi_write = native_apic_msr_eoi_write, + .icr_read = native_x2apic_icr_read, + .icr_write = native_x2apic_icr_write, + .wait_icr_idle = native_x2apic_wait_icr_idle, + .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle, +}; + +static void set_x2apic_extra_bits(int pnode) +{ + __this_cpu_write(x2apic_extra_bits, pnode << uvh_apicid.s.pnode_shift); +} + +/* + * Called on boot cpu. + */ +static __init int boot_pnode_to_blade(int pnode) +{ + int blade; + + for (blade = 0; blade < uv_num_possible_blades(); blade++) + if (pnode == uv_blade_info[blade].pnode) + return blade; + BUG(); +} + +struct redir_addr { + unsigned long redirect; + unsigned long alias; +}; + +#define DEST_SHIFT UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT + +static __initdata struct redir_addr redir_addrs[] = { + {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR}, + {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR}, + {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR}, +}; + +static unsigned char get_n_lshift(int m_val) +{ + union uv3h_gr0_gam_gr_config_u m_gr_config; + + if (is_uv1_hub()) + return m_val; + + if (is_uv2_hub()) + return m_val == 40 ? 40 : 39; + + m_gr_config.v = uv_read_local_mmr(UV3H_GR0_GAM_GR_CONFIG); + return m_gr_config.s3.m_skt; +} + +static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) +{ + union uvh_rh_gam_alias210_overlay_config_2_mmr_u alias; + union uvh_rh_gam_alias210_redirect_config_2_mmr_u redirect; + int i; + + for (i = 0; i < ARRAY_SIZE(redir_addrs); i++) { + alias.v = uv_read_local_mmr(redir_addrs[i].alias); + if (alias.s.enable && alias.s.base == 0) { + *size = (1UL << alias.s.m_alias); + redirect.v = uv_read_local_mmr(redir_addrs[i].redirect); + *base = (unsigned long)redirect.s.dest_base << DEST_SHIFT; + return; + } + } + *base = *size = 0; +} + +enum map_type {map_wb, map_uc}; + +static __init void map_high(char *id, unsigned long base, int pshift, + int bshift, int max_pnode, enum map_type map_type) +{ + unsigned long bytes, paddr; + + paddr = base << pshift; + bytes = (1UL << bshift) * (max_pnode + 1); + if (!paddr) { + pr_info("UV: Map %s_HI base address NULL\n", id); + return; + } + pr_debug("UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, paddr + bytes); + if (map_type == map_uc) + init_extra_mapping_uc(paddr, bytes); + else + init_extra_mapping_wb(paddr, bytes); +} + +static __init void map_gru_distributed(unsigned long c) +{ + union uvh_rh_gam_gru_overlay_config_mmr_u gru; + u64 paddr; + unsigned long bytes; + int nid; + + gru.v = c; + /* only base bits 42:28 relevant in dist mode */ + gru_dist_base = gru.v & 0x000007fff0000000UL; + if (!gru_dist_base) { + pr_info("UV: Map GRU_DIST base address NULL\n"); + return; + } + bytes = 1UL << UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT; + gru_dist_lmask = ((1UL << uv_hub_info->m_val) - 1) & ~(bytes - 1); + gru_dist_umask = ~((1UL << uv_hub_info->m_val) - 1); + gru_dist_base &= gru_dist_lmask; /* Clear bits above M */ + for_each_online_node(nid) { + paddr = ((u64)uv_node_to_pnode(nid) << uv_hub_info->m_val) | + gru_dist_base; + init_extra_mapping_wb(paddr, bytes); + gru_first_node_paddr = min(paddr, gru_first_node_paddr); + gru_last_node_paddr = max(paddr, gru_last_node_paddr); + } + /* Save upper (63:M) bits of address only for is_GRU_range */ + gru_first_node_paddr &= gru_dist_umask; + gru_last_node_paddr &= gru_dist_umask; + pr_debug("UV: Map GRU_DIST base 0x%016llx 0x%016llx - 0x%016llx\n", + gru_dist_base, gru_first_node_paddr, gru_last_node_paddr); +} + +static __init void map_gru_high(int max_pnode) +{ + union uvh_rh_gam_gru_overlay_config_mmr_u gru; + int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT; + + gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); + if (!gru.s.enable) { + pr_info("UV: GRU disabled\n"); + return; + } + + if (is_uv3_hub() && gru.s3.mode) { + map_gru_distributed(gru.v); + return; + } + map_high("GRU", gru.s.base, shift, shift, max_pnode, map_wb); + gru_start_paddr = ((u64)gru.s.base << shift); + gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1); +} + +static __init void map_mmr_high(int max_pnode) +{ + union uvh_rh_gam_mmr_overlay_config_mmr_u mmr; + int shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT; + + mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); + if (mmr.s.enable) + map_high("MMR", mmr.s.base, shift, shift, max_pnode, map_uc); + else + pr_info("UV: MMR disabled\n"); +} + +/* + * This commonality works because both 0 & 1 versions of the MMIOH OVERLAY + * and REDIRECT MMR regs are exactly the same on UV3. + */ +struct mmioh_config { + unsigned long overlay; + unsigned long redirect; + char *id; +}; + +static __initdata struct mmioh_config mmiohs[] = { + { + UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR, + UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR, + "MMIOH0" + }, + { + UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR, + UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR, + "MMIOH1" + }, +}; + +static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode) +{ + union uv3h_rh_gam_mmioh_overlay_config0_mmr_u overlay; + unsigned long mmr; + unsigned long base; + int i, n, shift, m_io, max_io; + int nasid, lnasid, fi, li; + char *id; + + id = mmiohs[index].id; + overlay.v = uv_read_local_mmr(mmiohs[index].overlay); + pr_info("UV: %s overlay 0x%lx base:0x%x m_io:%d\n", + id, overlay.v, overlay.s3.base, overlay.s3.m_io); + if (!overlay.s3.enable) { + pr_info("UV: %s disabled\n", id); + return; + } + + shift = UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_SHFT; + base = (unsigned long)overlay.s3.base; + m_io = overlay.s3.m_io; + mmr = mmiohs[index].redirect; + n = UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH; + min_pnode *= 2; /* convert to NASID */ + max_pnode *= 2; + max_io = lnasid = fi = li = -1; + + for (i = 0; i < n; i++) { + union uv3h_rh_gam_mmioh_redirect_config0_mmr_u redirect; + + redirect.v = uv_read_local_mmr(mmr + i * 8); + nasid = redirect.s3.nasid; + if (nasid < min_pnode || max_pnode < nasid) + nasid = -1; /* invalid NASID */ + + if (nasid == lnasid) { + li = i; + if (i != n-1) /* last entry check */ + continue; + } + + /* check if we have a cached (or last) redirect to print */ + if (lnasid != -1 || (i == n-1 && nasid != -1)) { + unsigned long addr1, addr2; + int f, l; + + if (lnasid == -1) { + f = l = i; + lnasid = nasid; + } else { + f = fi; + l = li; + } + addr1 = (base << shift) + + f * (unsigned long)(1 << m_io); + addr2 = (base << shift) + + (l + 1) * (unsigned long)(1 << m_io); + pr_info("UV: %s[%03d..%03d] NASID 0x%04x ADDR 0x%016lx - 0x%016lx\n", + id, fi, li, lnasid, addr1, addr2); + if (max_io < l) + max_io = l; + } + fi = li = i; + lnasid = nasid; + } + + pr_info("UV: %s base:0x%lx shift:%d M_IO:%d MAX_IO:%d\n", + id, base, shift, m_io, max_io); + + if (max_io >= 0) + map_high(id, base, shift, m_io, max_io, map_uc); +} + +static __init void map_mmioh_high(int min_pnode, int max_pnode) +{ + union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh; + unsigned long mmr, base; + int shift, enable, m_io, n_io; + + if (is_uv3_hub()) { + /* Map both MMIOH Regions */ + map_mmioh_high_uv3(0, min_pnode, max_pnode); + map_mmioh_high_uv3(1, min_pnode, max_pnode); + return; + } + + if (is_uv1_hub()) { + mmr = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR; + shift = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT; + mmioh.v = uv_read_local_mmr(mmr); + enable = !!mmioh.s1.enable; + base = mmioh.s1.base; + m_io = mmioh.s1.m_io; + n_io = mmioh.s1.n_io; + } else if (is_uv2_hub()) { + mmr = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR; + shift = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT; + mmioh.v = uv_read_local_mmr(mmr); + enable = !!mmioh.s2.enable; + base = mmioh.s2.base; + m_io = mmioh.s2.m_io; + n_io = mmioh.s2.n_io; + } else + return; + + if (enable) { + max_pnode &= (1 << n_io) - 1; + pr_info( + "UV: base:0x%lx shift:%d N_IO:%d M_IO:%d max_pnode:0x%x\n", + base, shift, m_io, n_io, max_pnode); + map_high("MMIOH", base, shift, m_io, max_pnode, map_uc); + } else { + pr_info("UV: MMIOH disabled\n"); + } +} + +static __init void map_low_mmrs(void) +{ + init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE); + init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE); +} + +static __init void uv_rtc_init(void) +{ + long status; + u64 ticks_per_sec; + + status = uv_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, + &ticks_per_sec); + if (status != BIOS_STATUS_SUCCESS || ticks_per_sec < 100000) { + printk(KERN_WARNING + "unable to determine platform RTC clock frequency, " + "guessing.\n"); + /* BIOS gives wrong value for clock freq. so guess */ + sn_rtc_cycles_per_second = 1000000000000UL / 30000UL; + } else + sn_rtc_cycles_per_second = ticks_per_sec; +} + +/* + * percpu heartbeat timer + */ +static void uv_heartbeat(unsigned long ignored) +{ + struct timer_list *timer = &uv_hub_info->scir.timer; + unsigned char bits = uv_hub_info->scir.state; + + /* flip heartbeat bit */ + bits ^= SCIR_CPU_HEARTBEAT; + + /* is this cpu idle? */ + if (idle_cpu(raw_smp_processor_id())) + bits &= ~SCIR_CPU_ACTIVITY; + else + bits |= SCIR_CPU_ACTIVITY; + + /* update system controller interface reg */ + uv_set_scir_bits(bits); + + /* enable next timer period */ + mod_timer_pinned(timer, jiffies + SCIR_CPU_HB_INTERVAL); +} + +static void uv_heartbeat_enable(int cpu) +{ + while (!uv_cpu_hub_info(cpu)->scir.enabled) { + struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer; + + uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY); + setup_timer(timer, uv_heartbeat, cpu); + timer->expires = jiffies + SCIR_CPU_HB_INTERVAL; + add_timer_on(timer, cpu); + uv_cpu_hub_info(cpu)->scir.enabled = 1; + + /* also ensure that boot cpu is enabled */ + cpu = 0; + } +} + +#ifdef CONFIG_HOTPLUG_CPU +static void uv_heartbeat_disable(int cpu) +{ + if (uv_cpu_hub_info(cpu)->scir.enabled) { + uv_cpu_hub_info(cpu)->scir.enabled = 0; + del_timer(&uv_cpu_hub_info(cpu)->scir.timer); + } + uv_set_cpu_scir_bits(cpu, 0xff); +} + +/* + * cpu hotplug notifier + */ +static int uv_scir_cpu_notify(struct notifier_block *self, unsigned long action, + void *hcpu) +{ + long cpu = (long)hcpu; + + switch (action) { + case CPU_ONLINE: + uv_heartbeat_enable(cpu); + break; + case CPU_DOWN_PREPARE: + uv_heartbeat_disable(cpu); + break; + default: + break; + } + return NOTIFY_OK; +} + +static __init void uv_scir_register_cpu_notifier(void) +{ + hotcpu_notifier(uv_scir_cpu_notify, 0); +} + +#else /* !CONFIG_HOTPLUG_CPU */ + +static __init void uv_scir_register_cpu_notifier(void) +{ +} + +static __init int uv_init_heartbeat(void) +{ + int cpu; + + if (is_uv_system()) + for_each_online_cpu(cpu) + uv_heartbeat_enable(cpu); + return 0; +} + +late_initcall(uv_init_heartbeat); + +#endif /* !CONFIG_HOTPLUG_CPU */ + +/* Direct Legacy VGA I/O traffic to designated IOH */ +int uv_set_vga_state(struct pci_dev *pdev, bool decode, + unsigned int command_bits, u32 flags) +{ + int domain, bus, rc; + + PR_DEVEL("devfn %x decode %d cmd %x flags %d\n", + pdev->devfn, decode, command_bits, flags); + + if (!(flags & PCI_VGA_STATE_CHANGE_BRIDGE)) + return 0; + + if ((command_bits & PCI_COMMAND_IO) == 0) + return 0; + + domain = pci_domain_nr(pdev->bus); + bus = pdev->bus->number; + + rc = uv_bios_set_legacy_vga_target(decode, domain, bus); + PR_DEVEL("vga decode %d %x:%x, rc: %d\n", decode, domain, bus, rc); + + return rc; +} + +/* + * Called on each cpu to initialize the per_cpu UV data area. + * FIXME: hotplug not supported yet + */ +void uv_cpu_init(void) +{ + /* CPU 0 initilization will be done via uv_system_init. */ + if (!uv_blade_info) + return; + + uv_blade_info[uv_numa_blade_id()].nr_online_cpus++; + + if (get_uv_system_type() == UV_NON_UNIQUE_APIC) + set_x2apic_extra_bits(uv_hub_info->pnode); +} + +void __init uv_system_init(void) +{ + union uvh_rh_gam_config_mmr_u m_n_config; + union uvh_node_id_u node_id; + unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size; + int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val; + int gnode_extra, min_pnode = 999999, max_pnode = -1; + unsigned long mmr_base, present, paddr; + unsigned short pnode_mask; + unsigned char n_lshift; + char *hub = (is_uv1_hub() ? "UV1" : + (is_uv2_hub() ? "UV2" : + "UV3")); + + pr_info("UV: Found %s hub\n", hub); + map_low_mmrs(); + + m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR ); + m_val = m_n_config.s.m_skt; + n_val = m_n_config.s.n_skt; + pnode_mask = (1 << n_val) - 1; + n_lshift = get_n_lshift(m_val); + mmr_base = + uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & + ~UV_MMR_ENABLE; + + node_id.v = uv_read_local_mmr(UVH_NODE_ID); + gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1; + gnode_upper = ((unsigned long)gnode_extra << m_val); + pr_info("UV: N:%d M:%d pnode_mask:0x%x gnode_upper/extra:0x%lx/0x%x n_lshift 0x%x\n", + n_val, m_val, pnode_mask, gnode_upper, gnode_extra, + n_lshift); + + pr_info("UV: global MMR base 0x%lx\n", mmr_base); + + for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) + uv_possible_blades += + hweight64(uv_read_local_mmr( UVH_NODE_PRESENT_TABLE + i * 8)); + + /* uv_num_possible_blades() is really the hub count */ + pr_info("UV: Found %d blades, %d hubs\n", + is_uv1_hub() ? uv_num_possible_blades() : + (uv_num_possible_blades() + 1) / 2, + uv_num_possible_blades()); + + bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); + uv_blade_info = kzalloc(bytes, GFP_KERNEL); + BUG_ON(!uv_blade_info); + + for (blade = 0; blade < uv_num_possible_blades(); blade++) + uv_blade_info[blade].memory_nid = -1; + + get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size); + + bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes(); + uv_node_to_blade = kmalloc(bytes, GFP_KERNEL); + BUG_ON(!uv_node_to_blade); + memset(uv_node_to_blade, 255, bytes); + + bytes = sizeof(uv_cpu_to_blade[0]) * num_possible_cpus(); + uv_cpu_to_blade = kmalloc(bytes, GFP_KERNEL); + BUG_ON(!uv_cpu_to_blade); + memset(uv_cpu_to_blade, 255, bytes); + + blade = 0; + for (i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) { + present = uv_read_local_mmr(UVH_NODE_PRESENT_TABLE + i * 8); + for (j = 0; j < 64; j++) { + if (!test_bit(j, &present)) + continue; + pnode = (i * 64 + j) & pnode_mask; + uv_blade_info[blade].pnode = pnode; + uv_blade_info[blade].nr_possible_cpus = 0; + uv_blade_info[blade].nr_online_cpus = 0; + spin_lock_init(&uv_blade_info[blade].nmi_lock); + min_pnode = min(pnode, min_pnode); + max_pnode = max(pnode, max_pnode); + blade++; + } + } + + uv_bios_init(); + uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, &sn_coherency_id, + &sn_region_size, &system_serial_number); + uv_rtc_init(); + + for_each_present_cpu(cpu) { + int apicid = per_cpu(x86_cpu_to_apicid, cpu); + + nid = cpu_to_node(cpu); + /* + * apic_pnode_shift must be set before calling uv_apicid_to_pnode(); + */ + uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask; + uv_cpu_hub_info(cpu)->apic_pnode_shift = uvh_apicid.s.pnode_shift; + uv_cpu_hub_info(cpu)->hub_revision = uv_hub_info->hub_revision; + + uv_cpu_hub_info(cpu)->m_shift = 64 - m_val; + uv_cpu_hub_info(cpu)->n_lshift = n_lshift; + + pnode = uv_apicid_to_pnode(apicid); + blade = boot_pnode_to_blade(pnode); + lcpu = uv_blade_info[blade].nr_possible_cpus; + uv_blade_info[blade].nr_possible_cpus++; + + /* Any node on the blade, else will contain -1. */ + uv_blade_info[blade].memory_nid = nid; + + uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base; + uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size; + uv_cpu_hub_info(cpu)->m_val = m_val; + uv_cpu_hub_info(cpu)->n_val = n_val; + uv_cpu_hub_info(cpu)->numa_blade_id = blade; + uv_cpu_hub_info(cpu)->blade_processor_id = lcpu; + uv_cpu_hub_info(cpu)->pnode = pnode; + uv_cpu_hub_info(cpu)->gpa_mask = (1UL << (m_val + n_val)) - 1; + uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; + uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra; + uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; + uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; + uv_cpu_hub_info(cpu)->scir.offset = uv_scir_offset(apicid); + uv_node_to_blade[nid] = blade; + uv_cpu_to_blade[cpu] = blade; + } + + /* Add blade/pnode info for nodes without cpus */ + for_each_online_node(nid) { + if (uv_node_to_blade[nid] >= 0) + continue; + paddr = node_start_pfn(nid) << PAGE_SHIFT; + pnode = uv_gpa_to_pnode(uv_soc_phys_ram_to_gpa(paddr)); + blade = boot_pnode_to_blade(pnode); + uv_node_to_blade[nid] = blade; + } + + map_gru_high(max_pnode); + map_mmr_high(max_pnode); + map_mmioh_high(min_pnode, max_pnode); + + uv_nmi_setup(); + uv_cpu_init(); + uv_scir_register_cpu_notifier(); + proc_mkdir("sgi_uv", NULL); + + /* register Legacy VGA I/O redirection handler */ + pci_register_set_vga_state(uv_set_vga_state); + + /* + * For a kdump kernel the reset must be BOOT_ACPI, not BOOT_EFI, as + * EFI is not enabled in the kdump kernel. + */ + if (is_kdump_kernel()) + reboot_type = BOOT_ACPI; +} + +apic_driver(apic_x2apic_uv_x); diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c new file mode 100644 index 00000000000..58487445141 --- /dev/null +++ b/arch/x86/kernel/apm_32.c @@ -0,0 +1,2449 @@ +/* -*- linux-c -*- + * APM BIOS driver for Linux + * Copyright 1994-2001 Stephen Rothwell (sfr@canb.auug.org.au) + * + * Initial development of this driver was funded by NEC Australia P/L + * and NEC Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * October 1995, Rik Faith (faith@cs.unc.edu): + * Minor enhancements and updates (to the patch set) for 1.3.x + * Documentation + * January 1996, Rik Faith (faith@cs.unc.edu): + * Make /proc/apm easy to format (bump driver version) + * March 1996, Rik Faith (faith@cs.unc.edu): + * Prohibit APM BIOS calls unless apm_enabled. + * (Thanks to Ulrich Windl <Ulrich.Windl@rz.uni-regensburg.de>) + * April 1996, Stephen Rothwell (sfr@canb.auug.org.au) + * Version 1.0 and 1.1 + * May 1996, Version 1.2 + * Feb 1998, Version 1.3 + * Feb 1998, Version 1.4 + * Aug 1998, Version 1.5 + * Sep 1998, Version 1.6 + * Nov 1998, Version 1.7 + * Jan 1999, Version 1.8 + * Jan 1999, Version 1.9 + * Oct 1999, Version 1.10 + * Nov 1999, Version 1.11 + * Jan 2000, Version 1.12 + * Feb 2000, Version 1.13 + * Nov 2000, Version 1.14 + * Oct 2001, Version 1.15 + * Jan 2002, Version 1.16 + * Oct 2002, Version 1.16ac + * + * History: + * 0.6b: first version in official kernel, Linux 1.3.46 + * 0.7: changed /proc/apm format, Linux 1.3.58 + * 0.8: fixed gcc 2.7.[12] compilation problems, Linux 1.3.59 + * 0.9: only call bios if bios is present, Linux 1.3.72 + * 1.0: use fixed device number, consolidate /proc/apm into this file, + * Linux 1.3.85 + * 1.1: support user-space standby and suspend, power off after system + * halted, Linux 1.3.98 + * 1.2: When resetting RTC after resume, take care so that the time + * is only incorrect by 30-60mS (vs. 1S previously) (Gabor J. Toth + * <jtoth@princeton.edu>); improve interaction between + * screen-blanking and gpm (Stephen Rothwell); Linux 1.99.4 + * 1.2a:Simple change to stop mysterious bug reports with SMP also added + * levels to the printk calls. APM is not defined for SMP machines. + * The new replacement for it is, but Linux doesn't yet support this. + * Alan Cox Linux 2.1.55 + * 1.3: Set up a valid data descriptor 0x40 for buggy BIOS's + * 1.4: Upgraded to support APM 1.2. Integrated ThinkPad suspend patch by + * Dean Gaudet <dgaudet@arctic.org>. + * C. Scott Ananian <cananian@alumni.princeton.edu> Linux 2.1.87 + * 1.5: Fix segment register reloading (in case of bad segments saved + * across BIOS call). + * Stephen Rothwell + * 1.6: Cope with compiler/assembler differences. + * Only try to turn off the first display device. + * Fix OOPS at power off with no APM BIOS by Jan Echternach + * <echter@informatik.uni-rostock.de> + * Stephen Rothwell + * 1.7: Modify driver's cached copy of the disabled/disengaged flags + * to reflect current state of APM BIOS. + * Chris Rankin <rankinc@bellsouth.net> + * Reset interrupt 0 timer to 100Hz after suspend + * Chad Miller <cmiller@surfsouth.com> + * Add CONFIG_APM_IGNORE_SUSPEND_BOUNCE + * Richard Gooch <rgooch@atnf.csiro.au> + * Allow boot time disabling of APM + * Make boot messages far less verbose by default + * Make asm safer + * Stephen Rothwell + * 1.8: Add CONFIG_APM_RTC_IS_GMT + * Richard Gooch <rgooch@atnf.csiro.au> + * change APM_NOINTS to CONFIG_APM_ALLOW_INTS + * remove dependency on CONFIG_PROC_FS + * Stephen Rothwell + * 1.9: Fix small typo. <laslo@wodip.opole.pl> + * Try to cope with BIOS's that need to have all display + * devices blanked and not just the first one. + * Ross Paterson <ross@soi.city.ac.uk> + * Fix segment limit setting it has always been wrong as + * the segments needed to have byte granularity. + * Mark a few things __init. + * Add hack to allow power off of SMP systems by popular request. + * Use CONFIG_SMP instead of __SMP__ + * Ignore BOUNCES for three seconds. + * Stephen Rothwell + * 1.10: Fix for Thinkpad return code. + * Merge 2.2 and 2.3 drivers. + * Remove APM dependencies in arch/i386/kernel/process.c + * Remove APM dependencies in drivers/char/sysrq.c + * Reset time across standby. + * Allow more inititialisation on SMP. + * Remove CONFIG_APM_POWER_OFF and make it boot time + * configurable (default on). + * Make debug only a boot time parameter (remove APM_DEBUG). + * Try to blank all devices on any error. + * 1.11: Remove APM dependencies in drivers/char/console.c + * Check nr_running to detect if we are idle (from + * Borislav Deianov <borislav@lix.polytechnique.fr>) + * Fix for bioses that don't zero the top part of the + * entrypoint offset (Mario Sitta <sitta@al.unipmn.it>) + * (reported by Panos Katsaloulis <teras@writeme.com>). + * Real mode power off patch (Walter Hofmann + * <Walter.Hofmann@physik.stud.uni-erlangen.de>). + * 1.12: Remove CONFIG_SMP as the compiler will optimize + * the code away anyway (smp_num_cpus == 1 in UP) + * noted by Artur Skawina <skawina@geocities.com>. + * Make power off under SMP work again. + * Fix thinko with initial engaging of BIOS. + * Make sure power off only happens on CPU 0 + * (Paul "Rusty" Russell <rusty@rustcorp.com.au>). + * Do error notification to user mode if BIOS calls fail. + * Move entrypoint offset fix to ...boot/setup.S + * where it belongs (Cosmos <gis88564@cis.nctu.edu.tw>). + * Remove smp-power-off. SMP users must now specify + * "apm=power-off" on the kernel command line. Suggested + * by Jim Avera <jima@hal.com>, modified by Alan Cox + * <alan@lxorguk.ukuu.org.uk>. + * Register the /proc/apm entry even on SMP so that + * scripts that check for it before doing power off + * work (Jim Avera <jima@hal.com>). + * 1.13: Changes for new pm_ interfaces (Andy Henroid + * <andy_henroid@yahoo.com>). + * Modularize the code. + * Fix the Thinkpad (again) :-( (CONFIG_APM_IGNORE_MULTIPLE_SUSPENDS + * is now the way life works). + * Fix thinko in suspend() (wrong return). + * Notify drivers on critical suspend. + * Make kapmd absorb more idle time (Pavel Machek <pavel@ucw.cz> + * modified by sfr). + * Disable interrupts while we are suspended (Andy Henroid + * <andy_henroid@yahoo.com> fixed by sfr). + * Make power off work on SMP again (Tony Hoyle + * <tmh@magenta-logic.com> and <zlatko@iskon.hr>) modified by sfr. + * Remove CONFIG_APM_SUSPEND_BOUNCE. The bounce ignore + * interval is now configurable. + * 1.14: Make connection version persist across module unload/load. + * Enable and engage power management earlier. + * Disengage power management on module unload. + * Changed to use the sysrq-register hack for registering the + * power off function called by magic sysrq based upon discussions + * in irc://irc.openprojects.net/#kernelnewbies + * (Crutcher Dunnavant <crutcher+kernel@datastacks.com>). + * Make CONFIG_APM_REAL_MODE_POWER_OFF run time configurable. + * (Arjan van de Ven <arjanv@redhat.com>) modified by sfr. + * Work around byte swap bug in one of the Vaio's BIOS's + * (Marc Boucher <marc@mbsi.ca>). + * Exposed the disable flag to dmi so that we can handle known + * broken APM (Alan Cox <alan@lxorguk.ukuu.org.uk>). + * 1.14ac: If the BIOS says "I slowed the CPU down" then don't spin + * calling it - instead idle. (Alan Cox <alan@lxorguk.ukuu.org.uk>) + * If an APM idle fails log it and idle sensibly + * 1.15: Don't queue events to clients who open the device O_WRONLY. + * Don't expect replies from clients who open the device O_RDONLY. + * (Idea from Thomas Hood) + * Minor waitqueue cleanups. (John Fremlin <chief@bandits.org>) + * 1.16: Fix idle calling. (Andreas Steinmetz <ast@domdv.de> et al.) + * Notify listeners of standby or suspend events before notifying + * drivers. Return EBUSY to ioctl() if suspend is rejected. + * (Russell King <rmk@arm.linux.org.uk> and Thomas Hood) + * Ignore first resume after we generate our own resume event + * after a suspend (Thomas Hood) + * Daemonize now gets rid of our controlling terminal (sfr). + * CONFIG_APM_CPU_IDLE now just affects the default value of + * idle_threshold (sfr). + * Change name of kernel apm daemon (as it no longer idles) (sfr). + * 1.16ac: Fix up SMP support somewhat. You can now force SMP on and we + * make _all_ APM calls on the CPU#0. Fix unsafe sign bug. + * TODO: determine if its "boot CPU" or "CPU0" we want to lock to. + * + * APM 1.1 Reference: + * + * Intel Corporation, Microsoft Corporation. Advanced Power Management + * (APM) BIOS Interface Specification, Revision 1.1, September 1993. + * Intel Order Number 241704-001. Microsoft Part Number 781-110-X01. + * + * [This document is available free from Intel by calling 800.628.8686 (fax + * 916.356.6100) or 800.548.4725; or from + * http://www.microsoft.com/whdc/archive/amp_12.mspx It is also + * available from Microsoft by calling 206.882.8080.] + * + * APM 1.2 Reference: + * Intel Corporation, Microsoft Corporation. Advanced Power Management + * (APM) BIOS Interface Specification, Revision 1.2, February 1996. + * + * [This document is available from Microsoft at: + * http://www.microsoft.com/whdc/archive/amp_12.mspx] + */ + +#define pr_fmt(fmt) "apm: " fmt + +#include <linux/module.h> + +#include <linux/poll.h> +#include <linux/types.h> +#include <linux/stddef.h> +#include <linux/timer.h> +#include <linux/fcntl.h> +#include <linux/slab.h> +#include <linux/stat.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/miscdevice.h> +#include <linux/apm_bios.h> +#include <linux/init.h> +#include <linux/time.h> +#include <linux/sched.h> +#include <linux/pm.h> +#include <linux/capability.h> +#include <linux/device.h> +#include <linux/kernel.h> +#include <linux/freezer.h> +#include <linux/smp.h> +#include <linux/dmi.h> +#include <linux/suspend.h> +#include <linux/kthread.h> +#include <linux/jiffies.h> +#include <linux/acpi.h> +#include <linux/syscore_ops.h> +#include <linux/i8253.h> +#include <linux/cpuidle.h> + +#include <asm/uaccess.h> +#include <asm/desc.h> +#include <asm/olpc.h> +#include <asm/paravirt.h> +#include <asm/reboot.h> + +#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) +extern int (*console_blank_hook)(int); +#endif + +/* + * The apm_bios device is one of the misc char devices. + * This is its minor number. + */ +#define APM_MINOR_DEV 134 + +/* + * Various options can be changed at boot time as follows: + * (We allow underscores for compatibility with the modules code) + * apm=on/off enable/disable APM + * [no-]allow[-_]ints allow interrupts during BIOS calls + * [no-]broken[-_]psr BIOS has a broken GetPowerStatus call + * [no-]realmode[-_]power[-_]off switch to real mode before + * powering off + * [no-]debug log some debugging messages + * [no-]power[-_]off power off on shutdown + * [no-]smp Use apm even on an SMP box + * bounce[-_]interval=<n> number of ticks to ignore suspend + * bounces + * idle[-_]threshold=<n> System idle percentage above which to + * make APM BIOS idle calls. Set it to + * 100 to disable. + * idle[-_]period=<n> Period (in 1/100s of a second) over + * which the idle percentage is + * calculated. + */ + +/* KNOWN PROBLEM MACHINES: + * + * U: TI 4000M TravelMate: BIOS is *NOT* APM compliant + * [Confirmed by TI representative] + * ?: ACER 486DX4/75: uses dseg 0040, in violation of APM specification + * [Confirmed by BIOS disassembly] + * [This may work now ...] + * P: Toshiba 1950S: battery life information only gets updated after resume + * P: Midwest Micro Soundbook Elite DX2/66 monochrome: screen blanking + * broken in BIOS [Reported by Garst R. Reese <reese@isn.net>] + * ?: AcerNote-950: oops on reading /proc/apm - workaround is a WIP + * Neale Banks <neale@lowendale.com.au> December 2000 + * + * Legend: U = unusable with APM patches + * P = partially usable with APM patches + */ + +/* + * Define as 1 to make the driver always call the APM BIOS busy + * routine even if the clock was not reported as slowed by the + * idle routine. Otherwise, define as 0. + */ +#define ALWAYS_CALL_BUSY 1 + +/* + * Define to make the APM BIOS calls zero all data segment registers (so + * that an incorrect BIOS implementation will cause a kernel panic if it + * tries to write to arbitrary memory). + */ +#define APM_ZERO_SEGS + +#include <asm/apm.h> + +/* + * Define to re-initialize the interrupt 0 timer to 100 Hz after a suspend. + * This patched by Chad Miller <cmiller@surfsouth.com>, original code by + * David Chen <chen@ctpa04.mit.edu> + */ +#undef INIT_TIMER_AFTER_SUSPEND + +#ifdef INIT_TIMER_AFTER_SUSPEND +#include <linux/timex.h> +#include <asm/io.h> +#include <linux/delay.h> +#endif + +/* + * Need to poll the APM BIOS every second + */ +#define APM_CHECK_TIMEOUT (HZ) + +/* + * Ignore suspend events for this amount of time after a resume + */ +#define DEFAULT_BOUNCE_INTERVAL (3 * HZ) + +/* + * Maximum number of events stored + */ +#define APM_MAX_EVENTS 20 + +/* + * The per-file APM data + */ +struct apm_user { + int magic; + struct apm_user *next; + unsigned int suser: 1; + unsigned int writer: 1; + unsigned int reader: 1; + unsigned int suspend_wait: 1; + int suspend_result; + int suspends_pending; + int standbys_pending; + int suspends_read; + int standbys_read; + int event_head; + int event_tail; + apm_event_t events[APM_MAX_EVENTS]; +}; + +/* + * The magic number in apm_user + */ +#define APM_BIOS_MAGIC 0x4101 + +/* + * idle percentage above which bios idle calls are done + */ +#ifdef CONFIG_APM_CPU_IDLE +#define DEFAULT_IDLE_THRESHOLD 95 +#else +#define DEFAULT_IDLE_THRESHOLD 100 +#endif +#define DEFAULT_IDLE_PERIOD (100 / 3) + +static int apm_cpu_idle(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int index); + +static struct cpuidle_driver apm_idle_driver = { + .name = "apm_idle", + .owner = THIS_MODULE, + .states = { + { /* entry 0 is for polling */ }, + { /* entry 1 is for APM idle */ + .name = "APM", + .desc = "APM idle", + .flags = CPUIDLE_FLAG_TIME_VALID, + .exit_latency = 250, /* WAG */ + .target_residency = 500, /* WAG */ + .enter = &apm_cpu_idle + }, + }, + .state_count = 2, +}; + +static struct cpuidle_device apm_cpuidle_device; + +/* + * Local variables + */ +__visible struct { + unsigned long offset; + unsigned short segment; +} apm_bios_entry; +static int clock_slowed; +static int idle_threshold __read_mostly = DEFAULT_IDLE_THRESHOLD; +static int idle_period __read_mostly = DEFAULT_IDLE_PERIOD; +static int suspends_pending; +static int standbys_pending; +static int ignore_sys_suspend; +static int ignore_normal_resume; +static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL; + +static bool debug __read_mostly; +static bool smp __read_mostly; +static int apm_disabled = -1; +#ifdef CONFIG_SMP +static bool power_off; +#else +static bool power_off = 1; +#endif +static bool realmode_power_off; +#ifdef CONFIG_APM_ALLOW_INTS +static bool allow_ints = 1; +#else +static bool allow_ints; +#endif +static bool broken_psr; + +static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue); +static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); +static struct apm_user *user_list; +static DEFINE_SPINLOCK(user_list_lock); +static DEFINE_MUTEX(apm_mutex); + +/* + * Set up a segment that references the real mode segment 0x40 + * that extends up to the end of page zero (that we have reserved). + * This is for buggy BIOS's that refer to (real mode) segment 0x40 + * even though they are called in protected mode. + */ +static struct desc_struct bad_bios_desc = GDT_ENTRY_INIT(0x4092, + (unsigned long)__va(0x400UL), PAGE_SIZE - 0x400 - 1); + +static const char driver_version[] = "1.16ac"; /* no spaces */ + +static struct task_struct *kapmd_task; + +/* + * APM event names taken from the APM 1.2 specification. These are + * the message codes that the BIOS uses to tell us about events + */ +static const char * const apm_event_name[] = { + "system standby", + "system suspend", + "normal resume", + "critical resume", + "low battery", + "power status change", + "update time", + "critical suspend", + "user standby", + "user suspend", + "system standby resume", + "capabilities change" +}; +#define NR_APM_EVENT_NAME ARRAY_SIZE(apm_event_name) + +typedef struct lookup_t { + int key; + char *msg; +} lookup_t; + +/* + * The BIOS returns a set of standard error codes in AX when the + * carry flag is set. + */ + +static const lookup_t error_table[] = { +/* N/A { APM_SUCCESS, "Operation succeeded" }, */ + { APM_DISABLED, "Power management disabled" }, + { APM_CONNECTED, "Real mode interface already connected" }, + { APM_NOT_CONNECTED, "Interface not connected" }, + { APM_16_CONNECTED, "16 bit interface already connected" }, +/* N/A { APM_16_UNSUPPORTED, "16 bit interface not supported" }, */ + { APM_32_CONNECTED, "32 bit interface already connected" }, + { APM_32_UNSUPPORTED, "32 bit interface not supported" }, + { APM_BAD_DEVICE, "Unrecognized device ID" }, + { APM_BAD_PARAM, "Parameter out of range" }, + { APM_NOT_ENGAGED, "Interface not engaged" }, + { APM_BAD_FUNCTION, "Function not supported" }, + { APM_RESUME_DISABLED, "Resume timer disabled" }, + { APM_BAD_STATE, "Unable to enter requested state" }, +/* N/A { APM_NO_EVENTS, "No events pending" }, */ + { APM_NO_ERROR, "BIOS did not set a return code" }, + { APM_NOT_PRESENT, "No APM present" } +}; +#define ERROR_COUNT ARRAY_SIZE(error_table) + +/** + * apm_error - display an APM error + * @str: information string + * @err: APM BIOS return code + * + * Write a meaningful log entry to the kernel log in the event of + * an APM error. Note that this also handles (negative) kernel errors. + */ + +static void apm_error(char *str, int err) +{ + int i; + + for (i = 0; i < ERROR_COUNT; i++) + if (error_table[i].key == err) + break; + if (i < ERROR_COUNT) + pr_notice("%s: %s\n", str, error_table[i].msg); + else if (err < 0) + pr_notice("%s: linux error code %i\n", str, err); + else + pr_notice("%s: unknown error code %#2.2x\n", + str, err); +} + +/* + * These are the actual BIOS calls. Depending on APM_ZERO_SEGS and + * apm_info.allow_ints, we are being really paranoid here! Not only + * are interrupts disabled, but all the segment registers (except SS) + * are saved and zeroed this means that if the BIOS tries to reference + * any data without explicitly loading the segment registers, the kernel + * will fault immediately rather than have some unforeseen circumstances + * for the rest of the kernel. And it will be very obvious! :-) Doing + * this depends on CS referring to the same physical memory as DS so that + * DS can be zeroed before the call. Unfortunately, we can't do anything + * about the stack segment/pointer. Also, we tell the compiler that + * everything could change. + * + * Also, we KNOW that for the non error case of apm_bios_call, there + * is no useful data returned in the low order 8 bits of eax. + */ + +static inline unsigned long __apm_irq_save(void) +{ + unsigned long flags; + local_save_flags(flags); + if (apm_info.allow_ints) { + if (irqs_disabled_flags(flags)) + local_irq_enable(); + } else + local_irq_disable(); + + return flags; +} + +#define apm_irq_save(flags) \ + do { flags = __apm_irq_save(); } while (0) + +static inline void apm_irq_restore(unsigned long flags) +{ + if (irqs_disabled_flags(flags)) + local_irq_disable(); + else if (irqs_disabled()) + local_irq_enable(); +} + +#ifdef APM_ZERO_SEGS +# define APM_DECL_SEGS \ + unsigned int saved_fs; unsigned int saved_gs; +# define APM_DO_SAVE_SEGS \ + savesegment(fs, saved_fs); savesegment(gs, saved_gs) +# define APM_DO_RESTORE_SEGS \ + loadsegment(fs, saved_fs); loadsegment(gs, saved_gs) +#else +# define APM_DECL_SEGS +# define APM_DO_SAVE_SEGS +# define APM_DO_RESTORE_SEGS +#endif + +struct apm_bios_call { + u32 func; + /* In and out */ + u32 ebx; + u32 ecx; + /* Out only */ + u32 eax; + u32 edx; + u32 esi; + + /* Error: -ENOMEM, or bits 8-15 of eax */ + int err; +}; + +/** + * __apm_bios_call - Make an APM BIOS 32bit call + * @_call: pointer to struct apm_bios_call. + * + * Make an APM call using the 32bit protected mode interface. The + * caller is responsible for knowing if APM BIOS is configured and + * enabled. This call can disable interrupts for a long period of + * time on some laptops. The return value is in AH and the carry + * flag is loaded into AL. If there is an error, then the error + * code is returned in AH (bits 8-15 of eax) and this function + * returns non-zero. + * + * Note: this makes the call on the current CPU. + */ +static long __apm_bios_call(void *_call) +{ + APM_DECL_SEGS + unsigned long flags; + int cpu; + struct desc_struct save_desc_40; + struct desc_struct *gdt; + struct apm_bios_call *call = _call; + + cpu = get_cpu(); + BUG_ON(cpu != 0); + gdt = get_cpu_gdt_table(cpu); + save_desc_40 = gdt[0x40 / 8]; + gdt[0x40 / 8] = bad_bios_desc; + + apm_irq_save(flags); + APM_DO_SAVE_SEGS; + apm_bios_call_asm(call->func, call->ebx, call->ecx, + &call->eax, &call->ebx, &call->ecx, &call->edx, + &call->esi); + APM_DO_RESTORE_SEGS; + apm_irq_restore(flags); + gdt[0x40 / 8] = save_desc_40; + put_cpu(); + + return call->eax & 0xff; +} + +/* Run __apm_bios_call or __apm_bios_call_simple on CPU 0 */ +static int on_cpu0(long (*fn)(void *), struct apm_bios_call *call) +{ + int ret; + + /* Don't bother with work_on_cpu in the common case, so we don't + * have to worry about OOM or overhead. */ + if (get_cpu() == 0) { + ret = fn(call); + put_cpu(); + } else { + put_cpu(); + ret = work_on_cpu(0, fn, call); + } + + /* work_on_cpu can fail with -ENOMEM */ + if (ret < 0) + call->err = ret; + else + call->err = (call->eax >> 8) & 0xff; + + return ret; +} + +/** + * apm_bios_call - Make an APM BIOS 32bit call (on CPU 0) + * @call: the apm_bios_call registers. + * + * If there is an error, it is returned in @call.err. + */ +static int apm_bios_call(struct apm_bios_call *call) +{ + return on_cpu0(__apm_bios_call, call); +} + +/** + * __apm_bios_call_simple - Make an APM BIOS 32bit call (on CPU 0) + * @_call: pointer to struct apm_bios_call. + * + * Make a BIOS call that returns one value only, or just status. + * If there is an error, then the error code is returned in AH + * (bits 8-15 of eax) and this function returns non-zero (it can + * also return -ENOMEM). This is used for simpler BIOS operations. + * This call may hold interrupts off for a long time on some laptops. + * + * Note: this makes the call on the current CPU. + */ +static long __apm_bios_call_simple(void *_call) +{ + u8 error; + APM_DECL_SEGS + unsigned long flags; + int cpu; + struct desc_struct save_desc_40; + struct desc_struct *gdt; + struct apm_bios_call *call = _call; + + cpu = get_cpu(); + BUG_ON(cpu != 0); + gdt = get_cpu_gdt_table(cpu); + save_desc_40 = gdt[0x40 / 8]; + gdt[0x40 / 8] = bad_bios_desc; + + apm_irq_save(flags); + APM_DO_SAVE_SEGS; + error = apm_bios_call_simple_asm(call->func, call->ebx, call->ecx, + &call->eax); + APM_DO_RESTORE_SEGS; + apm_irq_restore(flags); + gdt[0x40 / 8] = save_desc_40; + put_cpu(); + return error; +} + +/** + * apm_bios_call_simple - make a simple APM BIOS 32bit call + * @func: APM function to invoke + * @ebx_in: EBX register value for BIOS call + * @ecx_in: ECX register value for BIOS call + * @eax: EAX register on return from the BIOS call + * @err: bits + * + * Make a BIOS call that returns one value only, or just status. + * If there is an error, then the error code is returned in @err + * and this function returns non-zero. This is used for simpler + * BIOS operations. This call may hold interrupts off for a long + * time on some laptops. + */ +static int apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax, + int *err) +{ + struct apm_bios_call call; + int ret; + + call.func = func; + call.ebx = ebx_in; + call.ecx = ecx_in; + + ret = on_cpu0(__apm_bios_call_simple, &call); + *eax = call.eax; + *err = call.err; + return ret; +} + +/** + * apm_driver_version - APM driver version + * @val: loaded with the APM version on return + * + * Retrieve the APM version supported by the BIOS. This is only + * supported for APM 1.1 or higher. An error indicates APM 1.0 is + * probably present. + * + * On entry val should point to a value indicating the APM driver + * version with the high byte being the major and the low byte the + * minor number both in BCD + * + * On return it will hold the BIOS revision supported in the + * same format. + */ + +static int apm_driver_version(u_short *val) +{ + u32 eax; + int err; + + if (apm_bios_call_simple(APM_FUNC_VERSION, 0, *val, &eax, &err)) + return err; + *val = eax; + return APM_SUCCESS; +} + +/** + * apm_get_event - get an APM event from the BIOS + * @event: pointer to the event + * @info: point to the event information + * + * The APM BIOS provides a polled information for event + * reporting. The BIOS expects to be polled at least every second + * when events are pending. When a message is found the caller should + * poll until no more messages are present. However, this causes + * problems on some laptops where a suspend event notification is + * not cleared until it is acknowledged. + * + * Additional information is returned in the info pointer, providing + * that APM 1.2 is in use. If no messges are pending the value 0x80 + * is returned (No power management events pending). + */ +static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info) +{ + struct apm_bios_call call; + + call.func = APM_FUNC_GET_EVENT; + call.ebx = call.ecx = 0; + + if (apm_bios_call(&call)) + return call.err; + + *event = call.ebx; + if (apm_info.connection_version < 0x0102) + *info = ~0; /* indicate info not valid */ + else + *info = call.ecx; + return APM_SUCCESS; +} + +/** + * set_power_state - set the power management state + * @what: which items to transition + * @state: state to transition to + * + * Request an APM change of state for one or more system devices. The + * processor state must be transitioned last of all. what holds the + * class of device in the upper byte and the device number (0xFF for + * all) for the object to be transitioned. + * + * The state holds the state to transition to, which may in fact + * be an acceptance of a BIOS requested state change. + */ + +static int set_power_state(u_short what, u_short state) +{ + u32 eax; + int err; + + if (apm_bios_call_simple(APM_FUNC_SET_STATE, what, state, &eax, &err)) + return err; + return APM_SUCCESS; +} + +/** + * set_system_power_state - set system wide power state + * @state: which state to enter + * + * Transition the entire system into a new APM power state. + */ + +static int set_system_power_state(u_short state) +{ + return set_power_state(APM_DEVICE_ALL, state); +} + +/** + * apm_do_idle - perform power saving + * + * This function notifies the BIOS that the processor is (in the view + * of the OS) idle. It returns -1 in the event that the BIOS refuses + * to handle the idle request. On a success the function returns 1 + * if the BIOS did clock slowing or 0 otherwise. + */ + +static int apm_do_idle(void) +{ + u32 eax; + u8 ret = 0; + int idled = 0; + int err = 0; + + if (!need_resched()) { + idled = 1; + ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax, &err); + } + + if (!idled) + return 0; + + if (ret) { + static unsigned long t; + + /* This always fails on some SMP boards running UP kernels. + * Only report the failure the first 5 times. + */ + if (++t < 5) { + printk(KERN_DEBUG "apm_do_idle failed (%d)\n", err); + t = jiffies; + } + return -1; + } + clock_slowed = (apm_info.bios.flags & APM_IDLE_SLOWS_CLOCK) != 0; + return clock_slowed; +} + +/** + * apm_do_busy - inform the BIOS the CPU is busy + * + * Request that the BIOS brings the CPU back to full performance. + */ + +static void apm_do_busy(void) +{ + u32 dummy; + int err; + + if (clock_slowed || ALWAYS_CALL_BUSY) { + (void)apm_bios_call_simple(APM_FUNC_BUSY, 0, 0, &dummy, &err); + clock_slowed = 0; + } +} + +/* + * If no process has really been interested in + * the CPU for some time, we want to call BIOS + * power management - we probably want + * to conserve power. + */ +#define IDLE_CALC_LIMIT (HZ * 100) +#define IDLE_LEAKY_MAX 16 + +/** + * apm_cpu_idle - cpu idling for APM capable Linux + * + * This is the idling function the kernel executes when APM is available. It + * tries to do BIOS powermanagement based on the average system idle time. + * Furthermore it calls the system default idle routine. + */ + +static int apm_cpu_idle(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int index) +{ + static int use_apm_idle; /* = 0 */ + static unsigned int last_jiffies; /* = 0 */ + static unsigned int last_stime; /* = 0 */ + cputime_t stime; + + int apm_idle_done = 0; + unsigned int jiffies_since_last_check = jiffies - last_jiffies; + unsigned int bucket; + +recalc: + task_cputime(current, NULL, &stime); + if (jiffies_since_last_check > IDLE_CALC_LIMIT) { + use_apm_idle = 0; + } else if (jiffies_since_last_check > idle_period) { + unsigned int idle_percentage; + + idle_percentage = stime - last_stime; + idle_percentage *= 100; + idle_percentage /= jiffies_since_last_check; + use_apm_idle = (idle_percentage > idle_threshold); + if (apm_info.forbid_idle) + use_apm_idle = 0; + } + + last_jiffies = jiffies; + last_stime = stime; + + bucket = IDLE_LEAKY_MAX; + + while (!need_resched()) { + if (use_apm_idle) { + unsigned int t; + + t = jiffies; + switch (apm_do_idle()) { + case 0: + apm_idle_done = 1; + if (t != jiffies) { + if (bucket) { + bucket = IDLE_LEAKY_MAX; + continue; + } + } else if (bucket) { + bucket--; + continue; + } + break; + case 1: + apm_idle_done = 1; + break; + default: /* BIOS refused */ + break; + } + } + default_idle(); + local_irq_disable(); + jiffies_since_last_check = jiffies - last_jiffies; + if (jiffies_since_last_check > idle_period) + goto recalc; + } + + if (apm_idle_done) + apm_do_busy(); + + return index; +} + +/** + * apm_power_off - ask the BIOS to power off + * + * Handle the power off sequence. This is the one piece of code we + * will execute even on SMP machines. In order to deal with BIOS + * bugs we support real mode APM BIOS power off calls. We also make + * the SMP call on CPU0 as some systems will only honour this call + * on their first cpu. + */ + +static void apm_power_off(void) +{ + /* Some bioses don't like being called from CPU != 0 */ + if (apm_info.realmode_power_off) { + set_cpus_allowed_ptr(current, cpumask_of(0)); + machine_real_restart(MRR_APM); + } else { + (void)set_system_power_state(APM_STATE_OFF); + } +} + +#ifdef CONFIG_APM_DO_ENABLE + +/** + * apm_enable_power_management - enable BIOS APM power management + * @enable: enable yes/no + * + * Enable or disable the APM BIOS power services. + */ + +static int apm_enable_power_management(int enable) +{ + u32 eax; + int err; + + if ((enable == 0) && (apm_info.bios.flags & APM_BIOS_DISENGAGED)) + return APM_NOT_ENGAGED; + if (apm_bios_call_simple(APM_FUNC_ENABLE_PM, APM_DEVICE_BALL, + enable, &eax, &err)) + return err; + if (enable) + apm_info.bios.flags &= ~APM_BIOS_DISABLED; + else + apm_info.bios.flags |= APM_BIOS_DISABLED; + return APM_SUCCESS; +} +#endif + +/** + * apm_get_power_status - get current power state + * @status: returned status + * @bat: battery info + * @life: estimated life + * + * Obtain the current power status from the APM BIOS. We return a + * status which gives the rough battery status, and current power + * source. The bat value returned give an estimate as a percentage + * of life and a status value for the battery. The estimated life + * if reported is a lifetime in secodnds/minutes at current powwer + * consumption. + */ + +static int apm_get_power_status(u_short *status, u_short *bat, u_short *life) +{ + struct apm_bios_call call; + + call.func = APM_FUNC_GET_STATUS; + call.ebx = APM_DEVICE_ALL; + call.ecx = 0; + + if (apm_info.get_power_status_broken) + return APM_32_UNSUPPORTED; + if (apm_bios_call(&call)) + return call.err; + *status = call.ebx; + *bat = call.ecx; + if (apm_info.get_power_status_swabinminutes) { + *life = swab16((u16)call.edx); + *life |= 0x8000; + } else + *life = call.edx; + return APM_SUCCESS; +} + +#if 0 +static int apm_get_battery_status(u_short which, u_short *status, + u_short *bat, u_short *life, u_short *nbat) +{ + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; + u32 esi; + + if (apm_info.connection_version < 0x0102) { + /* pretend we only have one battery. */ + if (which != 1) + return APM_BAD_DEVICE; + *nbat = 1; + return apm_get_power_status(status, bat, life); + } + + if (apm_bios_call(APM_FUNC_GET_STATUS, (0x8000 | (which)), 0, &eax, + &ebx, &ecx, &edx, &esi)) + return (eax >> 8) & 0xff; + *status = ebx; + *bat = ecx; + *life = edx; + *nbat = esi; + return APM_SUCCESS; +} +#endif + +/** + * apm_engage_power_management - enable PM on a device + * @device: identity of device + * @enable: on/off + * + * Activate or deactive power management on either a specific device + * or the entire system (%APM_DEVICE_ALL). + */ + +static int apm_engage_power_management(u_short device, int enable) +{ + u32 eax; + int err; + + if ((enable == 0) && (device == APM_DEVICE_ALL) + && (apm_info.bios.flags & APM_BIOS_DISABLED)) + return APM_DISABLED; + if (apm_bios_call_simple(APM_FUNC_ENGAGE_PM, device, enable, + &eax, &err)) + return err; + if (device == APM_DEVICE_ALL) { + if (enable) + apm_info.bios.flags &= ~APM_BIOS_DISENGAGED; + else + apm_info.bios.flags |= APM_BIOS_DISENGAGED; + } + return APM_SUCCESS; +} + +#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) + +/** + * apm_console_blank - blank the display + * @blank: on/off + * + * Attempt to blank the console, firstly by blanking just video device + * zero, and if that fails (some BIOSes don't support it) then it blanks + * all video devices. Typically the BIOS will do laptop backlight and + * monitor powerdown for us. + */ + +static int apm_console_blank(int blank) +{ + int error = APM_NOT_ENGAGED; /* silence gcc */ + int i; + u_short state; + static const u_short dev[3] = { 0x100, 0x1FF, 0x101 }; + + state = blank ? APM_STATE_STANDBY : APM_STATE_READY; + + for (i = 0; i < ARRAY_SIZE(dev); i++) { + error = set_power_state(dev[i], state); + + if ((error == APM_SUCCESS) || (error == APM_NO_ERROR)) + return 1; + + if (error == APM_NOT_ENGAGED) + break; + } + + if (error == APM_NOT_ENGAGED) { + static int tried; + int eng_error; + if (tried++ == 0) { + eng_error = apm_engage_power_management(APM_DEVICE_ALL, 1); + if (eng_error) { + apm_error("set display", error); + apm_error("engage interface", eng_error); + return 0; + } else + return apm_console_blank(blank); + } + } + apm_error("set display", error); + return 0; +} +#endif + +static int queue_empty(struct apm_user *as) +{ + return as->event_head == as->event_tail; +} + +static apm_event_t get_queued_event(struct apm_user *as) +{ + if (++as->event_tail >= APM_MAX_EVENTS) + as->event_tail = 0; + return as->events[as->event_tail]; +} + +static void queue_event(apm_event_t event, struct apm_user *sender) +{ + struct apm_user *as; + + spin_lock(&user_list_lock); + if (user_list == NULL) + goto out; + for (as = user_list; as != NULL; as = as->next) { + if ((as == sender) || (!as->reader)) + continue; + if (++as->event_head >= APM_MAX_EVENTS) + as->event_head = 0; + + if (as->event_head == as->event_tail) { + static int notified; + + if (notified++ == 0) + pr_err("an event queue overflowed\n"); + if (++as->event_tail >= APM_MAX_EVENTS) + as->event_tail = 0; + } + as->events[as->event_head] = event; + if (!as->suser || !as->writer) + continue; + switch (event) { + case APM_SYS_SUSPEND: + case APM_USER_SUSPEND: + as->suspends_pending++; + suspends_pending++; + break; + + case APM_SYS_STANDBY: + case APM_USER_STANDBY: + as->standbys_pending++; + standbys_pending++; + break; + } + } + wake_up_interruptible(&apm_waitqueue); +out: + spin_unlock(&user_list_lock); +} + +static void reinit_timer(void) +{ +#ifdef INIT_TIMER_AFTER_SUSPEND + unsigned long flags; + + raw_spin_lock_irqsave(&i8253_lock, flags); + /* set the clock to HZ */ + outb_p(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ + udelay(10); + outb_p(LATCH & 0xff, PIT_CH0); /* LSB */ + udelay(10); + outb_p(LATCH >> 8, PIT_CH0); /* MSB */ + udelay(10); + raw_spin_unlock_irqrestore(&i8253_lock, flags); +#endif +} + +static int suspend(int vetoable) +{ + int err; + struct apm_user *as; + + dpm_suspend_start(PMSG_SUSPEND); + dpm_suspend_end(PMSG_SUSPEND); + + local_irq_disable(); + syscore_suspend(); + + local_irq_enable(); + + save_processor_state(); + err = set_system_power_state(APM_STATE_SUSPEND); + ignore_normal_resume = 1; + restore_processor_state(); + + local_irq_disable(); + reinit_timer(); + + if (err == APM_NO_ERROR) + err = APM_SUCCESS; + if (err != APM_SUCCESS) + apm_error("suspend", err); + err = (err == APM_SUCCESS) ? 0 : -EIO; + + syscore_resume(); + local_irq_enable(); + + dpm_resume_start(PMSG_RESUME); + dpm_resume_end(PMSG_RESUME); + + queue_event(APM_NORMAL_RESUME, NULL); + spin_lock(&user_list_lock); + for (as = user_list; as != NULL; as = as->next) { + as->suspend_wait = 0; + as->suspend_result = err; + } + spin_unlock(&user_list_lock); + wake_up_interruptible(&apm_suspend_waitqueue); + return err; +} + +static void standby(void) +{ + int err; + + dpm_suspend_end(PMSG_SUSPEND); + + local_irq_disable(); + syscore_suspend(); + local_irq_enable(); + + err = set_system_power_state(APM_STATE_STANDBY); + if ((err != APM_SUCCESS) && (err != APM_NO_ERROR)) + apm_error("standby", err); + + local_irq_disable(); + syscore_resume(); + local_irq_enable(); + + dpm_resume_start(PMSG_RESUME); +} + +static apm_event_t get_event(void) +{ + int error; + apm_event_t event = APM_NO_EVENTS; /* silence gcc */ + apm_eventinfo_t info; + + static int notified; + + /* we don't use the eventinfo */ + error = apm_get_event(&event, &info); + if (error == APM_SUCCESS) + return event; + + if ((error != APM_NO_EVENTS) && (notified++ == 0)) + apm_error("get_event", error); + + return 0; +} + +static void check_events(void) +{ + apm_event_t event; + static unsigned long last_resume; + static int ignore_bounce; + + while ((event = get_event()) != 0) { + if (debug) { + if (event <= NR_APM_EVENT_NAME) + printk(KERN_DEBUG "apm: received %s notify\n", + apm_event_name[event - 1]); + else + printk(KERN_DEBUG "apm: received unknown " + "event 0x%02x\n", event); + } + if (ignore_bounce + && (time_after(jiffies, last_resume + bounce_interval))) + ignore_bounce = 0; + + switch (event) { + case APM_SYS_STANDBY: + case APM_USER_STANDBY: + queue_event(event, NULL); + if (standbys_pending <= 0) + standby(); + break; + + case APM_USER_SUSPEND: +#ifdef CONFIG_APM_IGNORE_USER_SUSPEND + if (apm_info.connection_version > 0x100) + set_system_power_state(APM_STATE_REJECT); + break; +#endif + case APM_SYS_SUSPEND: + if (ignore_bounce) { + if (apm_info.connection_version > 0x100) + set_system_power_state(APM_STATE_REJECT); + break; + } + /* + * If we are already processing a SUSPEND, + * then further SUSPEND events from the BIOS + * will be ignored. We also return here to + * cope with the fact that the Thinkpads keep + * sending a SUSPEND event until something else + * happens! + */ + if (ignore_sys_suspend) + return; + ignore_sys_suspend = 1; + queue_event(event, NULL); + if (suspends_pending <= 0) + (void) suspend(1); + break; + + case APM_NORMAL_RESUME: + case APM_CRITICAL_RESUME: + case APM_STANDBY_RESUME: + ignore_sys_suspend = 0; + last_resume = jiffies; + ignore_bounce = 1; + if ((event != APM_NORMAL_RESUME) + || (ignore_normal_resume == 0)) { + dpm_resume_end(PMSG_RESUME); + queue_event(event, NULL); + } + ignore_normal_resume = 0; + break; + + case APM_CAPABILITY_CHANGE: + case APM_LOW_BATTERY: + case APM_POWER_STATUS_CHANGE: + queue_event(event, NULL); + /* If needed, notify drivers here */ + break; + + case APM_UPDATE_TIME: + break; + + case APM_CRITICAL_SUSPEND: + /* + * We are not allowed to reject a critical suspend. + */ + (void)suspend(0); + break; + } + } +} + +static void apm_event_handler(void) +{ + static int pending_count = 4; + int err; + + if ((standbys_pending > 0) || (suspends_pending > 0)) { + if ((apm_info.connection_version > 0x100) && + (pending_count-- <= 0)) { + pending_count = 4; + if (debug) + printk(KERN_DEBUG "apm: setting state busy\n"); + err = set_system_power_state(APM_STATE_BUSY); + if (err) + apm_error("busy", err); + } + } else + pending_count = 4; + check_events(); +} + +/* + * This is the APM thread main loop. + */ + +static void apm_mainloop(void) +{ + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue(&apm_waitqueue, &wait); + set_current_state(TASK_INTERRUPTIBLE); + for (;;) { + schedule_timeout(APM_CHECK_TIMEOUT); + if (kthread_should_stop()) + break; + /* + * Ok, check all events, check for idle (and mark us sleeping + * so as not to count towards the load average).. + */ + set_current_state(TASK_INTERRUPTIBLE); + apm_event_handler(); + } + remove_wait_queue(&apm_waitqueue, &wait); +} + +static int check_apm_user(struct apm_user *as, const char *func) +{ + if (as == NULL || as->magic != APM_BIOS_MAGIC) { + pr_err("%s passed bad filp\n", func); + return 1; + } + return 0; +} + +static ssize_t do_read(struct file *fp, char __user *buf, size_t count, loff_t *ppos) +{ + struct apm_user *as; + int i; + apm_event_t event; + + as = fp->private_data; + if (check_apm_user(as, "read")) + return -EIO; + if ((int)count < sizeof(apm_event_t)) + return -EINVAL; + if ((queue_empty(as)) && (fp->f_flags & O_NONBLOCK)) + return -EAGAIN; + wait_event_interruptible(apm_waitqueue, !queue_empty(as)); + i = count; + while ((i >= sizeof(event)) && !queue_empty(as)) { + event = get_queued_event(as); + if (copy_to_user(buf, &event, sizeof(event))) { + if (i < count) + break; + return -EFAULT; + } + switch (event) { + case APM_SYS_SUSPEND: + case APM_USER_SUSPEND: + as->suspends_read++; + break; + + case APM_SYS_STANDBY: + case APM_USER_STANDBY: + as->standbys_read++; + break; + } + buf += sizeof(event); + i -= sizeof(event); + } + if (i < count) + return count - i; + if (signal_pending(current)) + return -ERESTARTSYS; + return 0; +} + +static unsigned int do_poll(struct file *fp, poll_table *wait) +{ + struct apm_user *as; + + as = fp->private_data; + if (check_apm_user(as, "poll")) + return 0; + poll_wait(fp, &apm_waitqueue, wait); + if (!queue_empty(as)) + return POLLIN | POLLRDNORM; + return 0; +} + +static long do_ioctl(struct file *filp, u_int cmd, u_long arg) +{ + struct apm_user *as; + int ret; + + as = filp->private_data; + if (check_apm_user(as, "ioctl")) + return -EIO; + if (!as->suser || !as->writer) + return -EPERM; + switch (cmd) { + case APM_IOC_STANDBY: + mutex_lock(&apm_mutex); + if (as->standbys_read > 0) { + as->standbys_read--; + as->standbys_pending--; + standbys_pending--; + } else + queue_event(APM_USER_STANDBY, as); + if (standbys_pending <= 0) + standby(); + mutex_unlock(&apm_mutex); + break; + case APM_IOC_SUSPEND: + mutex_lock(&apm_mutex); + if (as->suspends_read > 0) { + as->suspends_read--; + as->suspends_pending--; + suspends_pending--; + } else + queue_event(APM_USER_SUSPEND, as); + if (suspends_pending <= 0) { + ret = suspend(1); + mutex_unlock(&apm_mutex); + } else { + as->suspend_wait = 1; + mutex_unlock(&apm_mutex); + wait_event_interruptible(apm_suspend_waitqueue, + as->suspend_wait == 0); + ret = as->suspend_result; + } + return ret; + default: + return -ENOTTY; + } + return 0; +} + +static int do_release(struct inode *inode, struct file *filp) +{ + struct apm_user *as; + + as = filp->private_data; + if (check_apm_user(as, "release")) + return 0; + filp->private_data = NULL; + if (as->standbys_pending > 0) { + standbys_pending -= as->standbys_pending; + if (standbys_pending <= 0) + standby(); + } + if (as->suspends_pending > 0) { + suspends_pending -= as->suspends_pending; + if (suspends_pending <= 0) + (void) suspend(1); + } + spin_lock(&user_list_lock); + if (user_list == as) + user_list = as->next; + else { + struct apm_user *as1; + + for (as1 = user_list; + (as1 != NULL) && (as1->next != as); + as1 = as1->next) + ; + if (as1 == NULL) + pr_err("filp not in user list\n"); + else + as1->next = as->next; + } + spin_unlock(&user_list_lock); + kfree(as); + return 0; +} + +static int do_open(struct inode *inode, struct file *filp) +{ + struct apm_user *as; + + as = kmalloc(sizeof(*as), GFP_KERNEL); + if (as == NULL) + return -ENOMEM; + + as->magic = APM_BIOS_MAGIC; + as->event_tail = as->event_head = 0; + as->suspends_pending = as->standbys_pending = 0; + as->suspends_read = as->standbys_read = 0; + /* + * XXX - this is a tiny bit broken, when we consider BSD + * process accounting. If the device is opened by root, we + * instantly flag that we used superuser privs. Who knows, + * we might close the device immediately without doing a + * privileged operation -- cevans + */ + as->suser = capable(CAP_SYS_ADMIN); + as->writer = (filp->f_mode & FMODE_WRITE) == FMODE_WRITE; + as->reader = (filp->f_mode & FMODE_READ) == FMODE_READ; + spin_lock(&user_list_lock); + as->next = user_list; + user_list = as; + spin_unlock(&user_list_lock); + filp->private_data = as; + return 0; +} + +static int proc_apm_show(struct seq_file *m, void *v) +{ + unsigned short bx; + unsigned short cx; + unsigned short dx; + int error; + unsigned short ac_line_status = 0xff; + unsigned short battery_status = 0xff; + unsigned short battery_flag = 0xff; + int percentage = -1; + int time_units = -1; + char *units = "?"; + + if ((num_online_cpus() == 1) && + !(error = apm_get_power_status(&bx, &cx, &dx))) { + ac_line_status = (bx >> 8) & 0xff; + battery_status = bx & 0xff; + if ((cx & 0xff) != 0xff) + percentage = cx & 0xff; + + if (apm_info.connection_version > 0x100) { + battery_flag = (cx >> 8) & 0xff; + if (dx != 0xffff) { + units = (dx & 0x8000) ? "min" : "sec"; + time_units = dx & 0x7fff; + } + } + } + /* Arguments, with symbols from linux/apm_bios.h. Information is + from the Get Power Status (0x0a) call unless otherwise noted. + + 0) Linux driver version (this will change if format changes) + 1) APM BIOS Version. Usually 1.0, 1.1 or 1.2. + 2) APM flags from APM Installation Check (0x00): + bit 0: APM_16_BIT_SUPPORT + bit 1: APM_32_BIT_SUPPORT + bit 2: APM_IDLE_SLOWS_CLOCK + bit 3: APM_BIOS_DISABLED + bit 4: APM_BIOS_DISENGAGED + 3) AC line status + 0x00: Off-line + 0x01: On-line + 0x02: On backup power (BIOS >= 1.1 only) + 0xff: Unknown + 4) Battery status + 0x00: High + 0x01: Low + 0x02: Critical + 0x03: Charging + 0x04: Selected battery not present (BIOS >= 1.2 only) + 0xff: Unknown + 5) Battery flag + bit 0: High + bit 1: Low + bit 2: Critical + bit 3: Charging + bit 7: No system battery + 0xff: Unknown + 6) Remaining battery life (percentage of charge): + 0-100: valid + -1: Unknown + 7) Remaining battery life (time units): + Number of remaining minutes or seconds + -1: Unknown + 8) min = minutes; sec = seconds */ + + seq_printf(m, "%s %d.%d 0x%02x 0x%02x 0x%02x 0x%02x %d%% %d %s\n", + driver_version, + (apm_info.bios.version >> 8) & 0xff, + apm_info.bios.version & 0xff, + apm_info.bios.flags, + ac_line_status, + battery_status, + battery_flag, + percentage, + time_units, + units); + return 0; +} + +static int proc_apm_open(struct inode *inode, struct file *file) +{ + return single_open(file, proc_apm_show, NULL); +} + +static const struct file_operations apm_file_ops = { + .owner = THIS_MODULE, + .open = proc_apm_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int apm(void *unused) +{ + unsigned short bx; + unsigned short cx; + unsigned short dx; + int error; + char *power_stat; + char *bat_stat; + + /* 2002/08/01 - WT + * This is to avoid random crashes at boot time during initialization + * on SMP systems in case of "apm=power-off" mode. Seen on ASUS A7M266D. + * Some bioses don't like being called from CPU != 0. + * Method suggested by Ingo Molnar. + */ + set_cpus_allowed_ptr(current, cpumask_of(0)); + BUG_ON(smp_processor_id() != 0); + + if (apm_info.connection_version == 0) { + apm_info.connection_version = apm_info.bios.version; + if (apm_info.connection_version > 0x100) { + /* + * We only support BIOSs up to version 1.2 + */ + if (apm_info.connection_version > 0x0102) + apm_info.connection_version = 0x0102; + error = apm_driver_version(&apm_info.connection_version); + if (error != APM_SUCCESS) { + apm_error("driver version", error); + /* Fall back to an APM 1.0 connection. */ + apm_info.connection_version = 0x100; + } + } + } + + if (debug) + printk(KERN_INFO "apm: Connection version %d.%d\n", + (apm_info.connection_version >> 8) & 0xff, + apm_info.connection_version & 0xff); + +#ifdef CONFIG_APM_DO_ENABLE + if (apm_info.bios.flags & APM_BIOS_DISABLED) { + /* + * This call causes my NEC UltraLite Versa 33/C to hang if it + * is booted with PM disabled but not in the docking station. + * Unfortunate ... + */ + error = apm_enable_power_management(1); + if (error) { + apm_error("enable power management", error); + return -1; + } + } +#endif + + if ((apm_info.bios.flags & APM_BIOS_DISENGAGED) + && (apm_info.connection_version > 0x0100)) { + error = apm_engage_power_management(APM_DEVICE_ALL, 1); + if (error) { + apm_error("engage power management", error); + return -1; + } + } + + if (debug && (num_online_cpus() == 1 || smp)) { + error = apm_get_power_status(&bx, &cx, &dx); + if (error) + printk(KERN_INFO "apm: power status not available\n"); + else { + switch ((bx >> 8) & 0xff) { + case 0: + power_stat = "off line"; + break; + case 1: + power_stat = "on line"; + break; + case 2: + power_stat = "on backup power"; + break; + default: + power_stat = "unknown"; + break; + } + switch (bx & 0xff) { + case 0: + bat_stat = "high"; + break; + case 1: + bat_stat = "low"; + break; + case 2: + bat_stat = "critical"; + break; + case 3: + bat_stat = "charging"; + break; + default: + bat_stat = "unknown"; + break; + } + printk(KERN_INFO + "apm: AC %s, battery status %s, battery life ", + power_stat, bat_stat); + if ((cx & 0xff) == 0xff) + printk("unknown\n"); + else + printk("%d%%\n", cx & 0xff); + if (apm_info.connection_version > 0x100) { + printk(KERN_INFO + "apm: battery flag 0x%02x, battery life ", + (cx >> 8) & 0xff); + if (dx == 0xffff) + printk("unknown\n"); + else + printk("%d %s\n", dx & 0x7fff, + (dx & 0x8000) ? + "minutes" : "seconds"); + } + } + } + + /* Install our power off handler.. */ + if (power_off) + pm_power_off = apm_power_off; + + if (num_online_cpus() == 1 || smp) { +#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) + console_blank_hook = apm_console_blank; +#endif + apm_mainloop(); +#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) + console_blank_hook = NULL; +#endif + } + + return 0; +} + +#ifndef MODULE +static int __init apm_setup(char *str) +{ + int invert; + + while ((str != NULL) && (*str != '\0')) { + if (strncmp(str, "off", 3) == 0) + apm_disabled = 1; + if (strncmp(str, "on", 2) == 0) + apm_disabled = 0; + if ((strncmp(str, "bounce-interval=", 16) == 0) || + (strncmp(str, "bounce_interval=", 16) == 0)) + bounce_interval = simple_strtol(str + 16, NULL, 0); + if ((strncmp(str, "idle-threshold=", 15) == 0) || + (strncmp(str, "idle_threshold=", 15) == 0)) + idle_threshold = simple_strtol(str + 15, NULL, 0); + if ((strncmp(str, "idle-period=", 12) == 0) || + (strncmp(str, "idle_period=", 12) == 0)) + idle_period = simple_strtol(str + 12, NULL, 0); + invert = (strncmp(str, "no-", 3) == 0) || + (strncmp(str, "no_", 3) == 0); + if (invert) + str += 3; + if (strncmp(str, "debug", 5) == 0) + debug = !invert; + if ((strncmp(str, "power-off", 9) == 0) || + (strncmp(str, "power_off", 9) == 0)) + power_off = !invert; + if (strncmp(str, "smp", 3) == 0) { + smp = !invert; + idle_threshold = 100; + } + if ((strncmp(str, "allow-ints", 10) == 0) || + (strncmp(str, "allow_ints", 10) == 0)) + apm_info.allow_ints = !invert; + if ((strncmp(str, "broken-psr", 10) == 0) || + (strncmp(str, "broken_psr", 10) == 0)) + apm_info.get_power_status_broken = !invert; + if ((strncmp(str, "realmode-power-off", 18) == 0) || + (strncmp(str, "realmode_power_off", 18) == 0)) + apm_info.realmode_power_off = !invert; + str = strchr(str, ','); + if (str != NULL) + str += strspn(str, ", \t"); + } + return 1; +} + +__setup("apm=", apm_setup); +#endif + +static const struct file_operations apm_bios_fops = { + .owner = THIS_MODULE, + .read = do_read, + .poll = do_poll, + .unlocked_ioctl = do_ioctl, + .open = do_open, + .release = do_release, + .llseek = noop_llseek, +}; + +static struct miscdevice apm_device = { + APM_MINOR_DEV, + "apm_bios", + &apm_bios_fops +}; + + +/* Simple "print if true" callback */ +static int __init print_if_true(const struct dmi_system_id *d) +{ + printk("%s\n", d->ident); + return 0; +} + +/* + * Some Bioses enable the PS/2 mouse (touchpad) at resume, even if it was + * disabled before the suspend. Linux used to get terribly confused by that. + */ +static int __init broken_ps2_resume(const struct dmi_system_id *d) +{ + printk(KERN_INFO "%s machine detected. Mousepad Resume Bug " + "workaround hopefully not needed.\n", d->ident); + return 0; +} + +/* Some bioses have a broken protected mode poweroff and need to use realmode */ +static int __init set_realmode_power_off(const struct dmi_system_id *d) +{ + if (apm_info.realmode_power_off == 0) { + apm_info.realmode_power_off = 1; + printk(KERN_INFO "%s bios detected. " + "Using realmode poweroff only.\n", d->ident); + } + return 0; +} + +/* Some laptops require interrupts to be enabled during APM calls */ +static int __init set_apm_ints(const struct dmi_system_id *d) +{ + if (apm_info.allow_ints == 0) { + apm_info.allow_ints = 1; + printk(KERN_INFO "%s machine detected. " + "Enabling interrupts during APM calls.\n", d->ident); + } + return 0; +} + +/* Some APM bioses corrupt memory or just plain do not work */ +static int __init apm_is_horked(const struct dmi_system_id *d) +{ + if (apm_info.disabled == 0) { + apm_info.disabled = 1; + printk(KERN_INFO "%s machine detected. " + "Disabling APM.\n", d->ident); + } + return 0; +} + +static int __init apm_is_horked_d850md(const struct dmi_system_id *d) +{ + if (apm_info.disabled == 0) { + apm_info.disabled = 1; + printk(KERN_INFO "%s machine detected. " + "Disabling APM.\n", d->ident); + printk(KERN_INFO "This bug is fixed in bios P15 which is available for\n"); + printk(KERN_INFO "download from support.intel.com\n"); + } + return 0; +} + +/* Some APM bioses hang on APM idle calls */ +static int __init apm_likes_to_melt(const struct dmi_system_id *d) +{ + if (apm_info.forbid_idle == 0) { + apm_info.forbid_idle = 1; + printk(KERN_INFO "%s machine detected. " + "Disabling APM idle calls.\n", d->ident); + } + return 0; +} + +/* + * Check for clue free BIOS implementations who use + * the following QA technique + * + * [ Write BIOS Code ]<------ + * | ^ + * < Does it Compile >----N-- + * |Y ^ + * < Does it Boot Win98 >-N-- + * |Y + * [Ship It] + * + * Phoenix A04 08/24/2000 is known bad (Dell Inspiron 5000e) + * Phoenix A07 09/29/2000 is known good (Dell Inspiron 5000) + */ +static int __init broken_apm_power(const struct dmi_system_id *d) +{ + apm_info.get_power_status_broken = 1; + printk(KERN_WARNING "BIOS strings suggest APM bugs, " + "disabling power status reporting.\n"); + return 0; +} + +/* + * This bios swaps the APM minute reporting bytes over (Many sony laptops + * have this problem). + */ +static int __init swab_apm_power_in_minutes(const struct dmi_system_id *d) +{ + apm_info.get_power_status_swabinminutes = 1; + printk(KERN_WARNING "BIOS strings suggest APM reports battery life " + "in minutes and wrong byte order.\n"); + return 0; +} + +static struct dmi_system_id __initdata apm_dmi_table[] = { + { + print_if_true, + KERN_WARNING "IBM T23 - BIOS 1.03b+ and controller firmware 1.02+ may be needed for Linux APM.", + { DMI_MATCH(DMI_SYS_VENDOR, "IBM"), + DMI_MATCH(DMI_BIOS_VERSION, "1AET38WW (1.01b)"), }, + }, + { /* Handle problems with APM on the C600 */ + broken_ps2_resume, "Dell Latitude C600", + { DMI_MATCH(DMI_SYS_VENDOR, "Dell"), + DMI_MATCH(DMI_PRODUCT_NAME, "Latitude C600"), }, + }, + { /* Allow interrupts during suspend on Dell Latitude laptops*/ + set_apm_ints, "Dell Latitude", + { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "Latitude C510"), } + }, + { /* APM crashes */ + apm_is_horked, "Dell Inspiron 2500", + { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"), + DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "A11"), }, + }, + { /* Allow interrupts during suspend on Dell Inspiron laptops*/ + set_apm_ints, "Dell Inspiron", { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 4000"), }, + }, + { /* Handle problems with APM on Inspiron 5000e */ + broken_apm_power, "Dell Inspiron 5000e", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "A04"), + DMI_MATCH(DMI_BIOS_DATE, "08/24/2000"), }, + }, + { /* Handle problems with APM on Inspiron 2500 */ + broken_apm_power, "Dell Inspiron 2500", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "A12"), + DMI_MATCH(DMI_BIOS_DATE, "02/04/2002"), }, + }, + { /* APM crashes */ + apm_is_horked, "Dell Dimension 4100", + { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "XPS-Z"), + DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."), + DMI_MATCH(DMI_BIOS_VERSION, "A11"), }, + }, + { /* Allow interrupts during suspend on Compaq Laptops*/ + set_apm_ints, "Compaq 12XL125", + { DMI_MATCH(DMI_SYS_VENDOR, "Compaq"), + DMI_MATCH(DMI_PRODUCT_NAME, "Compaq PC"), + DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "4.06"), }, + }, + { /* Allow interrupts during APM or the clock goes slow */ + set_apm_ints, "ASUSTeK", + { DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK Computer Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "L8400K series Notebook PC"), }, + }, + { /* APM blows on shutdown */ + apm_is_horked, "ABIT KX7-333[R]", + { DMI_MATCH(DMI_BOARD_VENDOR, "ABIT"), + DMI_MATCH(DMI_BOARD_NAME, "VT8367-8233A (KX7-333[R])"), }, + }, + { /* APM crashes */ + apm_is_horked, "Trigem Delhi3", + { DMI_MATCH(DMI_SYS_VENDOR, "TriGem Computer, Inc"), + DMI_MATCH(DMI_PRODUCT_NAME, "Delhi3"), }, + }, + { /* APM crashes */ + apm_is_horked, "Fujitsu-Siemens", + { DMI_MATCH(DMI_BIOS_VENDOR, "hoenix/FUJITSU SIEMENS"), + DMI_MATCH(DMI_BIOS_VERSION, "Version1.01"), }, + }, + { /* APM crashes */ + apm_is_horked_d850md, "Intel D850MD", + { DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."), + DMI_MATCH(DMI_BIOS_VERSION, "MV85010A.86A.0016.P07.0201251536"), }, + }, + { /* APM crashes */ + apm_is_horked, "Intel D810EMO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."), + DMI_MATCH(DMI_BIOS_VERSION, "MO81010A.86A.0008.P04.0004170800"), }, + }, + { /* APM crashes */ + apm_is_horked, "Dell XPS-Z", + { DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."), + DMI_MATCH(DMI_BIOS_VERSION, "A11"), + DMI_MATCH(DMI_PRODUCT_NAME, "XPS-Z"), }, + }, + { /* APM crashes */ + apm_is_horked, "Sharp PC-PJ/AX", + { DMI_MATCH(DMI_SYS_VENDOR, "SHARP"), + DMI_MATCH(DMI_PRODUCT_NAME, "PC-PJ/AX"), + DMI_MATCH(DMI_BIOS_VENDOR, "SystemSoft"), + DMI_MATCH(DMI_BIOS_VERSION, "Version R2.08"), }, + }, + { /* APM crashes */ + apm_is_horked, "Dell Inspiron 2500", + { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"), + DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "A11"), }, + }, + { /* APM idle hangs */ + apm_likes_to_melt, "Jabil AMD", + { DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."), + DMI_MATCH(DMI_BIOS_VERSION, "0AASNP06"), }, + }, + { /* APM idle hangs */ + apm_likes_to_melt, "AMI Bios", + { DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."), + DMI_MATCH(DMI_BIOS_VERSION, "0AASNP05"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-N505X(DE) */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0206H"), + DMI_MATCH(DMI_BIOS_DATE, "08/23/99"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-N505VX */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "W2K06H0"), + DMI_MATCH(DMI_BIOS_DATE, "02/03/00"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-XG29 */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0117A0"), + DMI_MATCH(DMI_BIOS_DATE, "04/25/00"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-Z600NE */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0121Z1"), + DMI_MATCH(DMI_BIOS_DATE, "05/11/00"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-Z600NE */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "WME01Z1"), + DMI_MATCH(DMI_BIOS_DATE, "08/11/00"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-Z600LEK(DE) */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0206Z3"), + DMI_MATCH(DMI_BIOS_DATE, "12/25/00"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-Z505LS */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0203D0"), + DMI_MATCH(DMI_BIOS_DATE, "05/12/00"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-Z505LS */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0203Z3"), + DMI_MATCH(DMI_BIOS_DATE, "08/25/00"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-Z505LS (with updated BIOS) */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0209Z3"), + DMI_MATCH(DMI_BIOS_DATE, "05/12/01"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-F104K */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0204K2"), + DMI_MATCH(DMI_BIOS_DATE, "08/28/00"), }, + }, + + { /* Handle problems with APM on Sony Vaio PCG-C1VN/C1VE */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0208P1"), + DMI_MATCH(DMI_BIOS_DATE, "11/09/00"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-C1VE */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0204P1"), + DMI_MATCH(DMI_BIOS_DATE, "09/12/00"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-C1VE */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "WXPO1Z3"), + DMI_MATCH(DMI_BIOS_DATE, "10/26/01"), }, + }, + { /* broken PM poweroff bios */ + set_realmode_power_off, "Award Software v4.60 PGMA", + { DMI_MATCH(DMI_BIOS_VENDOR, "Award Software International, Inc."), + DMI_MATCH(DMI_BIOS_VERSION, "4.60 PGMA"), + DMI_MATCH(DMI_BIOS_DATE, "134526184"), }, + }, + + /* Generic per vendor APM settings */ + + { /* Allow interrupts during suspend on IBM laptops */ + set_apm_ints, "IBM", + { DMI_MATCH(DMI_SYS_VENDOR, "IBM"), }, + }, + + { } +}; + +/* + * Just start the APM thread. We do NOT want to do APM BIOS + * calls from anything but the APM thread, if for no other reason + * than the fact that we don't trust the APM BIOS. This way, + * most common APM BIOS problems that lead to protection errors + * etc will have at least some level of being contained... + * + * In short, if something bad happens, at least we have a choice + * of just killing the apm thread.. + */ +static int __init apm_init(void) +{ + struct desc_struct *gdt; + int err; + + dmi_check_system(apm_dmi_table); + + if (apm_info.bios.version == 0 || paravirt_enabled() || machine_is_olpc()) { + printk(KERN_INFO "apm: BIOS not found.\n"); + return -ENODEV; + } + printk(KERN_INFO + "apm: BIOS version %d.%d Flags 0x%02x (Driver version %s)\n", + ((apm_info.bios.version >> 8) & 0xff), + (apm_info.bios.version & 0xff), + apm_info.bios.flags, + driver_version); + if ((apm_info.bios.flags & APM_32_BIT_SUPPORT) == 0) { + printk(KERN_INFO "apm: no 32 bit BIOS support\n"); + return -ENODEV; + } + + if (allow_ints) + apm_info.allow_ints = 1; + if (broken_psr) + apm_info.get_power_status_broken = 1; + if (realmode_power_off) + apm_info.realmode_power_off = 1; + /* User can override, but default is to trust DMI */ + if (apm_disabled != -1) + apm_info.disabled = apm_disabled; + + /* + * Fix for the Compaq Contura 3/25c which reports BIOS version 0.1 + * but is reportedly a 1.0 BIOS. + */ + if (apm_info.bios.version == 0x001) + apm_info.bios.version = 0x100; + + /* BIOS < 1.2 doesn't set cseg_16_len */ + if (apm_info.bios.version < 0x102) + apm_info.bios.cseg_16_len = 0; /* 64k */ + + if (debug) { + printk(KERN_INFO "apm: entry %x:%x cseg16 %x dseg %x", + apm_info.bios.cseg, apm_info.bios.offset, + apm_info.bios.cseg_16, apm_info.bios.dseg); + if (apm_info.bios.version > 0x100) + printk(" cseg len %x, dseg len %x", + apm_info.bios.cseg_len, + apm_info.bios.dseg_len); + if (apm_info.bios.version > 0x101) + printk(" cseg16 len %x", apm_info.bios.cseg_16_len); + printk("\n"); + } + + if (apm_info.disabled) { + pr_notice("disabled on user request.\n"); + return -ENODEV; + } + if ((num_online_cpus() > 1) && !power_off && !smp) { + pr_notice("disabled - APM is not SMP safe.\n"); + apm_info.disabled = 1; + return -ENODEV; + } + if (!acpi_disabled) { + pr_notice("overridden by ACPI.\n"); + apm_info.disabled = 1; + return -ENODEV; + } + + /* + * Set up the long jump entry point to the APM BIOS, which is called + * from inline assembly. + */ + apm_bios_entry.offset = apm_info.bios.offset; + apm_bios_entry.segment = APM_CS; + + /* + * The APM 1.1 BIOS is supposed to provide limit information that it + * recognizes. Many machines do this correctly, but many others do + * not restrict themselves to their claimed limit. When this happens, + * they will cause a segmentation violation in the kernel at boot time. + * Most BIOS's, however, will respect a 64k limit, so we use that. + * + * Note we only set APM segments on CPU zero, since we pin the APM + * code to that CPU. + */ + gdt = get_cpu_gdt_table(0); + set_desc_base(&gdt[APM_CS >> 3], + (unsigned long)__va((unsigned long)apm_info.bios.cseg << 4)); + set_desc_base(&gdt[APM_CS_16 >> 3], + (unsigned long)__va((unsigned long)apm_info.bios.cseg_16 << 4)); + set_desc_base(&gdt[APM_DS >> 3], + (unsigned long)__va((unsigned long)apm_info.bios.dseg << 4)); + + proc_create("apm", 0, NULL, &apm_file_ops); + + kapmd_task = kthread_create(apm, NULL, "kapmd"); + if (IS_ERR(kapmd_task)) { + pr_err("disabled - Unable to start kernel thread\n"); + err = PTR_ERR(kapmd_task); + kapmd_task = NULL; + remove_proc_entry("apm", NULL); + return err; + } + wake_up_process(kapmd_task); + + if (num_online_cpus() > 1 && !smp) { + printk(KERN_NOTICE + "apm: disabled - APM is not SMP safe (power off active).\n"); + return 0; + } + + /* + * Note we don't actually care if the misc_device cannot be registered. + * this driver can do its job without it, even if userspace can't + * control it. just log the error + */ + if (misc_register(&apm_device)) + printk(KERN_WARNING "apm: Could not register misc device.\n"); + + if (HZ != 100) + idle_period = (idle_period * HZ) / 100; + if (idle_threshold < 100) { + if (!cpuidle_register_driver(&apm_idle_driver)) + if (cpuidle_register_device(&apm_cpuidle_device)) + cpuidle_unregister_driver(&apm_idle_driver); + } + + return 0; +} + +static void __exit apm_exit(void) +{ + int error; + + cpuidle_unregister_device(&apm_cpuidle_device); + cpuidle_unregister_driver(&apm_idle_driver); + + if (((apm_info.bios.flags & APM_BIOS_DISENGAGED) == 0) + && (apm_info.connection_version > 0x0100)) { + error = apm_engage_power_management(APM_DEVICE_ALL, 0); + if (error) + apm_error("disengage power management", error); + } + misc_deregister(&apm_device); + remove_proc_entry("apm", NULL); + if (power_off) + pm_power_off = NULL; + if (kapmd_task) { + kthread_stop(kapmd_task); + kapmd_task = NULL; + } +} + +module_init(apm_init); +module_exit(apm_exit); + +MODULE_AUTHOR("Stephen Rothwell"); +MODULE_DESCRIPTION("Advanced Power Management"); +MODULE_LICENSE("GPL"); +module_param(debug, bool, 0644); +MODULE_PARM_DESC(debug, "Enable debug mode"); +module_param(power_off, bool, 0444); +MODULE_PARM_DESC(power_off, "Enable power off"); +module_param(bounce_interval, int, 0444); +MODULE_PARM_DESC(bounce_interval, + "Set the number of ticks to ignore suspend bounces"); +module_param(allow_ints, bool, 0444); +MODULE_PARM_DESC(allow_ints, "Allow interrupts during BIOS calls"); +module_param(broken_psr, bool, 0444); +MODULE_PARM_DESC(broken_psr, "BIOS has a broken GetPowerStatus call"); +module_param(realmode_power_off, bool, 0444); +MODULE_PARM_DESC(realmode_power_off, + "Switch to real mode before powering off"); +module_param(idle_threshold, int, 0444); +MODULE_PARM_DESC(idle_threshold, + "System idle percentage above which to make APM BIOS idle calls"); +module_param(idle_period, int, 0444); +MODULE_PARM_DESC(idle_period, + "Period (in sec/100) over which to caculate the idle percentage"); +module_param(smp, bool, 0444); +MODULE_PARM_DESC(smp, + "Set this to enable APM use on an SMP platform. Use with caution on older systems"); +MODULE_ALIAS_MISCDEV(APM_MINOR_DEV); diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c new file mode 100644 index 00000000000..9f6b9341950 --- /dev/null +++ b/arch/x86/kernel/asm-offsets.c @@ -0,0 +1,74 @@ +/* + * Generate definitions needed by assembly language modules. + * This code generates raw asm output which is post-processed to extract + * and format the required data. + */ +#define COMPILE_OFFSETS + +#include <linux/crypto.h> +#include <linux/sched.h> +#include <linux/stddef.h> +#include <linux/hardirq.h> +#include <linux/suspend.h> +#include <linux/kbuild.h> +#include <asm/processor.h> +#include <asm/thread_info.h> +#include <asm/sigframe.h> +#include <asm/bootparam.h> +#include <asm/suspend.h> + +#ifdef CONFIG_XEN +#include <xen/interface/xen.h> +#endif + +#ifdef CONFIG_X86_32 +# include "asm-offsets_32.c" +#else +# include "asm-offsets_64.c" +#endif + +void common(void) { + BLANK(); + OFFSET(TI_flags, thread_info, flags); + OFFSET(TI_status, thread_info, status); + OFFSET(TI_addr_limit, thread_info, addr_limit); + + BLANK(); + OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); + + BLANK(); + OFFSET(pbe_address, pbe, address); + OFFSET(pbe_orig_address, pbe, orig_address); + OFFSET(pbe_next, pbe, next); + +#ifdef CONFIG_PARAVIRT + BLANK(); + OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled); + OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops); + OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops); + OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); + OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); + OFFSET(PV_CPU_iret, pv_cpu_ops, iret); + OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit); + OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0); + OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2); +#endif + +#ifdef CONFIG_XEN + BLANK(); + OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask); + OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending); +#endif + + BLANK(); + OFFSET(BP_scratch, boot_params, scratch); + OFFSET(BP_loadflags, boot_params, hdr.loadflags); + OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); + OFFSET(BP_version, boot_params, hdr.version); + OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); + OFFSET(BP_pref_address, boot_params, hdr.pref_address); + OFFSET(BP_code32_start, boot_params, hdr.code32_start); + + BLANK(); + DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); +} diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c new file mode 100644 index 00000000000..d67c4be3e8b --- /dev/null +++ b/arch/x86/kernel/asm-offsets_32.c @@ -0,0 +1,89 @@ +#include <asm/ucontext.h> + +#include <linux/lguest.h> +#include "../../../drivers/lguest/lg.h" + +#define __SYSCALL_I386(nr, sym, compat) [nr] = 1, +static char syscalls[] = { +#include <asm/syscalls_32.h> +}; + +/* workaround for a warning with -Wmissing-prototypes */ +void foo(void); + +void foo(void) +{ + OFFSET(IA32_SIGCONTEXT_ax, sigcontext, ax); + OFFSET(IA32_SIGCONTEXT_bx, sigcontext, bx); + OFFSET(IA32_SIGCONTEXT_cx, sigcontext, cx); + OFFSET(IA32_SIGCONTEXT_dx, sigcontext, dx); + OFFSET(IA32_SIGCONTEXT_si, sigcontext, si); + OFFSET(IA32_SIGCONTEXT_di, sigcontext, di); + OFFSET(IA32_SIGCONTEXT_bp, sigcontext, bp); + OFFSET(IA32_SIGCONTEXT_sp, sigcontext, sp); + OFFSET(IA32_SIGCONTEXT_ip, sigcontext, ip); + BLANK(); + + OFFSET(CPUINFO_x86, cpuinfo_x86, x86); + OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor); + OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model); + OFFSET(CPUINFO_x86_mask, cpuinfo_x86, x86_mask); + OFFSET(CPUINFO_cpuid_level, cpuinfo_x86, cpuid_level); + OFFSET(CPUINFO_x86_capability, cpuinfo_x86, x86_capability); + OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id); + BLANK(); + + OFFSET(TI_sysenter_return, thread_info, sysenter_return); + OFFSET(TI_cpu, thread_info, cpu); + BLANK(); + + OFFSET(PT_EBX, pt_regs, bx); + OFFSET(PT_ECX, pt_regs, cx); + OFFSET(PT_EDX, pt_regs, dx); + OFFSET(PT_ESI, pt_regs, si); + OFFSET(PT_EDI, pt_regs, di); + OFFSET(PT_EBP, pt_regs, bp); + OFFSET(PT_EAX, pt_regs, ax); + OFFSET(PT_DS, pt_regs, ds); + OFFSET(PT_ES, pt_regs, es); + OFFSET(PT_FS, pt_regs, fs); + OFFSET(PT_GS, pt_regs, gs); + OFFSET(PT_ORIG_EAX, pt_regs, orig_ax); + OFFSET(PT_EIP, pt_regs, ip); + OFFSET(PT_CS, pt_regs, cs); + OFFSET(PT_EFLAGS, pt_regs, flags); + OFFSET(PT_OLDESP, pt_regs, sp); + OFFSET(PT_OLDSS, pt_regs, ss); + BLANK(); + + OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext); + BLANK(); + + OFFSET(saved_context_gdt_desc, saved_context, gdt_desc); + BLANK(); + + /* Offset from the sysenter stack to tss.sp0 */ + DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - + sizeof(struct tss_struct)); + +#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) + BLANK(); + OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); + OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending); + + BLANK(); + OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc); + OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc); + OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3); + OFFSET(LGUEST_PAGES_host_sp, lguest_pages, state.host_sp); + OFFSET(LGUEST_PAGES_guest_gdt_desc, lguest_pages,state.guest_gdt_desc); + OFFSET(LGUEST_PAGES_guest_idt_desc, lguest_pages,state.guest_idt_desc); + OFFSET(LGUEST_PAGES_guest_gdt, lguest_pages, state.guest_gdt); + OFFSET(LGUEST_PAGES_regs_trapnum, lguest_pages, regs.trapnum); + OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode); + OFFSET(LGUEST_PAGES_regs, lguest_pages, regs); +#endif + BLANK(); + DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); + DEFINE(NR_syscalls, sizeof(syscalls)); +} diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c new file mode 100644 index 00000000000..e7c798b354f --- /dev/null +++ b/arch/x86/kernel/asm-offsets_64.c @@ -0,0 +1,90 @@ +#include <asm/ia32.h> + +#define __SYSCALL_64(nr, sym, compat) [nr] = 1, +#define __SYSCALL_COMMON(nr, sym, compat) [nr] = 1, +#ifdef CONFIG_X86_X32_ABI +# define __SYSCALL_X32(nr, sym, compat) [nr] = 1, +#else +# define __SYSCALL_X32(nr, sym, compat) /* nothing */ +#endif +static char syscalls_64[] = { +#include <asm/syscalls_64.h> +}; +#define __SYSCALL_I386(nr, sym, compat) [nr] = 1, +static char syscalls_ia32[] = { +#include <asm/syscalls_32.h> +}; + +int main(void) +{ +#ifdef CONFIG_PARAVIRT + OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame); + OFFSET(PV_CPU_usergs_sysret32, pv_cpu_ops, usergs_sysret32); + OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64); + OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); + BLANK(); +#endif + +#ifdef CONFIG_IA32_EMULATION + OFFSET(TI_sysenter_return, thread_info, sysenter_return); + BLANK(); + +#define ENTRY(entry) OFFSET(IA32_SIGCONTEXT_ ## entry, sigcontext_ia32, entry) + ENTRY(ax); + ENTRY(bx); + ENTRY(cx); + ENTRY(dx); + ENTRY(si); + ENTRY(di); + ENTRY(bp); + ENTRY(sp); + ENTRY(ip); + BLANK(); +#undef ENTRY + + OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext); + BLANK(); +#endif + +#define ENTRY(entry) OFFSET(pt_regs_ ## entry, pt_regs, entry) + ENTRY(bx); + ENTRY(bx); + ENTRY(cx); + ENTRY(dx); + ENTRY(sp); + ENTRY(bp); + ENTRY(si); + ENTRY(di); + ENTRY(r8); + ENTRY(r9); + ENTRY(r10); + ENTRY(r11); + ENTRY(r12); + ENTRY(r13); + ENTRY(r14); + ENTRY(r15); + ENTRY(flags); + BLANK(); +#undef ENTRY + +#define ENTRY(entry) OFFSET(saved_context_ ## entry, saved_context, entry) + ENTRY(cr0); + ENTRY(cr2); + ENTRY(cr3); + ENTRY(cr4); + ENTRY(cr8); + ENTRY(gdt_desc); + BLANK(); +#undef ENTRY + + OFFSET(TSS_ist, tss_struct, x86_tss.ist); + BLANK(); + + DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1); + DEFINE(NR_syscalls, sizeof(syscalls_64)); + + DEFINE(__NR_ia32_syscall_max, sizeof(syscalls_ia32) - 1); + DEFINE(IA32_NR_syscalls, sizeof(syscalls_ia32)); + + return 0; +} diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c new file mode 100644 index 00000000000..06d3e5a14d9 --- /dev/null +++ b/arch/x86/kernel/audit_64.c @@ -0,0 +1,81 @@ +#include <linux/init.h> +#include <linux/types.h> +#include <linux/audit.h> +#include <asm/unistd.h> + +static unsigned dir_class[] = { +#include <asm-generic/audit_dir_write.h> +~0U +}; + +static unsigned read_class[] = { +#include <asm-generic/audit_read.h> +~0U +}; + +static unsigned write_class[] = { +#include <asm-generic/audit_write.h> +~0U +}; + +static unsigned chattr_class[] = { +#include <asm-generic/audit_change_attr.h> +~0U +}; + +static unsigned signal_class[] = { +#include <asm-generic/audit_signal.h> +~0U +}; + +int audit_classify_arch(int arch) +{ +#ifdef CONFIG_IA32_EMULATION + if (arch == AUDIT_ARCH_I386) + return 1; +#endif + return 0; +} + +int audit_classify_syscall(int abi, unsigned syscall) +{ +#ifdef CONFIG_IA32_EMULATION + extern int ia32_classify_syscall(unsigned); + if (abi == AUDIT_ARCH_I386) + return ia32_classify_syscall(syscall); +#endif + switch(syscall) { + case __NR_open: + return 2; + case __NR_openat: + return 3; + case __NR_execve: + return 5; + default: + return 0; + } +} + +static int __init audit_classes_init(void) +{ +#ifdef CONFIG_IA32_EMULATION + extern __u32 ia32_dir_class[]; + extern __u32 ia32_write_class[]; + extern __u32 ia32_read_class[]; + extern __u32 ia32_chattr_class[]; + extern __u32 ia32_signal_class[]; + audit_register_class(AUDIT_CLASS_WRITE_32, ia32_write_class); + audit_register_class(AUDIT_CLASS_READ_32, ia32_read_class); + audit_register_class(AUDIT_CLASS_DIR_WRITE_32, ia32_dir_class); + audit_register_class(AUDIT_CLASS_CHATTR_32, ia32_chattr_class); + audit_register_class(AUDIT_CLASS_SIGNAL_32, ia32_signal_class); +#endif + audit_register_class(AUDIT_CLASS_WRITE, write_class); + audit_register_class(AUDIT_CLASS_READ, read_class); + audit_register_class(AUDIT_CLASS_DIR_WRITE, dir_class); + audit_register_class(AUDIT_CLASS_CHATTR, chattr_class); + audit_register_class(AUDIT_CLASS_SIGNAL, signal_class); + return 0; +} + +__initcall(audit_classes_init); diff --git a/arch/x86/kernel/bootflag.c b/arch/x86/kernel/bootflag.c new file mode 100644 index 00000000000..5de7f4c5697 --- /dev/null +++ b/arch/x86/kernel/bootflag.c @@ -0,0 +1,101 @@ +/* + * Implement 'Simple Boot Flag Specification 2.0' + */ +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/spinlock.h> +#include <linux/acpi.h> +#include <asm/io.h> + +#include <linux/mc146818rtc.h> + +#define SBF_RESERVED (0x78) +#define SBF_PNPOS (1<<0) +#define SBF_BOOTING (1<<1) +#define SBF_DIAG (1<<2) +#define SBF_PARITY (1<<7) + +int sbf_port __initdata = -1; /* set via acpi_boot_init() */ + +static int __init parity(u8 v) +{ + int x = 0; + int i; + + for (i = 0; i < 8; i++) { + x ^= (v & 1); + v >>= 1; + } + + return x; +} + +static void __init sbf_write(u8 v) +{ + unsigned long flags; + + if (sbf_port != -1) { + v &= ~SBF_PARITY; + if (!parity(v)) + v |= SBF_PARITY; + + printk(KERN_INFO "Simple Boot Flag at 0x%x set to 0x%x\n", + sbf_port, v); + + spin_lock_irqsave(&rtc_lock, flags); + CMOS_WRITE(v, sbf_port); + spin_unlock_irqrestore(&rtc_lock, flags); + } +} + +static u8 __init sbf_read(void) +{ + unsigned long flags; + u8 v; + + if (sbf_port == -1) + return 0; + + spin_lock_irqsave(&rtc_lock, flags); + v = CMOS_READ(sbf_port); + spin_unlock_irqrestore(&rtc_lock, flags); + + return v; +} + +static int __init sbf_value_valid(u8 v) +{ + if (v & SBF_RESERVED) /* Reserved bits */ + return 0; + if (!parity(v)) + return 0; + + return 1; +} + +static int __init sbf_init(void) +{ + u8 v; + + if (sbf_port == -1) + return 0; + + v = sbf_read(); + if (!sbf_value_valid(v)) { + printk(KERN_WARNING "Simple Boot Flag value 0x%x read from " + "CMOS RAM was invalid\n", v); + } + + v &= ~SBF_RESERVED; + v &= ~SBF_BOOTING; + v &= ~SBF_DIAG; +#if defined(CONFIG_ISAPNP) + v |= SBF_PNPOS; +#endif + sbf_write(v); + + return 0; +} +module_init(sbf_init); diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c new file mode 100644 index 00000000000..83a7995625a --- /dev/null +++ b/arch/x86/kernel/check.c @@ -0,0 +1,167 @@ +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/kthread.h> +#include <linux/workqueue.h> +#include <linux/memblock.h> + +#include <asm/proto.h> + +/* + * Some BIOSes seem to corrupt the low 64k of memory during events + * like suspend/resume and unplugging an HDMI cable. Reserve all + * remaining free memory in that area and fill it with a distinct + * pattern. + */ +#define MAX_SCAN_AREAS 8 + +static int __read_mostly memory_corruption_check = -1; + +static unsigned __read_mostly corruption_check_size = 64*1024; +static unsigned __read_mostly corruption_check_period = 60; /* seconds */ + +static struct scan_area { + u64 addr; + u64 size; +} scan_areas[MAX_SCAN_AREAS]; +static int num_scan_areas; + +static __init int set_corruption_check(char *arg) +{ + ssize_t ret; + unsigned long val; + + ret = kstrtoul(arg, 10, &val); + if (ret) + return ret; + + memory_corruption_check = val; + return 0; +} +early_param("memory_corruption_check", set_corruption_check); + +static __init int set_corruption_check_period(char *arg) +{ + ssize_t ret; + unsigned long val; + + ret = kstrtoul(arg, 10, &val); + if (ret) + return ret; + + corruption_check_period = val; + return 0; +} +early_param("memory_corruption_check_period", set_corruption_check_period); + +static __init int set_corruption_check_size(char *arg) +{ + char *end; + unsigned size; + + size = memparse(arg, &end); + + if (*end == '\0') + corruption_check_size = size; + + return (size == corruption_check_size) ? 0 : -EINVAL; +} +early_param("memory_corruption_check_size", set_corruption_check_size); + + +void __init setup_bios_corruption_check(void) +{ + phys_addr_t start, end; + u64 i; + + if (memory_corruption_check == -1) { + memory_corruption_check = +#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK + 1 +#else + 0 +#endif + ; + } + + if (corruption_check_size == 0) + memory_corruption_check = 0; + + if (!memory_corruption_check) + return; + + corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); + + for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) { + start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE), + PAGE_SIZE, corruption_check_size); + end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE), + PAGE_SIZE, corruption_check_size); + if (start >= end) + continue; + + memblock_reserve(start, end - start); + scan_areas[num_scan_areas].addr = start; + scan_areas[num_scan_areas].size = end - start; + + /* Assume we've already mapped this early memory */ + memset(__va(start), 0, end - start); + + if (++num_scan_areas >= MAX_SCAN_AREAS) + break; + } + + if (num_scan_areas) + printk(KERN_INFO "Scanning %d areas for low memory corruption\n", num_scan_areas); +} + + +void check_for_bios_corruption(void) +{ + int i; + int corruption = 0; + + if (!memory_corruption_check) + return; + + for (i = 0; i < num_scan_areas; i++) { + unsigned long *addr = __va(scan_areas[i].addr); + unsigned long size = scan_areas[i].size; + + for (; size; addr++, size -= sizeof(unsigned long)) { + if (!*addr) + continue; + printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n", + addr, __pa(addr), *addr); + corruption = 1; + *addr = 0; + } + } + + WARN_ONCE(corruption, KERN_ERR "Memory corruption detected in low memory\n"); +} + +static void check_corruption(struct work_struct *dummy); +static DECLARE_DELAYED_WORK(bios_check_work, check_corruption); + +static void check_corruption(struct work_struct *dummy) +{ + check_for_bios_corruption(); + schedule_delayed_work(&bios_check_work, + round_jiffies_relative(corruption_check_period*HZ)); +} + +static int start_periodic_check_for_corruption(void) +{ + if (!num_scan_areas || !memory_corruption_check || corruption_check_period == 0) + return 0; + + printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n", + corruption_check_period); + + /* First time we run the checks right away */ + schedule_delayed_work(&bios_check_work, 0); + return 0; +} + +module_init(start_periodic_check_for_corruption); + diff --git a/arch/x86/kernel/cpu/.gitignore b/arch/x86/kernel/cpu/.gitignore new file mode 100644 index 00000000000..667df55a439 --- /dev/null +++ b/arch/x86/kernel/cpu/.gitignore @@ -0,0 +1 @@ +capflags.c diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile new file mode 100644 index 00000000000..7fd54f09b01 --- /dev/null +++ b/arch/x86/kernel/cpu/Makefile @@ -0,0 +1,58 @@ +# +# Makefile for x86-compatible CPU details, features and quirks +# + +# Don't trace early stages of a secondary CPU boot +ifdef CONFIG_FUNCTION_TRACER +CFLAGS_REMOVE_common.o = -pg +CFLAGS_REMOVE_perf_event.o = -pg +endif + +# Make sure load_percpu_segment has no stackprotector +nostackp := $(call cc-option, -fno-stack-protector) +CFLAGS_common.o := $(nostackp) + +obj-y := intel_cacheinfo.o scattered.o topology.o +obj-y += proc.o capflags.o powerflags.o common.o +obj-y += rdrand.o +obj-y += match.o + +obj-$(CONFIG_X86_32) += bugs.o +obj-$(CONFIG_X86_64) += bugs_64.o + +obj-$(CONFIG_CPU_SUP_INTEL) += intel.o +obj-$(CONFIG_CPU_SUP_AMD) += amd.o +obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o +obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o +obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o +obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o + +obj-$(CONFIG_PERF_EVENTS) += perf_event.o + +ifdef CONFIG_PERF_EVENTS +obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd.o perf_event_amd_uncore.o +ifdef CONFIG_AMD_IOMMU +obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd_iommu.o +endif +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o perf_event_intel_rapl.o +endif + + +obj-$(CONFIG_X86_MCE) += mcheck/ +obj-$(CONFIG_MTRR) += mtrr/ +obj-$(CONFIG_MICROCODE) += microcode/ + +obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o perf_event_amd_ibs.o + +obj-$(CONFIG_HYPERVISOR_GUEST) += vmware.o hypervisor.o mshyperv.o + +quiet_cmd_mkcapflags = MKCAP $@ + cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $< $@ + +cpufeature = $(src)/../../include/asm/cpufeature.h + +targets += capflags.c +$(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.sh FORCE + $(call if_changed,mkcapflags) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c new file mode 100644 index 00000000000..ce8b8ff0e0e --- /dev/null +++ b/arch/x86/kernel/cpu/amd.c @@ -0,0 +1,889 @@ +#include <linux/export.h> +#include <linux/bitops.h> +#include <linux/elf.h> +#include <linux/mm.h> + +#include <linux/io.h> +#include <linux/sched.h> +#include <asm/processor.h> +#include <asm/apic.h> +#include <asm/cpu.h> +#include <asm/pci-direct.h> + +#ifdef CONFIG_X86_64 +# include <asm/mmconfig.h> +# include <asm/cacheflush.h> +#endif + +#include "cpu.h" + +static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) +{ + u32 gprs[8] = { 0 }; + int err; + + WARN_ONCE((boot_cpu_data.x86 != 0xf), + "%s should only be used on K8!\n", __func__); + + gprs[1] = msr; + gprs[7] = 0x9c5a203a; + + err = rdmsr_safe_regs(gprs); + + *p = gprs[0] | ((u64)gprs[2] << 32); + + return err; +} + +static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) +{ + u32 gprs[8] = { 0 }; + + WARN_ONCE((boot_cpu_data.x86 != 0xf), + "%s should only be used on K8!\n", __func__); + + gprs[0] = (u32)val; + gprs[1] = msr; + gprs[2] = val >> 32; + gprs[7] = 0x9c5a203a; + + return wrmsr_safe_regs(gprs); +} + +#ifdef CONFIG_X86_32 +/* + * B step AMD K6 before B 9730xxxx have hardware bugs that can cause + * misexecution of code under Linux. Owners of such processors should + * contact AMD for precise details and a CPU swap. + * + * See http://www.multimania.com/poulot/k6bug.html + * and section 2.6.2 of "AMD-K6 Processor Revision Guide - Model 6" + * (Publication # 21266 Issue Date: August 1998) + * + * The following test is erm.. interesting. AMD neglected to up + * the chip setting when fixing the bug but they also tweaked some + * performance at the same time.. + */ + +extern __visible void vide(void); +__asm__(".globl vide\n\t.align 4\nvide: ret"); + +static void init_amd_k5(struct cpuinfo_x86 *c) +{ +/* + * General Systems BIOSen alias the cpu frequency registers + * of the Elan at 0x000df000. Unfortuantly, one of the Linux + * drivers subsequently pokes it, and changes the CPU speed. + * Workaround : Remove the unneeded alias. + */ +#define CBAR (0xfffc) /* Configuration Base Address (32-bit) */ +#define CBAR_ENB (0x80000000) +#define CBAR_KEY (0X000000CB) + if (c->x86_model == 9 || c->x86_model == 10) { + if (inl(CBAR) & CBAR_ENB) + outl(0 | CBAR_KEY, CBAR); + } +} + + +static void init_amd_k6(struct cpuinfo_x86 *c) +{ + u32 l, h; + int mbytes = get_num_physpages() >> (20-PAGE_SHIFT); + + if (c->x86_model < 6) { + /* Based on AMD doc 20734R - June 2000 */ + if (c->x86_model == 0) { + clear_cpu_cap(c, X86_FEATURE_APIC); + set_cpu_cap(c, X86_FEATURE_PGE); + } + return; + } + + if (c->x86_model == 6 && c->x86_mask == 1) { + const int K6_BUG_LOOP = 1000000; + int n; + void (*f_vide)(void); + unsigned long d, d2; + + printk(KERN_INFO "AMD K6 stepping B detected - "); + + /* + * It looks like AMD fixed the 2.6.2 bug and improved indirect + * calls at the same time. + */ + + n = K6_BUG_LOOP; + f_vide = vide; + rdtscl(d); + while (n--) + f_vide(); + rdtscl(d2); + d = d2-d; + + if (d > 20*K6_BUG_LOOP) + printk(KERN_CONT + "system stability may be impaired when more than 32 MB are used.\n"); + else + printk(KERN_CONT "probably OK (after B9730xxxx).\n"); + } + + /* K6 with old style WHCR */ + if (c->x86_model < 8 || + (c->x86_model == 8 && c->x86_mask < 8)) { + /* We can only write allocate on the low 508Mb */ + if (mbytes > 508) + mbytes = 508; + + rdmsr(MSR_K6_WHCR, l, h); + if ((l&0x0000FFFF) == 0) { + unsigned long flags; + l = (1<<0)|((mbytes/4)<<1); + local_irq_save(flags); + wbinvd(); + wrmsr(MSR_K6_WHCR, l, h); + local_irq_restore(flags); + printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n", + mbytes); + } + return; + } + + if ((c->x86_model == 8 && c->x86_mask > 7) || + c->x86_model == 9 || c->x86_model == 13) { + /* The more serious chips .. */ + + if (mbytes > 4092) + mbytes = 4092; + + rdmsr(MSR_K6_WHCR, l, h); + if ((l&0xFFFF0000) == 0) { + unsigned long flags; + l = ((mbytes>>2)<<22)|(1<<16); + local_irq_save(flags); + wbinvd(); + wrmsr(MSR_K6_WHCR, l, h); + local_irq_restore(flags); + printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n", + mbytes); + } + + return; + } + + if (c->x86_model == 10) { + /* AMD Geode LX is model 10 */ + /* placeholder for any needed mods */ + return; + } +} + +static void amd_k7_smp_check(struct cpuinfo_x86 *c) +{ + /* calling is from identify_secondary_cpu() ? */ + if (!c->cpu_index) + return; + + /* + * Certain Athlons might work (for various values of 'work') in SMP + * but they are not certified as MP capable. + */ + /* Athlon 660/661 is valid. */ + if ((c->x86_model == 6) && ((c->x86_mask == 0) || + (c->x86_mask == 1))) + return; + + /* Duron 670 is valid */ + if ((c->x86_model == 7) && (c->x86_mask == 0)) + return; + + /* + * Athlon 662, Duron 671, and Athlon >model 7 have capability + * bit. It's worth noting that the A5 stepping (662) of some + * Athlon XP's have the MP bit set. + * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for + * more. + */ + if (((c->x86_model == 6) && (c->x86_mask >= 2)) || + ((c->x86_model == 7) && (c->x86_mask >= 1)) || + (c->x86_model > 7)) + if (cpu_has_mp) + return; + + /* If we get here, not a certified SMP capable AMD system. */ + + /* + * Don't taint if we are running SMP kernel on a single non-MP + * approved Athlon + */ + WARN_ONCE(1, "WARNING: This combination of AMD" + " processors is not suitable for SMP.\n"); + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE); +} + +static void init_amd_k7(struct cpuinfo_x86 *c) +{ + u32 l, h; + + /* + * Bit 15 of Athlon specific MSR 15, needs to be 0 + * to enable SSE on Palomino/Morgan/Barton CPU's. + * If the BIOS didn't enable it already, enable it here. + */ + if (c->x86_model >= 6 && c->x86_model <= 10) { + if (!cpu_has(c, X86_FEATURE_XMM)) { + printk(KERN_INFO "Enabling disabled K7/SSE Support.\n"); + msr_clear_bit(MSR_K7_HWCR, 15); + set_cpu_cap(c, X86_FEATURE_XMM); + } + } + + /* + * It's been determined by AMD that Athlons since model 8 stepping 1 + * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx + * As per AMD technical note 27212 0.2 + */ + if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) { + rdmsr(MSR_K7_CLK_CTL, l, h); + if ((l & 0xfff00000) != 0x20000000) { + printk(KERN_INFO + "CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", + l, ((l & 0x000fffff)|0x20000000)); + wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h); + } + } + + set_cpu_cap(c, X86_FEATURE_K7); + + amd_k7_smp_check(c); +} +#endif + +#ifdef CONFIG_NUMA +/* + * To workaround broken NUMA config. Read the comment in + * srat_detect_node(). + */ +static int nearby_node(int apicid) +{ + int i, node; + + for (i = apicid - 1; i >= 0; i--) { + node = __apicid_to_node[i]; + if (node != NUMA_NO_NODE && node_online(node)) + return node; + } + for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { + node = __apicid_to_node[i]; + if (node != NUMA_NO_NODE && node_online(node)) + return node; + } + return first_node(node_online_map); /* Shouldn't happen */ +} +#endif + +/* + * Fixup core topology information for + * (1) AMD multi-node processors + * Assumption: Number of cores in each internal node is the same. + * (2) AMD processors supporting compute units + */ +#ifdef CONFIG_X86_HT +static void amd_get_topology(struct cpuinfo_x86 *c) +{ + u32 nodes, cores_per_cu = 1; + u8 node_id; + int cpu = smp_processor_id(); + + /* get information required for multi-node processors */ + if (cpu_has_topoext) { + u32 eax, ebx, ecx, edx; + + cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); + nodes = ((ecx >> 8) & 7) + 1; + node_id = ecx & 7; + + /* get compute unit information */ + smp_num_siblings = ((ebx >> 8) & 3) + 1; + c->compute_unit_id = ebx & 0xff; + cores_per_cu += ((ebx >> 8) & 3); + } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { + u64 value; + + rdmsrl(MSR_FAM10H_NODE_ID, value); + nodes = ((value >> 3) & 7) + 1; + node_id = value & 7; + } else + return; + + /* fixup multi-node processor information */ + if (nodes > 1) { + u32 cores_per_node; + u32 cus_per_node; + + set_cpu_cap(c, X86_FEATURE_AMD_DCM); + cores_per_node = c->x86_max_cores / nodes; + cus_per_node = cores_per_node / cores_per_cu; + + /* store NodeID, use llc_shared_map to store sibling info */ + per_cpu(cpu_llc_id, cpu) = node_id; + + /* core id has to be in the [0 .. cores_per_node - 1] range */ + c->cpu_core_id %= cores_per_node; + c->compute_unit_id %= cus_per_node; + } +} +#endif + +/* + * On a AMD dual core setup the lower bits of the APIC id distinguish the cores. + * Assumes number of cores is a power of two. + */ +static void amd_detect_cmp(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_HT + unsigned bits; + int cpu = smp_processor_id(); + + bits = c->x86_coreid_bits; + /* Low order bits define the core id (index of core in socket) */ + c->cpu_core_id = c->initial_apicid & ((1 << bits)-1); + /* Convert the initial APIC ID into the socket ID */ + c->phys_proc_id = c->initial_apicid >> bits; + /* use socket ID also for last level cache */ + per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; + amd_get_topology(c); +#endif +} + +u16 amd_get_nb_id(int cpu) +{ + u16 id = 0; +#ifdef CONFIG_SMP + id = per_cpu(cpu_llc_id, cpu); +#endif + return id; +} +EXPORT_SYMBOL_GPL(amd_get_nb_id); + +static void srat_detect_node(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_NUMA + int cpu = smp_processor_id(); + int node; + unsigned apicid = c->apicid; + + node = numa_cpu_node(cpu); + if (node == NUMA_NO_NODE) + node = per_cpu(cpu_llc_id, cpu); + + /* + * On multi-fabric platform (e.g. Numascale NumaChip) a + * platform-specific handler needs to be called to fixup some + * IDs of the CPU. + */ + if (x86_cpuinit.fixup_cpu_id) + x86_cpuinit.fixup_cpu_id(c, node); + + if (!node_online(node)) { + /* + * Two possibilities here: + * + * - The CPU is missing memory and no node was created. In + * that case try picking one from a nearby CPU. + * + * - The APIC IDs differ from the HyperTransport node IDs + * which the K8 northbridge parsing fills in. Assume + * they are all increased by a constant offset, but in + * the same order as the HT nodeids. If that doesn't + * result in a usable node fall back to the path for the + * previous case. + * + * This workaround operates directly on the mapping between + * APIC ID and NUMA node, assuming certain relationship + * between APIC ID, HT node ID and NUMA topology. As going + * through CPU mapping may alter the outcome, directly + * access __apicid_to_node[]. + */ + int ht_nodeid = c->initial_apicid; + + if (ht_nodeid >= 0 && + __apicid_to_node[ht_nodeid] != NUMA_NO_NODE) + node = __apicid_to_node[ht_nodeid]; + /* Pick a nearby node */ + if (!node_online(node)) + node = nearby_node(apicid); + } + numa_set_node(cpu, node); +#endif +} + +static void early_init_amd_mc(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_HT + unsigned bits, ecx; + + /* Multi core CPU? */ + if (c->extended_cpuid_level < 0x80000008) + return; + + ecx = cpuid_ecx(0x80000008); + + c->x86_max_cores = (ecx & 0xff) + 1; + + /* CPU telling us the core id bits shift? */ + bits = (ecx >> 12) & 0xF; + + /* Otherwise recompute */ + if (bits == 0) { + while ((1 << bits) < c->x86_max_cores) + bits++; + } + + c->x86_coreid_bits = bits; +#endif +} + +static void bsp_init_amd(struct cpuinfo_x86 *c) +{ + if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { + + if (c->x86 > 0x10 || + (c->x86 == 0x10 && c->x86_model >= 0x2)) { + u64 val; + + rdmsrl(MSR_K7_HWCR, val); + if (!(val & BIT(24))) + printk(KERN_WARNING FW_BUG "TSC doesn't count " + "with P0 frequency!\n"); + } + } + + if (c->x86 == 0x15) { + unsigned long upperbit; + u32 cpuid, assoc; + + cpuid = cpuid_edx(0x80000005); + assoc = cpuid >> 16 & 0xff; + upperbit = ((cpuid >> 24) << 10) / assoc; + + va_align.mask = (upperbit - 1) & PAGE_MASK; + va_align.flags = ALIGN_VA_32 | ALIGN_VA_64; + } +} + +static void early_init_amd(struct cpuinfo_x86 *c) +{ + early_init_amd_mc(c); + + /* + * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate + * with P/T states and does not stop in deep C-states + */ + if (c->x86_power & (1 << 8)) { + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); + if (!check_tsc_unstable()) + set_sched_clock_stable(); + } + +#ifdef CONFIG_X86_64 + set_cpu_cap(c, X86_FEATURE_SYSCALL32); +#else + /* Set MTRR capability flag if appropriate */ + if (c->x86 == 5) + if (c->x86_model == 13 || c->x86_model == 9 || + (c->x86_model == 8 && c->x86_mask >= 8)) + set_cpu_cap(c, X86_FEATURE_K6_MTRR); +#endif +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI) + /* check CPU config space for extended APIC ID */ + if (cpu_has_apic && c->x86 >= 0xf) { + unsigned int val; + val = read_pci_config(0, 24, 0, 0x68); + if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18))) + set_cpu_cap(c, X86_FEATURE_EXTD_APICID); + } +#endif + + /* F16h erratum 793, CVE-2013-6885 */ + if (c->x86 == 0x16 && c->x86_model <= 0xf) + msr_set_bit(MSR_AMD64_LS_CFG, 15); +} + +static const int amd_erratum_383[]; +static const int amd_erratum_400[]; +static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum); + +static void init_amd(struct cpuinfo_x86 *c) +{ + u32 dummy; + unsigned long long value; + +#ifdef CONFIG_SMP + /* + * Disable TLB flush filter by setting HWCR.FFDIS on K8 + * bit 6 of msr C001_0015 + * + * Errata 63 for SH-B3 steppings + * Errata 122 for all steppings (F+ have it disabled by default) + */ + if (c->x86 == 0xf) + msr_set_bit(MSR_K7_HWCR, 6); +#endif + + early_init_amd(c); + + /* + * Bit 31 in normal CPUID used for nonstandard 3DNow ID; + * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway + */ + clear_cpu_cap(c, 0*32+31); + +#ifdef CONFIG_X86_64 + /* On C+ stepping K8 rep microcode works well for copy/memset */ + if (c->x86 == 0xf) { + u32 level; + + level = cpuid_eax(1); + if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); + + /* + * Some BIOSes incorrectly force this feature, but only K8 + * revision D (model = 0x14) and later actually support it. + * (AMD Erratum #110, docId: 25759). + */ + if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) { + clear_cpu_cap(c, X86_FEATURE_LAHF_LM); + if (!rdmsrl_amd_safe(0xc001100d, &value)) { + value &= ~(1ULL << 32); + wrmsrl_amd_safe(0xc001100d, value); + } + } + + } + if (c->x86 >= 0x10) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); + + /* get apicid instead of initial apic id from cpuid */ + c->apicid = hard_smp_processor_id(); +#else + + /* + * FIXME: We should handle the K5 here. Set up the write + * range and also turn on MSR 83 bits 4 and 31 (write alloc, + * no bus pipeline) + */ + + switch (c->x86) { + case 4: + init_amd_k5(c); + break; + case 5: + init_amd_k6(c); + break; + case 6: /* An Athlon/Duron */ + init_amd_k7(c); + break; + } + + /* K6s reports MCEs but don't actually have all the MSRs */ + if (c->x86 < 6) + clear_cpu_cap(c, X86_FEATURE_MCE); +#endif + + /* Enable workaround for FXSAVE leak */ + if (c->x86 >= 6) + set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); + + if (!c->x86_model_id[0]) { + switch (c->x86) { + case 0xf: + /* Should distinguish Models here, but this is only + a fallback anyways. */ + strcpy(c->x86_model_id, "Hammer"); + break; + } + } + + /* re-enable TopologyExtensions if switched off by BIOS */ + if ((c->x86 == 0x15) && + (c->x86_model >= 0x10) && (c->x86_model <= 0x1f) && + !cpu_has(c, X86_FEATURE_TOPOEXT)) { + + if (msr_set_bit(0xc0011005, 54) > 0) { + rdmsrl(0xc0011005, value); + if (value & BIT_64(54)) { + set_cpu_cap(c, X86_FEATURE_TOPOEXT); + pr_info(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n"); + } + } + } + + /* + * The way access filter has a performance penalty on some workloads. + * Disable it on the affected CPUs. + */ + if ((c->x86 == 0x15) && + (c->x86_model >= 0x02) && (c->x86_model < 0x20)) { + + if (!rdmsrl_safe(0xc0011021, &value) && !(value & 0x1E)) { + value |= 0x1E; + wrmsrl_safe(0xc0011021, value); + } + } + + cpu_detect_cache_sizes(c); + + /* Multi core CPU? */ + if (c->extended_cpuid_level >= 0x80000008) { + amd_detect_cmp(c); + srat_detect_node(c); + } + +#ifdef CONFIG_X86_32 + detect_ht(c); +#endif + + init_amd_cacheinfo(c); + + if (c->x86 >= 0xf) + set_cpu_cap(c, X86_FEATURE_K8); + + if (cpu_has_xmm2) { + /* MFENCE stops RDTSC speculation */ + set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); + } + +#ifdef CONFIG_X86_64 + if (c->x86 == 0x10) { + /* do this for boot cpu */ + if (c == &boot_cpu_data) + check_enable_amd_mmconf_dmi(); + + fam10h_check_enable_mmcfg(); + } + + if (c == &boot_cpu_data && c->x86 >= 0xf) { + unsigned long long tseg; + + /* + * Split up direct mapping around the TSEG SMM area. + * Don't do it for gbpages because there seems very little + * benefit in doing so. + */ + if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { + unsigned long pfn = tseg >> PAGE_SHIFT; + + printk(KERN_DEBUG "tseg: %010llx\n", tseg); + if (pfn_range_is_mapped(pfn, pfn + 1)) + set_memory_4k((unsigned long)__va(tseg), 1); + } + } +#endif + + /* + * Family 0x12 and above processors have APIC timer + * running in deep C states. + */ + if (c->x86 > 0x11) + set_cpu_cap(c, X86_FEATURE_ARAT); + + if (c->x86 == 0x10) { + /* + * Disable GART TLB Walk Errors on Fam10h. We do this here + * because this is always needed when GART is enabled, even in a + * kernel which has no MCE support built in. + * BIOS should disable GartTlbWlk Errors already. If + * it doesn't, do it here as suggested by the BKDG. + * + * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012 + */ + msr_set_bit(MSR_AMD64_MCx_MASK(4), 10); + + /* + * On family 10h BIOS may not have properly enabled WC+ support, + * causing it to be converted to CD memtype. This may result in + * performance degradation for certain nested-paging guests. + * Prevent this conversion by clearing bit 24 in + * MSR_AMD64_BU_CFG2. + * + * NOTE: we want to use the _safe accessors so as not to #GP kvm + * guests on older kvm hosts. + */ + msr_clear_bit(MSR_AMD64_BU_CFG2, 24); + + if (cpu_has_amd_erratum(c, amd_erratum_383)) + set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH); + } + + if (cpu_has_amd_erratum(c, amd_erratum_400)) + set_cpu_bug(c, X86_BUG_AMD_APIC_C1E); + + rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); +} + +#ifdef CONFIG_X86_32 +static unsigned int amd_size_cache(struct cpuinfo_x86 *c, unsigned int size) +{ + /* AMD errata T13 (order #21922) */ + if ((c->x86 == 6)) { + /* Duron Rev A0 */ + if (c->x86_model == 3 && c->x86_mask == 0) + size = 64; + /* Tbird rev A1/A2 */ + if (c->x86_model == 4 && + (c->x86_mask == 0 || c->x86_mask == 1)) + size = 256; + } + return size; +} +#endif + +static void cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c) +{ + tlb_flushall_shift = 6; +} + +static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c) +{ + u32 ebx, eax, ecx, edx; + u16 mask = 0xfff; + + if (c->x86 < 0xf) + return; + + if (c->extended_cpuid_level < 0x80000006) + return; + + cpuid(0x80000006, &eax, &ebx, &ecx, &edx); + + tlb_lld_4k[ENTRIES] = (ebx >> 16) & mask; + tlb_lli_4k[ENTRIES] = ebx & mask; + + /* + * K8 doesn't have 2M/4M entries in the L2 TLB so read out the L1 TLB + * characteristics from the CPUID function 0x80000005 instead. + */ + if (c->x86 == 0xf) { + cpuid(0x80000005, &eax, &ebx, &ecx, &edx); + mask = 0xff; + } + + /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ + if (!((eax >> 16) & mask)) + tlb_lld_2m[ENTRIES] = (cpuid_eax(0x80000005) >> 16) & 0xff; + else + tlb_lld_2m[ENTRIES] = (eax >> 16) & mask; + + /* a 4M entry uses two 2M entries */ + tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1; + + /* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ + if (!(eax & mask)) { + /* Erratum 658 */ + if (c->x86 == 0x15 && c->x86_model <= 0x1f) { + tlb_lli_2m[ENTRIES] = 1024; + } else { + cpuid(0x80000005, &eax, &ebx, &ecx, &edx); + tlb_lli_2m[ENTRIES] = eax & 0xff; + } + } else + tlb_lli_2m[ENTRIES] = eax & mask; + + tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1; + + cpu_set_tlb_flushall_shift(c); +} + +static const struct cpu_dev amd_cpu_dev = { + .c_vendor = "AMD", + .c_ident = { "AuthenticAMD" }, +#ifdef CONFIG_X86_32 + .legacy_models = { + { .family = 4, .model_names = + { + [3] = "486 DX/2", + [7] = "486 DX/2-WB", + [8] = "486 DX/4", + [9] = "486 DX/4-WB", + [14] = "Am5x86-WT", + [15] = "Am5x86-WB" + } + }, + }, + .legacy_cache_size = amd_size_cache, +#endif + .c_early_init = early_init_amd, + .c_detect_tlb = cpu_detect_tlb_amd, + .c_bsp_init = bsp_init_amd, + .c_init = init_amd, + .c_x86_vendor = X86_VENDOR_AMD, +}; + +cpu_dev_register(amd_cpu_dev); + +/* + * AMD errata checking + * + * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or + * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that + * have an OSVW id assigned, which it takes as first argument. Both take a + * variable number of family-specific model-stepping ranges created by + * AMD_MODEL_RANGE(). + * + * Example: + * + * const int amd_erratum_319[] = + * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2), + * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0), + * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0)); + */ + +#define AMD_LEGACY_ERRATUM(...) { -1, __VA_ARGS__, 0 } +#define AMD_OSVW_ERRATUM(osvw_id, ...) { osvw_id, __VA_ARGS__, 0 } +#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \ + ((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end)) +#define AMD_MODEL_RANGE_FAMILY(range) (((range) >> 24) & 0xff) +#define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff) +#define AMD_MODEL_RANGE_END(range) ((range) & 0xfff) + +static const int amd_erratum_400[] = + AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf), + AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf)); + +static const int amd_erratum_383[] = + AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf)); + + +static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum) +{ + int osvw_id = *erratum++; + u32 range; + u32 ms; + + if (osvw_id >= 0 && osvw_id < 65536 && + cpu_has(cpu, X86_FEATURE_OSVW)) { + u64 osvw_len; + + rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len); + if (osvw_id < osvw_len) { + u64 osvw_bits; + + rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6), + osvw_bits); + return osvw_bits & (1ULL << (osvw_id & 0x3f)); + } + } + + /* OSVW unavailable or ID unknown, match family-model-stepping range */ + ms = (cpu->x86_model << 4) | cpu->x86_mask; + while ((range = *erratum++)) + if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) && + (ms >= AMD_MODEL_RANGE_START(range)) && + (ms <= AMD_MODEL_RANGE_END(range))) + return true; + + return false; +} diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c new file mode 100644 index 00000000000..03445346ee0 --- /dev/null +++ b/arch/x86/kernel/cpu/bugs.c @@ -0,0 +1,94 @@ +/* + * Copyright (C) 1994 Linus Torvalds + * + * Cyrix stuff, June 1998 by: + * - Rafael R. Reilova (moved everything from head.S), + * <rreilova@ececs.uc.edu> + * - Channing Corn (tests & fixes), + * - Andrew D. Balsa (code cleanup). + */ +#include <linux/init.h> +#include <linux/utsname.h> +#include <asm/bugs.h> +#include <asm/processor.h> +#include <asm/processor-flags.h> +#include <asm/i387.h> +#include <asm/msr.h> +#include <asm/paravirt.h> +#include <asm/alternative.h> + +static double __initdata x = 4195835.0; +static double __initdata y = 3145727.0; + +/* + * This used to check for exceptions.. + * However, it turns out that to support that, + * the XMM trap handlers basically had to + * be buggy. So let's have a correct XMM trap + * handler, and forget about printing out + * some status at boot. + * + * We should really only care about bugs here + * anyway. Not features. + */ +static void __init check_fpu(void) +{ + s32 fdiv_bug; + + kernel_fpu_begin(); + + /* + * trap_init() enabled FXSR and company _before_ testing for FP + * problems here. + * + * Test for the divl bug: http://en.wikipedia.org/wiki/Fdiv_bug + */ + __asm__("fninit\n\t" + "fldl %1\n\t" + "fdivl %2\n\t" + "fmull %2\n\t" + "fldl %1\n\t" + "fsubp %%st,%%st(1)\n\t" + "fistpl %0\n\t" + "fwait\n\t" + "fninit" + : "=m" (*&fdiv_bug) + : "m" (*&x), "m" (*&y)); + + kernel_fpu_end(); + + if (fdiv_bug) { + set_cpu_bug(&boot_cpu_data, X86_BUG_FDIV); + pr_warn("Hmm, FPU with FDIV bug\n"); + } +} + +void __init check_bugs(void) +{ + identify_boot_cpu(); +#ifndef CONFIG_SMP + pr_info("CPU: "); + print_cpu_info(&boot_cpu_data); +#endif + + /* + * Check whether we are able to run this kernel safely on SMP. + * + * - i386 is no longer supported. + * - In order to run on anything without a TSC, we need to be + * compiled for a i486. + */ + if (boot_cpu_data.x86 < 4) + panic("Kernel requires i486+ for 'invlpg' and other features"); + + init_utsname()->machine[1] = + '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86); + alternative_instructions(); + + /* + * kernel_fpu_begin/end() in check_fpu() relies on the patched + * alternative instructions. + */ + if (cpu_has_fpu) + check_fpu(); +} diff --git a/arch/x86/kernel/cpu/bugs_64.c b/arch/x86/kernel/cpu/bugs_64.c new file mode 100644 index 00000000000..04f0fe5af83 --- /dev/null +++ b/arch/x86/kernel/cpu/bugs_64.c @@ -0,0 +1,33 @@ +/* + * Copyright (C) 1994 Linus Torvalds + * Copyright (C) 2000 SuSE + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <asm/alternative.h> +#include <asm/bugs.h> +#include <asm/processor.h> +#include <asm/mtrr.h> +#include <asm/cacheflush.h> + +void __init check_bugs(void) +{ + identify_boot_cpu(); +#if !defined(CONFIG_SMP) + printk(KERN_INFO "CPU: "); + print_cpu_info(&boot_cpu_data); +#endif + alternative_instructions(); + + /* + * Make sure the first 2MB area is not mapped by huge pages + * There are typically fixed size MTRRs in there and overlapping + * MTRRs into large pages causes slow downs. + * + * Right now we don't do that with gbpages because there seems + * very little benefit for that case. + */ + if (!direct_gbpages) + set_memory_4k((unsigned long)__va(0), 1); +} diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c new file mode 100644 index 00000000000..d8fba5c15fb --- /dev/null +++ b/arch/x86/kernel/cpu/centaur.c @@ -0,0 +1,229 @@ +#include <linux/bitops.h> +#include <linux/kernel.h> + +#include <asm/processor.h> +#include <asm/e820.h> +#include <asm/mtrr.h> +#include <asm/msr.h> + +#include "cpu.h" + +#define ACE_PRESENT (1 << 6) +#define ACE_ENABLED (1 << 7) +#define ACE_FCR (1 << 28) /* MSR_VIA_FCR */ + +#define RNG_PRESENT (1 << 2) +#define RNG_ENABLED (1 << 3) +#define RNG_ENABLE (1 << 6) /* MSR_VIA_RNG */ + +static void init_c3(struct cpuinfo_x86 *c) +{ + u32 lo, hi; + + /* Test for Centaur Extended Feature Flags presence */ + if (cpuid_eax(0xC0000000) >= 0xC0000001) { + u32 tmp = cpuid_edx(0xC0000001); + + /* enable ACE unit, if present and disabled */ + if ((tmp & (ACE_PRESENT | ACE_ENABLED)) == ACE_PRESENT) { + rdmsr(MSR_VIA_FCR, lo, hi); + lo |= ACE_FCR; /* enable ACE unit */ + wrmsr(MSR_VIA_FCR, lo, hi); + printk(KERN_INFO "CPU: Enabled ACE h/w crypto\n"); + } + + /* enable RNG unit, if present and disabled */ + if ((tmp & (RNG_PRESENT | RNG_ENABLED)) == RNG_PRESENT) { + rdmsr(MSR_VIA_RNG, lo, hi); + lo |= RNG_ENABLE; /* enable RNG unit */ + wrmsr(MSR_VIA_RNG, lo, hi); + printk(KERN_INFO "CPU: Enabled h/w RNG\n"); + } + + /* store Centaur Extended Feature Flags as + * word 5 of the CPU capability bit array + */ + c->x86_capability[5] = cpuid_edx(0xC0000001); + } +#ifdef CONFIG_X86_32 + /* Cyrix III family needs CX8 & PGE explicitly enabled. */ + if (c->x86_model >= 6 && c->x86_model <= 13) { + rdmsr(MSR_VIA_FCR, lo, hi); + lo |= (1<<1 | 1<<7); + wrmsr(MSR_VIA_FCR, lo, hi); + set_cpu_cap(c, X86_FEATURE_CX8); + } + + /* Before Nehemiah, the C3's had 3dNOW! */ + if (c->x86_model >= 6 && c->x86_model < 9) + set_cpu_cap(c, X86_FEATURE_3DNOW); +#endif + if (c->x86 == 0x6 && c->x86_model >= 0xf) { + c->x86_cache_alignment = c->x86_clflush_size * 2; + set_cpu_cap(c, X86_FEATURE_REP_GOOD); + } + + cpu_detect_cache_sizes(c); +} + +enum { + ECX8 = 1<<1, + EIERRINT = 1<<2, + DPM = 1<<3, + DMCE = 1<<4, + DSTPCLK = 1<<5, + ELINEAR = 1<<6, + DSMC = 1<<7, + DTLOCK = 1<<8, + EDCTLB = 1<<8, + EMMX = 1<<9, + DPDC = 1<<11, + EBRPRED = 1<<12, + DIC = 1<<13, + DDC = 1<<14, + DNA = 1<<15, + ERETSTK = 1<<16, + E2MMX = 1<<19, + EAMD3D = 1<<20, +}; + +static void early_init_centaur(struct cpuinfo_x86 *c) +{ + switch (c->x86) { +#ifdef CONFIG_X86_32 + case 5: + /* Emulate MTRRs using Centaur's MCR. */ + set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR); + break; +#endif + case 6: + if (c->x86_model >= 0xf) + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + break; + } +#ifdef CONFIG_X86_64 + set_cpu_cap(c, X86_FEATURE_SYSENTER32); +#endif +} + +static void init_centaur(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_32 + char *name; + u32 fcr_set = 0; + u32 fcr_clr = 0; + u32 lo, hi, newlo; + u32 aa, bb, cc, dd; + + /* + * Bit 31 in normal CPUID used for nonstandard 3DNow ID; + * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway + */ + clear_cpu_cap(c, 0*32+31); +#endif + early_init_centaur(c); + switch (c->x86) { +#ifdef CONFIG_X86_32 + case 5: + switch (c->x86_model) { + case 4: + name = "C6"; + fcr_set = ECX8|DSMC|EDCTLB|EMMX|ERETSTK; + fcr_clr = DPDC; + printk(KERN_NOTICE "Disabling bugged TSC.\n"); + clear_cpu_cap(c, X86_FEATURE_TSC); + break; + case 8: + switch (c->x86_mask) { + default: + name = "2"; + break; + case 7 ... 9: + name = "2A"; + break; + case 10 ... 15: + name = "2B"; + break; + } + fcr_set = ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK| + E2MMX|EAMD3D; + fcr_clr = DPDC; + break; + case 9: + name = "3"; + fcr_set = ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK| + E2MMX|EAMD3D; + fcr_clr = DPDC; + break; + default: + name = "??"; + } + + rdmsr(MSR_IDT_FCR1, lo, hi); + newlo = (lo|fcr_set) & (~fcr_clr); + + if (newlo != lo) { + printk(KERN_INFO "Centaur FCR was 0x%X now 0x%X\n", + lo, newlo); + wrmsr(MSR_IDT_FCR1, newlo, hi); + } else { + printk(KERN_INFO "Centaur FCR is 0x%X\n", lo); + } + /* Emulate MTRRs using Centaur's MCR. */ + set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR); + /* Report CX8 */ + set_cpu_cap(c, X86_FEATURE_CX8); + /* Set 3DNow! on Winchip 2 and above. */ + if (c->x86_model >= 8) + set_cpu_cap(c, X86_FEATURE_3DNOW); + /* See if we can find out some more. */ + if (cpuid_eax(0x80000000) >= 0x80000005) { + /* Yes, we can. */ + cpuid(0x80000005, &aa, &bb, &cc, &dd); + /* Add L1 data and code cache sizes. */ + c->x86_cache_size = (cc>>24)+(dd>>24); + } + sprintf(c->x86_model_id, "WinChip %s", name); + break; +#endif + case 6: + init_c3(c); + break; + } +#ifdef CONFIG_X86_64 + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); +#endif +} + +#ifdef CONFIG_X86_32 +static unsigned int +centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size) +{ + /* VIA C3 CPUs (670-68F) need further shifting. */ + if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8))) + size >>= 8; + + /* + * There's also an erratum in Nehemiah stepping 1, which + * returns '65KB' instead of '64KB' + * - Note, it seems this may only be in engineering samples. + */ + if ((c->x86 == 6) && (c->x86_model == 9) && + (c->x86_mask == 1) && (size == 65)) + size -= 1; + return size; +} +#endif + +static const struct cpu_dev centaur_cpu_dev = { + .c_vendor = "Centaur", + .c_ident = { "CentaurHauls" }, + .c_early_init = early_init_centaur, + .c_init = init_centaur, +#ifdef CONFIG_X86_32 + .legacy_cache_size = centaur_size_cache, +#endif + .c_x86_vendor = X86_VENDOR_CENTAUR, +}; + +cpu_dev_register(centaur_cpu_dev); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c new file mode 100644 index 00000000000..ef1b93f18ed --- /dev/null +++ b/arch/x86/kernel/cpu/common.c @@ -0,0 +1,1431 @@ +#include <linux/bootmem.h> +#include <linux/linkage.h> +#include <linux/bitops.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/percpu.h> +#include <linux/string.h> +#include <linux/delay.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/kprobes.h> +#include <linux/kgdb.h> +#include <linux/smp.h> +#include <linux/io.h> + +#include <asm/stackprotector.h> +#include <asm/perf_event.h> +#include <asm/mmu_context.h> +#include <asm/archrandom.h> +#include <asm/hypervisor.h> +#include <asm/processor.h> +#include <asm/debugreg.h> +#include <asm/sections.h> +#include <asm/vsyscall.h> +#include <linux/topology.h> +#include <linux/cpumask.h> +#include <asm/pgtable.h> +#include <linux/atomic.h> +#include <asm/proto.h> +#include <asm/setup.h> +#include <asm/apic.h> +#include <asm/desc.h> +#include <asm/i387.h> +#include <asm/fpu-internal.h> +#include <asm/mtrr.h> +#include <linux/numa.h> +#include <asm/asm.h> +#include <asm/cpu.h> +#include <asm/mce.h> +#include <asm/msr.h> +#include <asm/pat.h> +#include <asm/microcode.h> +#include <asm/microcode_intel.h> + +#ifdef CONFIG_X86_LOCAL_APIC +#include <asm/uv/uv.h> +#endif + +#include "cpu.h" + +/* all of these masks are initialized in setup_cpu_local_masks() */ +cpumask_var_t cpu_initialized_mask; +cpumask_var_t cpu_callout_mask; +cpumask_var_t cpu_callin_mask; + +/* representing cpus for which sibling maps can be computed */ +cpumask_var_t cpu_sibling_setup_mask; + +/* correctly size the local cpu masks */ +void __init setup_cpu_local_masks(void) +{ + alloc_bootmem_cpumask_var(&cpu_initialized_mask); + alloc_bootmem_cpumask_var(&cpu_callin_mask); + alloc_bootmem_cpumask_var(&cpu_callout_mask); + alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask); +} + +static void default_init(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_64 + cpu_detect_cache_sizes(c); +#else + /* Not much we can do here... */ + /* Check if at least it has cpuid */ + if (c->cpuid_level == -1) { + /* No cpuid. It must be an ancient CPU */ + if (c->x86 == 4) + strcpy(c->x86_model_id, "486"); + else if (c->x86 == 3) + strcpy(c->x86_model_id, "386"); + } +#endif +} + +static const struct cpu_dev default_cpu = { + .c_init = default_init, + .c_vendor = "Unknown", + .c_x86_vendor = X86_VENDOR_UNKNOWN, +}; + +static const struct cpu_dev *this_cpu = &default_cpu; + +DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { +#ifdef CONFIG_X86_64 + /* + * We need valid kernel segments for data and code in long mode too + * IRET will check the segment types kkeil 2000/10/28 + * Also sysret mandates a special GDT layout + * + * TLS descriptors are currently at a different place compared to i386. + * Hopefully nobody expects them at a fixed place (Wine?) + */ + [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff), + [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff), + [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER32_CS] = GDT_ENTRY_INIT(0xc0fb, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f3, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xa0fb, 0, 0xfffff), +#else + [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xc09a, 0, 0xfffff), + [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_CS] = GDT_ENTRY_INIT(0xc0fa, 0, 0xfffff), + [GDT_ENTRY_DEFAULT_USER_DS] = GDT_ENTRY_INIT(0xc0f2, 0, 0xfffff), + /* + * Segments used for calling PnP BIOS have byte granularity. + * They code segments and data segments have fixed 64k limits, + * the transfer segment sizes are set at run time. + */ + /* 32-bit code */ + [GDT_ENTRY_PNPBIOS_CS32] = GDT_ENTRY_INIT(0x409a, 0, 0xffff), + /* 16-bit code */ + [GDT_ENTRY_PNPBIOS_CS16] = GDT_ENTRY_INIT(0x009a, 0, 0xffff), + /* 16-bit data */ + [GDT_ENTRY_PNPBIOS_DS] = GDT_ENTRY_INIT(0x0092, 0, 0xffff), + /* 16-bit data */ + [GDT_ENTRY_PNPBIOS_TS1] = GDT_ENTRY_INIT(0x0092, 0, 0), + /* 16-bit data */ + [GDT_ENTRY_PNPBIOS_TS2] = GDT_ENTRY_INIT(0x0092, 0, 0), + /* + * The APM segments have byte granularity and their bases + * are set at run time. All have 64k limits. + */ + /* 32-bit code */ + [GDT_ENTRY_APMBIOS_BASE] = GDT_ENTRY_INIT(0x409a, 0, 0xffff), + /* 16-bit code */ + [GDT_ENTRY_APMBIOS_BASE+1] = GDT_ENTRY_INIT(0x009a, 0, 0xffff), + /* data */ + [GDT_ENTRY_APMBIOS_BASE+2] = GDT_ENTRY_INIT(0x4092, 0, 0xffff), + + [GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), + [GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff), + GDT_STACK_CANARY_INIT +#endif +} }; +EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); + +static int __init x86_xsave_setup(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_XSAVE); + setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); + setup_clear_cpu_cap(X86_FEATURE_AVX); + setup_clear_cpu_cap(X86_FEATURE_AVX2); + return 1; +} +__setup("noxsave", x86_xsave_setup); + +static int __init x86_xsaveopt_setup(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); + return 1; +} +__setup("noxsaveopt", x86_xsaveopt_setup); + +#ifdef CONFIG_X86_32 +static int cachesize_override = -1; +static int disable_x86_serial_nr = 1; + +static int __init cachesize_setup(char *str) +{ + get_option(&str, &cachesize_override); + return 1; +} +__setup("cachesize=", cachesize_setup); + +static int __init x86_fxsr_setup(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_FXSR); + setup_clear_cpu_cap(X86_FEATURE_XMM); + return 1; +} +__setup("nofxsr", x86_fxsr_setup); + +static int __init x86_sep_setup(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_SEP); + return 1; +} +__setup("nosep", x86_sep_setup); + +/* Standard macro to see if a specific flag is changeable */ +static inline int flag_is_changeable_p(u32 flag) +{ + u32 f1, f2; + + /* + * Cyrix and IDT cpus allow disabling of CPUID + * so the code below may return different results + * when it is executed before and after enabling + * the CPUID. Add "volatile" to not allow gcc to + * optimize the subsequent calls to this function. + */ + asm volatile ("pushfl \n\t" + "pushfl \n\t" + "popl %0 \n\t" + "movl %0, %1 \n\t" + "xorl %2, %0 \n\t" + "pushl %0 \n\t" + "popfl \n\t" + "pushfl \n\t" + "popl %0 \n\t" + "popfl \n\t" + + : "=&r" (f1), "=&r" (f2) + : "ir" (flag)); + + return ((f1^f2) & flag) != 0; +} + +/* Probe for the CPUID instruction */ +int have_cpuid_p(void) +{ + return flag_is_changeable_p(X86_EFLAGS_ID); +} + +static void squash_the_stupid_serial_number(struct cpuinfo_x86 *c) +{ + unsigned long lo, hi; + + if (!cpu_has(c, X86_FEATURE_PN) || !disable_x86_serial_nr) + return; + + /* Disable processor serial number: */ + + rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi); + lo |= 0x200000; + wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi); + + printk(KERN_NOTICE "CPU serial number disabled.\n"); + clear_cpu_cap(c, X86_FEATURE_PN); + + /* Disabling the serial number may affect the cpuid level */ + c->cpuid_level = cpuid_eax(0); +} + +static int __init x86_serial_nr_setup(char *s) +{ + disable_x86_serial_nr = 0; + return 1; +} +__setup("serialnumber", x86_serial_nr_setup); +#else +static inline int flag_is_changeable_p(u32 flag) +{ + return 1; +} +static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c) +{ +} +#endif + +static __init int setup_disable_smep(char *arg) +{ + setup_clear_cpu_cap(X86_FEATURE_SMEP); + return 1; +} +__setup("nosmep", setup_disable_smep); + +static __always_inline void setup_smep(struct cpuinfo_x86 *c) +{ + if (cpu_has(c, X86_FEATURE_SMEP)) + set_in_cr4(X86_CR4_SMEP); +} + +static __init int setup_disable_smap(char *arg) +{ + setup_clear_cpu_cap(X86_FEATURE_SMAP); + return 1; +} +__setup("nosmap", setup_disable_smap); + +static __always_inline void setup_smap(struct cpuinfo_x86 *c) +{ + unsigned long eflags; + + /* This should have been cleared long ago */ + raw_local_save_flags(eflags); + BUG_ON(eflags & X86_EFLAGS_AC); + + if (cpu_has(c, X86_FEATURE_SMAP)) { +#ifdef CONFIG_X86_SMAP + set_in_cr4(X86_CR4_SMAP); +#else + clear_in_cr4(X86_CR4_SMAP); +#endif + } +} + +/* + * Some CPU features depend on higher CPUID levels, which may not always + * be available due to CPUID level capping or broken virtualization + * software. Add those features to this table to auto-disable them. + */ +struct cpuid_dependent_feature { + u32 feature; + u32 level; +}; + +static const struct cpuid_dependent_feature +cpuid_dependent_features[] = { + { X86_FEATURE_MWAIT, 0x00000005 }, + { X86_FEATURE_DCA, 0x00000009 }, + { X86_FEATURE_XSAVE, 0x0000000d }, + { 0, 0 } +}; + +static void filter_cpuid_features(struct cpuinfo_x86 *c, bool warn) +{ + const struct cpuid_dependent_feature *df; + + for (df = cpuid_dependent_features; df->feature; df++) { + + if (!cpu_has(c, df->feature)) + continue; + /* + * Note: cpuid_level is set to -1 if unavailable, but + * extended_extended_level is set to 0 if unavailable + * and the legitimate extended levels are all negative + * when signed; hence the weird messing around with + * signs here... + */ + if (!((s32)df->level < 0 ? + (u32)df->level > (u32)c->extended_cpuid_level : + (s32)df->level > (s32)c->cpuid_level)) + continue; + + clear_cpu_cap(c, df->feature); + if (!warn) + continue; + + printk(KERN_WARNING + "CPU: CPU feature %s disabled, no CPUID level 0x%x\n", + x86_cap_flags[df->feature], df->level); + } +} + +/* + * Naming convention should be: <Name> [(<Codename>)] + * This table only is used unless init_<vendor>() below doesn't set it; + * in particular, if CPUID levels 0x80000002..4 are supported, this + * isn't used + */ + +/* Look up CPU names by table lookup. */ +static const char *table_lookup_model(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_32 + const struct legacy_cpu_model_info *info; + + if (c->x86_model >= 16) + return NULL; /* Range check */ + + if (!this_cpu) + return NULL; + + info = this_cpu->legacy_models; + + while (info->family) { + if (info->family == c->x86) + return info->model_names[c->x86_model]; + info++; + } +#endif + return NULL; /* Not found */ +} + +__u32 cpu_caps_cleared[NCAPINTS]; +__u32 cpu_caps_set[NCAPINTS]; + +void load_percpu_segment(int cpu) +{ +#ifdef CONFIG_X86_32 + loadsegment(fs, __KERNEL_PERCPU); +#else + loadsegment(gs, 0); + wrmsrl(MSR_GS_BASE, (unsigned long)per_cpu(irq_stack_union.gs_base, cpu)); +#endif + load_stack_canary_segment(); +} + +/* + * Current gdt points %fs at the "master" per-cpu area: after this, + * it's on the real one. + */ +void switch_to_new_gdt(int cpu) +{ + struct desc_ptr gdt_descr; + + gdt_descr.address = (long)get_cpu_gdt_table(cpu); + gdt_descr.size = GDT_SIZE - 1; + load_gdt(&gdt_descr); + /* Reload the per-cpu base */ + + load_percpu_segment(cpu); +} + +static const struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; + +static void get_model_name(struct cpuinfo_x86 *c) +{ + unsigned int *v; + char *p, *q; + + if (c->extended_cpuid_level < 0x80000004) + return; + + v = (unsigned int *)c->x86_model_id; + cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); + cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); + cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); + c->x86_model_id[48] = 0; + + /* + * Intel chips right-justify this string for some dumb reason; + * undo that brain damage: + */ + p = q = &c->x86_model_id[0]; + while (*p == ' ') + p++; + if (p != q) { + while (*p) + *q++ = *p++; + while (q <= &c->x86_model_id[48]) + *q++ = '\0'; /* Zero-pad the rest */ + } +} + +void cpu_detect_cache_sizes(struct cpuinfo_x86 *c) +{ + unsigned int n, dummy, ebx, ecx, edx, l2size; + + n = c->extended_cpuid_level; + + if (n >= 0x80000005) { + cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); + c->x86_cache_size = (ecx>>24) + (edx>>24); +#ifdef CONFIG_X86_64 + /* On K8 L1 TLB is inclusive, so don't count it */ + c->x86_tlbsize = 0; +#endif + } + + if (n < 0x80000006) /* Some chips just has a large L1. */ + return; + + cpuid(0x80000006, &dummy, &ebx, &ecx, &edx); + l2size = ecx >> 16; + +#ifdef CONFIG_X86_64 + c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff); +#else + /* do processor-specific cache resizing */ + if (this_cpu->legacy_cache_size) + l2size = this_cpu->legacy_cache_size(c, l2size); + + /* Allow user to override all this if necessary. */ + if (cachesize_override != -1) + l2size = cachesize_override; + + if (l2size == 0) + return; /* Again, no L2 cache is possible */ +#endif + + c->x86_cache_size = l2size; +} + +u16 __read_mostly tlb_lli_4k[NR_INFO]; +u16 __read_mostly tlb_lli_2m[NR_INFO]; +u16 __read_mostly tlb_lli_4m[NR_INFO]; +u16 __read_mostly tlb_lld_4k[NR_INFO]; +u16 __read_mostly tlb_lld_2m[NR_INFO]; +u16 __read_mostly tlb_lld_4m[NR_INFO]; +u16 __read_mostly tlb_lld_1g[NR_INFO]; + +/* + * tlb_flushall_shift shows the balance point in replacing cr3 write + * with multiple 'invlpg'. It will do this replacement when + * flush_tlb_lines <= active_lines/2^tlb_flushall_shift. + * If tlb_flushall_shift is -1, means the replacement will be disabled. + */ +s8 __read_mostly tlb_flushall_shift = -1; + +void cpu_detect_tlb(struct cpuinfo_x86 *c) +{ + if (this_cpu->c_detect_tlb) + this_cpu->c_detect_tlb(c); + + printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" + "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n" + "tlb_flushall_shift: %d\n", + tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES], + tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES], + tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES], + tlb_lld_1g[ENTRIES], tlb_flushall_shift); +} + +void detect_ht(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_HT + u32 eax, ebx, ecx, edx; + int index_msb, core_bits; + static bool printed; + + if (!cpu_has(c, X86_FEATURE_HT)) + return; + + if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) + goto out; + + if (cpu_has(c, X86_FEATURE_XTOPOLOGY)) + return; + + cpuid(1, &eax, &ebx, &ecx, &edx); + + smp_num_siblings = (ebx & 0xff0000) >> 16; + + if (smp_num_siblings == 1) { + printk_once(KERN_INFO "CPU0: Hyper-Threading is disabled\n"); + goto out; + } + + if (smp_num_siblings <= 1) + goto out; + + index_msb = get_count_order(smp_num_siblings); + c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb); + + smp_num_siblings = smp_num_siblings / c->x86_max_cores; + + index_msb = get_count_order(smp_num_siblings); + + core_bits = get_count_order(c->x86_max_cores); + + c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, index_msb) & + ((1 << core_bits) - 1); + +out: + if (!printed && (c->x86_max_cores * smp_num_siblings) > 1) { + printk(KERN_INFO "CPU: Physical Processor ID: %d\n", + c->phys_proc_id); + printk(KERN_INFO "CPU: Processor Core ID: %d\n", + c->cpu_core_id); + printed = 1; + } +#endif +} + +static void get_cpu_vendor(struct cpuinfo_x86 *c) +{ + char *v = c->x86_vendor_id; + int i; + + for (i = 0; i < X86_VENDOR_NUM; i++) { + if (!cpu_devs[i]) + break; + + if (!strcmp(v, cpu_devs[i]->c_ident[0]) || + (cpu_devs[i]->c_ident[1] && + !strcmp(v, cpu_devs[i]->c_ident[1]))) { + + this_cpu = cpu_devs[i]; + c->x86_vendor = this_cpu->c_x86_vendor; + return; + } + } + + printk_once(KERN_ERR + "CPU: vendor_id '%s' unknown, using generic init.\n" \ + "CPU: Your system may be unstable.\n", v); + + c->x86_vendor = X86_VENDOR_UNKNOWN; + this_cpu = &default_cpu; +} + +void cpu_detect(struct cpuinfo_x86 *c) +{ + /* Get vendor name */ + cpuid(0x00000000, (unsigned int *)&c->cpuid_level, + (unsigned int *)&c->x86_vendor_id[0], + (unsigned int *)&c->x86_vendor_id[8], + (unsigned int *)&c->x86_vendor_id[4]); + + c->x86 = 4; + /* Intel-defined flags: level 0x00000001 */ + if (c->cpuid_level >= 0x00000001) { + u32 junk, tfms, cap0, misc; + + cpuid(0x00000001, &tfms, &misc, &junk, &cap0); + c->x86 = (tfms >> 8) & 0xf; + c->x86_model = (tfms >> 4) & 0xf; + c->x86_mask = tfms & 0xf; + + if (c->x86 == 0xf) + c->x86 += (tfms >> 20) & 0xff; + if (c->x86 >= 0x6) + c->x86_model += ((tfms >> 16) & 0xf) << 4; + + if (cap0 & (1<<19)) { + c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; + c->x86_cache_alignment = c->x86_clflush_size; + } + } +} + +void get_cpu_cap(struct cpuinfo_x86 *c) +{ + u32 tfms, xlvl; + u32 ebx; + + /* Intel-defined flags: level 0x00000001 */ + if (c->cpuid_level >= 0x00000001) { + u32 capability, excap; + + cpuid(0x00000001, &tfms, &ebx, &excap, &capability); + c->x86_capability[0] = capability; + c->x86_capability[4] = excap; + } + + /* Additional Intel-defined flags: level 0x00000007 */ + if (c->cpuid_level >= 0x00000007) { + u32 eax, ebx, ecx, edx; + + cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx); + + c->x86_capability[9] = ebx; + } + + /* AMD-defined flags: level 0x80000001 */ + xlvl = cpuid_eax(0x80000000); + c->extended_cpuid_level = xlvl; + + if ((xlvl & 0xffff0000) == 0x80000000) { + if (xlvl >= 0x80000001) { + c->x86_capability[1] = cpuid_edx(0x80000001); + c->x86_capability[6] = cpuid_ecx(0x80000001); + } + } + + if (c->extended_cpuid_level >= 0x80000008) { + u32 eax = cpuid_eax(0x80000008); + + c->x86_virt_bits = (eax >> 8) & 0xff; + c->x86_phys_bits = eax & 0xff; + } +#ifdef CONFIG_X86_32 + else if (cpu_has(c, X86_FEATURE_PAE) || cpu_has(c, X86_FEATURE_PSE36)) + c->x86_phys_bits = 36; +#endif + + if (c->extended_cpuid_level >= 0x80000007) + c->x86_power = cpuid_edx(0x80000007); + + init_scattered_cpuid_features(c); +} + +static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_32 + int i; + + /* + * First of all, decide if this is a 486 or higher + * It's a 486 if we can modify the AC flag + */ + if (flag_is_changeable_p(X86_EFLAGS_AC)) + c->x86 = 4; + else + c->x86 = 3; + + for (i = 0; i < X86_VENDOR_NUM; i++) + if (cpu_devs[i] && cpu_devs[i]->c_identify) { + c->x86_vendor_id[0] = 0; + cpu_devs[i]->c_identify(c); + if (c->x86_vendor_id[0]) { + get_cpu_vendor(c); + break; + } + } +#endif +} + +/* + * Do minimum CPU detection early. + * Fields really needed: vendor, cpuid_level, family, model, mask, + * cache alignment. + * The others are not touched to avoid unwanted side effects. + * + * WARNING: this function is only called on the BP. Don't add code here + * that is supposed to run on all CPUs. + */ +static void __init early_identify_cpu(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_64 + c->x86_clflush_size = 64; + c->x86_phys_bits = 36; + c->x86_virt_bits = 48; +#else + c->x86_clflush_size = 32; + c->x86_phys_bits = 32; + c->x86_virt_bits = 32; +#endif + c->x86_cache_alignment = c->x86_clflush_size; + + memset(&c->x86_capability, 0, sizeof c->x86_capability); + c->extended_cpuid_level = 0; + + if (!have_cpuid_p()) + identify_cpu_without_cpuid(c); + + /* cyrix could have cpuid enabled via c_identify()*/ + if (!have_cpuid_p()) + return; + + cpu_detect(c); + get_cpu_vendor(c); + get_cpu_cap(c); + fpu_detect(c); + + if (this_cpu->c_early_init) + this_cpu->c_early_init(c); + + c->cpu_index = 0; + filter_cpuid_features(c, false); + + if (this_cpu->c_bsp_init) + this_cpu->c_bsp_init(c); + + setup_force_cpu_cap(X86_FEATURE_ALWAYS); +} + +void __init early_cpu_init(void) +{ + const struct cpu_dev *const *cdev; + int count = 0; + +#ifdef CONFIG_PROCESSOR_SELECT + printk(KERN_INFO "KERNEL supported cpus:\n"); +#endif + + for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) { + const struct cpu_dev *cpudev = *cdev; + + if (count >= X86_VENDOR_NUM) + break; + cpu_devs[count] = cpudev; + count++; + +#ifdef CONFIG_PROCESSOR_SELECT + { + unsigned int j; + + for (j = 0; j < 2; j++) { + if (!cpudev->c_ident[j]) + continue; + printk(KERN_INFO " %s %s\n", cpudev->c_vendor, + cpudev->c_ident[j]); + } + } +#endif + } + early_identify_cpu(&boot_cpu_data); +} + +/* + * The NOPL instruction is supposed to exist on all CPUs of family >= 6; + * unfortunately, that's not true in practice because of early VIA + * chips and (more importantly) broken virtualizers that are not easy + * to detect. In the latter case it doesn't even *fail* reliably, so + * probing for it doesn't even work. Disable it completely on 32-bit + * unless we can find a reliable way to detect all the broken cases. + * Enable it explicitly on 64-bit for non-constant inputs of cpu_has(). + */ +static void detect_nopl(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_32 + clear_cpu_cap(c, X86_FEATURE_NOPL); +#else + set_cpu_cap(c, X86_FEATURE_NOPL); +#endif +} + +static void generic_identify(struct cpuinfo_x86 *c) +{ + c->extended_cpuid_level = 0; + + if (!have_cpuid_p()) + identify_cpu_without_cpuid(c); + + /* cyrix could have cpuid enabled via c_identify()*/ + if (!have_cpuid_p()) + return; + + cpu_detect(c); + + get_cpu_vendor(c); + + get_cpu_cap(c); + + if (c->cpuid_level >= 0x00000001) { + c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF; +#ifdef CONFIG_X86_32 +# ifdef CONFIG_X86_HT + c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); +# else + c->apicid = c->initial_apicid; +# endif +#endif + c->phys_proc_id = c->initial_apicid; + } + + get_model_name(c); /* Default name */ + + detect_nopl(c); +} + +/* + * This does the hard work of actually picking apart the CPU stuff... + */ +static void identify_cpu(struct cpuinfo_x86 *c) +{ + int i; + + c->loops_per_jiffy = loops_per_jiffy; + c->x86_cache_size = -1; + c->x86_vendor = X86_VENDOR_UNKNOWN; + c->x86_model = c->x86_mask = 0; /* So far unknown... */ + c->x86_vendor_id[0] = '\0'; /* Unset */ + c->x86_model_id[0] = '\0'; /* Unset */ + c->x86_max_cores = 1; + c->x86_coreid_bits = 0; +#ifdef CONFIG_X86_64 + c->x86_clflush_size = 64; + c->x86_phys_bits = 36; + c->x86_virt_bits = 48; +#else + c->cpuid_level = -1; /* CPUID not detected */ + c->x86_clflush_size = 32; + c->x86_phys_bits = 32; + c->x86_virt_bits = 32; +#endif + c->x86_cache_alignment = c->x86_clflush_size; + memset(&c->x86_capability, 0, sizeof c->x86_capability); + + generic_identify(c); + + if (this_cpu->c_identify) + this_cpu->c_identify(c); + + /* Clear/Set all flags overriden by options, after probe */ + for (i = 0; i < NCAPINTS; i++) { + c->x86_capability[i] &= ~cpu_caps_cleared[i]; + c->x86_capability[i] |= cpu_caps_set[i]; + } + +#ifdef CONFIG_X86_64 + c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); +#endif + + /* + * Vendor-specific initialization. In this section we + * canonicalize the feature flags, meaning if there are + * features a certain CPU supports which CPUID doesn't + * tell us, CPUID claiming incorrect flags, or other bugs, + * we handle them here. + * + * At the end of this section, c->x86_capability better + * indicate the features this CPU genuinely supports! + */ + if (this_cpu->c_init) + this_cpu->c_init(c); + + /* Disable the PN if appropriate */ + squash_the_stupid_serial_number(c); + + /* Set up SMEP/SMAP */ + setup_smep(c); + setup_smap(c); + + /* + * The vendor-specific functions might have changed features. + * Now we do "generic changes." + */ + + /* Filter out anything that depends on CPUID levels we don't have */ + filter_cpuid_features(c, true); + + /* If the model name is still unset, do table lookup. */ + if (!c->x86_model_id[0]) { + const char *p; + p = table_lookup_model(c); + if (p) + strcpy(c->x86_model_id, p); + else + /* Last resort... */ + sprintf(c->x86_model_id, "%02x/%02x", + c->x86, c->x86_model); + } + +#ifdef CONFIG_X86_64 + detect_ht(c); +#endif + + init_hypervisor(c); + x86_init_rdrand(c); + + /* + * Clear/Set all flags overriden by options, need do it + * before following smp all cpus cap AND. + */ + for (i = 0; i < NCAPINTS; i++) { + c->x86_capability[i] &= ~cpu_caps_cleared[i]; + c->x86_capability[i] |= cpu_caps_set[i]; + } + + /* + * On SMP, boot_cpu_data holds the common feature set between + * all CPUs; so make sure that we indicate which features are + * common between the CPUs. The first time this routine gets + * executed, c == &boot_cpu_data. + */ + if (c != &boot_cpu_data) { + /* AND the already accumulated flags with these */ + for (i = 0; i < NCAPINTS; i++) + boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; + + /* OR, i.e. replicate the bug flags */ + for (i = NCAPINTS; i < NCAPINTS + NBUGINTS; i++) + c->x86_capability[i] |= boot_cpu_data.x86_capability[i]; + } + + /* Init Machine Check Exception if available. */ + mcheck_cpu_init(c); + + select_idle_routine(c); + +#ifdef CONFIG_NUMA + numa_add_cpu(smp_processor_id()); +#endif +} + +#ifdef CONFIG_X86_64 +static void vgetcpu_set_mode(void) +{ + if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) + vgetcpu_mode = VGETCPU_RDTSCP; + else + vgetcpu_mode = VGETCPU_LSL; +} + +/* May not be __init: called during resume */ +static void syscall32_cpu_init(void) +{ + /* Load these always in case some future AMD CPU supports + SYSENTER from compat mode too. */ + wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); + wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); + + wrmsrl(MSR_CSTAR, ia32_cstar_target); +} +#endif + +#ifdef CONFIG_X86_32 +void enable_sep_cpu(void) +{ + int cpu = get_cpu(); + struct tss_struct *tss = &per_cpu(init_tss, cpu); + + if (!boot_cpu_has(X86_FEATURE_SEP)) { + put_cpu(); + return; + } + + tss->x86_tss.ss1 = __KERNEL_CS; + tss->x86_tss.sp1 = sizeof(struct tss_struct) + (unsigned long) tss; + wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); + wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0); + wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0); + put_cpu(); +} +#endif + +void __init identify_boot_cpu(void) +{ + identify_cpu(&boot_cpu_data); + init_amd_e400_c1e_mask(); +#ifdef CONFIG_X86_32 + sysenter_setup(); + enable_sep_cpu(); +#else + vgetcpu_set_mode(); +#endif + cpu_detect_tlb(&boot_cpu_data); +} + +void identify_secondary_cpu(struct cpuinfo_x86 *c) +{ + BUG_ON(c == &boot_cpu_data); + identify_cpu(c); +#ifdef CONFIG_X86_32 + enable_sep_cpu(); +#endif + mtrr_ap_init(); +} + +struct msr_range { + unsigned min; + unsigned max; +}; + +static const struct msr_range msr_range_array[] = { + { 0x00000000, 0x00000418}, + { 0xc0000000, 0xc000040b}, + { 0xc0010000, 0xc0010142}, + { 0xc0011000, 0xc001103b}, +}; + +static void __print_cpu_msr(void) +{ + unsigned index_min, index_max; + unsigned index; + u64 val; + int i; + + for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) { + index_min = msr_range_array[i].min; + index_max = msr_range_array[i].max; + + for (index = index_min; index < index_max; index++) { + if (rdmsrl_safe(index, &val)) + continue; + printk(KERN_INFO " MSR%08x: %016llx\n", index, val); + } + } +} + +static int show_msr; + +static __init int setup_show_msr(char *arg) +{ + int num; + + get_option(&arg, &num); + + if (num > 0) + show_msr = num; + return 1; +} +__setup("show_msr=", setup_show_msr); + +static __init int setup_noclflush(char *arg) +{ + setup_clear_cpu_cap(X86_FEATURE_CLFLUSH); + setup_clear_cpu_cap(X86_FEATURE_CLFLUSHOPT); + return 1; +} +__setup("noclflush", setup_noclflush); + +void print_cpu_info(struct cpuinfo_x86 *c) +{ + const char *vendor = NULL; + + if (c->x86_vendor < X86_VENDOR_NUM) { + vendor = this_cpu->c_vendor; + } else { + if (c->cpuid_level >= 0) + vendor = c->x86_vendor_id; + } + + if (vendor && !strstr(c->x86_model_id, vendor)) + printk(KERN_CONT "%s ", vendor); + + if (c->x86_model_id[0]) + printk(KERN_CONT "%s", strim(c->x86_model_id)); + else + printk(KERN_CONT "%d86", c->x86); + + printk(KERN_CONT " (fam: %02x, model: %02x", c->x86, c->x86_model); + + if (c->x86_mask || c->cpuid_level >= 0) + printk(KERN_CONT ", stepping: %02x)\n", c->x86_mask); + else + printk(KERN_CONT ")\n"); + + print_cpu_msr(c); +} + +void print_cpu_msr(struct cpuinfo_x86 *c) +{ + if (c->cpu_index < show_msr) + __print_cpu_msr(); +} + +static __init int setup_disablecpuid(char *arg) +{ + int bit; + + if (get_option(&arg, &bit) && bit < NCAPINTS*32) + setup_clear_cpu_cap(bit); + else + return 0; + + return 1; +} +__setup("clearcpuid=", setup_disablecpuid); + +DEFINE_PER_CPU(unsigned long, kernel_stack) = + (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE; +EXPORT_PER_CPU_SYMBOL(kernel_stack); + +#ifdef CONFIG_X86_64 +struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table }; +struct desc_ptr debug_idt_descr = { NR_VECTORS * 16 - 1, + (unsigned long) debug_idt_table }; + +DEFINE_PER_CPU_FIRST(union irq_stack_union, + irq_stack_union) __aligned(PAGE_SIZE) __visible; + +/* + * The following four percpu variables are hot. Align current_task to + * cacheline size such that all four fall in the same cacheline. + */ +DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = + &init_task; +EXPORT_PER_CPU_SYMBOL(current_task); + +DEFINE_PER_CPU(char *, irq_stack_ptr) = + init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64; + +DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; + +DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; +EXPORT_PER_CPU_SYMBOL(__preempt_count); + +DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); + +/* + * Special IST stacks which the CPU switches to when it calls + * an IST-marked descriptor entry. Up to 7 stacks (hardware + * limit), all of them are 4K, except the debug stack which + * is 8K. + */ +static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, + [DEBUG_STACK - 1] = DEBUG_STKSZ +}; + +static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks + [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); + +/* May not be marked __init: used by software suspend */ +void syscall_init(void) +{ + /* + * LSTAR and STAR live in a bit strange symbiosis. + * They both write to the same internal register. STAR allows to + * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip. + */ + wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); + wrmsrl(MSR_LSTAR, system_call); + wrmsrl(MSR_CSTAR, ignore_sysret); + +#ifdef CONFIG_IA32_EMULATION + syscall32_cpu_init(); +#endif + + /* Flags to clear on syscall */ + wrmsrl(MSR_SYSCALL_MASK, + X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF| + X86_EFLAGS_IOPL|X86_EFLAGS_AC); +} + +/* + * Copies of the original ist values from the tss are only accessed during + * debugging, no special alignment required. + */ +DEFINE_PER_CPU(struct orig_ist, orig_ist); + +static DEFINE_PER_CPU(unsigned long, debug_stack_addr); +DEFINE_PER_CPU(int, debug_stack_usage); + +int is_debug_stack(unsigned long addr) +{ + return __get_cpu_var(debug_stack_usage) || + (addr <= __get_cpu_var(debug_stack_addr) && + addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ)); +} +NOKPROBE_SYMBOL(is_debug_stack); + +DEFINE_PER_CPU(u32, debug_idt_ctr); + +void debug_stack_set_zero(void) +{ + this_cpu_inc(debug_idt_ctr); + load_current_idt(); +} +NOKPROBE_SYMBOL(debug_stack_set_zero); + +void debug_stack_reset(void) +{ + if (WARN_ON(!this_cpu_read(debug_idt_ctr))) + return; + if (this_cpu_dec_return(debug_idt_ctr) == 0) + load_current_idt(); +} +NOKPROBE_SYMBOL(debug_stack_reset); + +#else /* CONFIG_X86_64 */ + +DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; +EXPORT_PER_CPU_SYMBOL(current_task); +DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; +EXPORT_PER_CPU_SYMBOL(__preempt_count); +DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); + +#ifdef CONFIG_CC_STACKPROTECTOR +DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); +#endif + +#endif /* CONFIG_X86_64 */ + +/* + * Clear all 6 debug registers: + */ +static void clear_all_debug_regs(void) +{ + int i; + + for (i = 0; i < 8; i++) { + /* Ignore db4, db5 */ + if ((i == 4) || (i == 5)) + continue; + + set_debugreg(0, i); + } +} + +#ifdef CONFIG_KGDB +/* + * Restore debug regs if using kgdbwait and you have a kernel debugger + * connection established. + */ +static void dbg_restore_debug_regs(void) +{ + if (unlikely(kgdb_connected && arch_kgdb_ops.correct_hw_break)) + arch_kgdb_ops.correct_hw_break(); +} +#else /* ! CONFIG_KGDB */ +#define dbg_restore_debug_regs() +#endif /* ! CONFIG_KGDB */ + +/* + * cpu_init() initializes state that is per-CPU. Some data is already + * initialized (naturally) in the bootstrap process, such as the GDT + * and IDT. We reload them nevertheless, this function acts as a + * 'CPU state barrier', nothing should get across. + * A lot of state is already set up in PDA init for 64 bit + */ +#ifdef CONFIG_X86_64 + +void cpu_init(void) +{ + struct orig_ist *oist; + struct task_struct *me; + struct tss_struct *t; + unsigned long v; + int cpu; + int i; + + /* + * Load microcode on this cpu if a valid microcode is available. + * This is early microcode loading procedure. + */ + load_ucode_ap(); + + cpu = stack_smp_processor_id(); + t = &per_cpu(init_tss, cpu); + oist = &per_cpu(orig_ist, cpu); + +#ifdef CONFIG_NUMA + if (this_cpu_read(numa_node) == 0 && + early_cpu_to_node(cpu) != NUMA_NO_NODE) + set_numa_node(early_cpu_to_node(cpu)); +#endif + + me = current; + + if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) + panic("CPU#%d already initialized!\n", cpu); + + pr_debug("Initializing CPU#%d\n", cpu); + + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); + + /* + * Initialize the per-CPU GDT with the boot GDT, + * and set up the GDT descriptor: + */ + + switch_to_new_gdt(cpu); + loadsegment(fs, 0); + + load_current_idt(); + + memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); + syscall_init(); + + wrmsrl(MSR_FS_BASE, 0); + wrmsrl(MSR_KERNEL_GS_BASE, 0); + barrier(); + + x86_configure_nx(); + enable_x2apic(); + + /* + * set up and load the per-CPU TSS + */ + if (!oist->ist[0]) { + char *estacks = per_cpu(exception_stacks, cpu); + + for (v = 0; v < N_EXCEPTION_STACKS; v++) { + estacks += exception_stack_sizes[v]; + oist->ist[v] = t->x86_tss.ist[v] = + (unsigned long)estacks; + if (v == DEBUG_STACK-1) + per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks; + } + } + + t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); + + /* + * <= is required because the CPU will access up to + * 8 bits beyond the end of the IO permission bitmap. + */ + for (i = 0; i <= IO_BITMAP_LONGS; i++) + t->io_bitmap[i] = ~0UL; + + atomic_inc(&init_mm.mm_count); + me->active_mm = &init_mm; + BUG_ON(me->mm); + enter_lazy_tlb(&init_mm, me); + + load_sp0(t, ¤t->thread); + set_tss_desc(cpu, t); + load_TR_desc(); + load_LDT(&init_mm.context); + + clear_all_debug_regs(); + dbg_restore_debug_regs(); + + fpu_init(); + + if (is_uv_system()) + uv_cpu_init(); +} + +#else + +void cpu_init(void) +{ + int cpu = smp_processor_id(); + struct task_struct *curr = current; + struct tss_struct *t = &per_cpu(init_tss, cpu); + struct thread_struct *thread = &curr->thread; + + show_ucode_info_early(); + + if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) { + printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); + for (;;) + local_irq_enable(); + } + + printk(KERN_INFO "Initializing CPU#%d\n", cpu); + + if (cpu_has_vme || cpu_has_tsc || cpu_has_de) + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); + + load_current_idt(); + switch_to_new_gdt(cpu); + + /* + * Set up and load the per-CPU TSS and LDT + */ + atomic_inc(&init_mm.mm_count); + curr->active_mm = &init_mm; + BUG_ON(curr->mm); + enter_lazy_tlb(&init_mm, curr); + + load_sp0(t, thread); + set_tss_desc(cpu, t); + load_TR_desc(); + load_LDT(&init_mm.context); + + t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); + +#ifdef CONFIG_DOUBLEFAULT + /* Set up doublefault TSS pointer in the GDT */ + __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); +#endif + + clear_all_debug_regs(); + dbg_restore_debug_regs(); + + fpu_init(); +} +#endif + +#ifdef CONFIG_X86_DEBUG_STATIC_CPU_HAS +void warn_pre_alternatives(void) +{ + WARN(1, "You're using static_cpu_has before alternatives have run!\n"); +} +EXPORT_SYMBOL_GPL(warn_pre_alternatives); +#endif + +inline bool __static_cpu_has_safe(u16 bit) +{ + return boot_cpu_has(bit); +} +EXPORT_SYMBOL_GPL(__static_cpu_has_safe); diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h new file mode 100644 index 00000000000..c37dc37e831 --- /dev/null +++ b/arch/x86/kernel/cpu/cpu.h @@ -0,0 +1,48 @@ +#ifndef ARCH_X86_CPU_H +#define ARCH_X86_CPU_H + +/* attempt to consolidate cpu attributes */ +struct cpu_dev { + const char *c_vendor; + + /* some have two possibilities for cpuid string */ + const char *c_ident[2]; + + void (*c_early_init)(struct cpuinfo_x86 *); + void (*c_bsp_init)(struct cpuinfo_x86 *); + void (*c_init)(struct cpuinfo_x86 *); + void (*c_identify)(struct cpuinfo_x86 *); + void (*c_detect_tlb)(struct cpuinfo_x86 *); + int c_x86_vendor; +#ifdef CONFIG_X86_32 + /* Optional vendor specific routine to obtain the cache size. */ + unsigned int (*legacy_cache_size)(struct cpuinfo_x86 *, + unsigned int); + + /* Family/stepping-based lookup table for model names. */ + struct legacy_cpu_model_info { + int family; + const char *model_names[16]; + } legacy_models[5]; +#endif +}; + +struct _tlb_table { + unsigned char descriptor; + char tlb_type; + unsigned int entries; + /* unsigned int ways; */ + char info[128]; +}; + +#define cpu_dev_register(cpu_devX) \ + static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \ + __attribute__((__section__(".x86_cpu_dev.init"))) = \ + &cpu_devX; + +extern const struct cpu_dev *const __x86_cpu_dev_start[], + *const __x86_cpu_dev_end[]; + +extern void get_cpu_cap(struct cpuinfo_x86 *c); +extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); +#endif /* ARCH_X86_CPU_H */ diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c new file mode 100644 index 00000000000..aaf152e7963 --- /dev/null +++ b/arch/x86/kernel/cpu/cyrix.c @@ -0,0 +1,461 @@ +#include <linux/bitops.h> +#include <linux/delay.h> +#include <linux/pci.h> +#include <asm/dma.h> +#include <linux/io.h> +#include <asm/processor-cyrix.h> +#include <asm/processor-flags.h> +#include <linux/timer.h> +#include <asm/pci-direct.h> +#include <asm/tsc.h> + +#include "cpu.h" + +/* + * Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU + */ +static void __do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) +{ + unsigned char ccr2, ccr3; + + /* we test for DEVID by checking whether CCR3 is writable */ + ccr3 = getCx86(CX86_CCR3); + setCx86(CX86_CCR3, ccr3 ^ 0x80); + getCx86(0xc0); /* dummy to change bus */ + + if (getCx86(CX86_CCR3) == ccr3) { /* no DEVID regs. */ + ccr2 = getCx86(CX86_CCR2); + setCx86(CX86_CCR2, ccr2 ^ 0x04); + getCx86(0xc0); /* dummy */ + + if (getCx86(CX86_CCR2) == ccr2) /* old Cx486SLC/DLC */ + *dir0 = 0xfd; + else { /* Cx486S A step */ + setCx86(CX86_CCR2, ccr2); + *dir0 = 0xfe; + } + } else { + setCx86(CX86_CCR3, ccr3); /* restore CCR3 */ + + /* read DIR0 and DIR1 CPU registers */ + *dir0 = getCx86(CX86_DIR0); + *dir1 = getCx86(CX86_DIR1); + } +} + +static void do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) +{ + unsigned long flags; + + local_irq_save(flags); + __do_cyrix_devid(dir0, dir1); + local_irq_restore(flags); +} +/* + * Cx86_dir0_msb is a HACK needed by check_cx686_cpuid/slop in bugs.h in + * order to identify the Cyrix CPU model after we're out of setup.c + * + * Actually since bugs.h doesn't even reference this perhaps someone should + * fix the documentation ??? + */ +static unsigned char Cx86_dir0_msb = 0; + +static const char Cx86_model[][9] = { + "Cx486", "Cx486", "5x86 ", "6x86", "MediaGX ", "6x86MX ", + "M II ", "Unknown" +}; +static const char Cx486_name[][5] = { + "SLC", "DLC", "SLC2", "DLC2", "SRx", "DRx", + "SRx2", "DRx2" +}; +static const char Cx486S_name[][4] = { + "S", "S2", "Se", "S2e" +}; +static const char Cx486D_name[][4] = { + "DX", "DX2", "?", "?", "?", "DX4" +}; +static char Cx86_cb[] = "?.5x Core/Bus Clock"; +static const char cyrix_model_mult1[] = "12??43"; +static const char cyrix_model_mult2[] = "12233445"; + +/* + * Reset the slow-loop (SLOP) bit on the 686(L) which is set by some old + * BIOSes for compatibility with DOS games. This makes the udelay loop + * work correctly, and improves performance. + * + * FIXME: our newer udelay uses the tsc. We don't need to frob with SLOP + */ + +static void check_cx686_slop(struct cpuinfo_x86 *c) +{ + unsigned long flags; + + if (Cx86_dir0_msb == 3) { + unsigned char ccr3, ccr5; + + local_irq_save(flags); + ccr3 = getCx86(CX86_CCR3); + setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ + ccr5 = getCx86(CX86_CCR5); + if (ccr5 & 2) + setCx86(CX86_CCR5, ccr5 & 0xfd); /* reset SLOP */ + setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ + local_irq_restore(flags); + + if (ccr5 & 2) { /* possible wrong calibration done */ + printk(KERN_INFO "Recalibrating delay loop with SLOP bit reset\n"); + calibrate_delay(); + c->loops_per_jiffy = loops_per_jiffy; + } + } +} + + +static void set_cx86_reorder(void) +{ + u8 ccr3; + + printk(KERN_INFO "Enable Memory access reorder on Cyrix/NSC processor.\n"); + ccr3 = getCx86(CX86_CCR3); + setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ + + /* Load/Store Serialize to mem access disable (=reorder it) */ + setCx86_old(CX86_PCR0, getCx86_old(CX86_PCR0) & ~0x80); + /* set load/store serialize from 1GB to 4GB */ + ccr3 |= 0xe0; + setCx86(CX86_CCR3, ccr3); +} + +static void set_cx86_memwb(void) +{ + printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n"); + + /* CCR2 bit 2: unlock NW bit */ + setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) & ~0x04); + /* set 'Not Write-through' */ + write_cr0(read_cr0() | X86_CR0_NW); + /* CCR2 bit 2: lock NW bit and set WT1 */ + setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x14); +} + +/* + * Configure later MediaGX and/or Geode processor. + */ + +static void geode_configure(void) +{ + unsigned long flags; + u8 ccr3; + local_irq_save(flags); + + /* Suspend on halt power saving and enable #SUSP pin */ + setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x88); + + ccr3 = getCx86(CX86_CCR3); + setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ + + + /* FPU fast, DTE cache, Mem bypass */ + setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x38); + setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ + + set_cx86_memwb(); + set_cx86_reorder(); + + local_irq_restore(flags); +} + +static void early_init_cyrix(struct cpuinfo_x86 *c) +{ + unsigned char dir0, dir0_msn, dir1 = 0; + + __do_cyrix_devid(&dir0, &dir1); + dir0_msn = dir0 >> 4; /* identifies CPU "family" */ + + switch (dir0_msn) { + case 3: /* 6x86/6x86L */ + /* Emulate MTRRs using Cyrix's ARRs. */ + set_cpu_cap(c, X86_FEATURE_CYRIX_ARR); + break; + case 5: /* 6x86MX/M II */ + /* Emulate MTRRs using Cyrix's ARRs. */ + set_cpu_cap(c, X86_FEATURE_CYRIX_ARR); + break; + } +} + +static void init_cyrix(struct cpuinfo_x86 *c) +{ + unsigned char dir0, dir0_msn, dir0_lsn, dir1 = 0; + char *buf = c->x86_model_id; + const char *p = NULL; + + /* + * Bit 31 in normal CPUID used for nonstandard 3DNow ID; + * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway + */ + clear_cpu_cap(c, 0*32+31); + + /* Cyrix used bit 24 in extended (AMD) CPUID for Cyrix MMX extensions */ + if (test_cpu_cap(c, 1*32+24)) { + clear_cpu_cap(c, 1*32+24); + set_cpu_cap(c, X86_FEATURE_CXMMX); + } + + do_cyrix_devid(&dir0, &dir1); + + check_cx686_slop(c); + + Cx86_dir0_msb = dir0_msn = dir0 >> 4; /* identifies CPU "family" */ + dir0_lsn = dir0 & 0xf; /* model or clock multiplier */ + + /* common case step number/rev -- exceptions handled below */ + c->x86_model = (dir1 >> 4) + 1; + c->x86_mask = dir1 & 0xf; + + /* Now cook; the original recipe is by Channing Corn, from Cyrix. + * We do the same thing for each generation: we work out + * the model, multiplier and stepping. Black magic included, + * to make the silicon step/rev numbers match the printed ones. + */ + + switch (dir0_msn) { + unsigned char tmp; + + case 0: /* Cx486SLC/DLC/SRx/DRx */ + p = Cx486_name[dir0_lsn & 7]; + break; + + case 1: /* Cx486S/DX/DX2/DX4 */ + p = (dir0_lsn & 8) ? Cx486D_name[dir0_lsn & 5] + : Cx486S_name[dir0_lsn & 3]; + break; + + case 2: /* 5x86 */ + Cx86_cb[2] = cyrix_model_mult1[dir0_lsn & 5]; + p = Cx86_cb+2; + break; + + case 3: /* 6x86/6x86L */ + Cx86_cb[1] = ' '; + Cx86_cb[2] = cyrix_model_mult1[dir0_lsn & 5]; + if (dir1 > 0x21) { /* 686L */ + Cx86_cb[0] = 'L'; + p = Cx86_cb; + (c->x86_model)++; + } else /* 686 */ + p = Cx86_cb+1; + /* Emulate MTRRs using Cyrix's ARRs. */ + set_cpu_cap(c, X86_FEATURE_CYRIX_ARR); + /* 6x86's contain this bug */ + set_cpu_bug(c, X86_BUG_COMA); + break; + + case 4: /* MediaGX/GXm or Geode GXM/GXLV/GX1 */ +#ifdef CONFIG_PCI + { + u32 vendor, device; + /* + * It isn't really a PCI quirk directly, but the cure is the + * same. The MediaGX has deep magic SMM stuff that handles the + * SB emulation. It throws away the fifo on disable_dma() which + * is wrong and ruins the audio. + * + * Bug2: VSA1 has a wrap bug so that using maximum sized DMA + * causes bad things. According to NatSemi VSA2 has another + * bug to do with 'hlt'. I've not seen any boards using VSA2 + * and X doesn't seem to support it either so who cares 8). + * VSA1 we work around however. + */ + + printk(KERN_INFO "Working around Cyrix MediaGX virtual DMA bugs.\n"); + isa_dma_bridge_buggy = 2; + + /* We do this before the PCI layer is running. However we + are safe here as we know the bridge must be a Cyrix + companion and must be present */ + vendor = read_pci_config_16(0, 0, 0x12, PCI_VENDOR_ID); + device = read_pci_config_16(0, 0, 0x12, PCI_DEVICE_ID); + + /* + * The 5510/5520 companion chips have a funky PIT. + */ + if (vendor == PCI_VENDOR_ID_CYRIX && + (device == PCI_DEVICE_ID_CYRIX_5510 || + device == PCI_DEVICE_ID_CYRIX_5520)) + mark_tsc_unstable("cyrix 5510/5520 detected"); + } +#endif + c->x86_cache_size = 16; /* Yep 16K integrated cache thats it */ + + /* GXm supports extended cpuid levels 'ala' AMD */ + if (c->cpuid_level == 2) { + /* Enable cxMMX extensions (GX1 Datasheet 54) */ + setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7) | 1); + + /* + * GXm : 0x30 ... 0x5f GXm datasheet 51 + * GXlv: 0x6x GXlv datasheet 54 + * ? : 0x7x + * GX1 : 0x8x GX1 datasheet 56 + */ + if ((0x30 <= dir1 && dir1 <= 0x6f) || + (0x80 <= dir1 && dir1 <= 0x8f)) + geode_configure(); + return; + } else { /* MediaGX */ + Cx86_cb[2] = (dir0_lsn & 1) ? '3' : '4'; + p = Cx86_cb+2; + c->x86_model = (dir1 & 0x20) ? 1 : 2; + } + break; + + case 5: /* 6x86MX/M II */ + if (dir1 > 7) { + dir0_msn++; /* M II */ + /* Enable MMX extensions (App note 108) */ + setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7)|1); + } else { + /* A 6x86MX - it has the bug. */ + set_cpu_bug(c, X86_BUG_COMA); + } + tmp = (!(dir0_lsn & 7) || dir0_lsn & 1) ? 2 : 0; + Cx86_cb[tmp] = cyrix_model_mult2[dir0_lsn & 7]; + p = Cx86_cb+tmp; + if (((dir1 & 0x0f) > 4) || ((dir1 & 0xf0) == 0x20)) + (c->x86_model)++; + /* Emulate MTRRs using Cyrix's ARRs. */ + set_cpu_cap(c, X86_FEATURE_CYRIX_ARR); + break; + + case 0xf: /* Cyrix 486 without DEVID registers */ + switch (dir0_lsn) { + case 0xd: /* either a 486SLC or DLC w/o DEVID */ + dir0_msn = 0; + p = Cx486_name[(cpu_has_fpu ? 1 : 0)]; + break; + + case 0xe: /* a 486S A step */ + dir0_msn = 0; + p = Cx486S_name[0]; + break; + } + break; + + default: /* unknown (shouldn't happen, we know everyone ;-) */ + dir0_msn = 7; + break; + } + strcpy(buf, Cx86_model[dir0_msn & 7]); + if (p) + strcat(buf, p); + return; +} + +/* + * Handle National Semiconductor branded processors + */ +static void init_nsc(struct cpuinfo_x86 *c) +{ + /* + * There may be GX1 processors in the wild that are branded + * NSC and not Cyrix. + * + * This function only handles the GX processor, and kicks every + * thing else to the Cyrix init function above - that should + * cover any processors that might have been branded differently + * after NSC acquired Cyrix. + * + * If this breaks your GX1 horribly, please e-mail + * info-linux@ldcmail.amd.com to tell us. + */ + + /* Handle the GX (Formally known as the GX2) */ + + if (c->x86 == 5 && c->x86_model == 5) + cpu_detect_cache_sizes(c); + else + init_cyrix(c); +} + +/* + * Cyrix CPUs without cpuid or with cpuid not yet enabled can be detected + * by the fact that they preserve the flags across the division of 5/2. + * PII and PPro exhibit this behavior too, but they have cpuid available. + */ + +/* + * Perform the Cyrix 5/2 test. A Cyrix won't change + * the flags, while other 486 chips will. + */ +static inline int test_cyrix_52div(void) +{ + unsigned int test; + + __asm__ __volatile__( + "sahf\n\t" /* clear flags (%eax = 0x0005) */ + "div %b2\n\t" /* divide 5 by 2 */ + "lahf" /* store flags into %ah */ + : "=a" (test) + : "0" (5), "q" (2) + : "cc"); + + /* AH is 0x02 on Cyrix after the divide.. */ + return (unsigned char) (test >> 8) == 0x02; +} + +static void cyrix_identify(struct cpuinfo_x86 *c) +{ + /* Detect Cyrix with disabled CPUID */ + if (c->x86 == 4 && test_cyrix_52div()) { + unsigned char dir0, dir1; + + strcpy(c->x86_vendor_id, "CyrixInstead"); + c->x86_vendor = X86_VENDOR_CYRIX; + + /* Actually enable cpuid on the older cyrix */ + + /* Retrieve CPU revisions */ + + do_cyrix_devid(&dir0, &dir1); + + dir0 >>= 4; + + /* Check it is an affected model */ + + if (dir0 == 5 || dir0 == 3) { + unsigned char ccr3; + unsigned long flags; + printk(KERN_INFO "Enabling CPUID on Cyrix processor.\n"); + local_irq_save(flags); + ccr3 = getCx86(CX86_CCR3); + /* enable MAPEN */ + setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); + /* enable cpuid */ + setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x80); + /* disable MAPEN */ + setCx86(CX86_CCR3, ccr3); + local_irq_restore(flags); + } + } +} + +static const struct cpu_dev cyrix_cpu_dev = { + .c_vendor = "Cyrix", + .c_ident = { "CyrixInstead" }, + .c_early_init = early_init_cyrix, + .c_init = init_cyrix, + .c_identify = cyrix_identify, + .c_x86_vendor = X86_VENDOR_CYRIX, +}; + +cpu_dev_register(cyrix_cpu_dev); + +static const struct cpu_dev nsc_cpu_dev = { + .c_vendor = "NSC", + .c_ident = { "Geode by NSC" }, + .c_init = init_nsc, + .c_x86_vendor = X86_VENDOR_NSC, +}; + +cpu_dev_register(nsc_cpu_dev); diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c new file mode 100644 index 00000000000..36ce402a3fa --- /dev/null +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -0,0 +1,87 @@ +/* + * Common hypervisor code + * + * Copyright (C) 2008, VMware, Inc. + * Author : Alok N Kataria <akataria@vmware.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#include <linux/module.h> +#include <asm/processor.h> +#include <asm/hypervisor.h> + +static const __initconst struct hypervisor_x86 * const hypervisors[] = +{ +#ifdef CONFIG_XEN_PVHVM + &x86_hyper_xen_hvm, +#endif + &x86_hyper_vmware, + &x86_hyper_ms_hyperv, +#ifdef CONFIG_KVM_GUEST + &x86_hyper_kvm, +#endif +}; + +const struct hypervisor_x86 *x86_hyper; +EXPORT_SYMBOL(x86_hyper); + +static inline void __init +detect_hypervisor_vendor(void) +{ + const struct hypervisor_x86 *h, * const *p; + uint32_t pri, max_pri = 0; + + for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) { + h = *p; + pri = h->detect(); + if (pri != 0 && pri > max_pri) { + max_pri = pri; + x86_hyper = h; + } + } + + if (max_pri) + printk(KERN_INFO "Hypervisor detected: %s\n", x86_hyper->name); +} + +void init_hypervisor(struct cpuinfo_x86 *c) +{ + if (x86_hyper && x86_hyper->set_cpu_features) + x86_hyper->set_cpu_features(c); +} + +void __init init_hypervisor_platform(void) +{ + + detect_hypervisor_vendor(); + + if (!x86_hyper) + return; + + init_hypervisor(&boot_cpu_data); + + if (x86_hyper->init_platform) + x86_hyper->init_platform(); +} + +bool __init hypervisor_x2apic_available(void) +{ + return x86_hyper && + x86_hyper->x2apic_available && + x86_hyper->x2apic_available(); +} diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c new file mode 100644 index 00000000000..f9e4fdd3b87 --- /dev/null +++ b/arch/x86/kernel/cpu/intel.c @@ -0,0 +1,751 @@ +#include <linux/kernel.h> + +#include <linux/string.h> +#include <linux/bitops.h> +#include <linux/smp.h> +#include <linux/sched.h> +#include <linux/thread_info.h> +#include <linux/module.h> +#include <linux/uaccess.h> + +#include <asm/processor.h> +#include <asm/pgtable.h> +#include <asm/msr.h> +#include <asm/bugs.h> +#include <asm/cpu.h> + +#ifdef CONFIG_X86_64 +#include <linux/topology.h> +#endif + +#include "cpu.h" + +#ifdef CONFIG_X86_LOCAL_APIC +#include <asm/mpspec.h> +#include <asm/apic.h> +#endif + +static void early_init_intel(struct cpuinfo_x86 *c) +{ + u64 misc_enable; + + /* Unmask CPUID levels if masked: */ + if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { + if (msr_clear_bit(MSR_IA32_MISC_ENABLE, + MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) > 0) { + c->cpuid_level = cpuid_eax(0); + get_cpu_cap(c); + } + } + + if ((c->x86 == 0xf && c->x86_model >= 0x03) || + (c->x86 == 0x6 && c->x86_model >= 0x0e)) + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + + if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64)) { + unsigned lower_word; + + wrmsr(MSR_IA32_UCODE_REV, 0, 0); + /* Required by the SDM */ + sync_core(); + rdmsr(MSR_IA32_UCODE_REV, lower_word, c->microcode); + } + + /* + * Atom erratum AAE44/AAF40/AAG38/AAH41: + * + * A race condition between speculative fetches and invalidating + * a large page. This is worked around in microcode, but we + * need the microcode to have already been loaded... so if it is + * not, recommend a BIOS update and disable large pages. + */ + if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_mask <= 2 && + c->microcode < 0x20e) { + printk(KERN_WARNING "Atom PSE erratum detected, BIOS microcode update recommended\n"); + clear_cpu_cap(c, X86_FEATURE_PSE); + } + +#ifdef CONFIG_X86_64 + set_cpu_cap(c, X86_FEATURE_SYSENTER32); +#else + /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */ + if (c->x86 == 15 && c->x86_cache_alignment == 64) + c->x86_cache_alignment = 128; +#endif + + /* CPUID workaround for 0F33/0F34 CPU */ + if (c->x86 == 0xF && c->x86_model == 0x3 + && (c->x86_mask == 0x3 || c->x86_mask == 0x4)) + c->x86_phys_bits = 36; + + /* + * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate + * with P/T states and does not stop in deep C-states. + * + * It is also reliable across cores and sockets. (but not across + * cabinets - we turn it off in that case explicitly.) + */ + if (c->x86_power & (1 << 8)) { + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); + if (!check_tsc_unstable()) + set_sched_clock_stable(); + } + + /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */ + if (c->x86 == 6) { + switch (c->x86_model) { + case 0x27: /* Penwell */ + case 0x35: /* Cloverview */ + set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3); + break; + default: + break; + } + } + + /* + * There is a known erratum on Pentium III and Core Solo + * and Core Duo CPUs. + * " Page with PAT set to WC while associated MTRR is UC + * may consolidate to UC " + * Because of this erratum, it is better to stick with + * setting WC in MTRR rather than using PAT on these CPUs. + * + * Enable PAT WC only on P4, Core 2 or later CPUs. + */ + if (c->x86 == 6 && c->x86_model < 15) + clear_cpu_cap(c, X86_FEATURE_PAT); + +#ifdef CONFIG_KMEMCHECK + /* + * P4s have a "fast strings" feature which causes single- + * stepping REP instructions to only generate a #DB on + * cache-line boundaries. + * + * Ingo Molnar reported a Pentium D (model 6) and a Xeon + * (model 2) with the same problem. + */ + if (c->x86 == 15) + if (msr_clear_bit(MSR_IA32_MISC_ENABLE, + MSR_IA32_MISC_ENABLE_FAST_STRING_BIT) > 0) + pr_info("kmemcheck: Disabling fast string operations\n"); +#endif + + /* + * If fast string is not enabled in IA32_MISC_ENABLE for any reason, + * clear the fast string and enhanced fast string CPU capabilities. + */ + if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { + rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); + if (!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) { + printk(KERN_INFO "Disabled fast string operations\n"); + setup_clear_cpu_cap(X86_FEATURE_REP_GOOD); + setup_clear_cpu_cap(X86_FEATURE_ERMS); + } + } +} + +#ifdef CONFIG_X86_32 +/* + * Early probe support logic for ppro memory erratum #50 + * + * This is called before we do cpu ident work + */ + +int ppro_with_ram_bug(void) +{ + /* Uses data from early_cpu_detect now */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && + boot_cpu_data.x86 == 6 && + boot_cpu_data.x86_model == 1 && + boot_cpu_data.x86_mask < 8) { + printk(KERN_INFO "Pentium Pro with Errata#50 detected. Taking evasive action.\n"); + return 1; + } + return 0; +} + +static void intel_smp_check(struct cpuinfo_x86 *c) +{ + /* calling is from identify_secondary_cpu() ? */ + if (!c->cpu_index) + return; + + /* + * Mask B, Pentium, but not Pentium MMX + */ + if (c->x86 == 5 && + c->x86_mask >= 1 && c->x86_mask <= 4 && + c->x86_model <= 3) { + /* + * Remember we have B step Pentia with bugs + */ + WARN_ONCE(1, "WARNING: SMP operation may be unreliable" + "with B stepping processors.\n"); + } +} + +static int forcepae; +static int __init forcepae_setup(char *__unused) +{ + forcepae = 1; + return 1; +} +__setup("forcepae", forcepae_setup); + +static void intel_workarounds(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_F00F_BUG + /* + * All current models of Pentium and Pentium with MMX technology CPUs + * have the F0 0F bug, which lets nonprivileged users lock up the + * system. Announce that the fault handler will be checking for it. + */ + clear_cpu_bug(c, X86_BUG_F00F); + if (!paravirt_enabled() && c->x86 == 5) { + static int f00f_workaround_enabled; + + set_cpu_bug(c, X86_BUG_F00F); + if (!f00f_workaround_enabled) { + printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n"); + f00f_workaround_enabled = 1; + } + } +#endif + + /* + * SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until + * model 3 mask 3 + */ + if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633) + clear_cpu_cap(c, X86_FEATURE_SEP); + + /* + * PAE CPUID issue: many Pentium M report no PAE but may have a + * functionally usable PAE implementation. + * Forcefully enable PAE if kernel parameter "forcepae" is present. + */ + if (forcepae) { + printk(KERN_WARNING "PAE forced!\n"); + set_cpu_cap(c, X86_FEATURE_PAE); + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE); + } + + /* + * P4 Xeon errata 037 workaround. + * Hardware prefetcher may cause stale data to be loaded into the cache. + */ + if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) { + if (msr_set_bit(MSR_IA32_MISC_ENABLE, + MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT) + > 0) { + pr_info("CPU: C0 stepping P4 Xeon detected.\n"); + pr_info("CPU: Disabling hardware prefetching (Errata 037)\n"); + } + } + + /* + * See if we have a good local APIC by checking for buggy Pentia, + * i.e. all B steppings and the C2 stepping of P54C when using their + * integrated APIC (see 11AP erratum in "Pentium Processor + * Specification Update"). + */ + if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 && + (c->x86_mask < 0x6 || c->x86_mask == 0xb)) + set_cpu_cap(c, X86_FEATURE_11AP); + + +#ifdef CONFIG_X86_INTEL_USERCOPY + /* + * Set up the preferred alignment for movsl bulk memory moves + */ + switch (c->x86) { + case 4: /* 486: untested */ + break; + case 5: /* Old Pentia: untested */ + break; + case 6: /* PII/PIII only like movsl with 8-byte alignment */ + movsl_mask.mask = 7; + break; + case 15: /* P4 is OK down to 8-byte alignment */ + movsl_mask.mask = 7; + break; + } +#endif + + intel_smp_check(c); +} +#else +static void intel_workarounds(struct cpuinfo_x86 *c) +{ +} +#endif + +static void srat_detect_node(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_NUMA + unsigned node; + int cpu = smp_processor_id(); + + /* Don't do the funky fallback heuristics the AMD version employs + for now. */ + node = numa_cpu_node(cpu); + if (node == NUMA_NO_NODE || !node_online(node)) { + /* reuse the value from init_cpu_to_node() */ + node = cpu_to_node(cpu); + } + numa_set_node(cpu, node); +#endif +} + +/* + * find out the number of processor cores on the die + */ +static int intel_num_cpu_cores(struct cpuinfo_x86 *c) +{ + unsigned int eax, ebx, ecx, edx; + + if (c->cpuid_level < 4) + return 1; + + /* Intel has a non-standard dependency on %ecx for this CPUID level. */ + cpuid_count(4, 0, &eax, &ebx, &ecx, &edx); + if (eax & 0x1f) + return (eax >> 26) + 1; + else + return 1; +} + +static void detect_vmx_virtcap(struct cpuinfo_x86 *c) +{ + /* Intel VMX MSR indicated features */ +#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000 +#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000 +#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000 +#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001 +#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002 +#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020 + + u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2; + + clear_cpu_cap(c, X86_FEATURE_TPR_SHADOW); + clear_cpu_cap(c, X86_FEATURE_VNMI); + clear_cpu_cap(c, X86_FEATURE_FLEXPRIORITY); + clear_cpu_cap(c, X86_FEATURE_EPT); + clear_cpu_cap(c, X86_FEATURE_VPID); + + rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high); + msr_ctl = vmx_msr_high | vmx_msr_low; + if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW) + set_cpu_cap(c, X86_FEATURE_TPR_SHADOW); + if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI) + set_cpu_cap(c, X86_FEATURE_VNMI); + if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) { + rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, + vmx_msr_low, vmx_msr_high); + msr_ctl2 = vmx_msr_high | vmx_msr_low; + if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) && + (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)) + set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY); + if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT) + set_cpu_cap(c, X86_FEATURE_EPT); + if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID) + set_cpu_cap(c, X86_FEATURE_VPID); + } +} + +static void init_intel(struct cpuinfo_x86 *c) +{ + unsigned int l2 = 0; + + early_init_intel(c); + + intel_workarounds(c); + + /* + * Detect the extended topology information if available. This + * will reinitialise the initial_apicid which will be used + * in init_intel_cacheinfo() + */ + detect_extended_topology(c); + + if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) { + /* + * let's use the legacy cpuid vector 0x1 and 0x4 for topology + * detection. + */ + c->x86_max_cores = intel_num_cpu_cores(c); +#ifdef CONFIG_X86_32 + detect_ht(c); +#endif + } + + l2 = init_intel_cacheinfo(c); + if (c->cpuid_level > 9) { + unsigned eax = cpuid_eax(10); + /* Check for version and the number of counters */ + if ((eax & 0xff) && (((eax>>8) & 0xff) > 1)) + set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); + } + + if (cpu_has_xmm2) + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); + if (cpu_has_ds) { + unsigned int l1; + rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); + if (!(l1 & (1<<11))) + set_cpu_cap(c, X86_FEATURE_BTS); + if (!(l1 & (1<<12))) + set_cpu_cap(c, X86_FEATURE_PEBS); + } + + if (c->x86 == 6 && cpu_has_clflush && + (c->x86_model == 29 || c->x86_model == 46 || c->x86_model == 47)) + set_cpu_cap(c, X86_FEATURE_CLFLUSH_MONITOR); + +#ifdef CONFIG_X86_64 + if (c->x86 == 15) + c->x86_cache_alignment = c->x86_clflush_size * 2; + if (c->x86 == 6) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); +#else + /* + * Names for the Pentium II/Celeron processors + * detectable only by also checking the cache size. + * Dixon is NOT a Celeron. + */ + if (c->x86 == 6) { + char *p = NULL; + + switch (c->x86_model) { + case 5: + if (l2 == 0) + p = "Celeron (Covington)"; + else if (l2 == 256) + p = "Mobile Pentium II (Dixon)"; + break; + + case 6: + if (l2 == 128) + p = "Celeron (Mendocino)"; + else if (c->x86_mask == 0 || c->x86_mask == 5) + p = "Celeron-A"; + break; + + case 8: + if (l2 == 128) + p = "Celeron (Coppermine)"; + break; + } + + if (p) + strcpy(c->x86_model_id, p); + } + + if (c->x86 == 15) + set_cpu_cap(c, X86_FEATURE_P4); + if (c->x86 == 6) + set_cpu_cap(c, X86_FEATURE_P3); +#endif + + /* Work around errata */ + srat_detect_node(c); + + if (cpu_has(c, X86_FEATURE_VMX)) + detect_vmx_virtcap(c); + + /* + * Initialize MSR_IA32_ENERGY_PERF_BIAS if BIOS did not. + * x86_energy_perf_policy(8) is available to change it at run-time + */ + if (cpu_has(c, X86_FEATURE_EPB)) { + u64 epb; + + rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); + if ((epb & 0xF) == ENERGY_PERF_BIAS_PERFORMANCE) { + printk_once(KERN_WARNING "ENERGY_PERF_BIAS:" + " Set to 'normal', was 'performance'\n" + "ENERGY_PERF_BIAS: View and update with" + " x86_energy_perf_policy(8)\n"); + epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL; + wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); + } + } +} + +#ifdef CONFIG_X86_32 +static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) +{ + /* + * Intel PIII Tualatin. This comes in two flavours. + * One has 256kb of cache, the other 512. We have no way + * to determine which, so we use a boottime override + * for the 512kb model, and assume 256 otherwise. + */ + if ((c->x86 == 6) && (c->x86_model == 11) && (size == 0)) + size = 256; + return size; +} +#endif + +#define TLB_INST_4K 0x01 +#define TLB_INST_4M 0x02 +#define TLB_INST_2M_4M 0x03 + +#define TLB_INST_ALL 0x05 +#define TLB_INST_1G 0x06 + +#define TLB_DATA_4K 0x11 +#define TLB_DATA_4M 0x12 +#define TLB_DATA_2M_4M 0x13 +#define TLB_DATA_4K_4M 0x14 + +#define TLB_DATA_1G 0x16 + +#define TLB_DATA0_4K 0x21 +#define TLB_DATA0_4M 0x22 +#define TLB_DATA0_2M_4M 0x23 + +#define STLB_4K 0x41 +#define STLB_4K_2M 0x42 + +static const struct _tlb_table intel_tlb_table[] = { + { 0x01, TLB_INST_4K, 32, " TLB_INST 4 KByte pages, 4-way set associative" }, + { 0x02, TLB_INST_4M, 2, " TLB_INST 4 MByte pages, full associative" }, + { 0x03, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way set associative" }, + { 0x04, TLB_DATA_4M, 8, " TLB_DATA 4 MByte pages, 4-way set associative" }, + { 0x05, TLB_DATA_4M, 32, " TLB_DATA 4 MByte pages, 4-way set associative" }, + { 0x0b, TLB_INST_4M, 4, " TLB_INST 4 MByte pages, 4-way set associative" }, + { 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages */" }, + { 0x50, TLB_INST_ALL, 64, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, + { 0x51, TLB_INST_ALL, 128, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, + { 0x52, TLB_INST_ALL, 256, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, + { 0x55, TLB_INST_2M_4M, 7, " TLB_INST 2-MByte or 4-MByte pages, fully associative" }, + { 0x56, TLB_DATA0_4M, 16, " TLB_DATA0 4 MByte pages, 4-way set associative" }, + { 0x57, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, 4-way associative" }, + { 0x59, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, fully associative" }, + { 0x5a, TLB_DATA0_2M_4M, 32, " TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative" }, + { 0x5b, TLB_DATA_4K_4M, 64, " TLB_DATA 4 KByte and 4 MByte pages" }, + { 0x5c, TLB_DATA_4K_4M, 128, " TLB_DATA 4 KByte and 4 MByte pages" }, + { 0x5d, TLB_DATA_4K_4M, 256, " TLB_DATA 4 KByte and 4 MByte pages" }, + { 0x61, TLB_INST_4K, 48, " TLB_INST 4 KByte pages, full associative" }, + { 0x63, TLB_DATA_1G, 4, " TLB_DATA 1 GByte pages, 4-way set associative" }, + { 0x76, TLB_INST_2M_4M, 8, " TLB_INST 2-MByte or 4-MByte pages, fully associative" }, + { 0xb0, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 4-way set associative" }, + { 0xb1, TLB_INST_2M_4M, 4, " TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" }, + { 0xb2, TLB_INST_4K, 64, " TLB_INST 4KByte pages, 4-way set associative" }, + { 0xb3, TLB_DATA_4K, 128, " TLB_DATA 4 KByte pages, 4-way set associative" }, + { 0xb4, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 4-way associative" }, + { 0xb5, TLB_INST_4K, 64, " TLB_INST 4 KByte pages, 8-way set ssociative" }, + { 0xb6, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 8-way set ssociative" }, + { 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" }, + { 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" }, + { 0xc1, STLB_4K_2M, 1024, " STLB 4 KByte and 2 MByte pages, 8-way associative" }, + { 0xc2, TLB_DATA_2M_4M, 16, " DTLB 2 MByte/4MByte pages, 4-way associative" }, + { 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" }, + { 0x00, 0, 0 } +}; + +static void intel_tlb_lookup(const unsigned char desc) +{ + unsigned char k; + if (desc == 0) + return; + + /* look up this descriptor in the table */ + for (k = 0; intel_tlb_table[k].descriptor != desc && \ + intel_tlb_table[k].descriptor != 0; k++) + ; + + if (intel_tlb_table[k].tlb_type == 0) + return; + + switch (intel_tlb_table[k].tlb_type) { + case STLB_4K: + if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; + break; + case STLB_4K_2M: + if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_INST_ALL: + if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_INST_4K: + if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_INST_4M: + if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_INST_2M_4M: + if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_DATA_4K: + case TLB_DATA0_4K: + if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_DATA_4M: + case TLB_DATA0_4M: + if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_DATA_2M_4M: + case TLB_DATA0_2M_4M: + if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_DATA_4K_4M: + if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_DATA_1G: + if (tlb_lld_1g[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_1g[ENTRIES] = intel_tlb_table[k].entries; + break; + } +} + +static void intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c) +{ + switch ((c->x86 << 8) + c->x86_model) { + case 0x60f: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ + case 0x616: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ + case 0x617: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ + case 0x61d: /* six-core 45 nm xeon "Dunnington" */ + tlb_flushall_shift = -1; + break; + case 0x63a: /* Ivybridge */ + tlb_flushall_shift = 2; + break; + case 0x61a: /* 45 nm nehalem, "Bloomfield" */ + case 0x61e: /* 45 nm nehalem, "Lynnfield" */ + case 0x625: /* 32 nm nehalem, "Clarkdale" */ + case 0x62c: /* 32 nm nehalem, "Gulftown" */ + case 0x62e: /* 45 nm nehalem-ex, "Beckton" */ + case 0x62f: /* 32 nm Xeon E7 */ + case 0x62a: /* SandyBridge */ + case 0x62d: /* SandyBridge, "Romely-EP" */ + default: + tlb_flushall_shift = 6; + } +} + +static void intel_detect_tlb(struct cpuinfo_x86 *c) +{ + int i, j, n; + unsigned int regs[4]; + unsigned char *desc = (unsigned char *)regs; + + if (c->cpuid_level < 2) + return; + + /* Number of times to iterate */ + n = cpuid_eax(2) & 0xFF; + + for (i = 0 ; i < n ; i++) { + cpuid(2, ®s[0], ®s[1], ®s[2], ®s[3]); + + /* If bit 31 is set, this is an unknown format */ + for (j = 0 ; j < 3 ; j++) + if (regs[j] & (1 << 31)) + regs[j] = 0; + + /* Byte 0 is level count, not a descriptor */ + for (j = 1 ; j < 16 ; j++) + intel_tlb_lookup(desc[j]); + } + intel_tlb_flushall_shift_set(c); +} + +static const struct cpu_dev intel_cpu_dev = { + .c_vendor = "Intel", + .c_ident = { "GenuineIntel" }, +#ifdef CONFIG_X86_32 + .legacy_models = { + { .family = 4, .model_names = + { + [0] = "486 DX-25/33", + [1] = "486 DX-50", + [2] = "486 SX", + [3] = "486 DX/2", + [4] = "486 SL", + [5] = "486 SX/2", + [7] = "486 DX/2-WB", + [8] = "486 DX/4", + [9] = "486 DX/4-WB" + } + }, + { .family = 5, .model_names = + { + [0] = "Pentium 60/66 A-step", + [1] = "Pentium 60/66", + [2] = "Pentium 75 - 200", + [3] = "OverDrive PODP5V83", + [4] = "Pentium MMX", + [7] = "Mobile Pentium 75 - 200", + [8] = "Mobile Pentium MMX" + } + }, + { .family = 6, .model_names = + { + [0] = "Pentium Pro A-step", + [1] = "Pentium Pro", + [3] = "Pentium II (Klamath)", + [4] = "Pentium II (Deschutes)", + [5] = "Pentium II (Deschutes)", + [6] = "Mobile Pentium II", + [7] = "Pentium III (Katmai)", + [8] = "Pentium III (Coppermine)", + [10] = "Pentium III (Cascades)", + [11] = "Pentium III (Tualatin)", + } + }, + { .family = 15, .model_names = + { + [0] = "Pentium 4 (Unknown)", + [1] = "Pentium 4 (Willamette)", + [2] = "Pentium 4 (Northwood)", + [4] = "Pentium 4 (Foster)", + [5] = "Pentium 4 (Foster)", + } + }, + }, + .legacy_cache_size = intel_size_cache, +#endif + .c_detect_tlb = intel_detect_tlb, + .c_early_init = early_init_intel, + .c_init = init_intel, + .c_x86_vendor = X86_VENDOR_INTEL, +}; + +cpu_dev_register(intel_cpu_dev); + diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c new file mode 100644 index 00000000000..9c8f7394c61 --- /dev/null +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -0,0 +1,1262 @@ +/* + * Routines to identify caches on Intel CPU. + * + * Changes: + * Venkatesh Pallipadi : Adding cache identification through cpuid(4) + * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure. + * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD. + */ + +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/device.h> +#include <linux/compiler.h> +#include <linux/cpu.h> +#include <linux/sched.h> +#include <linux/pci.h> + +#include <asm/processor.h> +#include <linux/smp.h> +#include <asm/amd_nb.h> +#include <asm/smp.h> + +#define LVL_1_INST 1 +#define LVL_1_DATA 2 +#define LVL_2 3 +#define LVL_3 4 +#define LVL_TRACE 5 + +struct _cache_table { + unsigned char descriptor; + char cache_type; + short size; +}; + +#define MB(x) ((x) * 1024) + +/* All the cache descriptor types we care about (no TLB or + trace cache entries) */ + +static const struct _cache_table cache_table[] = +{ + { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */ + { 0x08, LVL_1_INST, 16 }, /* 4-way set assoc, 32 byte line size */ + { 0x09, LVL_1_INST, 32 }, /* 4-way set assoc, 64 byte line size */ + { 0x0a, LVL_1_DATA, 8 }, /* 2 way set assoc, 32 byte line size */ + { 0x0c, LVL_1_DATA, 16 }, /* 4-way set assoc, 32 byte line size */ + { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */ + { 0x0e, LVL_1_DATA, 24 }, /* 6-way set assoc, 64 byte line size */ + { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */ + { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */ + { 0x23, LVL_3, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x25, LVL_3, MB(2) }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x29, LVL_3, MB(4) }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x2c, LVL_1_DATA, 32 }, /* 8-way set assoc, 64 byte line size */ + { 0x30, LVL_1_INST, 32 }, /* 8-way set assoc, 64 byte line size */ + { 0x39, LVL_2, 128 }, /* 4-way set assoc, sectored cache, 64 byte line size */ + { 0x3a, LVL_2, 192 }, /* 6-way set assoc, sectored cache, 64 byte line size */ + { 0x3b, LVL_2, 128 }, /* 2-way set assoc, sectored cache, 64 byte line size */ + { 0x3c, LVL_2, 256 }, /* 4-way set assoc, sectored cache, 64 byte line size */ + { 0x3d, LVL_2, 384 }, /* 6-way set assoc, sectored cache, 64 byte line size */ + { 0x3e, LVL_2, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */ + { 0x3f, LVL_2, 256 }, /* 2-way set assoc, 64 byte line size */ + { 0x41, LVL_2, 128 }, /* 4-way set assoc, 32 byte line size */ + { 0x42, LVL_2, 256 }, /* 4-way set assoc, 32 byte line size */ + { 0x43, LVL_2, 512 }, /* 4-way set assoc, 32 byte line size */ + { 0x44, LVL_2, MB(1) }, /* 4-way set assoc, 32 byte line size */ + { 0x45, LVL_2, MB(2) }, /* 4-way set assoc, 32 byte line size */ + { 0x46, LVL_3, MB(4) }, /* 4-way set assoc, 64 byte line size */ + { 0x47, LVL_3, MB(8) }, /* 8-way set assoc, 64 byte line size */ + { 0x48, LVL_2, MB(3) }, /* 12-way set assoc, 64 byte line size */ + { 0x49, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */ + { 0x4a, LVL_3, MB(6) }, /* 12-way set assoc, 64 byte line size */ + { 0x4b, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */ + { 0x4c, LVL_3, MB(12) }, /* 12-way set assoc, 64 byte line size */ + { 0x4d, LVL_3, MB(16) }, /* 16-way set assoc, 64 byte line size */ + { 0x4e, LVL_2, MB(6) }, /* 24-way set assoc, 64 byte line size */ + { 0x60, LVL_1_DATA, 16 }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */ + { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */ + { 0x68, LVL_1_DATA, 32 }, /* 4-way set assoc, sectored cache, 64 byte line size */ + { 0x70, LVL_TRACE, 12 }, /* 8-way set assoc */ + { 0x71, LVL_TRACE, 16 }, /* 8-way set assoc */ + { 0x72, LVL_TRACE, 32 }, /* 8-way set assoc */ + { 0x73, LVL_TRACE, 64 }, /* 8-way set assoc */ + { 0x78, LVL_2, MB(1) }, /* 4-way set assoc, 64 byte line size */ + { 0x79, LVL_2, 128 }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x7a, LVL_2, 256 }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x7b, LVL_2, 512 }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x7c, LVL_2, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x7d, LVL_2, MB(2) }, /* 8-way set assoc, 64 byte line size */ + { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */ + { 0x80, LVL_2, 512 }, /* 8-way set assoc, 64 byte line size */ + { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */ + { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */ + { 0x84, LVL_2, MB(1) }, /* 8-way set assoc, 32 byte line size */ + { 0x85, LVL_2, MB(2) }, /* 8-way set assoc, 32 byte line size */ + { 0x86, LVL_2, 512 }, /* 4-way set assoc, 64 byte line size */ + { 0x87, LVL_2, MB(1) }, /* 8-way set assoc, 64 byte line size */ + { 0xd0, LVL_3, 512 }, /* 4-way set assoc, 64 byte line size */ + { 0xd1, LVL_3, MB(1) }, /* 4-way set assoc, 64 byte line size */ + { 0xd2, LVL_3, MB(2) }, /* 4-way set assoc, 64 byte line size */ + { 0xd6, LVL_3, MB(1) }, /* 8-way set assoc, 64 byte line size */ + { 0xd7, LVL_3, MB(2) }, /* 8-way set assoc, 64 byte line size */ + { 0xd8, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */ + { 0xdc, LVL_3, MB(2) }, /* 12-way set assoc, 64 byte line size */ + { 0xdd, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */ + { 0xde, LVL_3, MB(8) }, /* 12-way set assoc, 64 byte line size */ + { 0xe2, LVL_3, MB(2) }, /* 16-way set assoc, 64 byte line size */ + { 0xe3, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */ + { 0xe4, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */ + { 0xea, LVL_3, MB(12) }, /* 24-way set assoc, 64 byte line size */ + { 0xeb, LVL_3, MB(18) }, /* 24-way set assoc, 64 byte line size */ + { 0xec, LVL_3, MB(24) }, /* 24-way set assoc, 64 byte line size */ + { 0x00, 0, 0} +}; + + +enum _cache_type { + CACHE_TYPE_NULL = 0, + CACHE_TYPE_DATA = 1, + CACHE_TYPE_INST = 2, + CACHE_TYPE_UNIFIED = 3 +}; + +union _cpuid4_leaf_eax { + struct { + enum _cache_type type:5; + unsigned int level:3; + unsigned int is_self_initializing:1; + unsigned int is_fully_associative:1; + unsigned int reserved:4; + unsigned int num_threads_sharing:12; + unsigned int num_cores_on_die:6; + } split; + u32 full; +}; + +union _cpuid4_leaf_ebx { + struct { + unsigned int coherency_line_size:12; + unsigned int physical_line_partition:10; + unsigned int ways_of_associativity:10; + } split; + u32 full; +}; + +union _cpuid4_leaf_ecx { + struct { + unsigned int number_of_sets:32; + } split; + u32 full; +}; + +struct _cpuid4_info_regs { + union _cpuid4_leaf_eax eax; + union _cpuid4_leaf_ebx ebx; + union _cpuid4_leaf_ecx ecx; + unsigned long size; + struct amd_northbridge *nb; +}; + +struct _cpuid4_info { + struct _cpuid4_info_regs base; + DECLARE_BITMAP(shared_cpu_map, NR_CPUS); +}; + +unsigned short num_cache_leaves; + +/* AMD doesn't have CPUID4. Emulate it here to report the same + information to the user. This makes some assumptions about the machine: + L2 not shared, no SMT etc. that is currently true on AMD CPUs. + + In theory the TLBs could be reported as fake type (they are in "dummy"). + Maybe later */ +union l1_cache { + struct { + unsigned line_size:8; + unsigned lines_per_tag:8; + unsigned assoc:8; + unsigned size_in_kb:8; + }; + unsigned val; +}; + +union l2_cache { + struct { + unsigned line_size:8; + unsigned lines_per_tag:4; + unsigned assoc:4; + unsigned size_in_kb:16; + }; + unsigned val; +}; + +union l3_cache { + struct { + unsigned line_size:8; + unsigned lines_per_tag:4; + unsigned assoc:4; + unsigned res:2; + unsigned size_encoded:14; + }; + unsigned val; +}; + +static const unsigned short assocs[] = { + [1] = 1, + [2] = 2, + [4] = 4, + [6] = 8, + [8] = 16, + [0xa] = 32, + [0xb] = 48, + [0xc] = 64, + [0xd] = 96, + [0xe] = 128, + [0xf] = 0xffff /* fully associative - no way to show this currently */ +}; + +static const unsigned char levels[] = { 1, 1, 2, 3 }; +static const unsigned char types[] = { 1, 2, 3, 3 }; + +static void +amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, + union _cpuid4_leaf_ebx *ebx, + union _cpuid4_leaf_ecx *ecx) +{ + unsigned dummy; + unsigned line_size, lines_per_tag, assoc, size_in_kb; + union l1_cache l1i, l1d; + union l2_cache l2; + union l3_cache l3; + union l1_cache *l1 = &l1d; + + eax->full = 0; + ebx->full = 0; + ecx->full = 0; + + cpuid(0x80000005, &dummy, &dummy, &l1d.val, &l1i.val); + cpuid(0x80000006, &dummy, &dummy, &l2.val, &l3.val); + + switch (leaf) { + case 1: + l1 = &l1i; + case 0: + if (!l1->val) + return; + assoc = assocs[l1->assoc]; + line_size = l1->line_size; + lines_per_tag = l1->lines_per_tag; + size_in_kb = l1->size_in_kb; + break; + case 2: + if (!l2.val) + return; + assoc = assocs[l2.assoc]; + line_size = l2.line_size; + lines_per_tag = l2.lines_per_tag; + /* cpu_data has errata corrections for K7 applied */ + size_in_kb = __this_cpu_read(cpu_info.x86_cache_size); + break; + case 3: + if (!l3.val) + return; + assoc = assocs[l3.assoc]; + line_size = l3.line_size; + lines_per_tag = l3.lines_per_tag; + size_in_kb = l3.size_encoded * 512; + if (boot_cpu_has(X86_FEATURE_AMD_DCM)) { + size_in_kb = size_in_kb >> 1; + assoc = assoc >> 1; + } + break; + default: + return; + } + + eax->split.is_self_initializing = 1; + eax->split.type = types[leaf]; + eax->split.level = levels[leaf]; + eax->split.num_threads_sharing = 0; + eax->split.num_cores_on_die = __this_cpu_read(cpu_info.x86_max_cores) - 1; + + + if (assoc == 0xffff) + eax->split.is_fully_associative = 1; + ebx->split.coherency_line_size = line_size - 1; + ebx->split.ways_of_associativity = assoc - 1; + ebx->split.physical_line_partition = lines_per_tag - 1; + ecx->split.number_of_sets = (size_in_kb * 1024) / line_size / + (ebx->split.ways_of_associativity + 1) - 1; +} + +struct _cache_attr { + struct attribute attr; + ssize_t (*show)(struct _cpuid4_info *, char *, unsigned int); + ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count, + unsigned int); +}; + +#if defined(CONFIG_AMD_NB) && defined(CONFIG_SYSFS) +/* + * L3 cache descriptors + */ +static void amd_calc_l3_indices(struct amd_northbridge *nb) +{ + struct amd_l3_cache *l3 = &nb->l3_cache; + unsigned int sc0, sc1, sc2, sc3; + u32 val = 0; + + pci_read_config_dword(nb->misc, 0x1C4, &val); + + /* calculate subcache sizes */ + l3->subcaches[0] = sc0 = !(val & BIT(0)); + l3->subcaches[1] = sc1 = !(val & BIT(4)); + + if (boot_cpu_data.x86 == 0x15) { + l3->subcaches[0] = sc0 += !(val & BIT(1)); + l3->subcaches[1] = sc1 += !(val & BIT(5)); + } + + l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9)); + l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13)); + + l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; +} + +static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index) +{ + int node; + + /* only for L3, and not in virtualized environments */ + if (index < 3) + return; + + node = amd_get_nb_id(smp_processor_id()); + this_leaf->nb = node_to_amd_nb(node); + if (this_leaf->nb && !this_leaf->nb->l3_cache.indices) + amd_calc_l3_indices(this_leaf->nb); +} + +/* + * check whether a slot used for disabling an L3 index is occupied. + * @l3: L3 cache descriptor + * @slot: slot number (0..1) + * + * @returns: the disabled index if used or negative value if slot free. + */ +int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned slot) +{ + unsigned int reg = 0; + + pci_read_config_dword(nb->misc, 0x1BC + slot * 4, ®); + + /* check whether this slot is activated already */ + if (reg & (3UL << 30)) + return reg & 0xfff; + + return -1; +} + +static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, + unsigned int slot) +{ + int index; + + if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) + return -EINVAL; + + index = amd_get_l3_disable_slot(this_leaf->base.nb, slot); + if (index >= 0) + return sprintf(buf, "%d\n", index); + + return sprintf(buf, "FREE\n"); +} + +#define SHOW_CACHE_DISABLE(slot) \ +static ssize_t \ +show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf, \ + unsigned int cpu) \ +{ \ + return show_cache_disable(this_leaf, buf, slot); \ +} +SHOW_CACHE_DISABLE(0) +SHOW_CACHE_DISABLE(1) + +static void amd_l3_disable_index(struct amd_northbridge *nb, int cpu, + unsigned slot, unsigned long idx) +{ + int i; + + idx |= BIT(30); + + /* + * disable index in all 4 subcaches + */ + for (i = 0; i < 4; i++) { + u32 reg = idx | (i << 20); + + if (!nb->l3_cache.subcaches[i]) + continue; + + pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg); + + /* + * We need to WBINVD on a core on the node containing the L3 + * cache which indices we disable therefore a simple wbinvd() + * is not sufficient. + */ + wbinvd_on_cpu(cpu); + + reg |= BIT(31); + pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg); + } +} + +/* + * disable a L3 cache index by using a disable-slot + * + * @l3: L3 cache descriptor + * @cpu: A CPU on the node containing the L3 cache + * @slot: slot number (0..1) + * @index: index to disable + * + * @return: 0 on success, error status on failure + */ +int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, unsigned slot, + unsigned long index) +{ + int ret = 0; + + /* check if @slot is already used or the index is already disabled */ + ret = amd_get_l3_disable_slot(nb, slot); + if (ret >= 0) + return -EEXIST; + + if (index > nb->l3_cache.indices) + return -EINVAL; + + /* check whether the other slot has disabled the same index already */ + if (index == amd_get_l3_disable_slot(nb, !slot)) + return -EEXIST; + + amd_l3_disable_index(nb, cpu, slot, index); + + return 0; +} + +static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, + const char *buf, size_t count, + unsigned int slot) +{ + unsigned long val = 0; + int cpu, err = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) + return -EINVAL; + + cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); + + if (strict_strtoul(buf, 10, &val) < 0) + return -EINVAL; + + err = amd_set_l3_disable_slot(this_leaf->base.nb, cpu, slot, val); + if (err) { + if (err == -EEXIST) + pr_warning("L3 slot %d in use/index already disabled!\n", + slot); + return err; + } + return count; +} + +#define STORE_CACHE_DISABLE(slot) \ +static ssize_t \ +store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \ + const char *buf, size_t count, \ + unsigned int cpu) \ +{ \ + return store_cache_disable(this_leaf, buf, count, slot); \ +} +STORE_CACHE_DISABLE(0) +STORE_CACHE_DISABLE(1) + +static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, + show_cache_disable_0, store_cache_disable_0); +static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, + show_cache_disable_1, store_cache_disable_1); + +static ssize_t +show_subcaches(struct _cpuid4_info *this_leaf, char *buf, unsigned int cpu) +{ + if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + return -EINVAL; + + return sprintf(buf, "%x\n", amd_get_subcaches(cpu)); +} + +static ssize_t +store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count, + unsigned int cpu) +{ + unsigned long val; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + return -EINVAL; + + if (strict_strtoul(buf, 16, &val) < 0) + return -EINVAL; + + if (amd_set_subcaches(cpu, val)) + return -EINVAL; + + return count; +} + +static struct _cache_attr subcaches = + __ATTR(subcaches, 0644, show_subcaches, store_subcaches); + +#else +#define amd_init_l3_cache(x, y) +#endif /* CONFIG_AMD_NB && CONFIG_SYSFS */ + +static int +cpuid4_cache_lookup_regs(int index, struct _cpuid4_info_regs *this_leaf) +{ + union _cpuid4_leaf_eax eax; + union _cpuid4_leaf_ebx ebx; + union _cpuid4_leaf_ecx ecx; + unsigned edx; + + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + if (cpu_has_topoext) + cpuid_count(0x8000001d, index, &eax.full, + &ebx.full, &ecx.full, &edx); + else + amd_cpuid4(index, &eax, &ebx, &ecx); + amd_init_l3_cache(this_leaf, index); + } else { + cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); + } + + if (eax.split.type == CACHE_TYPE_NULL) + return -EIO; /* better error ? */ + + this_leaf->eax = eax; + this_leaf->ebx = ebx; + this_leaf->ecx = ecx; + this_leaf->size = (ecx.split.number_of_sets + 1) * + (ebx.split.coherency_line_size + 1) * + (ebx.split.physical_line_partition + 1) * + (ebx.split.ways_of_associativity + 1); + return 0; +} + +static int find_num_cache_leaves(struct cpuinfo_x86 *c) +{ + unsigned int eax, ebx, ecx, edx, op; + union _cpuid4_leaf_eax cache_eax; + int i = -1; + + if (c->x86_vendor == X86_VENDOR_AMD) + op = 0x8000001d; + else + op = 4; + + do { + ++i; + /* Do cpuid(op) loop to find out num_cache_leaves */ + cpuid_count(op, i, &eax, &ebx, &ecx, &edx); + cache_eax.full = eax; + } while (cache_eax.split.type != CACHE_TYPE_NULL); + return i; +} + +void init_amd_cacheinfo(struct cpuinfo_x86 *c) +{ + + if (cpu_has_topoext) { + num_cache_leaves = find_num_cache_leaves(c); + } else if (c->extended_cpuid_level >= 0x80000006) { + if (cpuid_edx(0x80000006) & 0xf000) + num_cache_leaves = 4; + else + num_cache_leaves = 3; + } +} + +unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c) +{ + /* Cache sizes */ + unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; + unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */ + unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */ + unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb; +#ifdef CONFIG_X86_HT + unsigned int cpu = c->cpu_index; +#endif + + if (c->cpuid_level > 3) { + static int is_initialized; + + if (is_initialized == 0) { + /* Init num_cache_leaves from boot CPU */ + num_cache_leaves = find_num_cache_leaves(c); + is_initialized++; + } + + /* + * Whenever possible use cpuid(4), deterministic cache + * parameters cpuid leaf to find the cache details + */ + for (i = 0; i < num_cache_leaves; i++) { + struct _cpuid4_info_regs this_leaf = {}; + int retval; + + retval = cpuid4_cache_lookup_regs(i, &this_leaf); + if (retval < 0) + continue; + + switch (this_leaf.eax.split.level) { + case 1: + if (this_leaf.eax.split.type == CACHE_TYPE_DATA) + new_l1d = this_leaf.size/1024; + else if (this_leaf.eax.split.type == CACHE_TYPE_INST) + new_l1i = this_leaf.size/1024; + break; + case 2: + new_l2 = this_leaf.size/1024; + num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; + index_msb = get_count_order(num_threads_sharing); + l2_id = c->apicid & ~((1 << index_msb) - 1); + break; + case 3: + new_l3 = this_leaf.size/1024; + num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; + index_msb = get_count_order(num_threads_sharing); + l3_id = c->apicid & ~((1 << index_msb) - 1); + break; + default: + break; + } + } + } + /* + * Don't use cpuid2 if cpuid4 is supported. For P4, we use cpuid2 for + * trace cache + */ + if ((num_cache_leaves == 0 || c->x86 == 15) && c->cpuid_level > 1) { + /* supports eax=2 call */ + int j, n; + unsigned int regs[4]; + unsigned char *dp = (unsigned char *)regs; + int only_trace = 0; + + if (num_cache_leaves != 0 && c->x86 == 15) + only_trace = 1; + + /* Number of times to iterate */ + n = cpuid_eax(2) & 0xFF; + + for (i = 0 ; i < n ; i++) { + cpuid(2, ®s[0], ®s[1], ®s[2], ®s[3]); + + /* If bit 31 is set, this is an unknown format */ + for (j = 0 ; j < 3 ; j++) + if (regs[j] & (1 << 31)) + regs[j] = 0; + + /* Byte 0 is level count, not a descriptor */ + for (j = 1 ; j < 16 ; j++) { + unsigned char des = dp[j]; + unsigned char k = 0; + + /* look up this descriptor in the table */ + while (cache_table[k].descriptor != 0) { + if (cache_table[k].descriptor == des) { + if (only_trace && cache_table[k].cache_type != LVL_TRACE) + break; + switch (cache_table[k].cache_type) { + case LVL_1_INST: + l1i += cache_table[k].size; + break; + case LVL_1_DATA: + l1d += cache_table[k].size; + break; + case LVL_2: + l2 += cache_table[k].size; + break; + case LVL_3: + l3 += cache_table[k].size; + break; + case LVL_TRACE: + trace += cache_table[k].size; + break; + } + + break; + } + + k++; + } + } + } + } + + if (new_l1d) + l1d = new_l1d; + + if (new_l1i) + l1i = new_l1i; + + if (new_l2) { + l2 = new_l2; +#ifdef CONFIG_X86_HT + per_cpu(cpu_llc_id, cpu) = l2_id; +#endif + } + + if (new_l3) { + l3 = new_l3; +#ifdef CONFIG_X86_HT + per_cpu(cpu_llc_id, cpu) = l3_id; +#endif + } + +#ifdef CONFIG_X86_HT + /* + * If cpu_llc_id is not yet set, this means cpuid_level < 4 which in + * turns means that the only possibility is SMT (as indicated in + * cpuid1). Since cpuid2 doesn't specify shared caches, and we know + * that SMT shares all caches, we can unconditionally set cpu_llc_id to + * c->phys_proc_id. + */ + if (per_cpu(cpu_llc_id, cpu) == BAD_APICID) + per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; +#endif + + c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d)); + + return l2; +} + +#ifdef CONFIG_SYSFS + +/* pointer to _cpuid4_info array (for each cache leaf) */ +static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info); +#define CPUID4_INFO_IDX(x, y) (&((per_cpu(ici_cpuid4_info, x))[y])) + +#ifdef CONFIG_SMP + +static int cache_shared_amd_cpu_map_setup(unsigned int cpu, int index) +{ + struct _cpuid4_info *this_leaf; + int i, sibling; + + if (cpu_has_topoext) { + unsigned int apicid, nshared, first, last; + + if (!per_cpu(ici_cpuid4_info, cpu)) + return 0; + + this_leaf = CPUID4_INFO_IDX(cpu, index); + nshared = this_leaf->base.eax.split.num_threads_sharing + 1; + apicid = cpu_data(cpu).apicid; + first = apicid - (apicid % nshared); + last = first + nshared - 1; + + for_each_online_cpu(i) { + apicid = cpu_data(i).apicid; + if ((apicid < first) || (apicid > last)) + continue; + if (!per_cpu(ici_cpuid4_info, i)) + continue; + this_leaf = CPUID4_INFO_IDX(i, index); + + for_each_online_cpu(sibling) { + apicid = cpu_data(sibling).apicid; + if ((apicid < first) || (apicid > last)) + continue; + set_bit(sibling, this_leaf->shared_cpu_map); + } + } + } else if (index == 3) { + for_each_cpu(i, cpu_llc_shared_mask(cpu)) { + if (!per_cpu(ici_cpuid4_info, i)) + continue; + this_leaf = CPUID4_INFO_IDX(i, index); + for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) { + if (!cpu_online(sibling)) + continue; + set_bit(sibling, this_leaf->shared_cpu_map); + } + } + } else + return 0; + + return 1; +} + +static void cache_shared_cpu_map_setup(unsigned int cpu, int index) +{ + struct _cpuid4_info *this_leaf, *sibling_leaf; + unsigned long num_threads_sharing; + int index_msb, i; + struct cpuinfo_x86 *c = &cpu_data(cpu); + + if (c->x86_vendor == X86_VENDOR_AMD) { + if (cache_shared_amd_cpu_map_setup(cpu, index)) + return; + } + + this_leaf = CPUID4_INFO_IDX(cpu, index); + num_threads_sharing = 1 + this_leaf->base.eax.split.num_threads_sharing; + + if (num_threads_sharing == 1) + cpumask_set_cpu(cpu, to_cpumask(this_leaf->shared_cpu_map)); + else { + index_msb = get_count_order(num_threads_sharing); + + for_each_online_cpu(i) { + if (cpu_data(i).apicid >> index_msb == + c->apicid >> index_msb) { + cpumask_set_cpu(i, + to_cpumask(this_leaf->shared_cpu_map)); + if (i != cpu && per_cpu(ici_cpuid4_info, i)) { + sibling_leaf = + CPUID4_INFO_IDX(i, index); + cpumask_set_cpu(cpu, to_cpumask( + sibling_leaf->shared_cpu_map)); + } + } + } + } +} +static void cache_remove_shared_cpu_map(unsigned int cpu, int index) +{ + struct _cpuid4_info *this_leaf, *sibling_leaf; + int sibling; + + this_leaf = CPUID4_INFO_IDX(cpu, index); + for_each_cpu(sibling, to_cpumask(this_leaf->shared_cpu_map)) { + sibling_leaf = CPUID4_INFO_IDX(sibling, index); + cpumask_clear_cpu(cpu, + to_cpumask(sibling_leaf->shared_cpu_map)); + } +} +#else +static void cache_shared_cpu_map_setup(unsigned int cpu, int index) +{ +} + +static void cache_remove_shared_cpu_map(unsigned int cpu, int index) +{ +} +#endif + +static void free_cache_attributes(unsigned int cpu) +{ + int i; + + for (i = 0; i < num_cache_leaves; i++) + cache_remove_shared_cpu_map(cpu, i); + + kfree(per_cpu(ici_cpuid4_info, cpu)); + per_cpu(ici_cpuid4_info, cpu) = NULL; +} + +static void get_cpu_leaves(void *_retval) +{ + int j, *retval = _retval, cpu = smp_processor_id(); + + /* Do cpuid and store the results */ + for (j = 0; j < num_cache_leaves; j++) { + struct _cpuid4_info *this_leaf = CPUID4_INFO_IDX(cpu, j); + + *retval = cpuid4_cache_lookup_regs(j, &this_leaf->base); + if (unlikely(*retval < 0)) { + int i; + + for (i = 0; i < j; i++) + cache_remove_shared_cpu_map(cpu, i); + break; + } + cache_shared_cpu_map_setup(cpu, j); + } +} + +static int detect_cache_attributes(unsigned int cpu) +{ + int retval; + + if (num_cache_leaves == 0) + return -ENOENT; + + per_cpu(ici_cpuid4_info, cpu) = kzalloc( + sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL); + if (per_cpu(ici_cpuid4_info, cpu) == NULL) + return -ENOMEM; + + smp_call_function_single(cpu, get_cpu_leaves, &retval, true); + if (retval) { + kfree(per_cpu(ici_cpuid4_info, cpu)); + per_cpu(ici_cpuid4_info, cpu) = NULL; + } + + return retval; +} + +#include <linux/kobject.h> +#include <linux/sysfs.h> +#include <linux/cpu.h> + +/* pointer to kobject for cpuX/cache */ +static DEFINE_PER_CPU(struct kobject *, ici_cache_kobject); + +struct _index_kobject { + struct kobject kobj; + unsigned int cpu; + unsigned short index; +}; + +/* pointer to array of kobjects for cpuX/cache/indexY */ +static DEFINE_PER_CPU(struct _index_kobject *, ici_index_kobject); +#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(ici_index_kobject, x))[y])) + +#define show_one_plus(file_name, object, val) \ +static ssize_t show_##file_name(struct _cpuid4_info *this_leaf, char *buf, \ + unsigned int cpu) \ +{ \ + return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \ +} + +show_one_plus(level, base.eax.split.level, 0); +show_one_plus(coherency_line_size, base.ebx.split.coherency_line_size, 1); +show_one_plus(physical_line_partition, base.ebx.split.physical_line_partition, 1); +show_one_plus(ways_of_associativity, base.ebx.split.ways_of_associativity, 1); +show_one_plus(number_of_sets, base.ecx.split.number_of_sets, 1); + +static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf, + unsigned int cpu) +{ + return sprintf(buf, "%luK\n", this_leaf->base.size / 1024); +} + +static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf, + int type, char *buf) +{ + ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf; + int n = 0; + + if (len > 1) { + const struct cpumask *mask; + + mask = to_cpumask(this_leaf->shared_cpu_map); + n = type ? + cpulist_scnprintf(buf, len-2, mask) : + cpumask_scnprintf(buf, len-2, mask); + buf[n++] = '\n'; + buf[n] = '\0'; + } + return n; +} + +static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf, + unsigned int cpu) +{ + return show_shared_cpu_map_func(leaf, 0, buf); +} + +static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf, + unsigned int cpu) +{ + return show_shared_cpu_map_func(leaf, 1, buf); +} + +static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf, + unsigned int cpu) +{ + switch (this_leaf->base.eax.split.type) { + case CACHE_TYPE_DATA: + return sprintf(buf, "Data\n"); + case CACHE_TYPE_INST: + return sprintf(buf, "Instruction\n"); + case CACHE_TYPE_UNIFIED: + return sprintf(buf, "Unified\n"); + default: + return sprintf(buf, "Unknown\n"); + } +} + +#define to_object(k) container_of(k, struct _index_kobject, kobj) +#define to_attr(a) container_of(a, struct _cache_attr, attr) + +#define define_one_ro(_name) \ +static struct _cache_attr _name = \ + __ATTR(_name, 0444, show_##_name, NULL) + +define_one_ro(level); +define_one_ro(type); +define_one_ro(coherency_line_size); +define_one_ro(physical_line_partition); +define_one_ro(ways_of_associativity); +define_one_ro(number_of_sets); +define_one_ro(size); +define_one_ro(shared_cpu_map); +define_one_ro(shared_cpu_list); + +static struct attribute *default_attrs[] = { + &type.attr, + &level.attr, + &coherency_line_size.attr, + &physical_line_partition.attr, + &ways_of_associativity.attr, + &number_of_sets.attr, + &size.attr, + &shared_cpu_map.attr, + &shared_cpu_list.attr, + NULL +}; + +#ifdef CONFIG_AMD_NB +static struct attribute **amd_l3_attrs(void) +{ + static struct attribute **attrs; + int n; + + if (attrs) + return attrs; + + n = ARRAY_SIZE(default_attrs); + + if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) + n += 2; + + if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + n += 1; + + attrs = kzalloc(n * sizeof (struct attribute *), GFP_KERNEL); + if (attrs == NULL) + return attrs = default_attrs; + + for (n = 0; default_attrs[n]; n++) + attrs[n] = default_attrs[n]; + + if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) { + attrs[n++] = &cache_disable_0.attr; + attrs[n++] = &cache_disable_1.attr; + } + + if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + attrs[n++] = &subcaches.attr; + + return attrs; +} +#endif + +static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + struct _cache_attr *fattr = to_attr(attr); + struct _index_kobject *this_leaf = to_object(kobj); + ssize_t ret; + + ret = fattr->show ? + fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), + buf, this_leaf->cpu) : + 0; + return ret; +} + +static ssize_t store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct _cache_attr *fattr = to_attr(attr); + struct _index_kobject *this_leaf = to_object(kobj); + ssize_t ret; + + ret = fattr->store ? + fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), + buf, count, this_leaf->cpu) : + 0; + return ret; +} + +static const struct sysfs_ops sysfs_ops = { + .show = show, + .store = store, +}; + +static struct kobj_type ktype_cache = { + .sysfs_ops = &sysfs_ops, + .default_attrs = default_attrs, +}; + +static struct kobj_type ktype_percpu_entry = { + .sysfs_ops = &sysfs_ops, +}; + +static void cpuid4_cache_sysfs_exit(unsigned int cpu) +{ + kfree(per_cpu(ici_cache_kobject, cpu)); + kfree(per_cpu(ici_index_kobject, cpu)); + per_cpu(ici_cache_kobject, cpu) = NULL; + per_cpu(ici_index_kobject, cpu) = NULL; + free_cache_attributes(cpu); +} + +static int cpuid4_cache_sysfs_init(unsigned int cpu) +{ + int err; + + if (num_cache_leaves == 0) + return -ENOENT; + + err = detect_cache_attributes(cpu); + if (err) + return err; + + /* Allocate all required memory */ + per_cpu(ici_cache_kobject, cpu) = + kzalloc(sizeof(struct kobject), GFP_KERNEL); + if (unlikely(per_cpu(ici_cache_kobject, cpu) == NULL)) + goto err_out; + + per_cpu(ici_index_kobject, cpu) = kzalloc( + sizeof(struct _index_kobject) * num_cache_leaves, GFP_KERNEL); + if (unlikely(per_cpu(ici_index_kobject, cpu) == NULL)) + goto err_out; + + return 0; + +err_out: + cpuid4_cache_sysfs_exit(cpu); + return -ENOMEM; +} + +static DECLARE_BITMAP(cache_dev_map, NR_CPUS); + +/* Add/Remove cache interface for CPU device */ +static int cache_add_dev(struct device *dev) +{ + unsigned int cpu = dev->id; + unsigned long i, j; + struct _index_kobject *this_object; + struct _cpuid4_info *this_leaf; + int retval; + + retval = cpuid4_cache_sysfs_init(cpu); + if (unlikely(retval < 0)) + return retval; + + retval = kobject_init_and_add(per_cpu(ici_cache_kobject, cpu), + &ktype_percpu_entry, + &dev->kobj, "%s", "cache"); + if (retval < 0) { + cpuid4_cache_sysfs_exit(cpu); + return retval; + } + + for (i = 0; i < num_cache_leaves; i++) { + this_object = INDEX_KOBJECT_PTR(cpu, i); + this_object->cpu = cpu; + this_object->index = i; + + this_leaf = CPUID4_INFO_IDX(cpu, i); + + ktype_cache.default_attrs = default_attrs; +#ifdef CONFIG_AMD_NB + if (this_leaf->base.nb) + ktype_cache.default_attrs = amd_l3_attrs(); +#endif + retval = kobject_init_and_add(&(this_object->kobj), + &ktype_cache, + per_cpu(ici_cache_kobject, cpu), + "index%1lu", i); + if (unlikely(retval)) { + for (j = 0; j < i; j++) + kobject_put(&(INDEX_KOBJECT_PTR(cpu, j)->kobj)); + kobject_put(per_cpu(ici_cache_kobject, cpu)); + cpuid4_cache_sysfs_exit(cpu); + return retval; + } + kobject_uevent(&(this_object->kobj), KOBJ_ADD); + } + cpumask_set_cpu(cpu, to_cpumask(cache_dev_map)); + + kobject_uevent(per_cpu(ici_cache_kobject, cpu), KOBJ_ADD); + return 0; +} + +static void cache_remove_dev(struct device *dev) +{ + unsigned int cpu = dev->id; + unsigned long i; + + if (per_cpu(ici_cpuid4_info, cpu) == NULL) + return; + if (!cpumask_test_cpu(cpu, to_cpumask(cache_dev_map))) + return; + cpumask_clear_cpu(cpu, to_cpumask(cache_dev_map)); + + for (i = 0; i < num_cache_leaves; i++) + kobject_put(&(INDEX_KOBJECT_PTR(cpu, i)->kobj)); + kobject_put(per_cpu(ici_cache_kobject, cpu)); + cpuid4_cache_sysfs_exit(cpu); +} + +static int cacheinfo_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + struct device *dev; + + dev = get_cpu_device(cpu); + switch (action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + cache_add_dev(dev); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + cache_remove_dev(dev); + break; + } + return NOTIFY_OK; +} + +static struct notifier_block cacheinfo_cpu_notifier = { + .notifier_call = cacheinfo_cpu_callback, +}; + +static int __init cache_sysfs_init(void) +{ + int i, err = 0; + + if (num_cache_leaves == 0) + return 0; + + cpu_notifier_register_begin(); + for_each_online_cpu(i) { + struct device *dev = get_cpu_device(i); + + err = cache_add_dev(dev); + if (err) + goto out; + } + __register_hotcpu_notifier(&cacheinfo_cpu_notifier); + +out: + cpu_notifier_register_done(); + return err; +} + +device_initcall(cache_sysfs_init); + +#endif diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c new file mode 100644 index 00000000000..afa9f0d487e --- /dev/null +++ b/arch/x86/kernel/cpu/match.c @@ -0,0 +1,49 @@ +#include <asm/cpu_device_id.h> +#include <asm/processor.h> +#include <linux/cpu.h> +#include <linux/module.h> +#include <linux/slab.h> + +/** + * x86_match_cpu - match current CPU again an array of x86_cpu_ids + * @match: Pointer to array of x86_cpu_ids. Last entry terminated with + * {}. + * + * Return the entry if the current CPU matches the entries in the + * passed x86_cpu_id match table. Otherwise NULL. The match table + * contains vendor (X86_VENDOR_*), family, model and feature bits or + * respective wildcard entries. + * + * A typical table entry would be to match a specific CPU + * { X86_VENDOR_INTEL, 6, 0x12 } + * or to match a specific CPU feature + * { X86_FEATURE_MATCH(X86_FEATURE_FOOBAR) } + * + * Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY, + * %X86_MODEL_ANY, %X86_FEATURE_ANY or 0 (except for vendor) + * + * Arrays used to match for this should also be declared using + * MODULE_DEVICE_TABLE(x86cpu, ...) + * + * This always matches against the boot cpu, assuming models and features are + * consistent over all CPUs. + */ +const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match) +{ + const struct x86_cpu_id *m; + struct cpuinfo_x86 *c = &boot_cpu_data; + + for (m = match; m->vendor | m->family | m->model | m->feature; m++) { + if (m->vendor != X86_VENDOR_ANY && c->x86_vendor != m->vendor) + continue; + if (m->family != X86_FAMILY_ANY && c->x86 != m->family) + continue; + if (m->model != X86_MODEL_ANY && c->x86_model != m->model) + continue; + if (m->feature != X86_FEATURE_ANY && !cpu_has(c, m->feature)) + continue; + return m; + } + return NULL; +} +EXPORT_SYMBOL(x86_match_cpu); diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile new file mode 100644 index 00000000000..bb34b03af25 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -0,0 +1,11 @@ +obj-y = mce.o mce-severity.o + +obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o +obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o +obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o +obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o +obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o + +obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o + +obj-$(CONFIG_ACPI_APEI) += mce-apei.o diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c new file mode 100644 index 00000000000..a1aef953315 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c @@ -0,0 +1,155 @@ +/* + * Bridge between MCE and APEI + * + * On some machine, corrected memory errors are reported via APEI + * generic hardware error source (GHES) instead of corrected Machine + * Check. These corrected memory errors can be reported to user space + * through /dev/mcelog via faking a corrected Machine Check, so that + * the error memory page can be offlined by /sbin/mcelog if the error + * count for one page is beyond the threshold. + * + * For fatal MCE, save MCE record into persistent storage via ERST, so + * that the MCE record can be logged after reboot via ERST. + * + * Copyright 2010 Intel Corp. + * Author: Huang Ying <ying.huang@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/export.h> +#include <linux/kernel.h> +#include <linux/acpi.h> +#include <linux/cper.h> +#include <acpi/apei.h> +#include <acpi/ghes.h> +#include <asm/mce.h> + +#include "mce-internal.h" + +void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) +{ + struct mce m; + + if (!(mem_err->validation_bits & CPER_MEM_VALID_PA)) + return; + + mce_setup(&m); + m.bank = 1; + /* Fake a memory read error with unknown channel */ + m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f; + + if (severity >= GHES_SEV_RECOVERABLE) + m.status |= MCI_STATUS_UC; + if (severity >= GHES_SEV_PANIC) + m.status |= MCI_STATUS_PCC; + + m.addr = mem_err->physical_addr; + mce_log(&m); + mce_notify_irq(); +} +EXPORT_SYMBOL_GPL(apei_mce_report_mem_error); + +#define CPER_CREATOR_MCE \ + UUID_LE(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c, \ + 0x64, 0x90, 0xb8, 0x9d) +#define CPER_SECTION_TYPE_MCE \ + UUID_LE(0xfe08ffbe, 0x95e4, 0x4be7, 0xbc, 0x73, 0x40, 0x96, \ + 0x04, 0x4a, 0x38, 0xfc) + +/* + * CPER specification (in UEFI specification 2.3 appendix N) requires + * byte-packed. + */ +struct cper_mce_record { + struct cper_record_header hdr; + struct cper_section_descriptor sec_hdr; + struct mce mce; +} __packed; + +int apei_write_mce(struct mce *m) +{ + struct cper_mce_record rcd; + + memset(&rcd, 0, sizeof(rcd)); + memcpy(rcd.hdr.signature, CPER_SIG_RECORD, CPER_SIG_SIZE); + rcd.hdr.revision = CPER_RECORD_REV; + rcd.hdr.signature_end = CPER_SIG_END; + rcd.hdr.section_count = 1; + rcd.hdr.error_severity = CPER_SEV_FATAL; + /* timestamp, platform_id, partition_id are all invalid */ + rcd.hdr.validation_bits = 0; + rcd.hdr.record_length = sizeof(rcd); + rcd.hdr.creator_id = CPER_CREATOR_MCE; + rcd.hdr.notification_type = CPER_NOTIFY_MCE; + rcd.hdr.record_id = cper_next_record_id(); + rcd.hdr.flags = CPER_HW_ERROR_FLAGS_PREVERR; + + rcd.sec_hdr.section_offset = (void *)&rcd.mce - (void *)&rcd; + rcd.sec_hdr.section_length = sizeof(rcd.mce); + rcd.sec_hdr.revision = CPER_SEC_REV; + /* fru_id and fru_text is invalid */ + rcd.sec_hdr.validation_bits = 0; + rcd.sec_hdr.flags = CPER_SEC_PRIMARY; + rcd.sec_hdr.section_type = CPER_SECTION_TYPE_MCE; + rcd.sec_hdr.section_severity = CPER_SEV_FATAL; + + memcpy(&rcd.mce, m, sizeof(*m)); + + return erst_write(&rcd.hdr); +} + +ssize_t apei_read_mce(struct mce *m, u64 *record_id) +{ + struct cper_mce_record rcd; + int rc, pos; + + rc = erst_get_record_id_begin(&pos); + if (rc) + return rc; +retry: + rc = erst_get_record_id_next(&pos, record_id); + if (rc) + goto out; + /* no more record */ + if (*record_id == APEI_ERST_INVALID_RECORD_ID) + goto out; + rc = erst_read(*record_id, &rcd.hdr, sizeof(rcd)); + /* someone else has cleared the record, try next one */ + if (rc == -ENOENT) + goto retry; + else if (rc < 0) + goto out; + /* try to skip other type records in storage */ + else if (rc != sizeof(rcd) || + uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) + goto retry; + memcpy(m, &rcd.mce, sizeof(*m)); + rc = sizeof(*m); +out: + erst_get_record_id_end(); + + return rc; +} + +/* Check whether there is record in ERST */ +int apei_check_mce(void) +{ + return erst_get_record_count(); +} + +int apei_clear_mce(u64 record_id) +{ + return erst_clear(record_id); +} diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c new file mode 100644 index 00000000000..5ac2d1fb28b --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -0,0 +1,256 @@ +/* + * Machine check injection support. + * Copyright 2008 Intel Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + * Authors: + * Andi Kleen + * Ying Huang + */ +#include <linux/uaccess.h> +#include <linux/module.h> +#include <linux/timer.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/fs.h> +#include <linux/preempt.h> +#include <linux/smp.h> +#include <linux/notifier.h> +#include <linux/kdebug.h> +#include <linux/cpu.h> +#include <linux/sched.h> +#include <linux/gfp.h> +#include <asm/mce.h> +#include <asm/apic.h> +#include <asm/nmi.h> + +/* Update fake mce registers on current CPU. */ +static void inject_mce(struct mce *m) +{ + struct mce *i = &per_cpu(injectm, m->extcpu); + + /* Make sure no one reads partially written injectm */ + i->finished = 0; + mb(); + m->finished = 0; + /* First set the fields after finished */ + i->extcpu = m->extcpu; + mb(); + /* Now write record in order, finished last (except above) */ + memcpy(i, m, sizeof(struct mce)); + /* Finally activate it */ + mb(); + i->finished = 1; +} + +static void raise_poll(struct mce *m) +{ + unsigned long flags; + mce_banks_t b; + + memset(&b, 0xff, sizeof(mce_banks_t)); + local_irq_save(flags); + machine_check_poll(0, &b); + local_irq_restore(flags); + m->finished = 0; +} + +static void raise_exception(struct mce *m, struct pt_regs *pregs) +{ + struct pt_regs regs; + unsigned long flags; + + if (!pregs) { + memset(®s, 0, sizeof(struct pt_regs)); + regs.ip = m->ip; + regs.cs = m->cs; + pregs = ®s; + } + /* in mcheck exeception handler, irq will be disabled */ + local_irq_save(flags); + do_machine_check(pregs, 0); + local_irq_restore(flags); + m->finished = 0; +} + +static cpumask_var_t mce_inject_cpumask; +static DEFINE_MUTEX(mce_inject_mutex); + +static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs) +{ + int cpu = smp_processor_id(); + struct mce *m = &__get_cpu_var(injectm); + if (!cpumask_test_cpu(cpu, mce_inject_cpumask)) + return NMI_DONE; + cpumask_clear_cpu(cpu, mce_inject_cpumask); + if (m->inject_flags & MCJ_EXCEPTION) + raise_exception(m, regs); + else if (m->status) + raise_poll(m); + return NMI_HANDLED; +} + +static void mce_irq_ipi(void *info) +{ + int cpu = smp_processor_id(); + struct mce *m = &__get_cpu_var(injectm); + + if (cpumask_test_cpu(cpu, mce_inject_cpumask) && + m->inject_flags & MCJ_EXCEPTION) { + cpumask_clear_cpu(cpu, mce_inject_cpumask); + raise_exception(m, NULL); + } +} + +/* Inject mce on current CPU */ +static int raise_local(void) +{ + struct mce *m = &__get_cpu_var(injectm); + int context = MCJ_CTX(m->inject_flags); + int ret = 0; + int cpu = m->extcpu; + + if (m->inject_flags & MCJ_EXCEPTION) { + printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu); + switch (context) { + case MCJ_CTX_IRQ: + /* + * Could do more to fake interrupts like + * calling irq_enter, but the necessary + * machinery isn't exported currently. + */ + /*FALL THROUGH*/ + case MCJ_CTX_PROCESS: + raise_exception(m, NULL); + break; + default: + printk(KERN_INFO "Invalid MCE context\n"); + ret = -EINVAL; + } + printk(KERN_INFO "MCE exception done on CPU %d\n", cpu); + } else if (m->status) { + printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu); + raise_poll(m); + mce_notify_irq(); + printk(KERN_INFO "Machine check poll done on CPU %d\n", cpu); + } else + m->finished = 0; + + return ret; +} + +static void raise_mce(struct mce *m) +{ + int context = MCJ_CTX(m->inject_flags); + + inject_mce(m); + + if (context == MCJ_CTX_RANDOM) + return; + +#ifdef CONFIG_X86_LOCAL_APIC + if (m->inject_flags & (MCJ_IRQ_BROADCAST | MCJ_NMI_BROADCAST)) { + unsigned long start; + int cpu; + + get_online_cpus(); + cpumask_copy(mce_inject_cpumask, cpu_online_mask); + cpumask_clear_cpu(get_cpu(), mce_inject_cpumask); + for_each_online_cpu(cpu) { + struct mce *mcpu = &per_cpu(injectm, cpu); + if (!mcpu->finished || + MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM) + cpumask_clear_cpu(cpu, mce_inject_cpumask); + } + if (!cpumask_empty(mce_inject_cpumask)) { + if (m->inject_flags & MCJ_IRQ_BROADCAST) { + /* + * don't wait because mce_irq_ipi is necessary + * to be sync with following raise_local + */ + preempt_disable(); + smp_call_function_many(mce_inject_cpumask, + mce_irq_ipi, NULL, 0); + preempt_enable(); + } else if (m->inject_flags & MCJ_NMI_BROADCAST) + apic->send_IPI_mask(mce_inject_cpumask, + NMI_VECTOR); + } + start = jiffies; + while (!cpumask_empty(mce_inject_cpumask)) { + if (!time_before(jiffies, start + 2*HZ)) { + printk(KERN_ERR + "Timeout waiting for mce inject %lx\n", + *cpumask_bits(mce_inject_cpumask)); + break; + } + cpu_relax(); + } + raise_local(); + put_cpu(); + put_online_cpus(); + } else +#endif + { + preempt_disable(); + raise_local(); + preempt_enable(); + } +} + +/* Error injection interface */ +static ssize_t mce_write(struct file *filp, const char __user *ubuf, + size_t usize, loff_t *off) +{ + struct mce m; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + /* + * There are some cases where real MSR reads could slip + * through. + */ + if (!boot_cpu_has(X86_FEATURE_MCE) || !boot_cpu_has(X86_FEATURE_MCA)) + return -EIO; + + if ((unsigned long)usize > sizeof(struct mce)) + usize = sizeof(struct mce); + if (copy_from_user(&m, ubuf, usize)) + return -EFAULT; + + if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu)) + return -EINVAL; + + /* + * Need to give user space some time to set everything up, + * so do it a jiffie or two later everywhere. + */ + schedule_timeout(2); + + mutex_lock(&mce_inject_mutex); + raise_mce(&m); + mutex_unlock(&mce_inject_mutex); + return usize; +} + +static int inject_init(void) +{ + if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL)) + return -ENOMEM; + printk(KERN_INFO "Machine check injector initialized\n"); + register_mce_write_callback(mce_write); + register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, + "mce_notify"); + return 0; +} + +module_init(inject_init); +/* + * Cannot tolerate unloading currently because we cannot + * guarantee all openers of mce_chrdev will get a reference to us. + */ +MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h new file mode 100644 index 00000000000..09edd0b65fe --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -0,0 +1,66 @@ +#include <linux/device.h> +#include <asm/mce.h> + +enum severity_level { + MCE_NO_SEVERITY, + MCE_KEEP_SEVERITY, + MCE_SOME_SEVERITY, + MCE_AO_SEVERITY, + MCE_UC_SEVERITY, + MCE_AR_SEVERITY, + MCE_PANIC_SEVERITY, +}; + +#define ATTR_LEN 16 + +/* One object for each MCE bank, shared by all CPUs */ +struct mce_bank { + u64 ctl; /* subevents to enable */ + unsigned char init; /* initialise bank? */ + struct device_attribute attr; /* device attribute */ + char attrname[ATTR_LEN]; /* attribute name */ +}; + +int mce_severity(struct mce *a, int tolerant, char **msg); +struct dentry *mce_get_debugfs_dir(void); + +extern struct mce_bank *mce_banks; +extern mce_banks_t mce_banks_ce_disabled; + +#ifdef CONFIG_X86_MCE_INTEL +unsigned long mce_intel_adjust_timer(unsigned long interval); +void mce_intel_cmci_poll(void); +void mce_intel_hcpu_update(unsigned long cpu); +void cmci_disable_bank(int bank); +#else +# define mce_intel_adjust_timer mce_adjust_timer_default +static inline void mce_intel_cmci_poll(void) { } +static inline void mce_intel_hcpu_update(unsigned long cpu) { } +static inline void cmci_disable_bank(int bank) { } +#endif + +void mce_timer_kick(unsigned long interval); + +#ifdef CONFIG_ACPI_APEI +int apei_write_mce(struct mce *m); +ssize_t apei_read_mce(struct mce *m, u64 *record_id); +int apei_check_mce(void); +int apei_clear_mce(u64 record_id); +#else +static inline int apei_write_mce(struct mce *m) +{ + return -EINVAL; +} +static inline ssize_t apei_read_mce(struct mce *m, u64 *record_id) +{ + return 0; +} +static inline int apei_check_mce(void) +{ + return 0; +} +static inline int apei_clear_mce(u64 record_id) +{ + return -EINVAL; +} +#endif diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c new file mode 100644 index 00000000000..c370e1c4468 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -0,0 +1,283 @@ +/* + * MCE grading rules. + * Copyright 2008, 2009 Intel Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + * Author: Andi Kleen + */ +#include <linux/kernel.h> +#include <linux/seq_file.h> +#include <linux/init.h> +#include <linux/debugfs.h> +#include <asm/mce.h> + +#include "mce-internal.h" + +/* + * Grade an mce by severity. In general the most severe ones are processed + * first. Since there are quite a lot of combinations test the bits in a + * table-driven way. The rules are simply processed in order, first + * match wins. + * + * Note this is only used for machine check exceptions, the corrected + * errors use much simpler rules. The exceptions still check for the corrected + * errors, but only to leave them alone for the CMCI handler (except for + * panic situations) + */ + +enum context { IN_KERNEL = 1, IN_USER = 2 }; +enum ser { SER_REQUIRED = 1, NO_SER = 2 }; + +static struct severity { + u64 mask; + u64 result; + unsigned char sev; + unsigned char mcgmask; + unsigned char mcgres; + unsigned char ser; + unsigned char context; + unsigned char covered; + char *msg; +} severities[] = { +#define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c } +#define KERNEL .context = IN_KERNEL +#define USER .context = IN_USER +#define SER .ser = SER_REQUIRED +#define NOSER .ser = NO_SER +#define BITCLR(x) .mask = x, .result = 0 +#define BITSET(x) .mask = x, .result = x +#define MCGMASK(x, y) .mcgmask = x, .mcgres = y +#define MASK(x, y) .mask = x, .result = y +#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) +#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) +#define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV) + + MCESEV( + NO, "Invalid", + BITCLR(MCI_STATUS_VAL) + ), + MCESEV( + NO, "Not enabled", + BITCLR(MCI_STATUS_EN) + ), + MCESEV( + PANIC, "Processor context corrupt", + BITSET(MCI_STATUS_PCC) + ), + /* When MCIP is not set something is very confused */ + MCESEV( + PANIC, "MCIP not set in MCA handler", + MCGMASK(MCG_STATUS_MCIP, 0) + ), + /* Neither return not error IP -- no chance to recover -> PANIC */ + MCESEV( + PANIC, "Neither restart nor error IP", + MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0) + ), + MCESEV( + PANIC, "In kernel and no restart IP", + KERNEL, MCGMASK(MCG_STATUS_RIPV, 0) + ), + MCESEV( + KEEP, "Corrected error", + NOSER, BITCLR(MCI_STATUS_UC) + ), + + /* ignore OVER for UCNA */ + MCESEV( + KEEP, "Uncorrected no action required", + SER, MASK(MCI_UC_SAR, MCI_STATUS_UC) + ), + MCESEV( + PANIC, "Illegal combination (UCNA with AR=1)", + SER, + MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR) + ), + MCESEV( + KEEP, "Non signalled machine check", + SER, BITCLR(MCI_STATUS_S) + ), + + MCESEV( + PANIC, "Action required with lost events", + SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR) + ), + + /* known AR MCACODs: */ +#ifdef CONFIG_MEMORY_FAILURE + MCESEV( + KEEP, "Action required but unaffected thread is continuable", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR, MCI_UC_SAR|MCI_ADDR), + MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV) + ), + MCESEV( + AR, "Action required: data load error in a user process", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), + USER + ), + MCESEV( + AR, "Action required: instruction fetch error in a user process", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR), + USER + ), +#endif + MCESEV( + PANIC, "Action required: unknown MCACOD", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR) + ), + + /* known AO MCACODs: */ + MCESEV( + AO, "Action optional: memory scrubbing error", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD_SCRUBMSK, MCI_UC_S|MCACOD_SCRUB) + ), + MCESEV( + AO, "Action optional: last level cache writeback error", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|MCACOD_L3WB) + ), + MCESEV( + SOME, "Action optional: unknown MCACOD", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S) + ), + MCESEV( + SOME, "Action optional with lost events", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_S) + ), + + MCESEV( + PANIC, "Overflowed uncorrected", + BITSET(MCI_STATUS_OVER|MCI_STATUS_UC) + ), + MCESEV( + UC, "Uncorrected", + BITSET(MCI_STATUS_UC) + ), + MCESEV( + SOME, "No match", + BITSET(0) + ) /* always matches. keep at end */ +}; + +/* + * If mcgstatus indicated that ip/cs on the stack were + * no good, then "m->cs" will be zero and we will have + * to assume the worst case (IN_KERNEL) as we actually + * have no idea what we were executing when the machine + * check hit. + * If we do have a good "m->cs" (or a faked one in the + * case we were executing in VM86 mode) we can use it to + * distinguish an exception taken in user from from one + * taken in the kernel. + */ +static int error_context(struct mce *m) +{ + return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL; +} + +int mce_severity(struct mce *m, int tolerant, char **msg) +{ + enum context ctx = error_context(m); + struct severity *s; + + for (s = severities;; s++) { + if ((m->status & s->mask) != s->result) + continue; + if ((m->mcgstatus & s->mcgmask) != s->mcgres) + continue; + if (s->ser == SER_REQUIRED && !mca_cfg.ser) + continue; + if (s->ser == NO_SER && mca_cfg.ser) + continue; + if (s->context && ctx != s->context) + continue; + if (msg) + *msg = s->msg; + s->covered = 1; + if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) { + if (panic_on_oops || tolerant < 1) + return MCE_PANIC_SEVERITY; + } + return s->sev; + } +} + +#ifdef CONFIG_DEBUG_FS +static void *s_start(struct seq_file *f, loff_t *pos) +{ + if (*pos >= ARRAY_SIZE(severities)) + return NULL; + return &severities[*pos]; +} + +static void *s_next(struct seq_file *f, void *data, loff_t *pos) +{ + if (++(*pos) >= ARRAY_SIZE(severities)) + return NULL; + return &severities[*pos]; +} + +static void s_stop(struct seq_file *f, void *data) +{ +} + +static int s_show(struct seq_file *f, void *data) +{ + struct severity *ser = data; + seq_printf(f, "%d\t%s\n", ser->covered, ser->msg); + return 0; +} + +static const struct seq_operations severities_seq_ops = { + .start = s_start, + .next = s_next, + .stop = s_stop, + .show = s_show, +}; + +static int severities_coverage_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &severities_seq_ops); +} + +static ssize_t severities_coverage_write(struct file *file, + const char __user *ubuf, + size_t count, loff_t *ppos) +{ + int i; + for (i = 0; i < ARRAY_SIZE(severities); i++) + severities[i].covered = 0; + return count; +} + +static const struct file_operations severities_coverage_fops = { + .open = severities_coverage_open, + .release = seq_release, + .read = seq_read, + .write = severities_coverage_write, + .llseek = seq_lseek, +}; + +static int __init severities_debugfs_init(void) +{ + struct dentry *dmce, *fsev; + + dmce = mce_get_debugfs_dir(); + if (!dmce) + goto err_out; + + fsev = debugfs_create_file("severities-coverage", 0444, dmce, NULL, + &severities_coverage_fops); + if (!fsev) + goto err_out; + + return 0; + +err_out: + return -ENOMEM; +} +late_initcall(severities_debugfs_init); +#endif /* CONFIG_DEBUG_FS */ diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c new file mode 100644 index 00000000000..9a79c8dbd8e --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -0,0 +1,2561 @@ +/* + * Machine check handler. + * + * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs. + * Rest from unknown author(s). + * 2004 Andi Kleen. Rewrote most of it. + * Copyright 2008 Intel Corporation + * Author: Andi Kleen + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/thread_info.h> +#include <linux/capability.h> +#include <linux/miscdevice.h> +#include <linux/ratelimit.h> +#include <linux/kallsyms.h> +#include <linux/rcupdate.h> +#include <linux/kobject.h> +#include <linux/uaccess.h> +#include <linux/kdebug.h> +#include <linux/kernel.h> +#include <linux/percpu.h> +#include <linux/string.h> +#include <linux/device.h> +#include <linux/syscore_ops.h> +#include <linux/delay.h> +#include <linux/ctype.h> +#include <linux/sched.h> +#include <linux/sysfs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/kmod.h> +#include <linux/poll.h> +#include <linux/nmi.h> +#include <linux/cpu.h> +#include <linux/smp.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/debugfs.h> +#include <linux/irq_work.h> +#include <linux/export.h> + +#include <asm/processor.h> +#include <asm/mce.h> +#include <asm/msr.h> + +#include "mce-internal.h" + +static DEFINE_MUTEX(mce_chrdev_read_mutex); + +#define rcu_dereference_check_mce(p) \ + rcu_dereference_index_check((p), \ + rcu_read_lock_sched_held() || \ + lockdep_is_held(&mce_chrdev_read_mutex)) + +#define CREATE_TRACE_POINTS +#include <trace/events/mce.h> + +#define SPINUNIT 100 /* 100ns */ + +DEFINE_PER_CPU(unsigned, mce_exception_count); + +struct mce_bank *mce_banks __read_mostly; + +struct mca_config mca_cfg __read_mostly = { + .bootlog = -1, + /* + * Tolerant levels: + * 0: always panic on uncorrected errors, log corrected errors + * 1: panic or SIGBUS on uncorrected errors, log corrected errors + * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors + * 3: never panic or SIGBUS, log all errors (for testing only) + */ + .tolerant = 1, + .monarch_timeout = -1 +}; + +/* User mode helper program triggered by machine check event */ +static unsigned long mce_need_notify; +static char mce_helper[128]; +static char *mce_helper_argv[2] = { mce_helper, NULL }; + +static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); + +static DEFINE_PER_CPU(struct mce, mces_seen); +static int cpu_missing; + +/* CMCI storm detection filter */ +static DEFINE_PER_CPU(unsigned long, mce_polled_error); + +/* + * MCA banks polled by the period polling timer for corrected events. + * With Intel CMCI, this only has MCA banks which do not support CMCI (if any). + */ +DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { + [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL +}; + +/* + * MCA banks controlled through firmware first for corrected errors. + * This is a global list of banks for which we won't enable CMCI and we + * won't poll. Firmware controls these banks and is responsible for + * reporting corrected errors through GHES. Uncorrected/recoverable + * errors are still notified through a machine check. + */ +mce_banks_t mce_banks_ce_disabled; + +static DEFINE_PER_CPU(struct work_struct, mce_work); + +static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); + +/* + * CPU/chipset specific EDAC code can register a notifier call here to print + * MCE errors in a human-readable form. + */ +ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); + +/* Do initial initialization of a struct mce */ +void mce_setup(struct mce *m) +{ + memset(m, 0, sizeof(struct mce)); + m->cpu = m->extcpu = smp_processor_id(); + rdtscll(m->tsc); + /* We hope get_seconds stays lockless */ + m->time = get_seconds(); + m->cpuvendor = boot_cpu_data.x86_vendor; + m->cpuid = cpuid_eax(1); + m->socketid = cpu_data(m->extcpu).phys_proc_id; + m->apicid = cpu_data(m->extcpu).initial_apicid; + rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); +} + +DEFINE_PER_CPU(struct mce, injectm); +EXPORT_PER_CPU_SYMBOL_GPL(injectm); + +/* + * Lockless MCE logging infrastructure. + * This avoids deadlocks on printk locks without having to break locks. Also + * separate MCEs from kernel messages to avoid bogus bug reports. + */ + +static struct mce_log mcelog = { + .signature = MCE_LOG_SIGNATURE, + .len = MCE_LOG_LEN, + .recordlen = sizeof(struct mce), +}; + +void mce_log(struct mce *mce) +{ + unsigned next, entry; + int ret = 0; + + /* Emit the trace record: */ + trace_mce_record(mce); + + ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); + if (ret == NOTIFY_STOP) + return; + + mce->finished = 0; + wmb(); + for (;;) { + entry = rcu_dereference_check_mce(mcelog.next); + for (;;) { + + /* + * When the buffer fills up discard new entries. + * Assume that the earlier errors are the more + * interesting ones: + */ + if (entry >= MCE_LOG_LEN) { + set_bit(MCE_OVERFLOW, + (unsigned long *)&mcelog.flags); + return; + } + /* Old left over entry. Skip: */ + if (mcelog.entry[entry].finished) { + entry++; + continue; + } + break; + } + smp_rmb(); + next = entry + 1; + if (cmpxchg(&mcelog.next, entry, next) == entry) + break; + } + memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); + wmb(); + mcelog.entry[entry].finished = 1; + wmb(); + + mce->finished = 1; + set_bit(0, &mce_need_notify); +} + +static void drain_mcelog_buffer(void) +{ + unsigned int next, i, prev = 0; + + next = ACCESS_ONCE(mcelog.next); + + do { + struct mce *m; + + /* drain what was logged during boot */ + for (i = prev; i < next; i++) { + unsigned long start = jiffies; + unsigned retries = 1; + + m = &mcelog.entry[i]; + + while (!m->finished) { + if (time_after_eq(jiffies, start + 2*retries)) + retries++; + + cpu_relax(); + + if (!m->finished && retries >= 4) { + pr_err("skipping error being logged currently!\n"); + break; + } + } + smp_rmb(); + atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); + } + + memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m)); + prev = next; + next = cmpxchg(&mcelog.next, prev, 0); + } while (next != prev); +} + + +void mce_register_decode_chain(struct notifier_block *nb) +{ + atomic_notifier_chain_register(&x86_mce_decoder_chain, nb); + drain_mcelog_buffer(); +} +EXPORT_SYMBOL_GPL(mce_register_decode_chain); + +void mce_unregister_decode_chain(struct notifier_block *nb) +{ + atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb); +} +EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); + +static void print_mce(struct mce *m) +{ + int ret = 0; + + pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n", + m->extcpu, m->mcgstatus, m->bank, m->status); + + if (m->ip) { + pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ", + !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", + m->cs, m->ip); + + if (m->cs == __KERNEL_CS) + print_symbol("{%s}", m->ip); + pr_cont("\n"); + } + + pr_emerg(HW_ERR "TSC %llx ", m->tsc); + if (m->addr) + pr_cont("ADDR %llx ", m->addr); + if (m->misc) + pr_cont("MISC %llx ", m->misc); + + pr_cont("\n"); + /* + * Note this output is parsed by external tools and old fields + * should not be changed. + */ + pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n", + m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid, + cpu_data(m->extcpu).microcode); + + /* + * Print out human-readable details about the MCE error, + * (if the CPU has an implementation for that) + */ + ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); + if (ret == NOTIFY_STOP) + return; + + pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); +} + +#define PANIC_TIMEOUT 5 /* 5 seconds */ + +static atomic_t mce_paniced; + +static int fake_panic; +static atomic_t mce_fake_paniced; + +/* Panic in progress. Enable interrupts and wait for final IPI */ +static void wait_for_panic(void) +{ + long timeout = PANIC_TIMEOUT*USEC_PER_SEC; + + preempt_disable(); + local_irq_enable(); + while (timeout-- > 0) + udelay(1); + if (panic_timeout == 0) + panic_timeout = mca_cfg.panic_timeout; + panic("Panicing machine check CPU died"); +} + +static void mce_panic(char *msg, struct mce *final, char *exp) +{ + int i, apei_err = 0; + + if (!fake_panic) { + /* + * Make sure only one CPU runs in machine check panic + */ + if (atomic_inc_return(&mce_paniced) > 1) + wait_for_panic(); + barrier(); + + bust_spinlocks(1); + console_verbose(); + } else { + /* Don't log too much for fake panic */ + if (atomic_inc_return(&mce_fake_paniced) > 1) + return; + } + /* First print corrected ones that are still unlogged */ + for (i = 0; i < MCE_LOG_LEN; i++) { + struct mce *m = &mcelog.entry[i]; + if (!(m->status & MCI_STATUS_VAL)) + continue; + if (!(m->status & MCI_STATUS_UC)) { + print_mce(m); + if (!apei_err) + apei_err = apei_write_mce(m); + } + } + /* Now print uncorrected but with the final one last */ + for (i = 0; i < MCE_LOG_LEN; i++) { + struct mce *m = &mcelog.entry[i]; + if (!(m->status & MCI_STATUS_VAL)) + continue; + if (!(m->status & MCI_STATUS_UC)) + continue; + if (!final || memcmp(m, final, sizeof(struct mce))) { + print_mce(m); + if (!apei_err) + apei_err = apei_write_mce(m); + } + } + if (final) { + print_mce(final); + if (!apei_err) + apei_err = apei_write_mce(final); + } + if (cpu_missing) + pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n"); + if (exp) + pr_emerg(HW_ERR "Machine check: %s\n", exp); + if (!fake_panic) { + if (panic_timeout == 0) + panic_timeout = mca_cfg.panic_timeout; + panic(msg); + } else + pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); +} + +/* Support code for software error injection */ + +static int msr_to_offset(u32 msr) +{ + unsigned bank = __this_cpu_read(injectm.bank); + + if (msr == mca_cfg.rip_msr) + return offsetof(struct mce, ip); + if (msr == MSR_IA32_MCx_STATUS(bank)) + return offsetof(struct mce, status); + if (msr == MSR_IA32_MCx_ADDR(bank)) + return offsetof(struct mce, addr); + if (msr == MSR_IA32_MCx_MISC(bank)) + return offsetof(struct mce, misc); + if (msr == MSR_IA32_MCG_STATUS) + return offsetof(struct mce, mcgstatus); + return -1; +} + +/* MSR access wrappers used for error injection */ +static u64 mce_rdmsrl(u32 msr) +{ + u64 v; + + if (__this_cpu_read(injectm.finished)) { + int offset = msr_to_offset(msr); + + if (offset < 0) + return 0; + return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); + } + + if (rdmsrl_safe(msr, &v)) { + WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr); + /* + * Return zero in case the access faulted. This should + * not happen normally but can happen if the CPU does + * something weird, or if the code is buggy. + */ + v = 0; + } + + return v; +} + +static void mce_wrmsrl(u32 msr, u64 v) +{ + if (__this_cpu_read(injectm.finished)) { + int offset = msr_to_offset(msr); + + if (offset >= 0) + *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; + return; + } + wrmsrl(msr, v); +} + +/* + * Collect all global (w.r.t. this processor) status about this machine + * check into our "mce" struct so that we can use it later to assess + * the severity of the problem as we read per-bank specific details. + */ +static inline void mce_gather_info(struct mce *m, struct pt_regs *regs) +{ + mce_setup(m); + + m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); + if (regs) { + /* + * Get the address of the instruction at the time of + * the machine check error. + */ + if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) { + m->ip = regs->ip; + m->cs = regs->cs; + + /* + * When in VM86 mode make the cs look like ring 3 + * always. This is a lie, but it's better than passing + * the additional vm86 bit around everywhere. + */ + if (v8086_mode(regs)) + m->cs |= 3; + } + /* Use accurate RIP reporting if available. */ + if (mca_cfg.rip_msr) + m->ip = mce_rdmsrl(mca_cfg.rip_msr); + } +} + +/* + * Simple lockless ring to communicate PFNs from the exception handler with the + * process context work function. This is vastly simplified because there's + * only a single reader and a single writer. + */ +#define MCE_RING_SIZE 16 /* we use one entry less */ + +struct mce_ring { + unsigned short start; + unsigned short end; + unsigned long ring[MCE_RING_SIZE]; +}; +static DEFINE_PER_CPU(struct mce_ring, mce_ring); + +/* Runs with CPU affinity in workqueue */ +static int mce_ring_empty(void) +{ + struct mce_ring *r = &__get_cpu_var(mce_ring); + + return r->start == r->end; +} + +static int mce_ring_get(unsigned long *pfn) +{ + struct mce_ring *r; + int ret = 0; + + *pfn = 0; + get_cpu(); + r = &__get_cpu_var(mce_ring); + if (r->start == r->end) + goto out; + *pfn = r->ring[r->start]; + r->start = (r->start + 1) % MCE_RING_SIZE; + ret = 1; +out: + put_cpu(); + return ret; +} + +/* Always runs in MCE context with preempt off */ +static int mce_ring_add(unsigned long pfn) +{ + struct mce_ring *r = &__get_cpu_var(mce_ring); + unsigned next; + + next = (r->end + 1) % MCE_RING_SIZE; + if (next == r->start) + return -1; + r->ring[r->end] = pfn; + wmb(); + r->end = next; + return 0; +} + +int mce_available(struct cpuinfo_x86 *c) +{ + if (mca_cfg.disabled) + return 0; + return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); +} + +static void mce_schedule_work(void) +{ + if (!mce_ring_empty()) + schedule_work(&__get_cpu_var(mce_work)); +} + +DEFINE_PER_CPU(struct irq_work, mce_irq_work); + +static void mce_irq_work_cb(struct irq_work *entry) +{ + mce_notify_irq(); + mce_schedule_work(); +} + +static void mce_report_event(struct pt_regs *regs) +{ + if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { + mce_notify_irq(); + /* + * Triggering the work queue here is just an insurance + * policy in case the syscall exit notify handler + * doesn't run soon enough or ends up running on the + * wrong CPU (can happen when audit sleeps) + */ + mce_schedule_work(); + return; + } + + irq_work_queue(&__get_cpu_var(mce_irq_work)); +} + +/* + * Read ADDR and MISC registers. + */ +static void mce_read_aux(struct mce *m, int i) +{ + if (m->status & MCI_STATUS_MISCV) + m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); + if (m->status & MCI_STATUS_ADDRV) { + m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); + + /* + * Mask the reported address by the reported granularity. + */ + if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) { + u8 shift = MCI_MISC_ADDR_LSB(m->misc); + m->addr >>= shift; + m->addr <<= shift; + } + } +} + +DEFINE_PER_CPU(unsigned, mce_poll_count); + +/* + * Poll for corrected events or events that happened before reset. + * Those are just logged through /dev/mcelog. + * + * This is executed in standard interrupt context. + * + * Note: spec recommends to panic for fatal unsignalled + * errors here. However this would be quite problematic -- + * we would need to reimplement the Monarch handling and + * it would mess up the exclusion between exception handler + * and poll hander -- * so we skip this for now. + * These cases should not happen anyways, or only when the CPU + * is already totally * confused. In this case it's likely it will + * not fully execute the machine check handler either. + */ +void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) +{ + struct mce m; + int i; + + this_cpu_inc(mce_poll_count); + + mce_gather_info(&m, NULL); + + for (i = 0; i < mca_cfg.banks; i++) { + if (!mce_banks[i].ctl || !test_bit(i, *b)) + continue; + + m.misc = 0; + m.addr = 0; + m.bank = i; + m.tsc = 0; + + barrier(); + m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); + if (!(m.status & MCI_STATUS_VAL)) + continue; + + this_cpu_write(mce_polled_error, 1); + /* + * Uncorrected or signalled events are handled by the exception + * handler when it is enabled, so don't process those here. + * + * TBD do the same check for MCI_STATUS_EN here? + */ + if (!(flags & MCP_UC) && + (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC))) + continue; + + mce_read_aux(&m, i); + + if (!(flags & MCP_TIMESTAMP)) + m.tsc = 0; + /* + * Don't get the IP here because it's unlikely to + * have anything to do with the actual error location. + */ + if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) + mce_log(&m); + + /* + * Clear state for this bank. + */ + mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); + } + + /* + * Don't clear MCG_STATUS here because it's only defined for + * exceptions. + */ + + sync_core(); +} +EXPORT_SYMBOL_GPL(machine_check_poll); + +/* + * Do a quick check if any of the events requires a panic. + * This decides if we keep the events around or clear them. + */ +static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, + struct pt_regs *regs) +{ + int i, ret = 0; + + for (i = 0; i < mca_cfg.banks; i++) { + m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); + if (m->status & MCI_STATUS_VAL) { + __set_bit(i, validp); + if (quirk_no_way_out) + quirk_no_way_out(i, m, regs); + } + if (mce_severity(m, mca_cfg.tolerant, msg) >= MCE_PANIC_SEVERITY) + ret = 1; + } + return ret; +} + +/* + * Variable to establish order between CPUs while scanning. + * Each CPU spins initially until executing is equal its number. + */ +static atomic_t mce_executing; + +/* + * Defines order of CPUs on entry. First CPU becomes Monarch. + */ +static atomic_t mce_callin; + +/* + * Check if a timeout waiting for other CPUs happened. + */ +static int mce_timed_out(u64 *t) +{ + /* + * The others already did panic for some reason. + * Bail out like in a timeout. + * rmb() to tell the compiler that system_state + * might have been modified by someone else. + */ + rmb(); + if (atomic_read(&mce_paniced)) + wait_for_panic(); + if (!mca_cfg.monarch_timeout) + goto out; + if ((s64)*t < SPINUNIT) { + if (mca_cfg.tolerant <= 1) + mce_panic("Timeout synchronizing machine check over CPUs", + NULL, NULL); + cpu_missing = 1; + return 1; + } + *t -= SPINUNIT; +out: + touch_nmi_watchdog(); + return 0; +} + +/* + * The Monarch's reign. The Monarch is the CPU who entered + * the machine check handler first. It waits for the others to + * raise the exception too and then grades them. When any + * error is fatal panic. Only then let the others continue. + * + * The other CPUs entering the MCE handler will be controlled by the + * Monarch. They are called Subjects. + * + * This way we prevent any potential data corruption in a unrecoverable case + * and also makes sure always all CPU's errors are examined. + * + * Also this detects the case of a machine check event coming from outer + * space (not detected by any CPUs) In this case some external agent wants + * us to shut down, so panic too. + * + * The other CPUs might still decide to panic if the handler happens + * in a unrecoverable place, but in this case the system is in a semi-stable + * state and won't corrupt anything by itself. It's ok to let the others + * continue for a bit first. + * + * All the spin loops have timeouts; when a timeout happens a CPU + * typically elects itself to be Monarch. + */ +static void mce_reign(void) +{ + int cpu; + struct mce *m = NULL; + int global_worst = 0; + char *msg = NULL; + char *nmsg = NULL; + + /* + * This CPU is the Monarch and the other CPUs have run + * through their handlers. + * Grade the severity of the errors of all the CPUs. + */ + for_each_possible_cpu(cpu) { + int severity = mce_severity(&per_cpu(mces_seen, cpu), + mca_cfg.tolerant, + &nmsg); + if (severity > global_worst) { + msg = nmsg; + global_worst = severity; + m = &per_cpu(mces_seen, cpu); + } + } + + /* + * Cannot recover? Panic here then. + * This dumps all the mces in the log buffer and stops the + * other CPUs. + */ + if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) + mce_panic("Fatal Machine check", m, msg); + + /* + * For UC somewhere we let the CPU who detects it handle it. + * Also must let continue the others, otherwise the handling + * CPU could deadlock on a lock. + */ + + /* + * No machine check event found. Must be some external + * source or one CPU is hung. Panic. + */ + if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3) + mce_panic("Machine check from unknown source", NULL, NULL); + + /* + * Now clear all the mces_seen so that they don't reappear on + * the next mce. + */ + for_each_possible_cpu(cpu) + memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce)); +} + +static atomic_t global_nwo; + +/* + * Start of Monarch synchronization. This waits until all CPUs have + * entered the exception handler and then determines if any of them + * saw a fatal event that requires panic. Then it executes them + * in the entry order. + * TBD double check parallel CPU hotunplug + */ +static int mce_start(int *no_way_out) +{ + int order; + int cpus = num_online_cpus(); + u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; + + if (!timeout) + return -1; + + atomic_add(*no_way_out, &global_nwo); + /* + * global_nwo should be updated before mce_callin + */ + smp_wmb(); + order = atomic_inc_return(&mce_callin); + + /* + * Wait for everyone. + */ + while (atomic_read(&mce_callin) != cpus) { + if (mce_timed_out(&timeout)) { + atomic_set(&global_nwo, 0); + return -1; + } + ndelay(SPINUNIT); + } + + /* + * mce_callin should be read before global_nwo + */ + smp_rmb(); + + if (order == 1) { + /* + * Monarch: Starts executing now, the others wait. + */ + atomic_set(&mce_executing, 1); + } else { + /* + * Subject: Now start the scanning loop one by one in + * the original callin order. + * This way when there are any shared banks it will be + * only seen by one CPU before cleared, avoiding duplicates. + */ + while (atomic_read(&mce_executing) < order) { + if (mce_timed_out(&timeout)) { + atomic_set(&global_nwo, 0); + return -1; + } + ndelay(SPINUNIT); + } + } + + /* + * Cache the global no_way_out state. + */ + *no_way_out = atomic_read(&global_nwo); + + return order; +} + +/* + * Synchronize between CPUs after main scanning loop. + * This invokes the bulk of the Monarch processing. + */ +static int mce_end(int order) +{ + int ret = -1; + u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; + + if (!timeout) + goto reset; + if (order < 0) + goto reset; + + /* + * Allow others to run. + */ + atomic_inc(&mce_executing); + + if (order == 1) { + /* CHECKME: Can this race with a parallel hotplug? */ + int cpus = num_online_cpus(); + + /* + * Monarch: Wait for everyone to go through their scanning + * loops. + */ + while (atomic_read(&mce_executing) <= cpus) { + if (mce_timed_out(&timeout)) + goto reset; + ndelay(SPINUNIT); + } + + mce_reign(); + barrier(); + ret = 0; + } else { + /* + * Subject: Wait for Monarch to finish. + */ + while (atomic_read(&mce_executing) != 0) { + if (mce_timed_out(&timeout)) + goto reset; + ndelay(SPINUNIT); + } + + /* + * Don't reset anything. That's done by the Monarch. + */ + return 0; + } + + /* + * Reset all global state. + */ +reset: + atomic_set(&global_nwo, 0); + atomic_set(&mce_callin, 0); + barrier(); + + /* + * Let others run again. + */ + atomic_set(&mce_executing, 0); + return ret; +} + +/* + * Check if the address reported by the CPU is in a format we can parse. + * It would be possible to add code for most other cases, but all would + * be somewhat complicated (e.g. segment offset would require an instruction + * parser). So only support physical addresses up to page granuality for now. + */ +static int mce_usable_address(struct mce *m) +{ + if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) + return 0; + if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT) + return 0; + if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS) + return 0; + return 1; +} + +static void mce_clear_state(unsigned long *toclear) +{ + int i; + + for (i = 0; i < mca_cfg.banks; i++) { + if (test_bit(i, toclear)) + mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); + } +} + +/* + * Need to save faulting physical address associated with a process + * in the machine check handler some place where we can grab it back + * later in mce_notify_process() + */ +#define MCE_INFO_MAX 16 + +struct mce_info { + atomic_t inuse; + struct task_struct *t; + __u64 paddr; + int restartable; +} mce_info[MCE_INFO_MAX]; + +static void mce_save_info(__u64 addr, int c) +{ + struct mce_info *mi; + + for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) { + if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) { + mi->t = current; + mi->paddr = addr; + mi->restartable = c; + return; + } + } + + mce_panic("Too many concurrent recoverable errors", NULL, NULL); +} + +static struct mce_info *mce_find_info(void) +{ + struct mce_info *mi; + + for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) + if (atomic_read(&mi->inuse) && mi->t == current) + return mi; + return NULL; +} + +static void mce_clear_info(struct mce_info *mi) +{ + atomic_set(&mi->inuse, 0); +} + +/* + * The actual machine check handler. This only handles real + * exceptions when something got corrupted coming in through int 18. + * + * This is executed in NMI context not subject to normal locking rules. This + * implies that most kernel services cannot be safely used. Don't even + * think about putting a printk in there! + * + * On Intel systems this is entered on all CPUs in parallel through + * MCE broadcast. However some CPUs might be broken beyond repair, + * so be always careful when synchronizing with others. + */ +void do_machine_check(struct pt_regs *regs, long error_code) +{ + struct mca_config *cfg = &mca_cfg; + struct mce m, *final; + int i; + int worst = 0; + int severity; + /* + * Establish sequential order between the CPUs entering the machine + * check handler. + */ + int order; + /* + * If no_way_out gets set, there is no safe way to recover from this + * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway. + */ + int no_way_out = 0; + /* + * If kill_it gets set, there might be a way to recover from this + * error. + */ + int kill_it = 0; + DECLARE_BITMAP(toclear, MAX_NR_BANKS); + DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); + char *msg = "Unknown"; + + this_cpu_inc(mce_exception_count); + + if (!cfg->banks) + goto out; + + mce_gather_info(&m, regs); + + final = &__get_cpu_var(mces_seen); + *final = m; + + memset(valid_banks, 0, sizeof(valid_banks)); + no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs); + + barrier(); + + /* + * When no restart IP might need to kill or panic. + * Assume the worst for now, but if we find the + * severity is MCE_AR_SEVERITY we have other options. + */ + if (!(m.mcgstatus & MCG_STATUS_RIPV)) + kill_it = 1; + + /* + * Go through all the banks in exclusion of the other CPUs. + * This way we don't report duplicated events on shared banks + * because the first one to see it will clear it. + */ + order = mce_start(&no_way_out); + for (i = 0; i < cfg->banks; i++) { + __clear_bit(i, toclear); + if (!test_bit(i, valid_banks)) + continue; + if (!mce_banks[i].ctl) + continue; + + m.misc = 0; + m.addr = 0; + m.bank = i; + + m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); + if ((m.status & MCI_STATUS_VAL) == 0) + continue; + + /* + * Non uncorrected or non signaled errors are handled by + * machine_check_poll. Leave them alone, unless this panics. + */ + if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) && + !no_way_out) + continue; + + /* + * Set taint even when machine check was not enabled. + */ + add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); + + severity = mce_severity(&m, cfg->tolerant, NULL); + + /* + * When machine check was for corrected handler don't touch, + * unless we're panicing. + */ + if (severity == MCE_KEEP_SEVERITY && !no_way_out) + continue; + __set_bit(i, toclear); + if (severity == MCE_NO_SEVERITY) { + /* + * Machine check event was not enabled. Clear, but + * ignore. + */ + continue; + } + + mce_read_aux(&m, i); + + /* + * Action optional error. Queue address for later processing. + * When the ring overflows we just ignore the AO error. + * RED-PEN add some logging mechanism when + * usable_address or mce_add_ring fails. + * RED-PEN don't ignore overflow for mca_cfg.tolerant == 0 + */ + if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) + mce_ring_add(m.addr >> PAGE_SHIFT); + + mce_log(&m); + + if (severity > worst) { + *final = m; + worst = severity; + } + } + + /* mce_clear_state will clear *final, save locally for use later */ + m = *final; + + if (!no_way_out) + mce_clear_state(toclear); + + /* + * Do most of the synchronization with other CPUs. + * When there's any problem use only local no_way_out state. + */ + if (mce_end(order) < 0) + no_way_out = worst >= MCE_PANIC_SEVERITY; + + /* + * At insane "tolerant" levels we take no action. Otherwise + * we only die if we have no other choice. For less serious + * issues we try to recover, or limit damage to the current + * process. + */ + if (cfg->tolerant < 3) { + if (no_way_out) + mce_panic("Fatal machine check on current CPU", &m, msg); + if (worst == MCE_AR_SEVERITY) { + /* schedule action before return to userland */ + mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV); + set_thread_flag(TIF_MCE_NOTIFY); + } else if (kill_it) { + force_sig(SIGBUS, current); + } + } + + if (worst > 0) + mce_report_event(regs); + mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); +out: + sync_core(); +} +EXPORT_SYMBOL_GPL(do_machine_check); + +#ifndef CONFIG_MEMORY_FAILURE +int memory_failure(unsigned long pfn, int vector, int flags) +{ + /* mce_severity() should not hand us an ACTION_REQUIRED error */ + BUG_ON(flags & MF_ACTION_REQUIRED); + pr_err("Uncorrected memory error in page 0x%lx ignored\n" + "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", + pfn); + + return 0; +} +#endif + +/* + * Called in process context that interrupted by MCE and marked with + * TIF_MCE_NOTIFY, just before returning to erroneous userland. + * This code is allowed to sleep. + * Attempt possible recovery such as calling the high level VM handler to + * process any corrupted pages, and kill/signal current process if required. + * Action required errors are handled here. + */ +void mce_notify_process(void) +{ + unsigned long pfn; + struct mce_info *mi = mce_find_info(); + int flags = MF_ACTION_REQUIRED; + + if (!mi) + mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL); + pfn = mi->paddr >> PAGE_SHIFT; + + clear_thread_flag(TIF_MCE_NOTIFY); + + pr_err("Uncorrected hardware memory error in user-access at %llx", + mi->paddr); + /* + * We must call memory_failure() here even if the current process is + * doomed. We still need to mark the page as poisoned and alert any + * other users of the page. + */ + if (!mi->restartable) + flags |= MF_MUST_KILL; + if (memory_failure(pfn, MCE_VECTOR, flags) < 0) { + pr_err("Memory error not recovered"); + force_sig(SIGBUS, current); + } + mce_clear_info(mi); +} + +/* + * Action optional processing happens here (picking up + * from the list of faulting pages that do_machine_check() + * placed into the "ring"). + */ +static void mce_process_work(struct work_struct *dummy) +{ + unsigned long pfn; + + while (mce_ring_get(&pfn)) + memory_failure(pfn, MCE_VECTOR, 0); +} + +#ifdef CONFIG_X86_MCE_INTEL +/*** + * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog + * @cpu: The CPU on which the event occurred. + * @status: Event status information + * + * This function should be called by the thermal interrupt after the + * event has been processed and the decision was made to log the event + * further. + * + * The status parameter will be saved to the 'status' field of 'struct mce' + * and historically has been the register value of the + * MSR_IA32_THERMAL_STATUS (Intel) msr. + */ +void mce_log_therm_throt_event(__u64 status) +{ + struct mce m; + + mce_setup(&m); + m.bank = MCE_THERMAL_BANK; + m.status = status; + mce_log(&m); +} +#endif /* CONFIG_X86_MCE_INTEL */ + +/* + * Periodic polling timer for "silent" machine check errors. If the + * poller finds an MCE, poll 2x faster. When the poller finds no more + * errors, poll 2x slower (up to check_interval seconds). + */ +static unsigned long check_interval = 5 * 60; /* 5 minutes */ + +static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ +static DEFINE_PER_CPU(struct timer_list, mce_timer); + +static unsigned long mce_adjust_timer_default(unsigned long interval) +{ + return interval; +} + +static unsigned long (*mce_adjust_timer)(unsigned long interval) = + mce_adjust_timer_default; + +static int cmc_error_seen(void) +{ + unsigned long *v = &__get_cpu_var(mce_polled_error); + + return test_and_clear_bit(0, v); +} + +static void mce_timer_fn(unsigned long data) +{ + struct timer_list *t = &__get_cpu_var(mce_timer); + unsigned long iv; + int notify; + + WARN_ON(smp_processor_id() != data); + + if (mce_available(__this_cpu_ptr(&cpu_info))) { + machine_check_poll(MCP_TIMESTAMP, + &__get_cpu_var(mce_poll_banks)); + mce_intel_cmci_poll(); + } + + /* + * Alert userspace if needed. If we logged an MCE, reduce the + * polling interval, otherwise increase the polling interval. + */ + iv = __this_cpu_read(mce_next_interval); + notify = mce_notify_irq(); + notify |= cmc_error_seen(); + if (notify) { + iv = max(iv / 2, (unsigned long) HZ/100); + } else { + iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); + iv = mce_adjust_timer(iv); + } + __this_cpu_write(mce_next_interval, iv); + /* Might have become 0 after CMCI storm subsided */ + if (iv) { + t->expires = jiffies + iv; + add_timer_on(t, smp_processor_id()); + } +} + +/* + * Ensure that the timer is firing in @interval from now. + */ +void mce_timer_kick(unsigned long interval) +{ + struct timer_list *t = &__get_cpu_var(mce_timer); + unsigned long when = jiffies + interval; + unsigned long iv = __this_cpu_read(mce_next_interval); + + if (timer_pending(t)) { + if (time_before(when, t->expires)) + mod_timer_pinned(t, when); + } else { + t->expires = round_jiffies(when); + add_timer_on(t, smp_processor_id()); + } + if (interval < iv) + __this_cpu_write(mce_next_interval, interval); +} + +/* Must not be called in IRQ context where del_timer_sync() can deadlock */ +static void mce_timer_delete_all(void) +{ + int cpu; + + for_each_online_cpu(cpu) + del_timer_sync(&per_cpu(mce_timer, cpu)); +} + +static void mce_do_trigger(struct work_struct *work) +{ + call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT); +} + +static DECLARE_WORK(mce_trigger_work, mce_do_trigger); + +/* + * Notify the user(s) about new machine check events. + * Can be called from interrupt context, but not from machine check/NMI + * context. + */ +int mce_notify_irq(void) +{ + /* Not more than two messages every minute */ + static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); + + if (test_and_clear_bit(0, &mce_need_notify)) { + /* wake processes polling /dev/mcelog */ + wake_up_interruptible(&mce_chrdev_wait); + + if (mce_helper[0]) + schedule_work(&mce_trigger_work); + + if (__ratelimit(&ratelimit)) + pr_info(HW_ERR "Machine check events logged\n"); + + return 1; + } + return 0; +} +EXPORT_SYMBOL_GPL(mce_notify_irq); + +static int __mcheck_cpu_mce_banks_init(void) +{ + int i; + u8 num_banks = mca_cfg.banks; + + mce_banks = kzalloc(num_banks * sizeof(struct mce_bank), GFP_KERNEL); + if (!mce_banks) + return -ENOMEM; + + for (i = 0; i < num_banks; i++) { + struct mce_bank *b = &mce_banks[i]; + + b->ctl = -1ULL; + b->init = 1; + } + return 0; +} + +/* + * Initialize Machine Checks for a CPU. + */ +static int __mcheck_cpu_cap_init(void) +{ + unsigned b; + u64 cap; + + rdmsrl(MSR_IA32_MCG_CAP, cap); + + b = cap & MCG_BANKCNT_MASK; + if (!mca_cfg.banks) + pr_info("CPU supports %d MCE banks\n", b); + + if (b > MAX_NR_BANKS) { + pr_warn("Using only %u machine check banks out of %u\n", + MAX_NR_BANKS, b); + b = MAX_NR_BANKS; + } + + /* Don't support asymmetric configurations today */ + WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks); + mca_cfg.banks = b; + + if (!mce_banks) { + int err = __mcheck_cpu_mce_banks_init(); + + if (err) + return err; + } + + /* Use accurate RIP reporting if available. */ + if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) + mca_cfg.rip_msr = MSR_IA32_MCG_EIP; + + if (cap & MCG_SER_P) + mca_cfg.ser = true; + + return 0; +} + +static void __mcheck_cpu_init_generic(void) +{ + enum mcp_flags m_fl = 0; + mce_banks_t all_banks; + u64 cap; + int i; + + if (!mca_cfg.bootlog) + m_fl = MCP_DONTLOG; + + /* + * Log the machine checks left over from the previous reset. + */ + bitmap_fill(all_banks, MAX_NR_BANKS); + machine_check_poll(MCP_UC | m_fl, &all_banks); + + set_in_cr4(X86_CR4_MCE); + + rdmsrl(MSR_IA32_MCG_CAP, cap); + if (cap & MCG_CTL_P) + wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); + + for (i = 0; i < mca_cfg.banks; i++) { + struct mce_bank *b = &mce_banks[i]; + + if (!b->init) + continue; + wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); + wrmsrl(MSR_IA32_MCx_STATUS(i), 0); + } +} + +/* + * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and + * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM + * Vol 3B Table 15-20). But this confuses both the code that determines + * whether the machine check occurred in kernel or user mode, and also + * the severity assessment code. Pretend that EIPV was set, and take the + * ip/cs values from the pt_regs that mce_gather_info() ignored earlier. + */ +static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs) +{ + if (bank != 0) + return; + if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0) + return; + if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC| + MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV| + MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR| + MCACOD)) != + (MCI_STATUS_UC|MCI_STATUS_EN| + MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S| + MCI_STATUS_AR|MCACOD_INSTR)) + return; + + m->mcgstatus |= MCG_STATUS_EIPV; + m->ip = regs->ip; + m->cs = regs->cs; +} + +/* Add per CPU specific workarounds here */ +static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) +{ + struct mca_config *cfg = &mca_cfg; + + if (c->x86_vendor == X86_VENDOR_UNKNOWN) { + pr_info("unknown CPU type - not enabling MCE support\n"); + return -EOPNOTSUPP; + } + + /* This should be disabled by the BIOS, but isn't always */ + if (c->x86_vendor == X86_VENDOR_AMD) { + if (c->x86 == 15 && cfg->banks > 4) { + /* + * disable GART TBL walk error reporting, which + * trips off incorrectly with the IOMMU & 3ware + * & Cerberus: + */ + clear_bit(10, (unsigned long *)&mce_banks[4].ctl); + } + if (c->x86 <= 17 && cfg->bootlog < 0) { + /* + * Lots of broken BIOS around that don't clear them + * by default and leave crap in there. Don't log: + */ + cfg->bootlog = 0; + } + /* + * Various K7s with broken bank 0 around. Always disable + * by default. + */ + if (c->x86 == 6 && cfg->banks > 0) + mce_banks[0].ctl = 0; + + /* + * Turn off MC4_MISC thresholding banks on those models since + * they're not supported there. + */ + if (c->x86 == 0x15 && + (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) { + int i; + u64 val, hwcr; + bool need_toggle; + u32 msrs[] = { + 0x00000413, /* MC4_MISC0 */ + 0xc0000408, /* MC4_MISC1 */ + }; + + rdmsrl(MSR_K7_HWCR, hwcr); + + /* McStatusWrEn has to be set */ + need_toggle = !(hwcr & BIT(18)); + + if (need_toggle) + wrmsrl(MSR_K7_HWCR, hwcr | BIT(18)); + + for (i = 0; i < ARRAY_SIZE(msrs); i++) { + rdmsrl(msrs[i], val); + + /* CntP bit set? */ + if (val & BIT_64(62)) { + val &= ~BIT_64(62); + wrmsrl(msrs[i], val); + } + } + + /* restore old settings */ + if (need_toggle) + wrmsrl(MSR_K7_HWCR, hwcr); + } + } + + if (c->x86_vendor == X86_VENDOR_INTEL) { + /* + * SDM documents that on family 6 bank 0 should not be written + * because it aliases to another special BIOS controlled + * register. + * But it's not aliased anymore on model 0x1a+ + * Don't ignore bank 0 completely because there could be a + * valid event later, merely don't write CTL0. + */ + + if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0) + mce_banks[0].init = 0; + + /* + * All newer Intel systems support MCE broadcasting. Enable + * synchronization with a one second timeout. + */ + if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && + cfg->monarch_timeout < 0) + cfg->monarch_timeout = USEC_PER_SEC; + + /* + * There are also broken BIOSes on some Pentium M and + * earlier systems: + */ + if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0) + cfg->bootlog = 0; + + if (c->x86 == 6 && c->x86_model == 45) + quirk_no_way_out = quirk_sandybridge_ifu; + } + if (cfg->monarch_timeout < 0) + cfg->monarch_timeout = 0; + if (cfg->bootlog != 0) + cfg->panic_timeout = 30; + + return 0; +} + +static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) +{ + if (c->x86 != 5) + return 0; + + switch (c->x86_vendor) { + case X86_VENDOR_INTEL: + intel_p5_mcheck_init(c); + return 1; + break; + case X86_VENDOR_CENTAUR: + winchip_mcheck_init(c); + return 1; + break; + } + + return 0; +} + +static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) +{ + switch (c->x86_vendor) { + case X86_VENDOR_INTEL: + mce_intel_feature_init(c); + mce_adjust_timer = mce_intel_adjust_timer; + break; + case X86_VENDOR_AMD: + mce_amd_feature_init(c); + break; + default: + break; + } +} + +static void mce_start_timer(unsigned int cpu, struct timer_list *t) +{ + unsigned long iv = check_interval * HZ; + + if (mca_cfg.ignore_ce || !iv) + return; + + per_cpu(mce_next_interval, cpu) = iv; + + t->expires = round_jiffies(jiffies + iv); + add_timer_on(t, cpu); +} + +static void __mcheck_cpu_init_timer(void) +{ + struct timer_list *t = &__get_cpu_var(mce_timer); + unsigned int cpu = smp_processor_id(); + + setup_timer(t, mce_timer_fn, cpu); + mce_start_timer(cpu, t); +} + +/* Handle unconfigured int18 (should never happen) */ +static void unexpected_machine_check(struct pt_regs *regs, long error_code) +{ + pr_err("CPU#%d: Unexpected int18 (Machine Check)\n", + smp_processor_id()); +} + +/* Call the installed machine check handler for this CPU setup. */ +void (*machine_check_vector)(struct pt_regs *, long error_code) = + unexpected_machine_check; + +/* + * Called for each booted CPU to set up machine checks. + * Must be called with preempt off: + */ +void mcheck_cpu_init(struct cpuinfo_x86 *c) +{ + if (mca_cfg.disabled) + return; + + if (__mcheck_cpu_ancient_init(c)) + return; + + if (!mce_available(c)) + return; + + if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) { + mca_cfg.disabled = true; + return; + } + + machine_check_vector = do_machine_check; + + __mcheck_cpu_init_generic(); + __mcheck_cpu_init_vendor(c); + __mcheck_cpu_init_timer(); + INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); + init_irq_work(&__get_cpu_var(mce_irq_work), &mce_irq_work_cb); +} + +/* + * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log. + */ + +static DEFINE_SPINLOCK(mce_chrdev_state_lock); +static int mce_chrdev_open_count; /* #times opened */ +static int mce_chrdev_open_exclu; /* already open exclusive? */ + +static int mce_chrdev_open(struct inode *inode, struct file *file) +{ + spin_lock(&mce_chrdev_state_lock); + + if (mce_chrdev_open_exclu || + (mce_chrdev_open_count && (file->f_flags & O_EXCL))) { + spin_unlock(&mce_chrdev_state_lock); + + return -EBUSY; + } + + if (file->f_flags & O_EXCL) + mce_chrdev_open_exclu = 1; + mce_chrdev_open_count++; + + spin_unlock(&mce_chrdev_state_lock); + + return nonseekable_open(inode, file); +} + +static int mce_chrdev_release(struct inode *inode, struct file *file) +{ + spin_lock(&mce_chrdev_state_lock); + + mce_chrdev_open_count--; + mce_chrdev_open_exclu = 0; + + spin_unlock(&mce_chrdev_state_lock); + + return 0; +} + +static void collect_tscs(void *data) +{ + unsigned long *cpu_tsc = (unsigned long *)data; + + rdtscll(cpu_tsc[smp_processor_id()]); +} + +static int mce_apei_read_done; + +/* Collect MCE record of previous boot in persistent storage via APEI ERST. */ +static int __mce_read_apei(char __user **ubuf, size_t usize) +{ + int rc; + u64 record_id; + struct mce m; + + if (usize < sizeof(struct mce)) + return -EINVAL; + + rc = apei_read_mce(&m, &record_id); + /* Error or no more MCE record */ + if (rc <= 0) { + mce_apei_read_done = 1; + /* + * When ERST is disabled, mce_chrdev_read() should return + * "no record" instead of "no device." + */ + if (rc == -ENODEV) + return 0; + return rc; + } + rc = -EFAULT; + if (copy_to_user(*ubuf, &m, sizeof(struct mce))) + return rc; + /* + * In fact, we should have cleared the record after that has + * been flushed to the disk or sent to network in + * /sbin/mcelog, but we have no interface to support that now, + * so just clear it to avoid duplication. + */ + rc = apei_clear_mce(record_id); + if (rc) { + mce_apei_read_done = 1; + return rc; + } + *ubuf += sizeof(struct mce); + + return 0; +} + +static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf, + size_t usize, loff_t *off) +{ + char __user *buf = ubuf; + unsigned long *cpu_tsc; + unsigned prev, next; + int i, err; + + cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); + if (!cpu_tsc) + return -ENOMEM; + + mutex_lock(&mce_chrdev_read_mutex); + + if (!mce_apei_read_done) { + err = __mce_read_apei(&buf, usize); + if (err || buf != ubuf) + goto out; + } + + next = rcu_dereference_check_mce(mcelog.next); + + /* Only supports full reads right now */ + err = -EINVAL; + if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) + goto out; + + err = 0; + prev = 0; + do { + for (i = prev; i < next; i++) { + unsigned long start = jiffies; + struct mce *m = &mcelog.entry[i]; + + while (!m->finished) { + if (time_after_eq(jiffies, start + 2)) { + memset(m, 0, sizeof(*m)); + goto timeout; + } + cpu_relax(); + } + smp_rmb(); + err |= copy_to_user(buf, m, sizeof(*m)); + buf += sizeof(*m); +timeout: + ; + } + + memset(mcelog.entry + prev, 0, + (next - prev) * sizeof(struct mce)); + prev = next; + next = cmpxchg(&mcelog.next, prev, 0); + } while (next != prev); + + synchronize_sched(); + + /* + * Collect entries that were still getting written before the + * synchronize. + */ + on_each_cpu(collect_tscs, cpu_tsc, 1); + + for (i = next; i < MCE_LOG_LEN; i++) { + struct mce *m = &mcelog.entry[i]; + + if (m->finished && m->tsc < cpu_tsc[m->cpu]) { + err |= copy_to_user(buf, m, sizeof(*m)); + smp_rmb(); + buf += sizeof(*m); + memset(m, 0, sizeof(*m)); + } + } + + if (err) + err = -EFAULT; + +out: + mutex_unlock(&mce_chrdev_read_mutex); + kfree(cpu_tsc); + + return err ? err : buf - ubuf; +} + +static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait) +{ + poll_wait(file, &mce_chrdev_wait, wait); + if (rcu_access_index(mcelog.next)) + return POLLIN | POLLRDNORM; + if (!mce_apei_read_done && apei_check_mce()) + return POLLIN | POLLRDNORM; + return 0; +} + +static long mce_chrdev_ioctl(struct file *f, unsigned int cmd, + unsigned long arg) +{ + int __user *p = (int __user *)arg; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (cmd) { + case MCE_GET_RECORD_LEN: + return put_user(sizeof(struct mce), p); + case MCE_GET_LOG_LEN: + return put_user(MCE_LOG_LEN, p); + case MCE_GETCLEAR_FLAGS: { + unsigned flags; + + do { + flags = mcelog.flags; + } while (cmpxchg(&mcelog.flags, flags, 0) != flags); + + return put_user(flags, p); + } + default: + return -ENOTTY; + } +} + +static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf, + size_t usize, loff_t *off); + +void register_mce_write_callback(ssize_t (*fn)(struct file *filp, + const char __user *ubuf, + size_t usize, loff_t *off)) +{ + mce_write = fn; +} +EXPORT_SYMBOL_GPL(register_mce_write_callback); + +ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf, + size_t usize, loff_t *off) +{ + if (mce_write) + return mce_write(filp, ubuf, usize, off); + else + return -EINVAL; +} + +static const struct file_operations mce_chrdev_ops = { + .open = mce_chrdev_open, + .release = mce_chrdev_release, + .read = mce_chrdev_read, + .write = mce_chrdev_write, + .poll = mce_chrdev_poll, + .unlocked_ioctl = mce_chrdev_ioctl, + .llseek = no_llseek, +}; + +static struct miscdevice mce_chrdev_device = { + MISC_MCELOG_MINOR, + "mcelog", + &mce_chrdev_ops, +}; + +static void __mce_disable_bank(void *arg) +{ + int bank = *((int *)arg); + __clear_bit(bank, __get_cpu_var(mce_poll_banks)); + cmci_disable_bank(bank); +} + +void mce_disable_bank(int bank) +{ + if (bank >= mca_cfg.banks) { + pr_warn(FW_BUG + "Ignoring request to disable invalid MCA bank %d.\n", + bank); + return; + } + set_bit(bank, mce_banks_ce_disabled); + on_each_cpu(__mce_disable_bank, &bank, 1); +} + +/* + * mce=off Disables machine check + * mce=no_cmci Disables CMCI + * mce=dont_log_ce Clears corrected events silently, no log created for CEs. + * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared. + * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above) + * monarchtimeout is how long to wait for other CPUs on machine + * check, or 0 to not wait + * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. + * mce=nobootlog Don't log MCEs from before booting. + * mce=bios_cmci_threshold Don't program the CMCI threshold + */ +static int __init mcheck_enable(char *str) +{ + struct mca_config *cfg = &mca_cfg; + + if (*str == 0) { + enable_p5_mce(); + return 1; + } + if (*str == '=') + str++; + if (!strcmp(str, "off")) + cfg->disabled = true; + else if (!strcmp(str, "no_cmci")) + cfg->cmci_disabled = true; + else if (!strcmp(str, "dont_log_ce")) + cfg->dont_log_ce = true; + else if (!strcmp(str, "ignore_ce")) + cfg->ignore_ce = true; + else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) + cfg->bootlog = (str[0] == 'b'); + else if (!strcmp(str, "bios_cmci_threshold")) + cfg->bios_cmci_threshold = true; + else if (isdigit(str[0])) { + get_option(&str, &(cfg->tolerant)); + if (*str == ',') { + ++str; + get_option(&str, &(cfg->monarch_timeout)); + } + } else { + pr_info("mce argument %s ignored. Please use /sys\n", str); + return 0; + } + return 1; +} +__setup("mce", mcheck_enable); + +int __init mcheck_init(void) +{ + mcheck_intel_therm_init(); + + return 0; +} + +/* + * mce_syscore: PM support + */ + +/* + * Disable machine checks on suspend and shutdown. We can't really handle + * them later. + */ +static int mce_disable_error_reporting(void) +{ + int i; + + for (i = 0; i < mca_cfg.banks; i++) { + struct mce_bank *b = &mce_banks[i]; + + if (b->init) + wrmsrl(MSR_IA32_MCx_CTL(i), 0); + } + return 0; +} + +static int mce_syscore_suspend(void) +{ + return mce_disable_error_reporting(); +} + +static void mce_syscore_shutdown(void) +{ + mce_disable_error_reporting(); +} + +/* + * On resume clear all MCE state. Don't want to see leftovers from the BIOS. + * Only one CPU is active at this time, the others get re-added later using + * CPU hotplug: + */ +static void mce_syscore_resume(void) +{ + __mcheck_cpu_init_generic(); + __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info)); +} + +static struct syscore_ops mce_syscore_ops = { + .suspend = mce_syscore_suspend, + .shutdown = mce_syscore_shutdown, + .resume = mce_syscore_resume, +}; + +/* + * mce_device: Sysfs support + */ + +static void mce_cpu_restart(void *data) +{ + if (!mce_available(__this_cpu_ptr(&cpu_info))) + return; + __mcheck_cpu_init_generic(); + __mcheck_cpu_init_timer(); +} + +/* Reinit MCEs after user configuration changes */ +static void mce_restart(void) +{ + mce_timer_delete_all(); + on_each_cpu(mce_cpu_restart, NULL, 1); +} + +/* Toggle features for corrected errors */ +static void mce_disable_cmci(void *data) +{ + if (!mce_available(__this_cpu_ptr(&cpu_info))) + return; + cmci_clear(); +} + +static void mce_enable_ce(void *all) +{ + if (!mce_available(__this_cpu_ptr(&cpu_info))) + return; + cmci_reenable(); + cmci_recheck(); + if (all) + __mcheck_cpu_init_timer(); +} + +static struct bus_type mce_subsys = { + .name = "machinecheck", + .dev_name = "machinecheck", +}; + +DEFINE_PER_CPU(struct device *, mce_device); + +void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); + +static inline struct mce_bank *attr_to_bank(struct device_attribute *attr) +{ + return container_of(attr, struct mce_bank, attr); +} + +static ssize_t show_bank(struct device *s, struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); +} + +static ssize_t set_bank(struct device *s, struct device_attribute *attr, + const char *buf, size_t size) +{ + u64 new; + + if (strict_strtoull(buf, 0, &new) < 0) + return -EINVAL; + + attr_to_bank(attr)->ctl = new; + mce_restart(); + + return size; +} + +static ssize_t +show_trigger(struct device *s, struct device_attribute *attr, char *buf) +{ + strcpy(buf, mce_helper); + strcat(buf, "\n"); + return strlen(mce_helper) + 1; +} + +static ssize_t set_trigger(struct device *s, struct device_attribute *attr, + const char *buf, size_t siz) +{ + char *p; + + strncpy(mce_helper, buf, sizeof(mce_helper)); + mce_helper[sizeof(mce_helper)-1] = 0; + p = strchr(mce_helper, '\n'); + + if (p) + *p = 0; + + return strlen(mce_helper) + !!p; +} + +static ssize_t set_ignore_ce(struct device *s, + struct device_attribute *attr, + const char *buf, size_t size) +{ + u64 new; + + if (strict_strtoull(buf, 0, &new) < 0) + return -EINVAL; + + if (mca_cfg.ignore_ce ^ !!new) { + if (new) { + /* disable ce features */ + mce_timer_delete_all(); + on_each_cpu(mce_disable_cmci, NULL, 1); + mca_cfg.ignore_ce = true; + } else { + /* enable ce features */ + mca_cfg.ignore_ce = false; + on_each_cpu(mce_enable_ce, (void *)1, 1); + } + } + return size; +} + +static ssize_t set_cmci_disabled(struct device *s, + struct device_attribute *attr, + const char *buf, size_t size) +{ + u64 new; + + if (strict_strtoull(buf, 0, &new) < 0) + return -EINVAL; + + if (mca_cfg.cmci_disabled ^ !!new) { + if (new) { + /* disable cmci */ + on_each_cpu(mce_disable_cmci, NULL, 1); + mca_cfg.cmci_disabled = true; + } else { + /* enable cmci */ + mca_cfg.cmci_disabled = false; + on_each_cpu(mce_enable_ce, NULL, 1); + } + } + return size; +} + +static ssize_t store_int_with_restart(struct device *s, + struct device_attribute *attr, + const char *buf, size_t size) +{ + ssize_t ret = device_store_int(s, attr, buf, size); + mce_restart(); + return ret; +} + +static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger); +static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant); +static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout); +static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce); + +static struct dev_ext_attribute dev_attr_check_interval = { + __ATTR(check_interval, 0644, device_show_int, store_int_with_restart), + &check_interval +}; + +static struct dev_ext_attribute dev_attr_ignore_ce = { + __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce), + &mca_cfg.ignore_ce +}; + +static struct dev_ext_attribute dev_attr_cmci_disabled = { + __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled), + &mca_cfg.cmci_disabled +}; + +static struct device_attribute *mce_device_attrs[] = { + &dev_attr_tolerant.attr, + &dev_attr_check_interval.attr, + &dev_attr_trigger, + &dev_attr_monarch_timeout.attr, + &dev_attr_dont_log_ce.attr, + &dev_attr_ignore_ce.attr, + &dev_attr_cmci_disabled.attr, + NULL +}; + +static cpumask_var_t mce_device_initialized; + +static void mce_device_release(struct device *dev) +{ + kfree(dev); +} + +/* Per cpu device init. All of the cpus still share the same ctrl bank: */ +static int mce_device_create(unsigned int cpu) +{ + struct device *dev; + int err; + int i, j; + + if (!mce_available(&boot_cpu_data)) + return -EIO; + + dev = kzalloc(sizeof *dev, GFP_KERNEL); + if (!dev) + return -ENOMEM; + dev->id = cpu; + dev->bus = &mce_subsys; + dev->release = &mce_device_release; + + err = device_register(dev); + if (err) { + put_device(dev); + return err; + } + + for (i = 0; mce_device_attrs[i]; i++) { + err = device_create_file(dev, mce_device_attrs[i]); + if (err) + goto error; + } + for (j = 0; j < mca_cfg.banks; j++) { + err = device_create_file(dev, &mce_banks[j].attr); + if (err) + goto error2; + } + cpumask_set_cpu(cpu, mce_device_initialized); + per_cpu(mce_device, cpu) = dev; + + return 0; +error2: + while (--j >= 0) + device_remove_file(dev, &mce_banks[j].attr); +error: + while (--i >= 0) + device_remove_file(dev, mce_device_attrs[i]); + + device_unregister(dev); + + return err; +} + +static void mce_device_remove(unsigned int cpu) +{ + struct device *dev = per_cpu(mce_device, cpu); + int i; + + if (!cpumask_test_cpu(cpu, mce_device_initialized)) + return; + + for (i = 0; mce_device_attrs[i]; i++) + device_remove_file(dev, mce_device_attrs[i]); + + for (i = 0; i < mca_cfg.banks; i++) + device_remove_file(dev, &mce_banks[i].attr); + + device_unregister(dev); + cpumask_clear_cpu(cpu, mce_device_initialized); + per_cpu(mce_device, cpu) = NULL; +} + +/* Make sure there are no machine checks on offlined CPUs. */ +static void mce_disable_cpu(void *h) +{ + unsigned long action = *(unsigned long *)h; + int i; + + if (!mce_available(__this_cpu_ptr(&cpu_info))) + return; + + if (!(action & CPU_TASKS_FROZEN)) + cmci_clear(); + for (i = 0; i < mca_cfg.banks; i++) { + struct mce_bank *b = &mce_banks[i]; + + if (b->init) + wrmsrl(MSR_IA32_MCx_CTL(i), 0); + } +} + +static void mce_reenable_cpu(void *h) +{ + unsigned long action = *(unsigned long *)h; + int i; + + if (!mce_available(__this_cpu_ptr(&cpu_info))) + return; + + if (!(action & CPU_TASKS_FROZEN)) + cmci_reenable(); + for (i = 0; i < mca_cfg.banks; i++) { + struct mce_bank *b = &mce_banks[i]; + + if (b->init) + wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); + } +} + +/* Get notified when a cpu comes on/off. Be hotplug friendly. */ +static int +mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + struct timer_list *t = &per_cpu(mce_timer, cpu); + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_ONLINE: + mce_device_create(cpu); + if (threshold_cpu_callback) + threshold_cpu_callback(action, cpu); + break; + case CPU_DEAD: + if (threshold_cpu_callback) + threshold_cpu_callback(action, cpu); + mce_device_remove(cpu); + mce_intel_hcpu_update(cpu); + break; + case CPU_DOWN_PREPARE: + smp_call_function_single(cpu, mce_disable_cpu, &action, 1); + del_timer_sync(t); + break; + case CPU_DOWN_FAILED: + smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); + mce_start_timer(cpu, t); + break; + } + + if (action == CPU_POST_DEAD) { + /* intentionally ignoring frozen here */ + cmci_rediscover(); + } + + return NOTIFY_OK; +} + +static struct notifier_block mce_cpu_notifier = { + .notifier_call = mce_cpu_callback, +}; + +static __init void mce_init_banks(void) +{ + int i; + + for (i = 0; i < mca_cfg.banks; i++) { + struct mce_bank *b = &mce_banks[i]; + struct device_attribute *a = &b->attr; + + sysfs_attr_init(&a->attr); + a->attr.name = b->attrname; + snprintf(b->attrname, ATTR_LEN, "bank%d", i); + + a->attr.mode = 0644; + a->show = show_bank; + a->store = set_bank; + } +} + +static __init int mcheck_init_device(void) +{ + int err; + int i = 0; + + if (!mce_available(&boot_cpu_data)) { + err = -EIO; + goto err_out; + } + + if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) { + err = -ENOMEM; + goto err_out; + } + + mce_init_banks(); + + err = subsys_system_register(&mce_subsys, NULL); + if (err) + goto err_out_mem; + + cpu_notifier_register_begin(); + for_each_online_cpu(i) { + err = mce_device_create(i); + if (err) { + /* + * Register notifier anyway (and do not unreg it) so + * that we don't leave undeleted timers, see notifier + * callback above. + */ + __register_hotcpu_notifier(&mce_cpu_notifier); + cpu_notifier_register_done(); + goto err_device_create; + } + } + + __register_hotcpu_notifier(&mce_cpu_notifier); + cpu_notifier_register_done(); + + register_syscore_ops(&mce_syscore_ops); + + /* register character device /dev/mcelog */ + err = misc_register(&mce_chrdev_device); + if (err) + goto err_register; + + return 0; + +err_register: + unregister_syscore_ops(&mce_syscore_ops); + +err_device_create: + /* + * We didn't keep track of which devices were created above, but + * even if we had, the set of online cpus might have changed. + * Play safe and remove for every possible cpu, since + * mce_device_remove() will do the right thing. + */ + for_each_possible_cpu(i) + mce_device_remove(i); + +err_out_mem: + free_cpumask_var(mce_device_initialized); + +err_out: + pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err); + + return err; +} +device_initcall_sync(mcheck_init_device); + +/* + * Old style boot options parsing. Only for compatibility. + */ +static int __init mcheck_disable(char *str) +{ + mca_cfg.disabled = true; + return 1; +} +__setup("nomce", mcheck_disable); + +#ifdef CONFIG_DEBUG_FS +struct dentry *mce_get_debugfs_dir(void) +{ + static struct dentry *dmce; + + if (!dmce) + dmce = debugfs_create_dir("mce", NULL); + + return dmce; +} + +static void mce_reset(void) +{ + cpu_missing = 0; + atomic_set(&mce_fake_paniced, 0); + atomic_set(&mce_executing, 0); + atomic_set(&mce_callin, 0); + atomic_set(&global_nwo, 0); +} + +static int fake_panic_get(void *data, u64 *val) +{ + *val = fake_panic; + return 0; +} + +static int fake_panic_set(void *data, u64 val) +{ + mce_reset(); + fake_panic = val; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, + fake_panic_set, "%llu\n"); + +static int __init mcheck_debugfs_init(void) +{ + struct dentry *dmce, *ffake_panic; + + dmce = mce_get_debugfs_dir(); + if (!dmce) + return -ENOMEM; + ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL, + &fake_panic_fops); + if (!ffake_panic) + return -ENOMEM; + + return 0; +} +late_initcall(mcheck_debugfs_init); +#endif diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c new file mode 100644 index 00000000000..603df4f7464 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -0,0 +1,789 @@ +/* + * (c) 2005-2012 Advanced Micro Devices, Inc. + * Your use of this code is subject to the terms and conditions of the + * GNU general public license version 2. See "COPYING" or + * http://www.gnu.org/licenses/gpl.html + * + * Written by Jacob Shin - AMD, Inc. + * + * Maintained by: Borislav Petkov <bp@alien8.de> + * + * April 2006 + * - added support for AMD Family 0x10 processors + * May 2012 + * - major scrubbing + * + * All MC4_MISCi registers are shared between multi-cores + */ +#include <linux/interrupt.h> +#include <linux/notifier.h> +#include <linux/kobject.h> +#include <linux/percpu.h> +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/sysfs.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/cpu.h> +#include <linux/smp.h> + +#include <asm/amd_nb.h> +#include <asm/apic.h> +#include <asm/idle.h> +#include <asm/mce.h> +#include <asm/msr.h> + +#define NR_BLOCKS 9 +#define THRESHOLD_MAX 0xFFF +#define INT_TYPE_APIC 0x00020000 +#define MASK_VALID_HI 0x80000000 +#define MASK_CNTP_HI 0x40000000 +#define MASK_LOCKED_HI 0x20000000 +#define MASK_LVTOFF_HI 0x00F00000 +#define MASK_COUNT_EN_HI 0x00080000 +#define MASK_INT_TYPE_HI 0x00060000 +#define MASK_OVERFLOW_HI 0x00010000 +#define MASK_ERR_COUNT_HI 0x00000FFF +#define MASK_BLKPTR_LO 0xFF000000 +#define MCG_XBLK_ADDR 0xC0000400 + +static const char * const th_names[] = { + "load_store", + "insn_fetch", + "combined_unit", + "", + "northbridge", + "execution_unit", +}; + +static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); +static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ + +static void amd_threshold_interrupt(void); + +/* + * CPU Initialization + */ + +struct thresh_restart { + struct threshold_block *b; + int reset; + int set_lvt_off; + int lvt_off; + u16 old_limit; +}; + +static inline bool is_shared_bank(int bank) +{ + /* Bank 4 is for northbridge reporting and is thus shared */ + return (bank == 4); +} + +static const char * const bank4_names(struct threshold_block *b) +{ + switch (b->address) { + /* MSR4_MISC0 */ + case 0x00000413: + return "dram"; + + case 0xc0000408: + return "ht_links"; + + case 0xc0000409: + return "l3_cache"; + + default: + WARN(1, "Funny MSR: 0x%08x\n", b->address); + return ""; + } +}; + + +static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits) +{ + /* + * bank 4 supports APIC LVT interrupts implicitly since forever. + */ + if (bank == 4) + return true; + + /* + * IntP: interrupt present; if this bit is set, the thresholding + * bank can generate APIC LVT interrupts + */ + return msr_high_bits & BIT(28); +} + +static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) +{ + int msr = (hi & MASK_LVTOFF_HI) >> 20; + + if (apic < 0) { + pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt " + "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu, + b->bank, b->block, b->address, hi, lo); + return 0; + } + + if (apic != msr) { + pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d " + "for bank %d, block %d (MSR%08X=0x%x%08x)\n", + b->cpu, apic, b->bank, b->block, b->address, hi, lo); + return 0; + } + + return 1; +}; + +/* + * Called via smp_call_function_single(), must be called with correct + * cpu affinity. + */ +static void threshold_restart_bank(void *_tr) +{ + struct thresh_restart *tr = _tr; + u32 hi, lo; + + rdmsr(tr->b->address, lo, hi); + + if (tr->b->threshold_limit < (hi & THRESHOLD_MAX)) + tr->reset = 1; /* limit cannot be lower than err count */ + + if (tr->reset) { /* reset err count and overflow bit */ + hi = + (hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) | + (THRESHOLD_MAX - tr->b->threshold_limit); + } else if (tr->old_limit) { /* change limit w/o reset */ + int new_count = (hi & THRESHOLD_MAX) + + (tr->old_limit - tr->b->threshold_limit); + + hi = (hi & ~MASK_ERR_COUNT_HI) | + (new_count & THRESHOLD_MAX); + } + + /* clear IntType */ + hi &= ~MASK_INT_TYPE_HI; + + if (!tr->b->interrupt_capable) + goto done; + + if (tr->set_lvt_off) { + if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) { + /* set new lvt offset */ + hi &= ~MASK_LVTOFF_HI; + hi |= tr->lvt_off << 20; + } + } + + if (tr->b->interrupt_enable) + hi |= INT_TYPE_APIC; + + done: + + hi |= MASK_COUNT_EN_HI; + wrmsr(tr->b->address, lo, hi); +} + +static void mce_threshold_block_init(struct threshold_block *b, int offset) +{ + struct thresh_restart tr = { + .b = b, + .set_lvt_off = 1, + .lvt_off = offset, + }; + + b->threshold_limit = THRESHOLD_MAX; + threshold_restart_bank(&tr); +}; + +static int setup_APIC_mce(int reserved, int new) +{ + if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR, + APIC_EILVT_MSG_FIX, 0)) + return new; + + return reserved; +} + +/* cpu init entry point, called from mce.c with preempt off */ +void mce_amd_feature_init(struct cpuinfo_x86 *c) +{ + struct threshold_block b; + unsigned int cpu = smp_processor_id(); + u32 low = 0, high = 0, address = 0; + unsigned int bank, block; + int offset = -1; + + for (bank = 0; bank < mca_cfg.banks; ++bank) { + for (block = 0; block < NR_BLOCKS; ++block) { + if (block == 0) + address = MSR_IA32_MC0_MISC + bank * 4; + else if (block == 1) { + address = (low & MASK_BLKPTR_LO) >> 21; + if (!address) + break; + + address += MCG_XBLK_ADDR; + } else + ++address; + + if (rdmsr_safe(address, &low, &high)) + break; + + if (!(high & MASK_VALID_HI)) + continue; + + if (!(high & MASK_CNTP_HI) || + (high & MASK_LOCKED_HI)) + continue; + + if (!block) + per_cpu(bank_map, cpu) |= (1 << bank); + + memset(&b, 0, sizeof(b)); + b.cpu = cpu; + b.bank = bank; + b.block = block; + b.address = address; + b.interrupt_capable = lvt_interrupt_supported(bank, high); + + if (b.interrupt_capable) { + int new = (high & MASK_LVTOFF_HI) >> 20; + offset = setup_APIC_mce(offset, new); + } + + mce_threshold_block_init(&b, offset); + mce_threshold_vector = amd_threshold_interrupt; + } + } +} + +/* + * APIC Interrupt Handler + */ + +/* + * threshold interrupt handler will service THRESHOLD_APIC_VECTOR. + * the interrupt goes off when error_count reaches threshold_limit. + * the handler will simply log mcelog w/ software defined bank number. + */ +static void amd_threshold_interrupt(void) +{ + u32 low = 0, high = 0, address = 0; + unsigned int bank, block; + struct mce m; + + mce_setup(&m); + + /* assume first bank caused it */ + for (bank = 0; bank < mca_cfg.banks; ++bank) { + if (!(per_cpu(bank_map, m.cpu) & (1 << bank))) + continue; + for (block = 0; block < NR_BLOCKS; ++block) { + if (block == 0) { + address = MSR_IA32_MC0_MISC + bank * 4; + } else if (block == 1) { + address = (low & MASK_BLKPTR_LO) >> 21; + if (!address) + break; + address += MCG_XBLK_ADDR; + } else { + ++address; + } + + if (rdmsr_safe(address, &low, &high)) + break; + + if (!(high & MASK_VALID_HI)) { + if (block) + continue; + else + break; + } + + if (!(high & MASK_CNTP_HI) || + (high & MASK_LOCKED_HI)) + continue; + + /* + * Log the machine check that caused the threshold + * event. + */ + machine_check_poll(MCP_TIMESTAMP, + &__get_cpu_var(mce_poll_banks)); + + if (high & MASK_OVERFLOW_HI) { + rdmsrl(address, m.misc); + rdmsrl(MSR_IA32_MC0_STATUS + bank * 4, + m.status); + m.bank = K8_MCE_THRESHOLD_BASE + + bank * NR_BLOCKS + + block; + mce_log(&m); + return; + } + } + } +} + +/* + * Sysfs Interface + */ + +struct threshold_attr { + struct attribute attr; + ssize_t (*show) (struct threshold_block *, char *); + ssize_t (*store) (struct threshold_block *, const char *, size_t count); +}; + +#define SHOW_FIELDS(name) \ +static ssize_t show_ ## name(struct threshold_block *b, char *buf) \ +{ \ + return sprintf(buf, "%lu\n", (unsigned long) b->name); \ +} +SHOW_FIELDS(interrupt_enable) +SHOW_FIELDS(threshold_limit) + +static ssize_t +store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size) +{ + struct thresh_restart tr; + unsigned long new; + + if (!b->interrupt_capable) + return -EINVAL; + + if (strict_strtoul(buf, 0, &new) < 0) + return -EINVAL; + + b->interrupt_enable = !!new; + + memset(&tr, 0, sizeof(tr)); + tr.b = b; + + smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); + + return size; +} + +static ssize_t +store_threshold_limit(struct threshold_block *b, const char *buf, size_t size) +{ + struct thresh_restart tr; + unsigned long new; + + if (strict_strtoul(buf, 0, &new) < 0) + return -EINVAL; + + if (new > THRESHOLD_MAX) + new = THRESHOLD_MAX; + if (new < 1) + new = 1; + + memset(&tr, 0, sizeof(tr)); + tr.old_limit = b->threshold_limit; + b->threshold_limit = new; + tr.b = b; + + smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); + + return size; +} + +static ssize_t show_error_count(struct threshold_block *b, char *buf) +{ + u32 lo, hi; + + rdmsr_on_cpu(b->cpu, b->address, &lo, &hi); + + return sprintf(buf, "%u\n", ((hi & THRESHOLD_MAX) - + (THRESHOLD_MAX - b->threshold_limit))); +} + +static struct threshold_attr error_count = { + .attr = {.name = __stringify(error_count), .mode = 0444 }, + .show = show_error_count, +}; + +#define RW_ATTR(val) \ +static struct threshold_attr val = { \ + .attr = {.name = __stringify(val), .mode = 0644 }, \ + .show = show_## val, \ + .store = store_## val, \ +}; + +RW_ATTR(interrupt_enable); +RW_ATTR(threshold_limit); + +static struct attribute *default_attrs[] = { + &threshold_limit.attr, + &error_count.attr, + NULL, /* possibly interrupt_enable if supported, see below */ + NULL, +}; + +#define to_block(k) container_of(k, struct threshold_block, kobj) +#define to_attr(a) container_of(a, struct threshold_attr, attr) + +static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + struct threshold_block *b = to_block(kobj); + struct threshold_attr *a = to_attr(attr); + ssize_t ret; + + ret = a->show ? a->show(b, buf) : -EIO; + + return ret; +} + +static ssize_t store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct threshold_block *b = to_block(kobj); + struct threshold_attr *a = to_attr(attr); + ssize_t ret; + + ret = a->store ? a->store(b, buf, count) : -EIO; + + return ret; +} + +static const struct sysfs_ops threshold_ops = { + .show = show, + .store = store, +}; + +static struct kobj_type threshold_ktype = { + .sysfs_ops = &threshold_ops, + .default_attrs = default_attrs, +}; + +static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank, + unsigned int block, u32 address) +{ + struct threshold_block *b = NULL; + u32 low, high; + int err; + + if ((bank >= mca_cfg.banks) || (block >= NR_BLOCKS)) + return 0; + + if (rdmsr_safe_on_cpu(cpu, address, &low, &high)) + return 0; + + if (!(high & MASK_VALID_HI)) { + if (block) + goto recurse; + else + return 0; + } + + if (!(high & MASK_CNTP_HI) || + (high & MASK_LOCKED_HI)) + goto recurse; + + b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL); + if (!b) + return -ENOMEM; + + b->block = block; + b->bank = bank; + b->cpu = cpu; + b->address = address; + b->interrupt_enable = 0; + b->interrupt_capable = lvt_interrupt_supported(bank, high); + b->threshold_limit = THRESHOLD_MAX; + + if (b->interrupt_capable) + threshold_ktype.default_attrs[2] = &interrupt_enable.attr; + else + threshold_ktype.default_attrs[2] = NULL; + + INIT_LIST_HEAD(&b->miscj); + + if (per_cpu(threshold_banks, cpu)[bank]->blocks) { + list_add(&b->miscj, + &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj); + } else { + per_cpu(threshold_banks, cpu)[bank]->blocks = b; + } + + err = kobject_init_and_add(&b->kobj, &threshold_ktype, + per_cpu(threshold_banks, cpu)[bank]->kobj, + (bank == 4 ? bank4_names(b) : th_names[bank])); + if (err) + goto out_free; +recurse: + if (!block) { + address = (low & MASK_BLKPTR_LO) >> 21; + if (!address) + return 0; + address += MCG_XBLK_ADDR; + } else { + ++address; + } + + err = allocate_threshold_blocks(cpu, bank, ++block, address); + if (err) + goto out_free; + + if (b) + kobject_uevent(&b->kobj, KOBJ_ADD); + + return err; + +out_free: + if (b) { + kobject_put(&b->kobj); + list_del(&b->miscj); + kfree(b); + } + return err; +} + +static int __threshold_add_blocks(struct threshold_bank *b) +{ + struct list_head *head = &b->blocks->miscj; + struct threshold_block *pos = NULL; + struct threshold_block *tmp = NULL; + int err = 0; + + err = kobject_add(&b->blocks->kobj, b->kobj, b->blocks->kobj.name); + if (err) + return err; + + list_for_each_entry_safe(pos, tmp, head, miscj) { + + err = kobject_add(&pos->kobj, b->kobj, pos->kobj.name); + if (err) { + list_for_each_entry_safe_reverse(pos, tmp, head, miscj) + kobject_del(&pos->kobj); + + return err; + } + } + return err; +} + +static int threshold_create_bank(unsigned int cpu, unsigned int bank) +{ + struct device *dev = per_cpu(mce_device, cpu); + struct amd_northbridge *nb = NULL; + struct threshold_bank *b = NULL; + const char *name = th_names[bank]; + int err = 0; + + if (is_shared_bank(bank)) { + nb = node_to_amd_nb(amd_get_nb_id(cpu)); + + /* threshold descriptor already initialized on this node? */ + if (nb && nb->bank4) { + /* yes, use it */ + b = nb->bank4; + err = kobject_add(b->kobj, &dev->kobj, name); + if (err) + goto out; + + per_cpu(threshold_banks, cpu)[bank] = b; + atomic_inc(&b->cpus); + + err = __threshold_add_blocks(b); + + goto out; + } + } + + b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); + if (!b) { + err = -ENOMEM; + goto out; + } + + b->kobj = kobject_create_and_add(name, &dev->kobj); + if (!b->kobj) { + err = -EINVAL; + goto out_free; + } + + per_cpu(threshold_banks, cpu)[bank] = b; + + if (is_shared_bank(bank)) { + atomic_set(&b->cpus, 1); + + /* nb is already initialized, see above */ + if (nb) { + WARN_ON(nb->bank4); + nb->bank4 = b; + } + } + + err = allocate_threshold_blocks(cpu, bank, 0, + MSR_IA32_MC0_MISC + bank * 4); + if (!err) + goto out; + + out_free: + kfree(b); + + out: + return err; +} + +/* create dir/files for all valid threshold banks */ +static int threshold_create_device(unsigned int cpu) +{ + unsigned int bank; + struct threshold_bank **bp; + int err = 0; + + bp = kzalloc(sizeof(struct threshold_bank *) * mca_cfg.banks, + GFP_KERNEL); + if (!bp) + return -ENOMEM; + + per_cpu(threshold_banks, cpu) = bp; + + for (bank = 0; bank < mca_cfg.banks; ++bank) { + if (!(per_cpu(bank_map, cpu) & (1 << bank))) + continue; + err = threshold_create_bank(cpu, bank); + if (err) + return err; + } + + return err; +} + +static void deallocate_threshold_block(unsigned int cpu, + unsigned int bank) +{ + struct threshold_block *pos = NULL; + struct threshold_block *tmp = NULL; + struct threshold_bank *head = per_cpu(threshold_banks, cpu)[bank]; + + if (!head) + return; + + list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) { + kobject_put(&pos->kobj); + list_del(&pos->miscj); + kfree(pos); + } + + kfree(per_cpu(threshold_banks, cpu)[bank]->blocks); + per_cpu(threshold_banks, cpu)[bank]->blocks = NULL; +} + +static void __threshold_remove_blocks(struct threshold_bank *b) +{ + struct threshold_block *pos = NULL; + struct threshold_block *tmp = NULL; + + kobject_del(b->kobj); + + list_for_each_entry_safe(pos, tmp, &b->blocks->miscj, miscj) + kobject_del(&pos->kobj); +} + +static void threshold_remove_bank(unsigned int cpu, int bank) +{ + struct amd_northbridge *nb; + struct threshold_bank *b; + + b = per_cpu(threshold_banks, cpu)[bank]; + if (!b) + return; + + if (!b->blocks) + goto free_out; + + if (is_shared_bank(bank)) { + if (!atomic_dec_and_test(&b->cpus)) { + __threshold_remove_blocks(b); + per_cpu(threshold_banks, cpu)[bank] = NULL; + return; + } else { + /* + * the last CPU on this node using the shared bank is + * going away, remove that bank now. + */ + nb = node_to_amd_nb(amd_get_nb_id(cpu)); + nb->bank4 = NULL; + } + } + + deallocate_threshold_block(cpu, bank); + +free_out: + kobject_del(b->kobj); + kobject_put(b->kobj); + kfree(b); + per_cpu(threshold_banks, cpu)[bank] = NULL; +} + +static void threshold_remove_device(unsigned int cpu) +{ + unsigned int bank; + + for (bank = 0; bank < mca_cfg.banks; ++bank) { + if (!(per_cpu(bank_map, cpu) & (1 << bank))) + continue; + threshold_remove_bank(cpu, bank); + } + kfree(per_cpu(threshold_banks, cpu)); +} + +/* get notified when a cpu comes on/off */ +static void +amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu) +{ + switch (action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + threshold_create_device(cpu); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + threshold_remove_device(cpu); + break; + default: + break; + } +} + +static __init int threshold_init_device(void) +{ + unsigned lcpu = 0; + + /* to hit CPUs online before the notifier is up */ + for_each_online_cpu(lcpu) { + int err = threshold_create_device(lcpu); + + if (err) + return err; + } + threshold_cpu_callback = amd_64_threshold_cpu_callback; + + return 0; +} +/* + * there are 3 funcs which need to be _initcalled in a logic sequence: + * 1. xen_late_init_mcelog + * 2. mcheck_init_device + * 3. threshold_init_device + * + * xen_late_init_mcelog must register xen_mce_chrdev_device before + * native mce_chrdev_device registration if running under xen platform; + * + * mcheck_init_device should be inited before threshold_init_device to + * initialize mce_device, otherwise a NULL ptr dereference will cause panic. + * + * so we use following _initcalls + * 1. device_initcall(xen_late_init_mcelog); + * 2. device_initcall_sync(mcheck_init_device); + * 3. late_initcall(threshold_init_device); + * + * when running under xen, the initcall order is 1,2,3; + * on baremetal, we skip 1 and we do only 2 and 3. + */ +late_initcall(threshold_init_device); diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c new file mode 100644 index 00000000000..9a316b21df8 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -0,0 +1,391 @@ +/* + * Intel specific MCE features. + * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca> + * Copyright (C) 2008, 2009 Intel Corporation + * Author: Andi Kleen + */ + +#include <linux/gfp.h> +#include <linux/interrupt.h> +#include <linux/percpu.h> +#include <linux/sched.h> +#include <linux/cpumask.h> +#include <asm/apic.h> +#include <asm/processor.h> +#include <asm/msr.h> +#include <asm/mce.h> + +#include "mce-internal.h" + +/* + * Support for Intel Correct Machine Check Interrupts. This allows + * the CPU to raise an interrupt when a corrected machine check happened. + * Normally we pick those up using a regular polling timer. + * Also supports reliable discovery of shared banks. + */ + +/* + * CMCI can be delivered to multiple cpus that share a machine check bank + * so we need to designate a single cpu to process errors logged in each bank + * in the interrupt handler (otherwise we would have many races and potential + * double reporting of the same error). + * Note that this can change when a cpu is offlined or brought online since + * some MCA banks are shared across cpus. When a cpu is offlined, cmci_clear() + * disables CMCI on all banks owned by the cpu and clears this bitfield. At + * this point, cmci_rediscover() kicks in and a different cpu may end up + * taking ownership of some of the shared MCA banks that were previously + * owned by the offlined cpu. + */ +static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); + +/* + * cmci_discover_lock protects against parallel discovery attempts + * which could race against each other. + */ +static DEFINE_SPINLOCK(cmci_discover_lock); + +#define CMCI_THRESHOLD 1 +#define CMCI_POLL_INTERVAL (30 * HZ) +#define CMCI_STORM_INTERVAL (1 * HZ) +#define CMCI_STORM_THRESHOLD 15 + +static DEFINE_PER_CPU(unsigned long, cmci_time_stamp); +static DEFINE_PER_CPU(unsigned int, cmci_storm_cnt); +static DEFINE_PER_CPU(unsigned int, cmci_storm_state); + +enum { + CMCI_STORM_NONE, + CMCI_STORM_ACTIVE, + CMCI_STORM_SUBSIDED, +}; + +static atomic_t cmci_storm_on_cpus; + +static int cmci_supported(int *banks) +{ + u64 cap; + + if (mca_cfg.cmci_disabled || mca_cfg.ignore_ce) + return 0; + + /* + * Vendor check is not strictly needed, but the initial + * initialization is vendor keyed and this + * makes sure none of the backdoors are entered otherwise. + */ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return 0; + if (!cpu_has_apic || lapic_get_maxlvt() < 6) + return 0; + rdmsrl(MSR_IA32_MCG_CAP, cap); + *banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff); + return !!(cap & MCG_CMCI_P); +} + +void mce_intel_cmci_poll(void) +{ + if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE) + return; + machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); +} + +void mce_intel_hcpu_update(unsigned long cpu) +{ + if (per_cpu(cmci_storm_state, cpu) == CMCI_STORM_ACTIVE) + atomic_dec(&cmci_storm_on_cpus); + + per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE; +} + +unsigned long mce_intel_adjust_timer(unsigned long interval) +{ + int r; + + if (interval < CMCI_POLL_INTERVAL) + return interval; + + switch (__this_cpu_read(cmci_storm_state)) { + case CMCI_STORM_ACTIVE: + /* + * We switch back to interrupt mode once the poll timer has + * silenced itself. That means no events recorded and the + * timer interval is back to our poll interval. + */ + __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED); + r = atomic_sub_return(1, &cmci_storm_on_cpus); + if (r == 0) + pr_notice("CMCI storm subsided: switching to interrupt mode\n"); + /* FALLTHROUGH */ + + case CMCI_STORM_SUBSIDED: + /* + * We wait for all cpus to go back to SUBSIDED + * state. When that happens we switch back to + * interrupt mode. + */ + if (!atomic_read(&cmci_storm_on_cpus)) { + __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE); + cmci_reenable(); + cmci_recheck(); + } + return CMCI_POLL_INTERVAL; + default: + /* + * We have shiny weather. Let the poll do whatever it + * thinks. + */ + return interval; + } +} + +static void cmci_storm_disable_banks(void) +{ + unsigned long flags, *owned; + int bank; + u64 val; + + spin_lock_irqsave(&cmci_discover_lock, flags); + owned = __get_cpu_var(mce_banks_owned); + for_each_set_bit(bank, owned, MAX_NR_BANKS) { + rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + val &= ~MCI_CTL2_CMCI_EN; + wrmsrl(MSR_IA32_MCx_CTL2(bank), val); + } + spin_unlock_irqrestore(&cmci_discover_lock, flags); +} + +static bool cmci_storm_detect(void) +{ + unsigned int cnt = __this_cpu_read(cmci_storm_cnt); + unsigned long ts = __this_cpu_read(cmci_time_stamp); + unsigned long now = jiffies; + int r; + + if (__this_cpu_read(cmci_storm_state) != CMCI_STORM_NONE) + return true; + + if (time_before_eq(now, ts + CMCI_STORM_INTERVAL)) { + cnt++; + } else { + cnt = 1; + __this_cpu_write(cmci_time_stamp, now); + } + __this_cpu_write(cmci_storm_cnt, cnt); + + if (cnt <= CMCI_STORM_THRESHOLD) + return false; + + cmci_storm_disable_banks(); + __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE); + r = atomic_add_return(1, &cmci_storm_on_cpus); + mce_timer_kick(CMCI_POLL_INTERVAL); + + if (r == 1) + pr_notice("CMCI storm detected: switching to poll mode\n"); + return true; +} + +/* + * The interrupt handler. This is called on every event. + * Just call the poller directly to log any events. + * This could in theory increase the threshold under high load, + * but doesn't for now. + */ +static void intel_threshold_interrupt(void) +{ + if (cmci_storm_detect()) + return; + machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); + mce_notify_irq(); +} + +/* + * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks + * on this CPU. Use the algorithm recommended in the SDM to discover shared + * banks. + */ +static void cmci_discover(int banks) +{ + unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned); + unsigned long flags; + int i; + int bios_wrong_thresh = 0; + + spin_lock_irqsave(&cmci_discover_lock, flags); + for (i = 0; i < banks; i++) { + u64 val; + int bios_zero_thresh = 0; + + if (test_bit(i, owned)) + continue; + + /* Skip banks in firmware first mode */ + if (test_bit(i, mce_banks_ce_disabled)) + continue; + + rdmsrl(MSR_IA32_MCx_CTL2(i), val); + + /* Already owned by someone else? */ + if (val & MCI_CTL2_CMCI_EN) { + clear_bit(i, owned); + __clear_bit(i, __get_cpu_var(mce_poll_banks)); + continue; + } + + if (!mca_cfg.bios_cmci_threshold) { + val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; + val |= CMCI_THRESHOLD; + } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) { + /* + * If bios_cmci_threshold boot option was specified + * but the threshold is zero, we'll try to initialize + * it to 1. + */ + bios_zero_thresh = 1; + val |= CMCI_THRESHOLD; + } + + val |= MCI_CTL2_CMCI_EN; + wrmsrl(MSR_IA32_MCx_CTL2(i), val); + rdmsrl(MSR_IA32_MCx_CTL2(i), val); + + /* Did the enable bit stick? -- the bank supports CMCI */ + if (val & MCI_CTL2_CMCI_EN) { + set_bit(i, owned); + __clear_bit(i, __get_cpu_var(mce_poll_banks)); + /* + * We are able to set thresholds for some banks that + * had a threshold of 0. This means the BIOS has not + * set the thresholds properly or does not work with + * this boot option. Note down now and report later. + */ + if (mca_cfg.bios_cmci_threshold && bios_zero_thresh && + (val & MCI_CTL2_CMCI_THRESHOLD_MASK)) + bios_wrong_thresh = 1; + } else { + WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks))); + } + } + spin_unlock_irqrestore(&cmci_discover_lock, flags); + if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) { + pr_info_once( + "bios_cmci_threshold: Some banks do not have valid thresholds set\n"); + pr_info_once( + "bios_cmci_threshold: Make sure your BIOS supports this boot option\n"); + } +} + +/* + * Just in case we missed an event during initialization check + * all the CMCI owned banks. + */ +void cmci_recheck(void) +{ + unsigned long flags; + int banks; + + if (!mce_available(__this_cpu_ptr(&cpu_info)) || !cmci_supported(&banks)) + return; + local_irq_save(flags); + machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); + local_irq_restore(flags); +} + +/* Caller must hold the lock on cmci_discover_lock */ +static void __cmci_disable_bank(int bank) +{ + u64 val; + + if (!test_bit(bank, __get_cpu_var(mce_banks_owned))) + return; + rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + val &= ~MCI_CTL2_CMCI_EN; + wrmsrl(MSR_IA32_MCx_CTL2(bank), val); + __clear_bit(bank, __get_cpu_var(mce_banks_owned)); +} + +/* + * Disable CMCI on this CPU for all banks it owns when it goes down. + * This allows other CPUs to claim the banks on rediscovery. + */ +void cmci_clear(void) +{ + unsigned long flags; + int i; + int banks; + + if (!cmci_supported(&banks)) + return; + spin_lock_irqsave(&cmci_discover_lock, flags); + for (i = 0; i < banks; i++) + __cmci_disable_bank(i); + spin_unlock_irqrestore(&cmci_discover_lock, flags); +} + +static void cmci_rediscover_work_func(void *arg) +{ + int banks; + + /* Recheck banks in case CPUs don't all have the same */ + if (cmci_supported(&banks)) + cmci_discover(banks); +} + +/* After a CPU went down cycle through all the others and rediscover */ +void cmci_rediscover(void) +{ + int banks; + + if (!cmci_supported(&banks)) + return; + + on_each_cpu(cmci_rediscover_work_func, NULL, 1); +} + +/* + * Reenable CMCI on this CPU in case a CPU down failed. + */ +void cmci_reenable(void) +{ + int banks; + if (cmci_supported(&banks)) + cmci_discover(banks); +} + +void cmci_disable_bank(int bank) +{ + int banks; + unsigned long flags; + + if (!cmci_supported(&banks)) + return; + + spin_lock_irqsave(&cmci_discover_lock, flags); + __cmci_disable_bank(bank); + spin_unlock_irqrestore(&cmci_discover_lock, flags); +} + +static void intel_init_cmci(void) +{ + int banks; + + if (!cmci_supported(&banks)) + return; + + mce_threshold_vector = intel_threshold_interrupt; + cmci_discover(banks); + /* + * For CPU #0 this runs with still disabled APIC, but that's + * ok because only the vector is set up. We still do another + * check for the banks later for CPU #0 just to make sure + * to not miss any events. + */ + apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED); + cmci_recheck(); +} + +void mce_intel_feature_init(struct cpuinfo_x86 *c) +{ + intel_init_thermal(c); + intel_init_cmci(); +} diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c new file mode 100644 index 00000000000..a3042989398 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/p5.c @@ -0,0 +1,66 @@ +/* + * P5 specific Machine Check Exception Reporting + * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> + */ +#include <linux/interrupt.h> +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/smp.h> + +#include <asm/processor.h> +#include <asm/mce.h> +#include <asm/msr.h> + +/* By default disabled */ +int mce_p5_enabled __read_mostly; + +/* Machine check handler for Pentium class Intel CPUs: */ +static void pentium_machine_check(struct pt_regs *regs, long error_code) +{ + u32 loaddr, hi, lotype; + + rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); + rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); + + printk(KERN_EMERG + "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n", + smp_processor_id(), loaddr, lotype); + + if (lotype & (1<<5)) { + printk(KERN_EMERG + "CPU#%d: Possible thermal failure (CPU on fire ?).\n", + smp_processor_id()); + } + + add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); +} + +/* Set up machine check reporting for processors with Intel style MCE: */ +void intel_p5_mcheck_init(struct cpuinfo_x86 *c) +{ + u32 l, h; + + /* Default P5 to off as its often misconnected: */ + if (!mce_p5_enabled) + return; + + /* Check for MCE support: */ + if (!cpu_has(c, X86_FEATURE_MCE)) + return; + + machine_check_vector = pentium_machine_check; + /* Make sure the vector pointer is visible before we enable MCEs: */ + wmb(); + + /* Read registers before enabling: */ + rdmsr(MSR_IA32_P5_MC_ADDR, l, h); + rdmsr(MSR_IA32_P5_MC_TYPE, l, h); + printk(KERN_INFO + "Intel old style machine check architecture supported.\n"); + + /* Enable MCE: */ + set_in_cr4(X86_CR4_MCE); + printk(KERN_INFO + "Intel old style machine check reporting enabled on CPU#%d.\n", + smp_processor_id()); +} diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c new file mode 100644 index 00000000000..36a1bb6d1ee --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -0,0 +1,573 @@ +/* + * Thermal throttle event support code (such as syslog messaging and rate + * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c). + * + * This allows consistent reporting of CPU thermal throttle events. + * + * Maintains a counter in /sys that keeps track of the number of thermal + * events, such that the user knows how bad the thermal problem might be + * (since the logging to syslog and mcelog is rate limited). + * + * Author: Dmitriy Zavin (dmitriyz@google.com) + * + * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c. + * Inspired by Ross Biro's and Al Borchers' counter code. + */ +#include <linux/interrupt.h> +#include <linux/notifier.h> +#include <linux/jiffies.h> +#include <linux/kernel.h> +#include <linux/percpu.h> +#include <linux/export.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/smp.h> +#include <linux/cpu.h> + +#include <asm/processor.h> +#include <asm/apic.h> +#include <asm/idle.h> +#include <asm/mce.h> +#include <asm/msr.h> +#include <asm/trace/irq_vectors.h> + +/* How long to wait between reporting thermal events */ +#define CHECK_INTERVAL (300 * HZ) + +#define THERMAL_THROTTLING_EVENT 0 +#define POWER_LIMIT_EVENT 1 + +/* + * Current thermal event state: + */ +struct _thermal_state { + bool new_event; + int event; + u64 next_check; + unsigned long count; + unsigned long last_count; +}; + +struct thermal_state { + struct _thermal_state core_throttle; + struct _thermal_state core_power_limit; + struct _thermal_state package_throttle; + struct _thermal_state package_power_limit; + struct _thermal_state core_thresh0; + struct _thermal_state core_thresh1; + struct _thermal_state pkg_thresh0; + struct _thermal_state pkg_thresh1; +}; + +/* Callback to handle core threshold interrupts */ +int (*platform_thermal_notify)(__u64 msr_val); +EXPORT_SYMBOL(platform_thermal_notify); + +/* Callback to handle core package threshold_interrupts */ +int (*platform_thermal_package_notify)(__u64 msr_val); +EXPORT_SYMBOL_GPL(platform_thermal_package_notify); + +/* Callback support of rate control, return true, if + * callback has rate control */ +bool (*platform_thermal_package_rate_control)(void); +EXPORT_SYMBOL_GPL(platform_thermal_package_rate_control); + + +static DEFINE_PER_CPU(struct thermal_state, thermal_state); + +static atomic_t therm_throt_en = ATOMIC_INIT(0); + +static u32 lvtthmr_init __read_mostly; + +#ifdef CONFIG_SYSFS +#define define_therm_throt_device_one_ro(_name) \ + static DEVICE_ATTR(_name, 0444, \ + therm_throt_device_show_##_name, \ + NULL) \ + +#define define_therm_throt_device_show_func(event, name) \ + \ +static ssize_t therm_throt_device_show_##event##_##name( \ + struct device *dev, \ + struct device_attribute *attr, \ + char *buf) \ +{ \ + unsigned int cpu = dev->id; \ + ssize_t ret; \ + \ + preempt_disable(); /* CPU hotplug */ \ + if (cpu_online(cpu)) { \ + ret = sprintf(buf, "%lu\n", \ + per_cpu(thermal_state, cpu).event.name); \ + } else \ + ret = 0; \ + preempt_enable(); \ + \ + return ret; \ +} + +define_therm_throt_device_show_func(core_throttle, count); +define_therm_throt_device_one_ro(core_throttle_count); + +define_therm_throt_device_show_func(core_power_limit, count); +define_therm_throt_device_one_ro(core_power_limit_count); + +define_therm_throt_device_show_func(package_throttle, count); +define_therm_throt_device_one_ro(package_throttle_count); + +define_therm_throt_device_show_func(package_power_limit, count); +define_therm_throt_device_one_ro(package_power_limit_count); + +static struct attribute *thermal_throttle_attrs[] = { + &dev_attr_core_throttle_count.attr, + NULL +}; + +static struct attribute_group thermal_attr_group = { + .attrs = thermal_throttle_attrs, + .name = "thermal_throttle" +}; +#endif /* CONFIG_SYSFS */ + +#define CORE_LEVEL 0 +#define PACKAGE_LEVEL 1 + +/*** + * therm_throt_process - Process thermal throttling event from interrupt + * @curr: Whether the condition is current or not (boolean), since the + * thermal interrupt normally gets called both when the thermal + * event begins and once the event has ended. + * + * This function is called by the thermal interrupt after the + * IRQ has been acknowledged. + * + * It will take care of rate limiting and printing messages to the syslog. + * + * Returns: 0 : Event should NOT be further logged, i.e. still in + * "timeout" from previous log message. + * 1 : Event should be logged further, and a message has been + * printed to the syslog. + */ +static int therm_throt_process(bool new_event, int event, int level) +{ + struct _thermal_state *state; + unsigned int this_cpu = smp_processor_id(); + bool old_event; + u64 now; + struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu); + + now = get_jiffies_64(); + if (level == CORE_LEVEL) { + if (event == THERMAL_THROTTLING_EVENT) + state = &pstate->core_throttle; + else if (event == POWER_LIMIT_EVENT) + state = &pstate->core_power_limit; + else + return 0; + } else if (level == PACKAGE_LEVEL) { + if (event == THERMAL_THROTTLING_EVENT) + state = &pstate->package_throttle; + else if (event == POWER_LIMIT_EVENT) + state = &pstate->package_power_limit; + else + return 0; + } else + return 0; + + old_event = state->new_event; + state->new_event = new_event; + + if (new_event) + state->count++; + + if (time_before64(now, state->next_check) && + state->count != state->last_count) + return 0; + + state->next_check = now + CHECK_INTERVAL; + state->last_count = state->count; + + /* if we just entered the thermal event */ + if (new_event) { + if (event == THERMAL_THROTTLING_EVENT) + printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n", + this_cpu, + level == CORE_LEVEL ? "Core" : "Package", + state->count); + return 1; + } + if (old_event) { + if (event == THERMAL_THROTTLING_EVENT) + printk(KERN_INFO "CPU%d: %s temperature/speed normal\n", + this_cpu, + level == CORE_LEVEL ? "Core" : "Package"); + return 1; + } + + return 0; +} + +static int thresh_event_valid(int level, int event) +{ + struct _thermal_state *state; + unsigned int this_cpu = smp_processor_id(); + struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu); + u64 now = get_jiffies_64(); + + if (level == PACKAGE_LEVEL) + state = (event == 0) ? &pstate->pkg_thresh0 : + &pstate->pkg_thresh1; + else + state = (event == 0) ? &pstate->core_thresh0 : + &pstate->core_thresh1; + + if (time_before64(now, state->next_check)) + return 0; + + state->next_check = now + CHECK_INTERVAL; + + return 1; +} + +static bool int_pln_enable; +static int __init int_pln_enable_setup(char *s) +{ + int_pln_enable = true; + + return 1; +} +__setup("int_pln_enable", int_pln_enable_setup); + +#ifdef CONFIG_SYSFS +/* Add/Remove thermal_throttle interface for CPU device: */ +static int thermal_throttle_add_dev(struct device *dev, unsigned int cpu) +{ + int err; + struct cpuinfo_x86 *c = &cpu_data(cpu); + + err = sysfs_create_group(&dev->kobj, &thermal_attr_group); + if (err) + return err; + + if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) + err = sysfs_add_file_to_group(&dev->kobj, + &dev_attr_core_power_limit_count.attr, + thermal_attr_group.name); + if (cpu_has(c, X86_FEATURE_PTS)) { + err = sysfs_add_file_to_group(&dev->kobj, + &dev_attr_package_throttle_count.attr, + thermal_attr_group.name); + if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) + err = sysfs_add_file_to_group(&dev->kobj, + &dev_attr_package_power_limit_count.attr, + thermal_attr_group.name); + } + + return err; +} + +static void thermal_throttle_remove_dev(struct device *dev) +{ + sysfs_remove_group(&dev->kobj, &thermal_attr_group); +} + +/* Get notified when a cpu comes on/off. Be hotplug friendly. */ +static int +thermal_throttle_cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + struct device *dev; + int err = 0; + + dev = get_cpu_device(cpu); + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + err = thermal_throttle_add_dev(dev, cpu); + WARN_ON(err); + break; + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + case CPU_DEAD: + case CPU_DEAD_FROZEN: + thermal_throttle_remove_dev(dev); + break; + } + return notifier_from_errno(err); +} + +static struct notifier_block thermal_throttle_cpu_notifier = +{ + .notifier_call = thermal_throttle_cpu_callback, +}; + +static __init int thermal_throttle_init_device(void) +{ + unsigned int cpu = 0; + int err; + + if (!atomic_read(&therm_throt_en)) + return 0; + + cpu_notifier_register_begin(); + + /* connect live CPUs to sysfs */ + for_each_online_cpu(cpu) { + err = thermal_throttle_add_dev(get_cpu_device(cpu), cpu); + WARN_ON(err); + } + + __register_hotcpu_notifier(&thermal_throttle_cpu_notifier); + cpu_notifier_register_done(); + + return 0; +} +device_initcall(thermal_throttle_init_device); + +#endif /* CONFIG_SYSFS */ + +static void notify_package_thresholds(__u64 msr_val) +{ + bool notify_thres_0 = false; + bool notify_thres_1 = false; + + if (!platform_thermal_package_notify) + return; + + /* lower threshold check */ + if (msr_val & THERM_LOG_THRESHOLD0) + notify_thres_0 = true; + /* higher threshold check */ + if (msr_val & THERM_LOG_THRESHOLD1) + notify_thres_1 = true; + + if (!notify_thres_0 && !notify_thres_1) + return; + + if (platform_thermal_package_rate_control && + platform_thermal_package_rate_control()) { + /* Rate control is implemented in callback */ + platform_thermal_package_notify(msr_val); + return; + } + + /* lower threshold reached */ + if (notify_thres_0 && thresh_event_valid(PACKAGE_LEVEL, 0)) + platform_thermal_package_notify(msr_val); + /* higher threshold reached */ + if (notify_thres_1 && thresh_event_valid(PACKAGE_LEVEL, 1)) + platform_thermal_package_notify(msr_val); +} + +static void notify_thresholds(__u64 msr_val) +{ + /* check whether the interrupt handler is defined; + * otherwise simply return + */ + if (!platform_thermal_notify) + return; + + /* lower threshold reached */ + if ((msr_val & THERM_LOG_THRESHOLD0) && + thresh_event_valid(CORE_LEVEL, 0)) + platform_thermal_notify(msr_val); + /* higher threshold reached */ + if ((msr_val & THERM_LOG_THRESHOLD1) && + thresh_event_valid(CORE_LEVEL, 1)) + platform_thermal_notify(msr_val); +} + +/* Thermal transition interrupt handler */ +static void intel_thermal_interrupt(void) +{ + __u64 msr_val; + + rdmsrl(MSR_IA32_THERM_STATUS, msr_val); + + /* Check for violation of core thermal thresholds*/ + notify_thresholds(msr_val); + + if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT, + THERMAL_THROTTLING_EVENT, + CORE_LEVEL) != 0) + mce_log_therm_throt_event(msr_val); + + if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable) + therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, + POWER_LIMIT_EVENT, + CORE_LEVEL); + + if (this_cpu_has(X86_FEATURE_PTS)) { + rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); + /* check violations of package thermal thresholds */ + notify_package_thresholds(msr_val); + therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, + THERMAL_THROTTLING_EVENT, + PACKAGE_LEVEL); + if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable) + therm_throt_process(msr_val & + PACKAGE_THERM_STATUS_POWER_LIMIT, + POWER_LIMIT_EVENT, + PACKAGE_LEVEL); + } +} + +static void unexpected_thermal_interrupt(void) +{ + printk(KERN_ERR "CPU%d: Unexpected LVT thermal interrupt!\n", + smp_processor_id()); +} + +static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt; + +static inline void __smp_thermal_interrupt(void) +{ + inc_irq_stat(irq_thermal_count); + smp_thermal_vector(); +} + +asmlinkage __visible void smp_thermal_interrupt(struct pt_regs *regs) +{ + entering_irq(); + __smp_thermal_interrupt(); + exiting_ack_irq(); +} + +asmlinkage __visible void smp_trace_thermal_interrupt(struct pt_regs *regs) +{ + entering_irq(); + trace_thermal_apic_entry(THERMAL_APIC_VECTOR); + __smp_thermal_interrupt(); + trace_thermal_apic_exit(THERMAL_APIC_VECTOR); + exiting_ack_irq(); +} + +/* Thermal monitoring depends on APIC, ACPI and clock modulation */ +static int intel_thermal_supported(struct cpuinfo_x86 *c) +{ + if (!cpu_has_apic) + return 0; + if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) + return 0; + return 1; +} + +void __init mcheck_intel_therm_init(void) +{ + /* + * This function is only called on boot CPU. Save the init thermal + * LVT value on BSP and use that value to restore APs' thermal LVT + * entry BIOS programmed later + */ + if (intel_thermal_supported(&boot_cpu_data)) + lvtthmr_init = apic_read(APIC_LVTTHMR); +} + +void intel_init_thermal(struct cpuinfo_x86 *c) +{ + unsigned int cpu = smp_processor_id(); + int tm2 = 0; + u32 l, h; + + if (!intel_thermal_supported(c)) + return; + + /* + * First check if its enabled already, in which case there might + * be some SMM goo which handles it, so we can't even put a handler + * since it might be delivered via SMI already: + */ + rdmsr(MSR_IA32_MISC_ENABLE, l, h); + + h = lvtthmr_init; + /* + * The initial value of thermal LVT entries on all APs always reads + * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI + * sequence to them and LVT registers are reset to 0s except for + * the mask bits which are set to 1s when APs receive INIT IPI. + * If BIOS takes over the thermal interrupt and sets its interrupt + * delivery mode to SMI (not fixed), it restores the value that the + * BIOS has programmed on AP based on BSP's info we saved since BIOS + * is always setting the same value for all threads/cores. + */ + if ((h & APIC_DM_FIXED_MASK) != APIC_DM_FIXED) + apic_write(APIC_LVTTHMR, lvtthmr_init); + + + if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { + printk(KERN_DEBUG + "CPU%d: Thermal monitoring handled by SMI\n", cpu); + return; + } + + /* Check whether a vector already exists */ + if (h & APIC_VECTOR_MASK) { + printk(KERN_DEBUG + "CPU%d: Thermal LVT vector (%#x) already installed\n", + cpu, (h & APIC_VECTOR_MASK)); + return; + } + + /* early Pentium M models use different method for enabling TM2 */ + if (cpu_has(c, X86_FEATURE_TM2)) { + if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) { + rdmsr(MSR_THERM2_CTL, l, h); + if (l & MSR_THERM2_CTL_TM_SELECT) + tm2 = 1; + } else if (l & MSR_IA32_MISC_ENABLE_TM2) + tm2 = 1; + } + + /* We'll mask the thermal vector in the lapic till we're ready: */ + h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; + apic_write(APIC_LVTTHMR, h); + + rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); + if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable) + wrmsr(MSR_IA32_THERM_INTERRUPT, + (l | (THERM_INT_LOW_ENABLE + | THERM_INT_HIGH_ENABLE)) & ~THERM_INT_PLN_ENABLE, h); + else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) + wrmsr(MSR_IA32_THERM_INTERRUPT, + l | (THERM_INT_LOW_ENABLE + | THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h); + else + wrmsr(MSR_IA32_THERM_INTERRUPT, + l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); + + if (cpu_has(c, X86_FEATURE_PTS)) { + rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); + if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable) + wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, + (l | (PACKAGE_THERM_INT_LOW_ENABLE + | PACKAGE_THERM_INT_HIGH_ENABLE)) + & ~PACKAGE_THERM_INT_PLN_ENABLE, h); + else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) + wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, + l | (PACKAGE_THERM_INT_LOW_ENABLE + | PACKAGE_THERM_INT_HIGH_ENABLE + | PACKAGE_THERM_INT_PLN_ENABLE), h); + else + wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, + l | (PACKAGE_THERM_INT_LOW_ENABLE + | PACKAGE_THERM_INT_HIGH_ENABLE), h); + } + + smp_thermal_vector = intel_thermal_interrupt; + + rdmsr(MSR_IA32_MISC_ENABLE, l, h); + wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h); + + /* Unmask the thermal vector: */ + l = apic_read(APIC_LVTTHMR); + apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); + + printk_once(KERN_INFO "CPU0: Thermal monitoring enabled (%s)\n", + tm2 ? "TM2" : "TM1"); + + /* enable thermal throttle processing */ + atomic_set(&therm_throt_en, 1); +} diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c new file mode 100644 index 00000000000..7245980186e --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -0,0 +1,41 @@ +/* + * Common corrected MCE threshold handler code: + */ +#include <linux/interrupt.h> +#include <linux/kernel.h> + +#include <asm/irq_vectors.h> +#include <asm/apic.h> +#include <asm/idle.h> +#include <asm/mce.h> +#include <asm/trace/irq_vectors.h> + +static void default_threshold_interrupt(void) +{ + printk(KERN_ERR "Unexpected threshold interrupt at vector %x\n", + THRESHOLD_APIC_VECTOR); +} + +void (*mce_threshold_vector)(void) = default_threshold_interrupt; + +static inline void __smp_threshold_interrupt(void) +{ + inc_irq_stat(irq_threshold_count); + mce_threshold_vector(); +} + +asmlinkage __visible void smp_threshold_interrupt(void) +{ + entering_irq(); + __smp_threshold_interrupt(); + exiting_ack_irq(); +} + +asmlinkage __visible void smp_trace_threshold_interrupt(void) +{ + entering_irq(); + trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR); + __smp_threshold_interrupt(); + trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR); + exiting_ack_irq(); +} diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c new file mode 100644 index 00000000000..7dc5564d0cd --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/winchip.c @@ -0,0 +1,38 @@ +/* + * IDT Winchip specific Machine Check Exception Reporting + * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk> + */ +#include <linux/interrupt.h> +#include <linux/kernel.h> +#include <linux/types.h> + +#include <asm/processor.h> +#include <asm/mce.h> +#include <asm/msr.h> + +/* Machine check handler for WinChip C6: */ +static void winchip_machine_check(struct pt_regs *regs, long error_code) +{ + printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); + add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); +} + +/* Set up machine check reporting on the Winchip C6 series */ +void winchip_mcheck_init(struct cpuinfo_x86 *c) +{ + u32 lo, hi; + + machine_check_vector = winchip_machine_check; + /* Make sure the vector pointer is visible before we enable MCEs: */ + wmb(); + + rdmsr(MSR_IDT_FCR1, lo, hi); + lo |= (1<<2); /* Enable EIERRINT (int 18 MCE) */ + lo &= ~(1<<4); /* Enable MCE */ + wrmsr(MSR_IDT_FCR1, lo, hi); + + set_in_cr4(X86_CR4_MCE); + + printk(KERN_INFO + "Winchip machine check reporting enabled on CPU#0.\n"); +} diff --git a/arch/x86/kernel/cpu/microcode/Makefile b/arch/x86/kernel/cpu/microcode/Makefile new file mode 100644 index 00000000000..285c85427c3 --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/Makefile @@ -0,0 +1,7 @@ +microcode-y := core.o +obj-$(CONFIG_MICROCODE) += microcode.o +microcode-$(CONFIG_MICROCODE_INTEL) += intel.o intel_lib.o +microcode-$(CONFIG_MICROCODE_AMD) += amd.o +obj-$(CONFIG_MICROCODE_EARLY) += core_early.o +obj-$(CONFIG_MICROCODE_INTEL_EARLY) += intel_early.o +obj-$(CONFIG_MICROCODE_AMD_EARLY) += amd_early.o diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c new file mode 100644 index 00000000000..8fffd845e22 --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -0,0 +1,492 @@ +/* + * AMD CPU Microcode Update Driver for Linux + * Copyright (C) 2008-2011 Advanced Micro Devices Inc. + * + * Author: Peter Oruba <peter.oruba@amd.com> + * + * Based on work by: + * Tigran Aivazian <tigran@aivazian.fsnet.co.uk> + * + * Maintainers: + * Andreas Herrmann <herrmann.der.user@googlemail.com> + * Borislav Petkov <bp@alien8.de> + * + * This driver allows to upgrade microcode on F10h AMD + * CPUs and later. + * + * Licensed under the terms of the GNU General Public + * License version 2. See file COPYING for details. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/firmware.h> +#include <linux/pci_ids.h> +#include <linux/uaccess.h> +#include <linux/vmalloc.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/pci.h> + +#include <asm/microcode.h> +#include <asm/processor.h> +#include <asm/msr.h> +#include <asm/microcode_amd.h> + +MODULE_DESCRIPTION("AMD Microcode Update Driver"); +MODULE_AUTHOR("Peter Oruba"); +MODULE_LICENSE("GPL v2"); + +static struct equiv_cpu_entry *equiv_cpu_table; + +struct ucode_patch { + struct list_head plist; + void *data; + u32 patch_id; + u16 equiv_cpu; +}; + +static LIST_HEAD(pcache); + +static u16 __find_equiv_id(unsigned int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + return find_equiv_id(equiv_cpu_table, uci->cpu_sig.sig); +} + +static u32 find_cpu_family_by_equiv_cpu(u16 equiv_cpu) +{ + int i = 0; + + BUG_ON(!equiv_cpu_table); + + while (equiv_cpu_table[i].equiv_cpu != 0) { + if (equiv_cpu == equiv_cpu_table[i].equiv_cpu) + return equiv_cpu_table[i].installed_cpu; + i++; + } + return 0; +} + +/* + * a small, trivial cache of per-family ucode patches + */ +static struct ucode_patch *cache_find_patch(u16 equiv_cpu) +{ + struct ucode_patch *p; + + list_for_each_entry(p, &pcache, plist) + if (p->equiv_cpu == equiv_cpu) + return p; + return NULL; +} + +static void update_cache(struct ucode_patch *new_patch) +{ + struct ucode_patch *p; + + list_for_each_entry(p, &pcache, plist) { + if (p->equiv_cpu == new_patch->equiv_cpu) { + if (p->patch_id >= new_patch->patch_id) + /* we already have the latest patch */ + return; + + list_replace(&p->plist, &new_patch->plist); + kfree(p->data); + kfree(p); + return; + } + } + /* no patch found, add it */ + list_add_tail(&new_patch->plist, &pcache); +} + +static void free_cache(void) +{ + struct ucode_patch *p, *tmp; + + list_for_each_entry_safe(p, tmp, &pcache, plist) { + __list_del(p->plist.prev, p->plist.next); + kfree(p->data); + kfree(p); + } +} + +static struct ucode_patch *find_patch(unsigned int cpu) +{ + u16 equiv_id; + + equiv_id = __find_equiv_id(cpu); + if (!equiv_id) + return NULL; + + return cache_find_patch(equiv_id); +} + +static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + struct ucode_patch *p; + + csig->sig = cpuid_eax(0x00000001); + csig->rev = c->microcode; + + /* + * a patch could have been loaded early, set uci->mc so that + * mc_bp_resume() can call apply_microcode() + */ + p = find_patch(cpu); + if (p && (p->patch_id == csig->rev)) + uci->mc = p->data; + + pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev); + + return 0; +} + +static unsigned int verify_patch_size(u8 family, u32 patch_size, + unsigned int size) +{ + u32 max_size; + +#define F1XH_MPB_MAX_SIZE 2048 +#define F14H_MPB_MAX_SIZE 1824 +#define F15H_MPB_MAX_SIZE 4096 +#define F16H_MPB_MAX_SIZE 3458 + + switch (family) { + case 0x14: + max_size = F14H_MPB_MAX_SIZE; + break; + case 0x15: + max_size = F15H_MPB_MAX_SIZE; + break; + case 0x16: + max_size = F16H_MPB_MAX_SIZE; + break; + default: + max_size = F1XH_MPB_MAX_SIZE; + break; + } + + if (patch_size > min_t(u32, size, max_size)) { + pr_err("patch size mismatch\n"); + return 0; + } + + return patch_size; +} + +int __apply_microcode_amd(struct microcode_amd *mc_amd) +{ + u32 rev, dummy; + + native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); + + /* verify patch application was successful */ + native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); + if (rev != mc_amd->hdr.patch_id) + return -1; + + return 0; +} + +int apply_microcode_amd(int cpu) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu); + struct microcode_amd *mc_amd; + struct ucode_cpu_info *uci; + struct ucode_patch *p; + u32 rev, dummy; + + BUG_ON(raw_smp_processor_id() != cpu); + + uci = ucode_cpu_info + cpu; + + p = find_patch(cpu); + if (!p) + return 0; + + mc_amd = p->data; + uci->mc = p->data; + + rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); + + /* need to apply patch? */ + if (rev >= mc_amd->hdr.patch_id) { + c->microcode = rev; + uci->cpu_sig.rev = rev; + return 0; + } + + if (__apply_microcode_amd(mc_amd)) { + pr_err("CPU%d: update failed for patch_level=0x%08x\n", + cpu, mc_amd->hdr.patch_id); + return -1; + } + pr_info("CPU%d: new patch_level=0x%08x\n", cpu, + mc_amd->hdr.patch_id); + + uci->cpu_sig.rev = mc_amd->hdr.patch_id; + c->microcode = mc_amd->hdr.patch_id; + + return 0; +} + +static int install_equiv_cpu_table(const u8 *buf) +{ + unsigned int *ibuf = (unsigned int *)buf; + unsigned int type = ibuf[1]; + unsigned int size = ibuf[2]; + + if (type != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { + pr_err("empty section/" + "invalid type field in container file section header\n"); + return -EINVAL; + } + + equiv_cpu_table = vmalloc(size); + if (!equiv_cpu_table) { + pr_err("failed to allocate equivalent CPU table\n"); + return -ENOMEM; + } + + memcpy(equiv_cpu_table, buf + CONTAINER_HDR_SZ, size); + + /* add header length */ + return size + CONTAINER_HDR_SZ; +} + +static void free_equiv_cpu_table(void) +{ + vfree(equiv_cpu_table); + equiv_cpu_table = NULL; +} + +static void cleanup(void) +{ + free_equiv_cpu_table(); + free_cache(); +} + +/* + * We return the current size even if some of the checks failed so that + * we can skip over the next patch. If we return a negative value, we + * signal a grave error like a memory allocation has failed and the + * driver cannot continue functioning normally. In such cases, we tear + * down everything we've used up so far and exit. + */ +static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover) +{ + struct microcode_header_amd *mc_hdr; + struct ucode_patch *patch; + unsigned int patch_size, crnt_size, ret; + u32 proc_fam; + u16 proc_id; + + patch_size = *(u32 *)(fw + 4); + crnt_size = patch_size + SECTION_HDR_SIZE; + mc_hdr = (struct microcode_header_amd *)(fw + SECTION_HDR_SIZE); + proc_id = mc_hdr->processor_rev_id; + + proc_fam = find_cpu_family_by_equiv_cpu(proc_id); + if (!proc_fam) { + pr_err("No patch family for equiv ID: 0x%04x\n", proc_id); + return crnt_size; + } + + /* check if patch is for the current family */ + proc_fam = ((proc_fam >> 8) & 0xf) + ((proc_fam >> 20) & 0xff); + if (proc_fam != family) + return crnt_size; + + if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) { + pr_err("Patch-ID 0x%08x: chipset-specific code unsupported.\n", + mc_hdr->patch_id); + return crnt_size; + } + + ret = verify_patch_size(family, patch_size, leftover); + if (!ret) { + pr_err("Patch-ID 0x%08x: size mismatch.\n", mc_hdr->patch_id); + return crnt_size; + } + + patch = kzalloc(sizeof(*patch), GFP_KERNEL); + if (!patch) { + pr_err("Patch allocation failure.\n"); + return -EINVAL; + } + + patch->data = kzalloc(patch_size, GFP_KERNEL); + if (!patch->data) { + pr_err("Patch data allocation failure.\n"); + kfree(patch); + return -EINVAL; + } + + /* All looks ok, copy patch... */ + memcpy(patch->data, fw + SECTION_HDR_SIZE, patch_size); + INIT_LIST_HEAD(&patch->plist); + patch->patch_id = mc_hdr->patch_id; + patch->equiv_cpu = proc_id; + + pr_debug("%s: Added patch_id: 0x%08x, proc_id: 0x%04x\n", + __func__, patch->patch_id, proc_id); + + /* ... and add to cache. */ + update_cache(patch); + + return crnt_size; +} + +static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, + size_t size) +{ + enum ucode_state ret = UCODE_ERROR; + unsigned int leftover; + u8 *fw = (u8 *)data; + int crnt_size = 0; + int offset; + + offset = install_equiv_cpu_table(data); + if (offset < 0) { + pr_err("failed to create equivalent cpu table\n"); + return ret; + } + fw += offset; + leftover = size - offset; + + if (*(u32 *)fw != UCODE_UCODE_TYPE) { + pr_err("invalid type field in container file section header\n"); + free_equiv_cpu_table(); + return ret; + } + + while (leftover) { + crnt_size = verify_and_add_patch(family, fw, leftover); + if (crnt_size < 0) + return ret; + + fw += crnt_size; + leftover -= crnt_size; + } + + return UCODE_OK; +} + +enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size) +{ + enum ucode_state ret; + + /* free old equiv table */ + free_equiv_cpu_table(); + + ret = __load_microcode_amd(family, data, size); + + if (ret != UCODE_OK) + cleanup(); + +#if defined(CONFIG_MICROCODE_AMD_EARLY) && defined(CONFIG_X86_32) + /* save BSP's matching patch for early load */ + if (cpu_data(smp_processor_id()).cpu_index == boot_cpu_data.cpu_index) { + struct ucode_patch *p = find_patch(smp_processor_id()); + if (p) { + memset(amd_ucode_patch, 0, PATCH_MAX_SIZE); + memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data), + PATCH_MAX_SIZE)); + } + } +#endif + return ret; +} + +/* + * AMD microcode firmware naming convention, up to family 15h they are in + * the legacy file: + * + * amd-ucode/microcode_amd.bin + * + * This legacy file is always smaller than 2K in size. + * + * Beginning with family 15h, they are in family-specific firmware files: + * + * amd-ucode/microcode_amd_fam15h.bin + * amd-ucode/microcode_amd_fam16h.bin + * ... + * + * These might be larger than 2K. + */ +static enum ucode_state request_microcode_amd(int cpu, struct device *device, + bool refresh_fw) +{ + char fw_name[36] = "amd-ucode/microcode_amd.bin"; + struct cpuinfo_x86 *c = &cpu_data(cpu); + enum ucode_state ret = UCODE_NFOUND; + const struct firmware *fw; + + /* reload ucode container only on the boot cpu */ + if (!refresh_fw || c->cpu_index != boot_cpu_data.cpu_index) + return UCODE_OK; + + if (c->x86 >= 0x15) + snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86); + + if (request_firmware_direct(&fw, (const char *)fw_name, device)) { + pr_debug("failed to load file %s\n", fw_name); + goto out; + } + + ret = UCODE_ERROR; + if (*(u32 *)fw->data != UCODE_MAGIC) { + pr_err("invalid magic value (0x%08x)\n", *(u32 *)fw->data); + goto fw_release; + } + + ret = load_microcode_amd(c->x86, fw->data, fw->size); + + fw_release: + release_firmware(fw); + + out: + return ret; +} + +static enum ucode_state +request_microcode_user(int cpu, const void __user *buf, size_t size) +{ + return UCODE_ERROR; +} + +static void microcode_fini_cpu_amd(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + uci->mc = NULL; +} + +static struct microcode_ops microcode_amd_ops = { + .request_microcode_user = request_microcode_user, + .request_microcode_fw = request_microcode_amd, + .collect_cpu_info = collect_cpu_info_amd, + .apply_microcode = apply_microcode_amd, + .microcode_fini_cpu = microcode_fini_cpu_amd, +}; + +struct microcode_ops * __init init_amd_microcode(void) +{ + struct cpuinfo_x86 *c = &cpu_data(0); + + if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { + pr_warning("AMD CPU family 0x%x not supported\n", c->x86); + return NULL; + } + + return µcode_amd_ops; +} + +void __exit exit_amd_microcode(void) +{ + cleanup(); +} diff --git a/arch/x86/kernel/cpu/microcode/amd_early.c b/arch/x86/kernel/cpu/microcode/amd_early.c new file mode 100644 index 00000000000..617a9e28424 --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/amd_early.c @@ -0,0 +1,395 @@ +/* + * Copyright (C) 2013 Advanced Micro Devices, Inc. + * + * Author: Jacob Shin <jacob.shin@amd.com> + * Fixes: Borislav Petkov <bp@suse.de> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/earlycpio.h> +#include <linux/initrd.h> + +#include <asm/cpu.h> +#include <asm/setup.h> +#include <asm/microcode_amd.h> + +/* + * This points to the current valid container of microcode patches which we will + * save from the initrd before jettisoning its contents. + */ +static u8 *container; +static size_t container_size; + +static u32 ucode_new_rev; +u8 amd_ucode_patch[PATCH_MAX_SIZE]; +static u16 this_equiv_id; + +struct cpio_data ucode_cpio; + +/* + * Microcode patch container file is prepended to the initrd in cpio format. + * See Documentation/x86/early-microcode.txt + */ +static __initdata char ucode_path[] = "kernel/x86/microcode/AuthenticAMD.bin"; + +static struct cpio_data __init find_ucode_in_initrd(void) +{ + long offset = 0; + char *path; + void *start; + size_t size; + +#ifdef CONFIG_X86_32 + struct boot_params *p; + + /* + * On 32-bit, early load occurs before paging is turned on so we need + * to use physical addresses. + */ + p = (struct boot_params *)__pa_nodebug(&boot_params); + path = (char *)__pa_nodebug(ucode_path); + start = (void *)p->hdr.ramdisk_image; + size = p->hdr.ramdisk_size; +#else + path = ucode_path; + start = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET); + size = boot_params.hdr.ramdisk_size; +#endif + + return find_cpio_data(path, start, size, &offset); +} + +static size_t compute_container_size(u8 *data, u32 total_size) +{ + size_t size = 0; + u32 *header = (u32 *)data; + + if (header[0] != UCODE_MAGIC || + header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */ + header[2] == 0) /* size */ + return size; + + size = header[2] + CONTAINER_HDR_SZ; + total_size -= size; + data += size; + + while (total_size) { + u16 patch_size; + + header = (u32 *)data; + + if (header[0] != UCODE_UCODE_TYPE) + break; + + /* + * Sanity-check patch size. + */ + patch_size = header[1]; + if (patch_size > PATCH_MAX_SIZE) + break; + + size += patch_size + SECTION_HDR_SIZE; + data += patch_size + SECTION_HDR_SIZE; + total_size -= patch_size + SECTION_HDR_SIZE; + } + + return size; +} + +/* + * Early load occurs before we can vmalloc(). So we look for the microcode + * patch container file in initrd, traverse equivalent cpu table, look for a + * matching microcode patch, and update, all in initrd memory in place. + * When vmalloc() is available for use later -- on 64-bit during first AP load, + * and on 32-bit during save_microcode_in_initrd_amd() -- we can call + * load_microcode_amd() to save equivalent cpu table and microcode patches in + * kernel heap memory. + */ +static void apply_ucode_in_initrd(void *ucode, size_t size) +{ + struct equiv_cpu_entry *eq; + size_t *cont_sz; + u32 *header; + u8 *data, **cont; + u16 eq_id = 0; + int offset, left; + u32 rev, eax, ebx, ecx, edx; + u32 *new_rev; + +#ifdef CONFIG_X86_32 + new_rev = (u32 *)__pa_nodebug(&ucode_new_rev); + cont_sz = (size_t *)__pa_nodebug(&container_size); + cont = (u8 **)__pa_nodebug(&container); +#else + new_rev = &ucode_new_rev; + cont_sz = &container_size; + cont = &container; +#endif + + data = ucode; + left = size; + header = (u32 *)data; + + /* find equiv cpu table */ + if (header[0] != UCODE_MAGIC || + header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */ + header[2] == 0) /* size */ + return; + + eax = 0x00000001; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + + while (left > 0) { + eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ); + + *cont = data; + + /* Advance past the container header */ + offset = header[2] + CONTAINER_HDR_SZ; + data += offset; + left -= offset; + + eq_id = find_equiv_id(eq, eax); + if (eq_id) { + this_equiv_id = eq_id; + *cont_sz = compute_container_size(*cont, left + offset); + + /* + * truncate how much we need to iterate over in the + * ucode update loop below + */ + left = *cont_sz - offset; + break; + } + + /* + * support multiple container files appended together. if this + * one does not have a matching equivalent cpu entry, we fast + * forward to the next container file. + */ + while (left > 0) { + header = (u32 *)data; + if (header[0] == UCODE_MAGIC && + header[1] == UCODE_EQUIV_CPU_TABLE_TYPE) + break; + + offset = header[1] + SECTION_HDR_SIZE; + data += offset; + left -= offset; + } + + /* mark where the next microcode container file starts */ + offset = data - (u8 *)ucode; + ucode = data; + } + + if (!eq_id) { + *cont = NULL; + *cont_sz = 0; + return; + } + + /* find ucode and update if needed */ + + native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax); + + while (left > 0) { + struct microcode_amd *mc; + + header = (u32 *)data; + if (header[0] != UCODE_UCODE_TYPE || /* type */ + header[1] == 0) /* size */ + break; + + mc = (struct microcode_amd *)(data + SECTION_HDR_SIZE); + + if (eq_id == mc->hdr.processor_rev_id && rev < mc->hdr.patch_id) { + + if (!__apply_microcode_amd(mc)) { + rev = mc->hdr.patch_id; + *new_rev = rev; + + /* save ucode patch */ + memcpy(amd_ucode_patch, mc, + min_t(u32, header[1], PATCH_MAX_SIZE)); + } + } + + offset = header[1] + SECTION_HDR_SIZE; + data += offset; + left -= offset; + } +} + +void __init load_ucode_amd_bsp(void) +{ + struct cpio_data cp; + void **data; + size_t *size; + +#ifdef CONFIG_X86_32 + data = (void **)__pa_nodebug(&ucode_cpio.data); + size = (size_t *)__pa_nodebug(&ucode_cpio.size); +#else + data = &ucode_cpio.data; + size = &ucode_cpio.size; +#endif + + cp = find_ucode_in_initrd(); + if (!cp.data) + return; + + *data = cp.data; + *size = cp.size; + + apply_ucode_in_initrd(cp.data, cp.size); +} + +#ifdef CONFIG_X86_32 +/* + * On 32-bit, since AP's early load occurs before paging is turned on, we + * cannot traverse cpu_equiv_table and pcache in kernel heap memory. So during + * cold boot, AP will apply_ucode_in_initrd() just like the BSP. During + * save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch, + * which is used upon resume from suspend. + */ +void load_ucode_amd_ap(void) +{ + struct microcode_amd *mc; + size_t *usize; + void **ucode; + + mc = (struct microcode_amd *)__pa(amd_ucode_patch); + if (mc->hdr.patch_id && mc->hdr.processor_rev_id) { + __apply_microcode_amd(mc); + return; + } + + ucode = (void *)__pa_nodebug(&container); + usize = (size_t *)__pa_nodebug(&container_size); + + if (!*ucode || !*usize) + return; + + apply_ucode_in_initrd(*ucode, *usize); +} + +static void __init collect_cpu_sig_on_bsp(void *arg) +{ + unsigned int cpu = smp_processor_id(); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + uci->cpu_sig.sig = cpuid_eax(0x00000001); +} + +static void __init get_bsp_sig(void) +{ + unsigned int bsp = boot_cpu_data.cpu_index; + struct ucode_cpu_info *uci = ucode_cpu_info + bsp; + + if (!uci->cpu_sig.sig) + smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1); +} +#else +void load_ucode_amd_ap(void) +{ + unsigned int cpu = smp_processor_id(); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + struct equiv_cpu_entry *eq; + struct microcode_amd *mc; + u32 rev, eax; + u16 eq_id; + + /* Exit if called on the BSP. */ + if (!cpu) + return; + + if (!container) + return; + + rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax); + + uci->cpu_sig.rev = rev; + uci->cpu_sig.sig = eax; + + eax = cpuid_eax(0x00000001); + eq = (struct equiv_cpu_entry *)(container + CONTAINER_HDR_SZ); + + eq_id = find_equiv_id(eq, eax); + if (!eq_id) + return; + + if (eq_id == this_equiv_id) { + mc = (struct microcode_amd *)amd_ucode_patch; + + if (mc && rev < mc->hdr.patch_id) { + if (!__apply_microcode_amd(mc)) + ucode_new_rev = mc->hdr.patch_id; + } + + } else { + if (!ucode_cpio.data) + return; + + /* + * AP has a different equivalence ID than BSP, looks like + * mixed-steppings silicon so go through the ucode blob anew. + */ + apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size); + } +} +#endif + +int __init save_microcode_in_initrd_amd(void) +{ + unsigned long cont; + enum ucode_state ret; + u32 eax; + + if (!container) + return -EINVAL; + +#ifdef CONFIG_X86_32 + get_bsp_sig(); + cont = (unsigned long)container; +#else + /* + * We need the physical address of the container for both bitness since + * boot_params.hdr.ramdisk_image is a physical address. + */ + cont = __pa(container); +#endif + + /* + * Take into account the fact that the ramdisk might get relocated and + * therefore we need to recompute the container's position in virtual + * memory space. + */ + if (relocated_ramdisk) + container = (u8 *)(__va(relocated_ramdisk) + + (cont - boot_params.hdr.ramdisk_image)); + + if (ucode_new_rev) + pr_info("microcode: updated early to new patch_level=0x%08x\n", + ucode_new_rev); + + eax = cpuid_eax(0x00000001); + eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); + + ret = load_microcode_amd(eax, container, container_size); + if (ret != UCODE_OK) + return -EINVAL; + + /* + * This will be freed any msec now, stash patches for the current + * family and switch to patch cache for cpu hotplug, etc later. + */ + container = NULL; + container_size = 0; + + return 0; +} diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c new file mode 100644 index 00000000000..dd9d6190b08 --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -0,0 +1,651 @@ +/* + * Intel CPU Microcode Update Driver for Linux + * + * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk> + * 2006 Shaohua Li <shaohua.li@intel.com> + * + * This driver allows to upgrade microcode on Intel processors + * belonging to IA-32 family - PentiumPro, Pentium II, + * Pentium III, Xeon, Pentium 4, etc. + * + * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture + * Software Developer's Manual + * Order Number 253668 or free download from: + * + * http://developer.intel.com/Assets/PDF/manual/253668.pdf + * + * For more information, go to http://www.urbanmyth.org/microcode + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com> + * Initial release. + * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com> + * Added read() support + cleanups. + * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com> + * Added 'device trimming' support. open(O_WRONLY) zeroes + * and frees the saved copy of applied microcode. + * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com> + * Made to use devfs (/dev/cpu/microcode) + cleanups. + * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com> + * Added misc device support (now uses both devfs and misc). + * Added MICROCODE_IOCFREE ioctl to clear memory. + * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com> + * Messages for error cases (non Intel & no suitable microcode). + * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com> + * Removed ->release(). Removed exclusive open and status bitmap. + * Added microcode_rwsem to serialize read()/write()/ioctl(). + * Removed global kernel lock usage. + * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com> + * Write 0 to 0x8B msr and then cpuid before reading revision, + * so that it works even if there were no update done by the + * BIOS. Otherwise, reading from 0x8B gives junk (which happened + * to be 0 on my machine which is why it worked even when I + * disabled update by the BIOS) + * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix. + * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and + * Tigran Aivazian <tigran@veritas.com> + * Intel Pentium 4 processor support and bugfixes. + * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com> + * Bugfix for HT (Hyper-Threading) enabled processors + * whereby processor resources are shared by all logical processors + * in a single CPU package. + * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and + * Tigran Aivazian <tigran@veritas.com>, + * Serialize updates as required on HT processors due to + * speculative nature of implementation. + * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com> + * Fix the panic when writing zero-length microcode chunk. + * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>, + * Jun Nakajima <jun.nakajima@intel.com> + * Support for the microcode updates in the new format. + * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com> + * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl + * because we no longer hold a copy of applied microcode + * in kernel memory. + * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com> + * Fix sigmatch() macro to handle old CPUs with pf == 0. + * Thanks to Stuart Swales for pointing out this bug. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/platform_device.h> +#include <linux/miscdevice.h> +#include <linux/capability.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/cpu.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/syscore_ops.h> + +#include <asm/microcode.h> +#include <asm/processor.h> +#include <asm/cpu_device_id.h> +#include <asm/perf_event.h> + +MODULE_DESCRIPTION("Microcode Update Driver"); +MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); +MODULE_LICENSE("GPL"); + +#define MICROCODE_VERSION "2.00" + +static struct microcode_ops *microcode_ops; + +bool dis_ucode_ldr; +module_param(dis_ucode_ldr, bool, 0); + +/* + * Synchronization. + * + * All non cpu-hotplug-callback call sites use: + * + * - microcode_mutex to synchronize with each other; + * - get/put_online_cpus() to synchronize with + * the cpu-hotplug-callback call sites. + * + * We guarantee that only a single cpu is being + * updated at any particular moment of time. + */ +static DEFINE_MUTEX(microcode_mutex); + +struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; +EXPORT_SYMBOL_GPL(ucode_cpu_info); + +/* + * Operations that are run on a target cpu: + */ + +struct cpu_info_ctx { + struct cpu_signature *cpu_sig; + int err; +}; + +static void collect_cpu_info_local(void *arg) +{ + struct cpu_info_ctx *ctx = arg; + + ctx->err = microcode_ops->collect_cpu_info(smp_processor_id(), + ctx->cpu_sig); +} + +static int collect_cpu_info_on_target(int cpu, struct cpu_signature *cpu_sig) +{ + struct cpu_info_ctx ctx = { .cpu_sig = cpu_sig, .err = 0 }; + int ret; + + ret = smp_call_function_single(cpu, collect_cpu_info_local, &ctx, 1); + if (!ret) + ret = ctx.err; + + return ret; +} + +static int collect_cpu_info(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + int ret; + + memset(uci, 0, sizeof(*uci)); + + ret = collect_cpu_info_on_target(cpu, &uci->cpu_sig); + if (!ret) + uci->valid = 1; + + return ret; +} + +struct apply_microcode_ctx { + int err; +}; + +static void apply_microcode_local(void *arg) +{ + struct apply_microcode_ctx *ctx = arg; + + ctx->err = microcode_ops->apply_microcode(smp_processor_id()); +} + +static int apply_microcode_on_target(int cpu) +{ + struct apply_microcode_ctx ctx = { .err = 0 }; + int ret; + + ret = smp_call_function_single(cpu, apply_microcode_local, &ctx, 1); + if (!ret) + ret = ctx.err; + + return ret; +} + +#ifdef CONFIG_MICROCODE_OLD_INTERFACE +static int do_microcode_update(const void __user *buf, size_t size) +{ + int error = 0; + int cpu; + + for_each_online_cpu(cpu) { + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + enum ucode_state ustate; + + if (!uci->valid) + continue; + + ustate = microcode_ops->request_microcode_user(cpu, buf, size); + if (ustate == UCODE_ERROR) { + error = -1; + break; + } else if (ustate == UCODE_OK) + apply_microcode_on_target(cpu); + } + + return error; +} + +static int microcode_open(struct inode *inode, struct file *file) +{ + return capable(CAP_SYS_RAWIO) ? nonseekable_open(inode, file) : -EPERM; +} + +static ssize_t microcode_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + ssize_t ret = -EINVAL; + + if ((len >> PAGE_SHIFT) > totalram_pages) { + pr_err("too much data (max %ld pages)\n", totalram_pages); + return ret; + } + + get_online_cpus(); + mutex_lock(µcode_mutex); + + if (do_microcode_update(buf, len) == 0) + ret = (ssize_t)len; + + if (ret > 0) + perf_check_microcode(); + + mutex_unlock(µcode_mutex); + put_online_cpus(); + + return ret; +} + +static const struct file_operations microcode_fops = { + .owner = THIS_MODULE, + .write = microcode_write, + .open = microcode_open, + .llseek = no_llseek, +}; + +static struct miscdevice microcode_dev = { + .minor = MICROCODE_MINOR, + .name = "microcode", + .nodename = "cpu/microcode", + .fops = µcode_fops, +}; + +static int __init microcode_dev_init(void) +{ + int error; + + error = misc_register(µcode_dev); + if (error) { + pr_err("can't misc_register on minor=%d\n", MICROCODE_MINOR); + return error; + } + + return 0; +} + +static void __exit microcode_dev_exit(void) +{ + misc_deregister(µcode_dev); +} + +MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); +MODULE_ALIAS("devname:cpu/microcode"); +#else +#define microcode_dev_init() 0 +#define microcode_dev_exit() do { } while (0) +#endif + +/* fake device for request_firmware */ +static struct platform_device *microcode_pdev; + +static int reload_for_cpu(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + enum ucode_state ustate; + int err = 0; + + if (!uci->valid) + return err; + + ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev, true); + if (ustate == UCODE_OK) + apply_microcode_on_target(cpu); + else + if (ustate == UCODE_ERROR) + err = -EINVAL; + return err; +} + +static ssize_t reload_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t size) +{ + unsigned long val; + int cpu; + ssize_t ret = 0, tmp_ret; + + ret = kstrtoul(buf, 0, &val); + if (ret) + return ret; + + if (val != 1) + return size; + + get_online_cpus(); + mutex_lock(µcode_mutex); + for_each_online_cpu(cpu) { + tmp_ret = reload_for_cpu(cpu); + if (tmp_ret != 0) + pr_warn("Error reloading microcode on CPU %d\n", cpu); + + /* save retval of the first encountered reload error */ + if (!ret) + ret = tmp_ret; + } + if (!ret) + perf_check_microcode(); + mutex_unlock(µcode_mutex); + put_online_cpus(); + + if (!ret) + ret = size; + + return ret; +} + +static ssize_t version_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; + + return sprintf(buf, "0x%x\n", uci->cpu_sig.rev); +} + +static ssize_t pf_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; + + return sprintf(buf, "0x%x\n", uci->cpu_sig.pf); +} + +static DEVICE_ATTR(reload, 0200, NULL, reload_store); +static DEVICE_ATTR(version, 0400, version_show, NULL); +static DEVICE_ATTR(processor_flags, 0400, pf_show, NULL); + +static struct attribute *mc_default_attrs[] = { + &dev_attr_version.attr, + &dev_attr_processor_flags.attr, + NULL +}; + +static struct attribute_group mc_attr_group = { + .attrs = mc_default_attrs, + .name = "microcode", +}; + +static void microcode_fini_cpu(int cpu) +{ + microcode_ops->microcode_fini_cpu(cpu); +} + +static enum ucode_state microcode_resume_cpu(int cpu) +{ + pr_debug("CPU%d updated upon resume\n", cpu); + + if (apply_microcode_on_target(cpu)) + return UCODE_ERROR; + + return UCODE_OK; +} + +static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw) +{ + enum ucode_state ustate; + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + if (uci && uci->valid) + return UCODE_OK; + + if (collect_cpu_info(cpu)) + return UCODE_ERROR; + + /* --dimm. Trigger a delayed update? */ + if (system_state != SYSTEM_RUNNING) + return UCODE_NFOUND; + + ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev, + refresh_fw); + + if (ustate == UCODE_OK) { + pr_debug("CPU%d updated upon init\n", cpu); + apply_microcode_on_target(cpu); + } + + return ustate; +} + +static enum ucode_state microcode_update_cpu(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + if (uci->valid) + return microcode_resume_cpu(cpu); + + return microcode_init_cpu(cpu, false); +} + +static int mc_device_add(struct device *dev, struct subsys_interface *sif) +{ + int err, cpu = dev->id; + + if (!cpu_online(cpu)) + return 0; + + pr_debug("CPU%d added\n", cpu); + + err = sysfs_create_group(&dev->kobj, &mc_attr_group); + if (err) + return err; + + if (microcode_init_cpu(cpu, true) == UCODE_ERROR) + return -EINVAL; + + return err; +} + +static int mc_device_remove(struct device *dev, struct subsys_interface *sif) +{ + int cpu = dev->id; + + if (!cpu_online(cpu)) + return 0; + + pr_debug("CPU%d removed\n", cpu); + microcode_fini_cpu(cpu); + sysfs_remove_group(&dev->kobj, &mc_attr_group); + return 0; +} + +static struct subsys_interface mc_cpu_interface = { + .name = "microcode", + .subsys = &cpu_subsys, + .add_dev = mc_device_add, + .remove_dev = mc_device_remove, +}; + +/** + * mc_bp_resume - Update boot CPU microcode during resume. + */ +static void mc_bp_resume(void) +{ + int cpu = smp_processor_id(); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + if (uci->valid && uci->mc) + microcode_ops->apply_microcode(cpu); +} + +static struct syscore_ops mc_syscore_ops = { + .resume = mc_bp_resume, +}; + +static int +mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + struct device *dev; + + dev = get_cpu_device(cpu); + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_ONLINE: + microcode_update_cpu(cpu); + pr_debug("CPU%d added\n", cpu); + /* + * "break" is missing on purpose here because we want to fall + * through in order to create the sysfs group. + */ + + case CPU_DOWN_FAILED: + if (sysfs_create_group(&dev->kobj, &mc_attr_group)) + pr_err("Failed to create group for CPU%d\n", cpu); + break; + + case CPU_DOWN_PREPARE: + /* Suspend is in progress, only remove the interface */ + sysfs_remove_group(&dev->kobj, &mc_attr_group); + pr_debug("CPU%d removed\n", cpu); + break; + + /* + * case CPU_DEAD: + * + * When a CPU goes offline, don't free up or invalidate the copy of + * the microcode in kernel memory, so that we can reuse it when the + * CPU comes back online without unnecessarily requesting the userspace + * for it again. + */ + } + + /* The CPU refused to come up during a system resume */ + if (action == CPU_UP_CANCELED_FROZEN) + microcode_fini_cpu(cpu); + + return NOTIFY_OK; +} + +static struct notifier_block __refdata mc_cpu_notifier = { + .notifier_call = mc_cpu_callback, +}; + +#ifdef MODULE +/* Autoload on Intel and AMD systems */ +static const struct x86_cpu_id __initconst microcode_id[] = { +#ifdef CONFIG_MICROCODE_INTEL + { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, }, +#endif +#ifdef CONFIG_MICROCODE_AMD + { X86_VENDOR_AMD, X86_FAMILY_ANY, X86_MODEL_ANY, }, +#endif + {} +}; +MODULE_DEVICE_TABLE(x86cpu, microcode_id); +#endif + +static struct attribute *cpu_root_microcode_attrs[] = { + &dev_attr_reload.attr, + NULL +}; + +static struct attribute_group cpu_root_microcode_group = { + .name = "microcode", + .attrs = cpu_root_microcode_attrs, +}; + +static int __init microcode_init(void) +{ + struct cpuinfo_x86 *c = &cpu_data(0); + int error; + + if (dis_ucode_ldr) + return 0; + + if (c->x86_vendor == X86_VENDOR_INTEL) + microcode_ops = init_intel_microcode(); + else if (c->x86_vendor == X86_VENDOR_AMD) + microcode_ops = init_amd_microcode(); + else + pr_err("no support for this CPU vendor\n"); + + if (!microcode_ops) + return -ENODEV; + + microcode_pdev = platform_device_register_simple("microcode", -1, + NULL, 0); + if (IS_ERR(microcode_pdev)) + return PTR_ERR(microcode_pdev); + + get_online_cpus(); + mutex_lock(µcode_mutex); + + error = subsys_interface_register(&mc_cpu_interface); + if (!error) + perf_check_microcode(); + mutex_unlock(µcode_mutex); + put_online_cpus(); + + if (error) + goto out_pdev; + + error = sysfs_create_group(&cpu_subsys.dev_root->kobj, + &cpu_root_microcode_group); + + if (error) { + pr_err("Error creating microcode group!\n"); + goto out_driver; + } + + error = microcode_dev_init(); + if (error) + goto out_ucode_group; + + register_syscore_ops(&mc_syscore_ops); + register_hotcpu_notifier(&mc_cpu_notifier); + + pr_info("Microcode Update Driver: v" MICROCODE_VERSION + " <tigran@aivazian.fsnet.co.uk>, Peter Oruba\n"); + + return 0; + + out_ucode_group: + sysfs_remove_group(&cpu_subsys.dev_root->kobj, + &cpu_root_microcode_group); + + out_driver: + get_online_cpus(); + mutex_lock(µcode_mutex); + + subsys_interface_unregister(&mc_cpu_interface); + + mutex_unlock(µcode_mutex); + put_online_cpus(); + + out_pdev: + platform_device_unregister(microcode_pdev); + return error; + +} +module_init(microcode_init); + +static void __exit microcode_exit(void) +{ + struct cpuinfo_x86 *c = &cpu_data(0); + + microcode_dev_exit(); + + unregister_hotcpu_notifier(&mc_cpu_notifier); + unregister_syscore_ops(&mc_syscore_ops); + + sysfs_remove_group(&cpu_subsys.dev_root->kobj, + &cpu_root_microcode_group); + + get_online_cpus(); + mutex_lock(µcode_mutex); + + subsys_interface_unregister(&mc_cpu_interface); + + mutex_unlock(µcode_mutex); + put_online_cpus(); + + platform_device_unregister(microcode_pdev); + + microcode_ops = NULL; + + if (c->x86_vendor == X86_VENDOR_AMD) + exit_amd_microcode(); + + pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); +} +module_exit(microcode_exit); diff --git a/arch/x86/kernel/cpu/microcode/core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c new file mode 100644 index 00000000000..5f28a64e71e --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/core_early.c @@ -0,0 +1,178 @@ +/* + * X86 CPU microcode early update for Linux + * + * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com> + * H Peter Anvin" <hpa@zytor.com> + * + * This driver allows to early upgrade microcode on Intel processors + * belonging to IA-32 family - PentiumPro, Pentium II, + * Pentium III, Xeon, Pentium 4, etc. + * + * Reference: Section 9.11 of Volume 3, IA-32 Intel Architecture + * Software Developer's Manual. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/module.h> +#include <asm/microcode.h> +#include <asm/microcode_intel.h> +#include <asm/microcode_amd.h> +#include <asm/processor.h> +#include <asm/cmdline.h> + +#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24)) +#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u') +#define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I') +#define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l') +#define CPUID_AMD1 QCHAR('A', 'u', 't', 'h') +#define CPUID_AMD2 QCHAR('e', 'n', 't', 'i') +#define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D') + +#define CPUID_IS(a, b, c, ebx, ecx, edx) \ + (!((ebx ^ (a))|(edx ^ (b))|(ecx ^ (c)))) + +/* + * In early loading microcode phase on BSP, boot_cpu_data is not set up yet. + * x86_vendor() gets vendor id for BSP. + * + * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify + * coding, we still use x86_vendor() to get vendor id for AP. + * + * x86_vendor() gets vendor information directly through cpuid. + */ +static int x86_vendor(void) +{ + u32 eax = 0x00000000; + u32 ebx, ecx = 0, edx; + + native_cpuid(&eax, &ebx, &ecx, &edx); + + if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx)) + return X86_VENDOR_INTEL; + + if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx)) + return X86_VENDOR_AMD; + + return X86_VENDOR_UNKNOWN; +} + +static int x86_family(void) +{ + u32 eax = 0x00000001; + u32 ebx, ecx = 0, edx; + int x86; + + native_cpuid(&eax, &ebx, &ecx, &edx); + + x86 = (eax >> 8) & 0xf; + if (x86 == 15) + x86 += (eax >> 20) & 0xff; + + return x86; +} + +static bool __init check_loader_disabled_bsp(void) +{ +#ifdef CONFIG_X86_32 + const char *cmdline = (const char *)__pa_nodebug(boot_command_line); + const char *opt = "dis_ucode_ldr"; + const char *option = (const char *)__pa_nodebug(opt); + bool *res = (bool *)__pa_nodebug(&dis_ucode_ldr); + +#else /* CONFIG_X86_64 */ + const char *cmdline = boot_command_line; + const char *option = "dis_ucode_ldr"; + bool *res = &dis_ucode_ldr; +#endif + + if (cmdline_find_option_bool(cmdline, option)) + *res = true; + + return *res; +} + +void __init load_ucode_bsp(void) +{ + int vendor, x86; + + if (check_loader_disabled_bsp()) + return; + + if (!have_cpuid_p()) + return; + + vendor = x86_vendor(); + x86 = x86_family(); + + switch (vendor) { + case X86_VENDOR_INTEL: + if (x86 >= 6) + load_ucode_intel_bsp(); + break; + case X86_VENDOR_AMD: + if (x86 >= 0x10) + load_ucode_amd_bsp(); + break; + default: + break; + } +} + +static bool check_loader_disabled_ap(void) +{ +#ifdef CONFIG_X86_32 + return __pa_nodebug(dis_ucode_ldr); +#else + return dis_ucode_ldr; +#endif +} + +void load_ucode_ap(void) +{ + int vendor, x86; + + if (check_loader_disabled_ap()) + return; + + if (!have_cpuid_p()) + return; + + vendor = x86_vendor(); + x86 = x86_family(); + + switch (vendor) { + case X86_VENDOR_INTEL: + if (x86 >= 6) + load_ucode_intel_ap(); + break; + case X86_VENDOR_AMD: + if (x86 >= 0x10) + load_ucode_amd_ap(); + break; + default: + break; + } +} + +int __init save_microcode_in_initrd(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + switch (c->x86_vendor) { + case X86_VENDOR_INTEL: + if (c->x86 >= 6) + save_microcode_in_initrd_intel(); + break; + case X86_VENDOR_AMD: + if (c->x86 >= 0x10) + save_microcode_in_initrd_amd(); + break; + default: + break; + } + + return 0; +} diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c new file mode 100644 index 00000000000..a276fa75d9b --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -0,0 +1,333 @@ +/* + * Intel CPU Microcode Update Driver for Linux + * + * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk> + * 2006 Shaohua Li <shaohua.li@intel.com> + * + * This driver allows to upgrade microcode on Intel processors + * belonging to IA-32 family - PentiumPro, Pentium II, + * Pentium III, Xeon, Pentium 4, etc. + * + * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture + * Software Developer's Manual + * Order Number 253668 or free download from: + * + * http://developer.intel.com/Assets/PDF/manual/253668.pdf + * + * For more information, go to http://www.urbanmyth.org/microcode + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com> + * Initial release. + * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com> + * Added read() support + cleanups. + * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com> + * Added 'device trimming' support. open(O_WRONLY) zeroes + * and frees the saved copy of applied microcode. + * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com> + * Made to use devfs (/dev/cpu/microcode) + cleanups. + * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com> + * Added misc device support (now uses both devfs and misc). + * Added MICROCODE_IOCFREE ioctl to clear memory. + * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com> + * Messages for error cases (non Intel & no suitable microcode). + * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com> + * Removed ->release(). Removed exclusive open and status bitmap. + * Added microcode_rwsem to serialize read()/write()/ioctl(). + * Removed global kernel lock usage. + * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com> + * Write 0 to 0x8B msr and then cpuid before reading revision, + * so that it works even if there were no update done by the + * BIOS. Otherwise, reading from 0x8B gives junk (which happened + * to be 0 on my machine which is why it worked even when I + * disabled update by the BIOS) + * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix. + * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and + * Tigran Aivazian <tigran@veritas.com> + * Intel Pentium 4 processor support and bugfixes. + * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com> + * Bugfix for HT (Hyper-Threading) enabled processors + * whereby processor resources are shared by all logical processors + * in a single CPU package. + * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and + * Tigran Aivazian <tigran@veritas.com>, + * Serialize updates as required on HT processors due to + * speculative nature of implementation. + * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com> + * Fix the panic when writing zero-length microcode chunk. + * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>, + * Jun Nakajima <jun.nakajima@intel.com> + * Support for the microcode updates in the new format. + * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com> + * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl + * because we no longer hold a copy of applied microcode + * in kernel memory. + * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com> + * Fix sigmatch() macro to handle old CPUs with pf == 0. + * Thanks to Stuart Swales for pointing out this bug. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/firmware.h> +#include <linux/uaccess.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/vmalloc.h> + +#include <asm/microcode_intel.h> +#include <asm/processor.h> +#include <asm/msr.h> + +MODULE_DESCRIPTION("Microcode Update Driver"); +MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); +MODULE_LICENSE("GPL"); + +static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu_num); + unsigned int val[2]; + + memset(csig, 0, sizeof(*csig)); + + csig->sig = cpuid_eax(0x00000001); + + if ((c->x86_model >= 5) || (c->x86 > 6)) { + /* get processor flags from MSR 0x17 */ + rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); + csig->pf = 1 << ((val[1] >> 18) & 7); + } + + csig->rev = c->microcode; + pr_info("CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n", + cpu_num, csig->sig, csig->pf, csig->rev); + + return 0; +} + +/* + * return 0 - no update found + * return 1 - found update + */ +static int get_matching_mc(struct microcode_intel *mc_intel, int cpu) +{ + struct cpu_signature cpu_sig; + unsigned int csig, cpf, crev; + + collect_cpu_info(cpu, &cpu_sig); + + csig = cpu_sig.sig; + cpf = cpu_sig.pf; + crev = cpu_sig.rev; + + return get_matching_microcode(csig, cpf, mc_intel, crev); +} + +int apply_microcode(int cpu) +{ + struct microcode_intel *mc_intel; + struct ucode_cpu_info *uci; + unsigned int val[2]; + int cpu_num = raw_smp_processor_id(); + struct cpuinfo_x86 *c = &cpu_data(cpu_num); + + uci = ucode_cpu_info + cpu; + mc_intel = uci->mc; + + /* We should bind the task to the CPU */ + BUG_ON(cpu_num != cpu); + + if (mc_intel == NULL) + return 0; + + /* + * Microcode on this CPU could be updated earlier. Only apply the + * microcode patch in mc_intel when it is newer than the one on this + * CPU. + */ + if (get_matching_mc(mc_intel, cpu) == 0) + return 0; + + /* write microcode via MSR 0x79 */ + wrmsr(MSR_IA32_UCODE_WRITE, + (unsigned long) mc_intel->bits, + (unsigned long) mc_intel->bits >> 16 >> 16); + wrmsr(MSR_IA32_UCODE_REV, 0, 0); + + /* As documented in the SDM: Do a CPUID 1 here */ + sync_core(); + + /* get the current revision from MSR 0x8B */ + rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); + + if (val[1] != mc_intel->hdr.rev) { + pr_err("CPU%d update to revision 0x%x failed\n", + cpu_num, mc_intel->hdr.rev); + return -1; + } + pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x\n", + cpu_num, val[1], + mc_intel->hdr.date & 0xffff, + mc_intel->hdr.date >> 24, + (mc_intel->hdr.date >> 16) & 0xff); + + uci->cpu_sig.rev = val[1]; + c->microcode = val[1]; + + return 0; +} + +static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, + int (*get_ucode_data)(void *, const void *, size_t)) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + u8 *ucode_ptr = data, *new_mc = NULL, *mc = NULL; + int new_rev = uci->cpu_sig.rev; + unsigned int leftover = size; + enum ucode_state state = UCODE_OK; + unsigned int curr_mc_size = 0; + unsigned int csig, cpf; + + while (leftover) { + struct microcode_header_intel mc_header; + unsigned int mc_size; + + if (get_ucode_data(&mc_header, ucode_ptr, sizeof(mc_header))) + break; + + mc_size = get_totalsize(&mc_header); + if (!mc_size || mc_size > leftover) { + pr_err("error! Bad data in microcode data file\n"); + break; + } + + /* For performance reasons, reuse mc area when possible */ + if (!mc || mc_size > curr_mc_size) { + vfree(mc); + mc = vmalloc(mc_size); + if (!mc) + break; + curr_mc_size = mc_size; + } + + if (get_ucode_data(mc, ucode_ptr, mc_size) || + microcode_sanity_check(mc, 1) < 0) { + break; + } + + csig = uci->cpu_sig.sig; + cpf = uci->cpu_sig.pf; + if (get_matching_microcode(csig, cpf, mc, new_rev)) { + vfree(new_mc); + new_rev = mc_header.rev; + new_mc = mc; + mc = NULL; /* trigger new vmalloc */ + } + + ucode_ptr += mc_size; + leftover -= mc_size; + } + + vfree(mc); + + if (leftover) { + vfree(new_mc); + state = UCODE_ERROR; + goto out; + } + + if (!new_mc) { + state = UCODE_NFOUND; + goto out; + } + + vfree(uci->mc); + uci->mc = (struct microcode_intel *)new_mc; + + /* + * If early loading microcode is supported, save this mc into + * permanent memory. So it will be loaded early when a CPU is hot added + * or resumes. + */ + save_mc_for_early(new_mc); + + pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", + cpu, new_rev, uci->cpu_sig.rev); +out: + return state; +} + +static int get_ucode_fw(void *to, const void *from, size_t n) +{ + memcpy(to, from, n); + return 0; +} + +static enum ucode_state request_microcode_fw(int cpu, struct device *device, + bool refresh_fw) +{ + char name[30]; + struct cpuinfo_x86 *c = &cpu_data(cpu); + const struct firmware *firmware; + enum ucode_state ret; + + sprintf(name, "intel-ucode/%02x-%02x-%02x", + c->x86, c->x86_model, c->x86_mask); + + if (request_firmware_direct(&firmware, name, device)) { + pr_debug("data file %s load failed\n", name); + return UCODE_NFOUND; + } + + ret = generic_load_microcode(cpu, (void *)firmware->data, + firmware->size, &get_ucode_fw); + + release_firmware(firmware); + + return ret; +} + +static int get_ucode_user(void *to, const void *from, size_t n) +{ + return copy_from_user(to, from, n); +} + +static enum ucode_state +request_microcode_user(int cpu, const void __user *buf, size_t size) +{ + return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user); +} + +static void microcode_fini_cpu(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + vfree(uci->mc); + uci->mc = NULL; +} + +static struct microcode_ops microcode_intel_ops = { + .request_microcode_user = request_microcode_user, + .request_microcode_fw = request_microcode_fw, + .collect_cpu_info = collect_cpu_info, + .apply_microcode = apply_microcode, + .microcode_fini_cpu = microcode_fini_cpu, +}; + +struct microcode_ops * __init init_intel_microcode(void) +{ + struct cpuinfo_x86 *c = &cpu_data(0); + + if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || + cpu_has(c, X86_FEATURE_IA64)) { + pr_err("Intel CPU family 0x%x not supported\n", c->x86); + return NULL; + } + + return µcode_intel_ops; +} + diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c new file mode 100644 index 00000000000..18f739129e7 --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/intel_early.c @@ -0,0 +1,787 @@ +/* + * Intel CPU microcode early update for Linux + * + * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com> + * H Peter Anvin" <hpa@zytor.com> + * + * This allows to early upgrade microcode on Intel processors + * belonging to IA-32 family - PentiumPro, Pentium II, + * Pentium III, Xeon, Pentium 4, etc. + * + * Reference: Section 9.11 of Volume 3, IA-32 Intel Architecture + * Software Developer's Manual. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/earlycpio.h> +#include <linux/initrd.h> +#include <linux/cpu.h> +#include <asm/msr.h> +#include <asm/microcode_intel.h> +#include <asm/processor.h> +#include <asm/tlbflush.h> +#include <asm/setup.h> + +unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT]; +struct mc_saved_data { + unsigned int mc_saved_count; + struct microcode_intel **mc_saved; +} mc_saved_data; + +static enum ucode_state +generic_load_microcode_early(struct microcode_intel **mc_saved_p, + unsigned int mc_saved_count, + struct ucode_cpu_info *uci) +{ + struct microcode_intel *ucode_ptr, *new_mc = NULL; + int new_rev = uci->cpu_sig.rev; + enum ucode_state state = UCODE_OK; + unsigned int mc_size; + struct microcode_header_intel *mc_header; + unsigned int csig = uci->cpu_sig.sig; + unsigned int cpf = uci->cpu_sig.pf; + int i; + + for (i = 0; i < mc_saved_count; i++) { + ucode_ptr = mc_saved_p[i]; + + mc_header = (struct microcode_header_intel *)ucode_ptr; + mc_size = get_totalsize(mc_header); + if (get_matching_microcode(csig, cpf, ucode_ptr, new_rev)) { + new_rev = mc_header->rev; + new_mc = ucode_ptr; + } + } + + if (!new_mc) { + state = UCODE_NFOUND; + goto out; + } + + uci->mc = (struct microcode_intel *)new_mc; +out: + return state; +} + +static void +microcode_pointer(struct microcode_intel **mc_saved, + unsigned long *mc_saved_in_initrd, + unsigned long initrd_start, int mc_saved_count) +{ + int i; + + for (i = 0; i < mc_saved_count; i++) + mc_saved[i] = (struct microcode_intel *) + (mc_saved_in_initrd[i] + initrd_start); +} + +#ifdef CONFIG_X86_32 +static void +microcode_phys(struct microcode_intel **mc_saved_tmp, + struct mc_saved_data *mc_saved_data) +{ + int i; + struct microcode_intel ***mc_saved; + + mc_saved = (struct microcode_intel ***) + __pa_nodebug(&mc_saved_data->mc_saved); + for (i = 0; i < mc_saved_data->mc_saved_count; i++) { + struct microcode_intel *p; + + p = *(struct microcode_intel **) + __pa_nodebug(mc_saved_data->mc_saved + i); + mc_saved_tmp[i] = (struct microcode_intel *)__pa_nodebug(p); + } +} +#endif + +static enum ucode_state +load_microcode(struct mc_saved_data *mc_saved_data, + unsigned long *mc_saved_in_initrd, + unsigned long initrd_start, + struct ucode_cpu_info *uci) +{ + struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; + unsigned int count = mc_saved_data->mc_saved_count; + + if (!mc_saved_data->mc_saved) { + microcode_pointer(mc_saved_tmp, mc_saved_in_initrd, + initrd_start, count); + + return generic_load_microcode_early(mc_saved_tmp, count, uci); + } else { +#ifdef CONFIG_X86_32 + microcode_phys(mc_saved_tmp, mc_saved_data); + return generic_load_microcode_early(mc_saved_tmp, count, uci); +#else + return generic_load_microcode_early(mc_saved_data->mc_saved, + count, uci); +#endif + } +} + +static u8 get_x86_family(unsigned long sig) +{ + u8 x86; + + x86 = (sig >> 8) & 0xf; + + if (x86 == 0xf) + x86 += (sig >> 20) & 0xff; + + return x86; +} + +static u8 get_x86_model(unsigned long sig) +{ + u8 x86, x86_model; + + x86 = get_x86_family(sig); + x86_model = (sig >> 4) & 0xf; + + if (x86 == 0x6 || x86 == 0xf) + x86_model += ((sig >> 16) & 0xf) << 4; + + return x86_model; +} + +/* + * Given CPU signature and a microcode patch, this function finds if the + * microcode patch has matching family and model with the CPU. + */ +static enum ucode_state +matching_model_microcode(struct microcode_header_intel *mc_header, + unsigned long sig) +{ + u8 x86, x86_model; + u8 x86_ucode, x86_model_ucode; + struct extended_sigtable *ext_header; + unsigned long total_size = get_totalsize(mc_header); + unsigned long data_size = get_datasize(mc_header); + int ext_sigcount, i; + struct extended_signature *ext_sig; + + x86 = get_x86_family(sig); + x86_model = get_x86_model(sig); + + x86_ucode = get_x86_family(mc_header->sig); + x86_model_ucode = get_x86_model(mc_header->sig); + + if (x86 == x86_ucode && x86_model == x86_model_ucode) + return UCODE_OK; + + /* Look for ext. headers: */ + if (total_size <= data_size + MC_HEADER_SIZE) + return UCODE_NFOUND; + + ext_header = (struct extended_sigtable *) + mc_header + data_size + MC_HEADER_SIZE; + ext_sigcount = ext_header->count; + ext_sig = (void *)ext_header + EXT_HEADER_SIZE; + + for (i = 0; i < ext_sigcount; i++) { + x86_ucode = get_x86_family(ext_sig->sig); + x86_model_ucode = get_x86_model(ext_sig->sig); + + if (x86 == x86_ucode && x86_model == x86_model_ucode) + return UCODE_OK; + + ext_sig++; + } + + return UCODE_NFOUND; +} + +static int +save_microcode(struct mc_saved_data *mc_saved_data, + struct microcode_intel **mc_saved_src, + unsigned int mc_saved_count) +{ + int i, j; + struct microcode_intel **mc_saved_p; + int ret; + + if (!mc_saved_count) + return -EINVAL; + + /* + * Copy new microcode data. + */ + mc_saved_p = kmalloc(mc_saved_count*sizeof(struct microcode_intel *), + GFP_KERNEL); + if (!mc_saved_p) + return -ENOMEM; + + for (i = 0; i < mc_saved_count; i++) { + struct microcode_intel *mc = mc_saved_src[i]; + struct microcode_header_intel *mc_header = &mc->hdr; + unsigned long mc_size = get_totalsize(mc_header); + mc_saved_p[i] = kmalloc(mc_size, GFP_KERNEL); + if (!mc_saved_p[i]) { + ret = -ENOMEM; + goto err; + } + if (!mc_saved_src[i]) { + ret = -EINVAL; + goto err; + } + memcpy(mc_saved_p[i], mc, mc_size); + } + + /* + * Point to newly saved microcode. + */ + mc_saved_data->mc_saved = mc_saved_p; + mc_saved_data->mc_saved_count = mc_saved_count; + + return 0; + +err: + for (j = 0; j <= i; j++) + kfree(mc_saved_p[j]); + kfree(mc_saved_p); + + return ret; +} + +/* + * A microcode patch in ucode_ptr is saved into mc_saved + * - if it has matching signature and newer revision compared to an existing + * patch mc_saved. + * - or if it is a newly discovered microcode patch. + * + * The microcode patch should have matching model with CPU. + */ +static void _save_mc(struct microcode_intel **mc_saved, u8 *ucode_ptr, + unsigned int *mc_saved_count_p) +{ + int i; + int found = 0; + unsigned int mc_saved_count = *mc_saved_count_p; + struct microcode_header_intel *mc_header; + + mc_header = (struct microcode_header_intel *)ucode_ptr; + for (i = 0; i < mc_saved_count; i++) { + unsigned int sig, pf; + unsigned int new_rev; + struct microcode_header_intel *mc_saved_header = + (struct microcode_header_intel *)mc_saved[i]; + sig = mc_saved_header->sig; + pf = mc_saved_header->pf; + new_rev = mc_header->rev; + + if (get_matching_sig(sig, pf, ucode_ptr, new_rev)) { + found = 1; + if (update_match_revision(mc_header, new_rev)) { + /* + * Found an older ucode saved before. + * Replace the older one with this newer + * one. + */ + mc_saved[i] = + (struct microcode_intel *)ucode_ptr; + break; + } + } + } + if (i >= mc_saved_count && !found) + /* + * This ucode is first time discovered in ucode file. + * Save it to memory. + */ + mc_saved[mc_saved_count++] = + (struct microcode_intel *)ucode_ptr; + + *mc_saved_count_p = mc_saved_count; +} + +/* + * Get microcode matching with BSP's model. Only CPUs with the same model as + * BSP can stay in the platform. + */ +static enum ucode_state __init +get_matching_model_microcode(int cpu, unsigned long start, + void *data, size_t size, + struct mc_saved_data *mc_saved_data, + unsigned long *mc_saved_in_initrd, + struct ucode_cpu_info *uci) +{ + u8 *ucode_ptr = data; + unsigned int leftover = size; + enum ucode_state state = UCODE_OK; + unsigned int mc_size; + struct microcode_header_intel *mc_header; + struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; + unsigned int mc_saved_count = mc_saved_data->mc_saved_count; + int i; + + while (leftover) { + mc_header = (struct microcode_header_intel *)ucode_ptr; + + mc_size = get_totalsize(mc_header); + if (!mc_size || mc_size > leftover || + microcode_sanity_check(ucode_ptr, 0) < 0) + break; + + leftover -= mc_size; + + /* + * Since APs with same family and model as the BSP may boot in + * the platform, we need to find and save microcode patches + * with the same family and model as the BSP. + */ + if (matching_model_microcode(mc_header, uci->cpu_sig.sig) != + UCODE_OK) { + ucode_ptr += mc_size; + continue; + } + + _save_mc(mc_saved_tmp, ucode_ptr, &mc_saved_count); + + ucode_ptr += mc_size; + } + + if (leftover) { + state = UCODE_ERROR; + goto out; + } + + if (mc_saved_count == 0) { + state = UCODE_NFOUND; + goto out; + } + + for (i = 0; i < mc_saved_count; i++) + mc_saved_in_initrd[i] = (unsigned long)mc_saved_tmp[i] - start; + + mc_saved_data->mc_saved_count = mc_saved_count; +out: + return state; +} + +static int collect_cpu_info_early(struct ucode_cpu_info *uci) +{ + unsigned int val[2]; + u8 x86, x86_model; + struct cpu_signature csig; + unsigned int eax, ebx, ecx, edx; + + csig.sig = 0; + csig.pf = 0; + csig.rev = 0; + + memset(uci, 0, sizeof(*uci)); + + eax = 0x00000001; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + csig.sig = eax; + + x86 = get_x86_family(csig.sig); + x86_model = get_x86_model(csig.sig); + + if ((x86_model >= 5) || (x86 > 6)) { + /* get processor flags from MSR 0x17 */ + native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); + csig.pf = 1 << ((val[1] >> 18) & 7); + } + native_wrmsr(MSR_IA32_UCODE_REV, 0, 0); + + /* As documented in the SDM: Do a CPUID 1 here */ + sync_core(); + + /* get the current revision from MSR 0x8B */ + native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); + + csig.rev = val[1]; + + uci->cpu_sig = csig; + uci->valid = 1; + + return 0; +} + +#ifdef DEBUG +static void __ref show_saved_mc(void) +{ + int i, j; + unsigned int sig, pf, rev, total_size, data_size, date; + struct ucode_cpu_info uci; + + if (mc_saved_data.mc_saved_count == 0) { + pr_debug("no micorcode data saved.\n"); + return; + } + pr_debug("Total microcode saved: %d\n", mc_saved_data.mc_saved_count); + + collect_cpu_info_early(&uci); + + sig = uci.cpu_sig.sig; + pf = uci.cpu_sig.pf; + rev = uci.cpu_sig.rev; + pr_debug("CPU%d: sig=0x%x, pf=0x%x, rev=0x%x\n", + smp_processor_id(), sig, pf, rev); + + for (i = 0; i < mc_saved_data.mc_saved_count; i++) { + struct microcode_header_intel *mc_saved_header; + struct extended_sigtable *ext_header; + int ext_sigcount; + struct extended_signature *ext_sig; + + mc_saved_header = (struct microcode_header_intel *) + mc_saved_data.mc_saved[i]; + sig = mc_saved_header->sig; + pf = mc_saved_header->pf; + rev = mc_saved_header->rev; + total_size = get_totalsize(mc_saved_header); + data_size = get_datasize(mc_saved_header); + date = mc_saved_header->date; + + pr_debug("mc_saved[%d]: sig=0x%x, pf=0x%x, rev=0x%x, toal size=0x%x, date = %04x-%02x-%02x\n", + i, sig, pf, rev, total_size, + date & 0xffff, + date >> 24, + (date >> 16) & 0xff); + + /* Look for ext. headers: */ + if (total_size <= data_size + MC_HEADER_SIZE) + continue; + + ext_header = (struct extended_sigtable *) + mc_saved_header + data_size + MC_HEADER_SIZE; + ext_sigcount = ext_header->count; + ext_sig = (void *)ext_header + EXT_HEADER_SIZE; + + for (j = 0; j < ext_sigcount; j++) { + sig = ext_sig->sig; + pf = ext_sig->pf; + + pr_debug("\tExtended[%d]: sig=0x%x, pf=0x%x\n", + j, sig, pf); + + ext_sig++; + } + + } +} +#else +static inline void show_saved_mc(void) +{ +} +#endif + +#if defined(CONFIG_MICROCODE_INTEL_EARLY) && defined(CONFIG_HOTPLUG_CPU) +static DEFINE_MUTEX(x86_cpu_microcode_mutex); +/* + * Save this mc into mc_saved_data. So it will be loaded early when a CPU is + * hot added or resumes. + * + * Please make sure this mc should be a valid microcode patch before calling + * this function. + */ +int save_mc_for_early(u8 *mc) +{ + struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; + unsigned int mc_saved_count_init; + unsigned int mc_saved_count; + struct microcode_intel **mc_saved; + int ret = 0; + int i; + + /* + * Hold hotplug lock so mc_saved_data is not accessed by a CPU in + * hotplug. + */ + mutex_lock(&x86_cpu_microcode_mutex); + + mc_saved_count_init = mc_saved_data.mc_saved_count; + mc_saved_count = mc_saved_data.mc_saved_count; + mc_saved = mc_saved_data.mc_saved; + + if (mc_saved && mc_saved_count) + memcpy(mc_saved_tmp, mc_saved, + mc_saved_count * sizeof(struct mirocode_intel *)); + /* + * Save the microcode patch mc in mc_save_tmp structure if it's a newer + * version. + */ + + _save_mc(mc_saved_tmp, mc, &mc_saved_count); + + /* + * Save the mc_save_tmp in global mc_saved_data. + */ + ret = save_microcode(&mc_saved_data, mc_saved_tmp, mc_saved_count); + if (ret) { + pr_err("Cannot save microcode patch.\n"); + goto out; + } + + show_saved_mc(); + + /* + * Free old saved microcod data. + */ + if (mc_saved) { + for (i = 0; i < mc_saved_count_init; i++) + kfree(mc_saved[i]); + kfree(mc_saved); + } + +out: + mutex_unlock(&x86_cpu_microcode_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(save_mc_for_early); +#endif + +static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin"; +static __init enum ucode_state +scan_microcode(unsigned long start, unsigned long end, + struct mc_saved_data *mc_saved_data, + unsigned long *mc_saved_in_initrd, + struct ucode_cpu_info *uci) +{ + unsigned int size = end - start + 1; + struct cpio_data cd; + long offset = 0; +#ifdef CONFIG_X86_32 + char *p = (char *)__pa_nodebug(ucode_name); +#else + char *p = ucode_name; +#endif + + cd.data = NULL; + cd.size = 0; + + cd = find_cpio_data(p, (void *)start, size, &offset); + if (!cd.data) + return UCODE_ERROR; + + + return get_matching_model_microcode(0, start, cd.data, cd.size, + mc_saved_data, mc_saved_in_initrd, + uci); +} + +/* + * Print ucode update info. + */ +static void +print_ucode_info(struct ucode_cpu_info *uci, unsigned int date) +{ + int cpu = smp_processor_id(); + + pr_info("CPU%d microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n", + cpu, + uci->cpu_sig.rev, + date & 0xffff, + date >> 24, + (date >> 16) & 0xff); +} + +#ifdef CONFIG_X86_32 + +static int delay_ucode_info; +static int current_mc_date; + +/* + * Print early updated ucode info after printk works. This is delayed info dump. + */ +void show_ucode_info_early(void) +{ + struct ucode_cpu_info uci; + + if (delay_ucode_info) { + collect_cpu_info_early(&uci); + print_ucode_info(&uci, current_mc_date); + delay_ucode_info = 0; + } +} + +/* + * At this point, we can not call printk() yet. Keep microcode patch number in + * mc_saved_data.mc_saved and delay printing microcode info in + * show_ucode_info_early() until printk() works. + */ +static void print_ucode(struct ucode_cpu_info *uci) +{ + struct microcode_intel *mc_intel; + int *delay_ucode_info_p; + int *current_mc_date_p; + + mc_intel = uci->mc; + if (mc_intel == NULL) + return; + + delay_ucode_info_p = (int *)__pa_nodebug(&delay_ucode_info); + current_mc_date_p = (int *)__pa_nodebug(¤t_mc_date); + + *delay_ucode_info_p = 1; + *current_mc_date_p = mc_intel->hdr.date; +} +#else + +/* + * Flush global tlb. We only do this in x86_64 where paging has been enabled + * already and PGE should be enabled as well. + */ +static inline void flush_tlb_early(void) +{ + __native_flush_tlb_global_irq_disabled(); +} + +static inline void print_ucode(struct ucode_cpu_info *uci) +{ + struct microcode_intel *mc_intel; + + mc_intel = uci->mc; + if (mc_intel == NULL) + return; + + print_ucode_info(uci, mc_intel->hdr.date); +} +#endif + +static int apply_microcode_early(struct mc_saved_data *mc_saved_data, + struct ucode_cpu_info *uci) +{ + struct microcode_intel *mc_intel; + unsigned int val[2]; + + mc_intel = uci->mc; + if (mc_intel == NULL) + return 0; + + /* write microcode via MSR 0x79 */ + native_wrmsr(MSR_IA32_UCODE_WRITE, + (unsigned long) mc_intel->bits, + (unsigned long) mc_intel->bits >> 16 >> 16); + native_wrmsr(MSR_IA32_UCODE_REV, 0, 0); + + /* As documented in the SDM: Do a CPUID 1 here */ + sync_core(); + + /* get the current revision from MSR 0x8B */ + native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); + if (val[1] != mc_intel->hdr.rev) + return -1; + +#ifdef CONFIG_X86_64 + /* Flush global tlb. This is precaution. */ + flush_tlb_early(); +#endif + uci->cpu_sig.rev = val[1]; + + print_ucode(uci); + + return 0; +} + +/* + * This function converts microcode patch offsets previously stored in + * mc_saved_in_initrd to pointers and stores the pointers in mc_saved_data. + */ +int __init save_microcode_in_initrd_intel(void) +{ + unsigned int count = mc_saved_data.mc_saved_count; + struct microcode_intel *mc_saved[MAX_UCODE_COUNT]; + int ret = 0; + + if (count == 0) + return ret; + + microcode_pointer(mc_saved, mc_saved_in_initrd, initrd_start, count); + ret = save_microcode(&mc_saved_data, mc_saved, count); + if (ret) + pr_err("Cannot save microcode patches from initrd.\n"); + + show_saved_mc(); + + return ret; +} + +static void __init +_load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data, + unsigned long *mc_saved_in_initrd, + unsigned long initrd_start_early, + unsigned long initrd_end_early, + struct ucode_cpu_info *uci) +{ + collect_cpu_info_early(uci); + scan_microcode(initrd_start_early, initrd_end_early, mc_saved_data, + mc_saved_in_initrd, uci); + load_microcode(mc_saved_data, mc_saved_in_initrd, + initrd_start_early, uci); + apply_microcode_early(mc_saved_data, uci); +} + +void __init +load_ucode_intel_bsp(void) +{ + u64 ramdisk_image, ramdisk_size; + unsigned long initrd_start_early, initrd_end_early; + struct ucode_cpu_info uci; +#ifdef CONFIG_X86_32 + struct boot_params *boot_params_p; + + boot_params_p = (struct boot_params *)__pa_nodebug(&boot_params); + ramdisk_image = boot_params_p->hdr.ramdisk_image; + ramdisk_size = boot_params_p->hdr.ramdisk_size; + initrd_start_early = ramdisk_image; + initrd_end_early = initrd_start_early + ramdisk_size; + + _load_ucode_intel_bsp( + (struct mc_saved_data *)__pa_nodebug(&mc_saved_data), + (unsigned long *)__pa_nodebug(&mc_saved_in_initrd), + initrd_start_early, initrd_end_early, &uci); +#else + ramdisk_image = boot_params.hdr.ramdisk_image; + ramdisk_size = boot_params.hdr.ramdisk_size; + initrd_start_early = ramdisk_image + PAGE_OFFSET; + initrd_end_early = initrd_start_early + ramdisk_size; + + _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, + initrd_start_early, initrd_end_early, &uci); +#endif +} + +void load_ucode_intel_ap(void) +{ + struct mc_saved_data *mc_saved_data_p; + struct ucode_cpu_info uci; + unsigned long *mc_saved_in_initrd_p; + unsigned long initrd_start_addr; +#ifdef CONFIG_X86_32 + unsigned long *initrd_start_p; + + mc_saved_in_initrd_p = + (unsigned long *)__pa_nodebug(mc_saved_in_initrd); + mc_saved_data_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data); + initrd_start_p = (unsigned long *)__pa_nodebug(&initrd_start); + initrd_start_addr = (unsigned long)__pa_nodebug(*initrd_start_p); +#else + mc_saved_data_p = &mc_saved_data; + mc_saved_in_initrd_p = mc_saved_in_initrd; + initrd_start_addr = initrd_start; +#endif + + /* + * If there is no valid ucode previously saved in memory, no need to + * update ucode on this AP. + */ + if (mc_saved_data_p->mc_saved_count == 0) + return; + + collect_cpu_info_early(&uci); + load_microcode(mc_saved_data_p, mc_saved_in_initrd_p, + initrd_start_addr, &uci); + apply_microcode_early(mc_saved_data_p, &uci); +} diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c new file mode 100644 index 00000000000..ce69320d017 --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/intel_lib.c @@ -0,0 +1,174 @@ +/* + * Intel CPU Microcode Update Driver for Linux + * + * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com> + * H Peter Anvin" <hpa@zytor.com> + * + * This driver allows to upgrade microcode on Intel processors + * belonging to IA-32 family - PentiumPro, Pentium II, + * Pentium III, Xeon, Pentium 4, etc. + * + * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture + * Software Developer's Manual + * Order Number 253668 or free download from: + * + * http://developer.intel.com/Assets/PDF/manual/253668.pdf + * + * For more information, go to http://www.urbanmyth.org/microcode + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ +#include <linux/firmware.h> +#include <linux/uaccess.h> +#include <linux/kernel.h> +#include <linux/module.h> + +#include <asm/microcode_intel.h> +#include <asm/processor.h> +#include <asm/msr.h> + +static inline int +update_match_cpu(unsigned int csig, unsigned int cpf, + unsigned int sig, unsigned int pf) +{ + return (!sigmatch(sig, csig, pf, cpf)) ? 0 : 1; +} + +int +update_match_revision(struct microcode_header_intel *mc_header, int rev) +{ + return (mc_header->rev <= rev) ? 0 : 1; +} + +int microcode_sanity_check(void *mc, int print_err) +{ + unsigned long total_size, data_size, ext_table_size; + struct microcode_header_intel *mc_header = mc; + struct extended_sigtable *ext_header = NULL; + int sum, orig_sum, ext_sigcount = 0, i; + struct extended_signature *ext_sig; + + total_size = get_totalsize(mc_header); + data_size = get_datasize(mc_header); + + if (data_size + MC_HEADER_SIZE > total_size) { + if (print_err) + pr_err("error! Bad data size in microcode data file\n"); + return -EINVAL; + } + + if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { + if (print_err) + pr_err("error! Unknown microcode update format\n"); + return -EINVAL; + } + ext_table_size = total_size - (MC_HEADER_SIZE + data_size); + if (ext_table_size) { + if ((ext_table_size < EXT_HEADER_SIZE) + || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { + if (print_err) + pr_err("error! Small exttable size in microcode data file\n"); + return -EINVAL; + } + ext_header = mc + MC_HEADER_SIZE + data_size; + if (ext_table_size != exttable_size(ext_header)) { + if (print_err) + pr_err("error! Bad exttable size in microcode data file\n"); + return -EFAULT; + } + ext_sigcount = ext_header->count; + } + + /* check extended table checksum */ + if (ext_table_size) { + int ext_table_sum = 0; + int *ext_tablep = (int *)ext_header; + + i = ext_table_size / DWSIZE; + while (i--) + ext_table_sum += ext_tablep[i]; + if (ext_table_sum) { + if (print_err) + pr_warn("aborting, bad extended signature table checksum\n"); + return -EINVAL; + } + } + + /* calculate the checksum */ + orig_sum = 0; + i = (MC_HEADER_SIZE + data_size) / DWSIZE; + while (i--) + orig_sum += ((int *)mc)[i]; + if (orig_sum) { + if (print_err) + pr_err("aborting, bad checksum\n"); + return -EINVAL; + } + if (!ext_table_size) + return 0; + /* check extended signature checksum */ + for (i = 0; i < ext_sigcount; i++) { + ext_sig = (void *)ext_header + EXT_HEADER_SIZE + + EXT_SIGNATURE_SIZE * i; + sum = orig_sum + - (mc_header->sig + mc_header->pf + mc_header->cksum) + + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); + if (sum) { + if (print_err) + pr_err("aborting, bad checksum\n"); + return -EINVAL; + } + } + return 0; +} +EXPORT_SYMBOL_GPL(microcode_sanity_check); + +/* + * return 0 - no update found + * return 1 - found update + */ +int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev) +{ + struct microcode_header_intel *mc_header = mc; + struct extended_sigtable *ext_header; + unsigned long total_size = get_totalsize(mc_header); + int ext_sigcount, i; + struct extended_signature *ext_sig; + + if (update_match_cpu(csig, cpf, mc_header->sig, mc_header->pf)) + return 1; + + /* Look for ext. headers: */ + if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE) + return 0; + + ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE; + ext_sigcount = ext_header->count; + ext_sig = (void *)ext_header + EXT_HEADER_SIZE; + + for (i = 0; i < ext_sigcount; i++) { + if (update_match_cpu(csig, cpf, ext_sig->sig, ext_sig->pf)) + return 1; + ext_sig++; + } + return 0; +} + +/* + * return 0 - no update found + * return 1 - found update + */ +int get_matching_microcode(unsigned int csig, int cpf, void *mc, int rev) +{ + struct microcode_header_intel *mc_header = mc; + + if (!update_match_revision(mc_header, rev)) + return 0; + + return get_matching_sig(csig, cpf, mc, rev); +} +EXPORT_SYMBOL_GPL(get_matching_microcode); diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh new file mode 100644 index 00000000000..2bf61650549 --- /dev/null +++ b/arch/x86/kernel/cpu/mkcapflags.sh @@ -0,0 +1,41 @@ +#!/bin/sh +# +# Generate the x86_cap_flags[] array from include/asm/cpufeature.h +# + +IN=$1 +OUT=$2 + +TABS="$(printf '\t\t\t\t\t')" +trap 'rm "$OUT"' EXIT + +( + echo "#ifndef _ASM_X86_CPUFEATURE_H" + echo "#include <asm/cpufeature.h>" + echo "#endif" + echo "" + echo "const char * const x86_cap_flags[NCAPINTS*32] = {" + + # Iterate through any input lines starting with #define X86_FEATURE_ + sed -n -e 's/\t/ /g' -e 's/^ *# *define *X86_FEATURE_//p' $IN | + while read i + do + # Name is everything up to the first whitespace + NAME="$(echo "$i" | sed 's/ .*//')" + + # If the /* comment */ starts with a quote string, grab that. + VALUE="$(echo "$i" | sed -n 's@.*/\* *\("[^"]*"\).*\*/@\1@p')" + [ -z "$VALUE" ] && VALUE="\"$NAME\"" + [ "$VALUE" == '""' ] && continue + + # Name is uppercase, VALUE is all lowercase + VALUE="$(echo "$VALUE" | tr A-Z a-z)" + + TABCOUNT=$(( ( 5*8 - 14 - $(echo "$NAME" | wc -c) ) / 8 )) + printf "\t[%s]%.*s = %s,\n" \ + "X86_FEATURE_$NAME" "$TABCOUNT" "$TABS" "$VALUE" + done + echo "};" +) > $OUT + +trap - EXIT diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c new file mode 100644 index 00000000000..a450373e8e9 --- /dev/null +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -0,0 +1,152 @@ +/* + * HyperV Detection code. + * + * Copyright (C) 2010, Novell, Inc. + * Author : K. Y. Srinivasan <ksrinivasan@novell.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + */ + +#include <linux/types.h> +#include <linux/time.h> +#include <linux/clocksource.h> +#include <linux/module.h> +#include <linux/hardirq.h> +#include <linux/efi.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <asm/processor.h> +#include <asm/hypervisor.h> +#include <asm/hyperv.h> +#include <asm/mshyperv.h> +#include <asm/desc.h> +#include <asm/idle.h> +#include <asm/irq_regs.h> +#include <asm/i8259.h> +#include <asm/apic.h> +#include <asm/timer.h> + +struct ms_hyperv_info ms_hyperv; +EXPORT_SYMBOL_GPL(ms_hyperv); + +#if IS_ENABLED(CONFIG_HYPERV) +static void (*vmbus_handler)(void); + +void hyperv_vector_handler(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + irq_enter(); + exit_idle(); + + inc_irq_stat(irq_hv_callback_count); + if (vmbus_handler) + vmbus_handler(); + + irq_exit(); + set_irq_regs(old_regs); +} + +void hv_setup_vmbus_irq(void (*handler)(void)) +{ + vmbus_handler = handler; + /* + * Setup the IDT for hypervisor callback. Prevent reallocation + * at module reload. + */ + if (!test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors)) + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, + hyperv_callback_vector); +} + +void hv_remove_vmbus_irq(void) +{ + /* We have no way to deallocate the interrupt gate */ + vmbus_handler = NULL; +} +EXPORT_SYMBOL_GPL(hv_setup_vmbus_irq); +EXPORT_SYMBOL_GPL(hv_remove_vmbus_irq); +#endif + +static uint32_t __init ms_hyperv_platform(void) +{ + u32 eax; + u32 hyp_signature[3]; + + if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return 0; + + cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS, + &eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]); + + if (eax >= HYPERV_CPUID_MIN && + eax <= HYPERV_CPUID_MAX && + !memcmp("Microsoft Hv", hyp_signature, 12)) + return HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS; + + return 0; +} + +static cycle_t read_hv_clock(struct clocksource *arg) +{ + cycle_t current_tick; + /* + * Read the partition counter to get the current tick count. This count + * is set to 0 when the partition is created and is incremented in + * 100 nanosecond units. + */ + rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick); + return current_tick; +} + +static struct clocksource hyperv_cs = { + .name = "hyperv_clocksource", + .rating = 400, /* use this when running on Hyperv*/ + .read = read_hv_clock, + .mask = CLOCKSOURCE_MASK(64), +}; + +static void __init ms_hyperv_init_platform(void) +{ + /* + * Extract the features and hints + */ + ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES); + ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO); + + printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n", + ms_hyperv.features, ms_hyperv.hints); + +#ifdef CONFIG_X86_LOCAL_APIC + if (ms_hyperv.features & HV_X64_MSR_APIC_FREQUENCY_AVAILABLE) { + /* + * Get the APIC frequency. + */ + u64 hv_lapic_frequency; + + rdmsrl(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency); + hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ); + lapic_timer_frequency = hv_lapic_frequency; + printk(KERN_INFO "HyperV: LAPIC Timer Frequency: %#x\n", + lapic_timer_frequency); + } +#endif + + if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE) + clocksource_register_hz(&hyperv_cs, NSEC_PER_SEC/100); + +#ifdef CONFIG_X86_IO_APIC + no_timer_check = 1; +#endif + +} + +const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { + .name = "Microsoft HyperV", + .detect = ms_hyperv_platform, + .init_platform = ms_hyperv_init_platform, +}; +EXPORT_SYMBOL(x86_hyper_ms_hyperv); diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile new file mode 100644 index 00000000000..ad9e5ed8118 --- /dev/null +++ b/arch/x86/kernel/cpu/mtrr/Makefile @@ -0,0 +1,3 @@ +obj-y := main.o if.o generic.o cleanup.o +obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o + diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c new file mode 100644 index 00000000000..92ba9cd31c9 --- /dev/null +++ b/arch/x86/kernel/cpu/mtrr/amd.c @@ -0,0 +1,124 @@ +#include <linux/init.h> +#include <linux/mm.h> +#include <asm/mtrr.h> +#include <asm/msr.h> + +#include "mtrr.h" + +static void +amd_get_mtrr(unsigned int reg, unsigned long *base, + unsigned long *size, mtrr_type *type) +{ + unsigned long low, high; + + rdmsr(MSR_K6_UWCCR, low, high); + /* Upper dword is region 1, lower is region 0 */ + if (reg == 1) + low = high; + /* The base masks off on the right alignment */ + *base = (low & 0xFFFE0000) >> PAGE_SHIFT; + *type = 0; + if (low & 1) + *type = MTRR_TYPE_UNCACHABLE; + if (low & 2) + *type = MTRR_TYPE_WRCOMB; + if (!(low & 3)) { + *size = 0; + return; + } + /* + * This needs a little explaining. The size is stored as an + * inverted mask of bits of 128K granularity 15 bits long offset + * 2 bits. + * + * So to get a size we do invert the mask and add 1 to the lowest + * mask bit (4 as its 2 bits in). This gives us a size we then shift + * to turn into 128K blocks. + * + * eg 111 1111 1111 1100 is 512K + * + * invert 000 0000 0000 0011 + * +1 000 0000 0000 0100 + * *128K ... + */ + low = (~low) & 0x1FFFC; + *size = (low + 4) << (15 - PAGE_SHIFT); +} + +/** + * amd_set_mtrr - Set variable MTRR register on the local CPU. + * + * @reg The register to set. + * @base The base address of the region. + * @size The size of the region. If this is 0 the region is disabled. + * @type The type of the region. + * + * Returns nothing. + */ +static void +amd_set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type) +{ + u32 regs[2]; + + /* + * Low is MTRR0, High MTRR 1 + */ + rdmsr(MSR_K6_UWCCR, regs[0], regs[1]); + /* + * Blank to disable + */ + if (size == 0) { + regs[reg] = 0; + } else { + /* + * Set the register to the base, the type (off by one) and an + * inverted bitmask of the size The size is the only odd + * bit. We are fed say 512K We invert this and we get 111 1111 + * 1111 1011 but if you subtract one and invert you get the + * desired 111 1111 1111 1100 mask + * + * But ~(x - 1) == ~x + 1 == -x. Two's complement rocks! + */ + regs[reg] = (-size >> (15 - PAGE_SHIFT) & 0x0001FFFC) + | (base << PAGE_SHIFT) | (type + 1); + } + + /* + * The writeback rule is quite specific. See the manual. Its + * disable local interrupts, write back the cache, set the mtrr + */ + wbinvd(); + wrmsr(MSR_K6_UWCCR, regs[0], regs[1]); +} + +static int +amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type) +{ + /* + * Apply the K6 block alignment and size rules + * In order + * o Uncached or gathering only + * o 128K or bigger block + * o Power of 2 block + * o base suitably aligned to the power + */ + if (type > MTRR_TYPE_WRCOMB || size < (1 << (17 - PAGE_SHIFT)) + || (size & ~(size - 1)) - size || (base & (size - 1))) + return -EINVAL; + return 0; +} + +static const struct mtrr_ops amd_mtrr_ops = { + .vendor = X86_VENDOR_AMD, + .set = amd_set_mtrr, + .get = amd_get_mtrr, + .get_free_region = generic_get_free_region, + .validate_add_page = amd_validate_add_page, + .have_wrcomb = positive_have_wrcomb, +}; + +int __init amd_init_mtrr(void) +{ + set_mtrr_ops(&amd_mtrr_ops); + return 0; +} diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c new file mode 100644 index 00000000000..316fe3e60a9 --- /dev/null +++ b/arch/x86/kernel/cpu/mtrr/centaur.c @@ -0,0 +1,126 @@ +#include <linux/init.h> +#include <linux/mm.h> + +#include <asm/mtrr.h> +#include <asm/msr.h> + +#include "mtrr.h" + +static struct { + unsigned long high; + unsigned long low; +} centaur_mcr[8]; + +static u8 centaur_mcr_reserved; +static u8 centaur_mcr_type; /* 0 for winchip, 1 for winchip2 */ + +/** + * centaur_get_free_region - Get a free MTRR. + * + * @base: The starting (base) address of the region. + * @size: The size (in bytes) of the region. + * + * Returns: the index of the region on success, else -1 on error. + */ +static int +centaur_get_free_region(unsigned long base, unsigned long size, int replace_reg) +{ + unsigned long lbase, lsize; + mtrr_type ltype; + int i, max; + + max = num_var_ranges; + if (replace_reg >= 0 && replace_reg < max) + return replace_reg; + + for (i = 0; i < max; ++i) { + if (centaur_mcr_reserved & (1 << i)) + continue; + mtrr_if->get(i, &lbase, &lsize, <ype); + if (lsize == 0) + return i; + } + + return -ENOSPC; +} + +/* + * Report boot time MCR setups + */ +void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) +{ + centaur_mcr[mcr].low = lo; + centaur_mcr[mcr].high = hi; +} + +static void +centaur_get_mcr(unsigned int reg, unsigned long *base, + unsigned long *size, mtrr_type * type) +{ + *base = centaur_mcr[reg].high >> PAGE_SHIFT; + *size = -(centaur_mcr[reg].low & 0xfffff000) >> PAGE_SHIFT; + *type = MTRR_TYPE_WRCOMB; /* write-combining */ + + if (centaur_mcr_type == 1 && ((centaur_mcr[reg].low & 31) & 2)) + *type = MTRR_TYPE_UNCACHABLE; + if (centaur_mcr_type == 1 && (centaur_mcr[reg].low & 31) == 25) + *type = MTRR_TYPE_WRBACK; + if (centaur_mcr_type == 0 && (centaur_mcr[reg].low & 31) == 31) + *type = MTRR_TYPE_WRBACK; +} + +static void +centaur_set_mcr(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type) +{ + unsigned long low, high; + + if (size == 0) { + /* Disable */ + high = low = 0; + } else { + high = base << PAGE_SHIFT; + if (centaur_mcr_type == 0) { + /* Only support write-combining... */ + low = -size << PAGE_SHIFT | 0x1f; + } else { + if (type == MTRR_TYPE_UNCACHABLE) + low = -size << PAGE_SHIFT | 0x02; /* NC */ + else + low = -size << PAGE_SHIFT | 0x09; /* WWO, WC */ + } + } + centaur_mcr[reg].high = high; + centaur_mcr[reg].low = low; + wrmsr(MSR_IDT_MCR0 + reg, low, high); +} + +static int +centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int type) +{ + /* + * FIXME: Winchip2 supports uncached + */ + if (type != MTRR_TYPE_WRCOMB && + (centaur_mcr_type == 0 || type != MTRR_TYPE_UNCACHABLE)) { + pr_warning("mtrr: only write-combining%s supported\n", + centaur_mcr_type ? " and uncacheable are" : " is"); + return -EINVAL; + } + return 0; +} + +static const struct mtrr_ops centaur_mtrr_ops = { + .vendor = X86_VENDOR_CENTAUR, + .set = centaur_set_mcr, + .get = centaur_get_mcr, + .get_free_region = centaur_get_free_region, + .validate_add_page = centaur_validate_add_page, + .have_wrcomb = positive_have_wrcomb, +}; + +int __init centaur_init_mtrr(void) +{ + set_mtrr_ops(¢aur_mtrr_ops); + return 0; +} diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c new file mode 100644 index 00000000000..5f90b85ff22 --- /dev/null +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -0,0 +1,980 @@ +/* + * MTRR (Memory Type Range Register) cleanup + * + * Copyright (C) 2009 Yinghai Lu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the Free + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#include <linux/module.h> +#include <linux/init.h> +#include <linux/pci.h> +#include <linux/smp.h> +#include <linux/cpu.h> +#include <linux/mutex.h> +#include <linux/uaccess.h> +#include <linux/kvm_para.h> +#include <linux/range.h> + +#include <asm/processor.h> +#include <asm/e820.h> +#include <asm/mtrr.h> +#include <asm/msr.h> + +#include "mtrr.h" + +struct var_mtrr_range_state { + unsigned long base_pfn; + unsigned long size_pfn; + mtrr_type type; +}; + +struct var_mtrr_state { + unsigned long range_startk; + unsigned long range_sizek; + unsigned long chunk_sizek; + unsigned long gran_sizek; + unsigned int reg; +}; + +/* Should be related to MTRR_VAR_RANGES nums */ +#define RANGE_NUM 256 + +static struct range __initdata range[RANGE_NUM]; +static int __initdata nr_range; + +static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; + +static int __initdata debug_print; +#define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0) + +#define BIOS_BUG_MSG KERN_WARNING \ + "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n" + +static int __init +x86_get_mtrr_mem_range(struct range *range, int nr_range, + unsigned long extra_remove_base, + unsigned long extra_remove_size) +{ + unsigned long base, size; + mtrr_type type; + int i; + + for (i = 0; i < num_var_ranges; i++) { + type = range_state[i].type; + if (type != MTRR_TYPE_WRBACK) + continue; + base = range_state[i].base_pfn; + size = range_state[i].size_pfn; + nr_range = add_range_with_merge(range, RANGE_NUM, nr_range, + base, base + size); + } + if (debug_print) { + printk(KERN_DEBUG "After WB checking\n"); + for (i = 0; i < nr_range; i++) + printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n", + range[i].start, range[i].end); + } + + /* Take out UC ranges: */ + for (i = 0; i < num_var_ranges; i++) { + type = range_state[i].type; + if (type != MTRR_TYPE_UNCACHABLE && + type != MTRR_TYPE_WRPROT) + continue; + size = range_state[i].size_pfn; + if (!size) + continue; + base = range_state[i].base_pfn; + if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed && + (mtrr_state.enabled & 1)) { + /* Var MTRR contains UC entry below 1M? Skip it: */ + printk(BIOS_BUG_MSG, i); + if (base + size <= (1<<(20-PAGE_SHIFT))) + continue; + size -= (1<<(20-PAGE_SHIFT)) - base; + base = 1<<(20-PAGE_SHIFT); + } + subtract_range(range, RANGE_NUM, base, base + size); + } + if (extra_remove_size) + subtract_range(range, RANGE_NUM, extra_remove_base, + extra_remove_base + extra_remove_size); + + if (debug_print) { + printk(KERN_DEBUG "After UC checking\n"); + for (i = 0; i < RANGE_NUM; i++) { + if (!range[i].end) + continue; + printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n", + range[i].start, range[i].end); + } + } + + /* sort the ranges */ + nr_range = clean_sort_range(range, RANGE_NUM); + if (debug_print) { + printk(KERN_DEBUG "After sorting\n"); + for (i = 0; i < nr_range; i++) + printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n", + range[i].start, range[i].end); + } + + return nr_range; +} + +#ifdef CONFIG_MTRR_SANITIZER + +static unsigned long __init sum_ranges(struct range *range, int nr_range) +{ + unsigned long sum = 0; + int i; + + for (i = 0; i < nr_range; i++) + sum += range[i].end - range[i].start; + + return sum; +} + +static int enable_mtrr_cleanup __initdata = + CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT; + +static int __init disable_mtrr_cleanup_setup(char *str) +{ + enable_mtrr_cleanup = 0; + return 0; +} +early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup); + +static int __init enable_mtrr_cleanup_setup(char *str) +{ + enable_mtrr_cleanup = 1; + return 0; +} +early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup); + +static int __init mtrr_cleanup_debug_setup(char *str) +{ + debug_print = 1; + return 0; +} +early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup); + +static void __init +set_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, + unsigned char type, unsigned int address_bits) +{ + u32 base_lo, base_hi, mask_lo, mask_hi; + u64 base, mask; + + if (!sizek) { + fill_mtrr_var_range(reg, 0, 0, 0, 0); + return; + } + + mask = (1ULL << address_bits) - 1; + mask &= ~((((u64)sizek) << 10) - 1); + + base = ((u64)basek) << 10; + + base |= type; + mask |= 0x800; + + base_lo = base & ((1ULL<<32) - 1); + base_hi = base >> 32; + + mask_lo = mask & ((1ULL<<32) - 1); + mask_hi = mask >> 32; + + fill_mtrr_var_range(reg, base_lo, base_hi, mask_lo, mask_hi); +} + +static void __init +save_var_mtrr(unsigned int reg, unsigned long basek, unsigned long sizek, + unsigned char type) +{ + range_state[reg].base_pfn = basek >> (PAGE_SHIFT - 10); + range_state[reg].size_pfn = sizek >> (PAGE_SHIFT - 10); + range_state[reg].type = type; +} + +static void __init set_var_mtrr_all(unsigned int address_bits) +{ + unsigned long basek, sizek; + unsigned char type; + unsigned int reg; + + for (reg = 0; reg < num_var_ranges; reg++) { + basek = range_state[reg].base_pfn << (PAGE_SHIFT - 10); + sizek = range_state[reg].size_pfn << (PAGE_SHIFT - 10); + type = range_state[reg].type; + + set_var_mtrr(reg, basek, sizek, type, address_bits); + } +} + +static unsigned long to_size_factor(unsigned long sizek, char *factorp) +{ + unsigned long base = sizek; + char factor; + + if (base & ((1<<10) - 1)) { + /* Not MB-aligned: */ + factor = 'K'; + } else if (base & ((1<<20) - 1)) { + factor = 'M'; + base >>= 10; + } else { + factor = 'G'; + base >>= 20; + } + + *factorp = factor; + + return base; +} + +static unsigned int __init +range_to_mtrr(unsigned int reg, unsigned long range_startk, + unsigned long range_sizek, unsigned char type) +{ + if (!range_sizek || (reg >= num_var_ranges)) + return reg; + + while (range_sizek) { + unsigned long max_align, align; + unsigned long sizek; + + /* Compute the maximum size with which we can make a range: */ + if (range_startk) + max_align = __ffs(range_startk); + else + max_align = BITS_PER_LONG - 1; + + align = __fls(range_sizek); + if (align > max_align) + align = max_align; + + sizek = 1UL << align; + if (debug_print) { + char start_factor = 'K', size_factor = 'K'; + unsigned long start_base, size_base; + + start_base = to_size_factor(range_startk, &start_factor); + size_base = to_size_factor(sizek, &size_factor); + + Dprintk("Setting variable MTRR %d, " + "base: %ld%cB, range: %ld%cB, type %s\n", + reg, start_base, start_factor, + size_base, size_factor, + (type == MTRR_TYPE_UNCACHABLE) ? "UC" : + ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other") + ); + } + save_var_mtrr(reg++, range_startk, sizek, type); + range_startk += sizek; + range_sizek -= sizek; + if (reg >= num_var_ranges) + break; + } + return reg; +} + +static unsigned __init +range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, + unsigned long sizek) +{ + unsigned long hole_basek, hole_sizek; + unsigned long second_basek, second_sizek; + unsigned long range0_basek, range0_sizek; + unsigned long range_basek, range_sizek; + unsigned long chunk_sizek; + unsigned long gran_sizek; + + hole_basek = 0; + hole_sizek = 0; + second_basek = 0; + second_sizek = 0; + chunk_sizek = state->chunk_sizek; + gran_sizek = state->gran_sizek; + + /* Align with gran size, prevent small block used up MTRRs: */ + range_basek = ALIGN(state->range_startk, gran_sizek); + if ((range_basek > basek) && basek) + return second_sizek; + + state->range_sizek -= (range_basek - state->range_startk); + range_sizek = ALIGN(state->range_sizek, gran_sizek); + + while (range_sizek > state->range_sizek) { + range_sizek -= gran_sizek; + if (!range_sizek) + return 0; + } + state->range_sizek = range_sizek; + + /* Try to append some small hole: */ + range0_basek = state->range_startk; + range0_sizek = ALIGN(state->range_sizek, chunk_sizek); + + /* No increase: */ + if (range0_sizek == state->range_sizek) { + Dprintk("rangeX: %016lx - %016lx\n", + range0_basek<<10, + (range0_basek + state->range_sizek)<<10); + state->reg = range_to_mtrr(state->reg, range0_basek, + state->range_sizek, MTRR_TYPE_WRBACK); + return 0; + } + + /* Only cut back when it is not the last: */ + if (sizek) { + while (range0_basek + range0_sizek > (basek + sizek)) { + if (range0_sizek >= chunk_sizek) + range0_sizek -= chunk_sizek; + else + range0_sizek = 0; + + if (!range0_sizek) + break; + } + } + +second_try: + range_basek = range0_basek + range0_sizek; + + /* One hole in the middle: */ + if (range_basek > basek && range_basek <= (basek + sizek)) + second_sizek = range_basek - basek; + + if (range0_sizek > state->range_sizek) { + + /* One hole in middle or at the end: */ + hole_sizek = range0_sizek - state->range_sizek - second_sizek; + + /* Hole size should be less than half of range0 size: */ + if (hole_sizek >= (range0_sizek >> 1) && + range0_sizek >= chunk_sizek) { + range0_sizek -= chunk_sizek; + second_sizek = 0; + hole_sizek = 0; + + goto second_try; + } + } + + if (range0_sizek) { + Dprintk("range0: %016lx - %016lx\n", + range0_basek<<10, + (range0_basek + range0_sizek)<<10); + state->reg = range_to_mtrr(state->reg, range0_basek, + range0_sizek, MTRR_TYPE_WRBACK); + } + + if (range0_sizek < state->range_sizek) { + /* Need to handle left over range: */ + range_sizek = state->range_sizek - range0_sizek; + + Dprintk("range: %016lx - %016lx\n", + range_basek<<10, + (range_basek + range_sizek)<<10); + + state->reg = range_to_mtrr(state->reg, range_basek, + range_sizek, MTRR_TYPE_WRBACK); + } + + if (hole_sizek) { + hole_basek = range_basek - hole_sizek - second_sizek; + Dprintk("hole: %016lx - %016lx\n", + hole_basek<<10, + (hole_basek + hole_sizek)<<10); + state->reg = range_to_mtrr(state->reg, hole_basek, + hole_sizek, MTRR_TYPE_UNCACHABLE); + } + + return second_sizek; +} + +static void __init +set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn, + unsigned long size_pfn) +{ + unsigned long basek, sizek; + unsigned long second_sizek = 0; + + if (state->reg >= num_var_ranges) + return; + + basek = base_pfn << (PAGE_SHIFT - 10); + sizek = size_pfn << (PAGE_SHIFT - 10); + + /* See if I can merge with the last range: */ + if ((basek <= 1024) || + (state->range_startk + state->range_sizek == basek)) { + unsigned long endk = basek + sizek; + state->range_sizek = endk - state->range_startk; + return; + } + /* Write the range mtrrs: */ + if (state->range_sizek != 0) + second_sizek = range_to_mtrr_with_hole(state, basek, sizek); + + /* Allocate an msr: */ + state->range_startk = basek + second_sizek; + state->range_sizek = sizek - second_sizek; +} + +/* Mininum size of mtrr block that can take hole: */ +static u64 mtrr_chunk_size __initdata = (256ULL<<20); + +static int __init parse_mtrr_chunk_size_opt(char *p) +{ + if (!p) + return -EINVAL; + mtrr_chunk_size = memparse(p, &p); + return 0; +} +early_param("mtrr_chunk_size", parse_mtrr_chunk_size_opt); + +/* Granularity of mtrr of block: */ +static u64 mtrr_gran_size __initdata; + +static int __init parse_mtrr_gran_size_opt(char *p) +{ + if (!p) + return -EINVAL; + mtrr_gran_size = memparse(p, &p); + return 0; +} +early_param("mtrr_gran_size", parse_mtrr_gran_size_opt); + +static unsigned long nr_mtrr_spare_reg __initdata = + CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT; + +static int __init parse_mtrr_spare_reg(char *arg) +{ + if (arg) + nr_mtrr_spare_reg = simple_strtoul(arg, NULL, 0); + return 0; +} +early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg); + +static int __init +x86_setup_var_mtrrs(struct range *range, int nr_range, + u64 chunk_size, u64 gran_size) +{ + struct var_mtrr_state var_state; + int num_reg; + int i; + + var_state.range_startk = 0; + var_state.range_sizek = 0; + var_state.reg = 0; + var_state.chunk_sizek = chunk_size >> 10; + var_state.gran_sizek = gran_size >> 10; + + memset(range_state, 0, sizeof(range_state)); + + /* Write the range: */ + for (i = 0; i < nr_range; i++) { + set_var_mtrr_range(&var_state, range[i].start, + range[i].end - range[i].start); + } + + /* Write the last range: */ + if (var_state.range_sizek != 0) + range_to_mtrr_with_hole(&var_state, 0, 0); + + num_reg = var_state.reg; + /* Clear out the extra MTRR's: */ + while (var_state.reg < num_var_ranges) { + save_var_mtrr(var_state.reg, 0, 0, 0); + var_state.reg++; + } + + return num_reg; +} + +struct mtrr_cleanup_result { + unsigned long gran_sizek; + unsigned long chunk_sizek; + unsigned long lose_cover_sizek; + unsigned int num_reg; + int bad; +}; + +/* + * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G + * chunk size: gran_size, ..., 2G + * so we need (1+16)*8 + */ +#define NUM_RESULT 136 +#define PSHIFT (PAGE_SHIFT - 10) + +static struct mtrr_cleanup_result __initdata result[NUM_RESULT]; +static unsigned long __initdata min_loss_pfn[RANGE_NUM]; + +static void __init print_out_mtrr_range_state(void) +{ + char start_factor = 'K', size_factor = 'K'; + unsigned long start_base, size_base; + mtrr_type type; + int i; + + for (i = 0; i < num_var_ranges; i++) { + + size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10); + if (!size_base) + continue; + + size_base = to_size_factor(size_base, &size_factor), + start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10); + start_base = to_size_factor(start_base, &start_factor), + type = range_state[i].type; + + printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n", + i, start_base, start_factor, + size_base, size_factor, + (type == MTRR_TYPE_UNCACHABLE) ? "UC" : + ((type == MTRR_TYPE_WRPROT) ? "WP" : + ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other")) + ); + } +} + +static int __init mtrr_need_cleanup(void) +{ + int i; + mtrr_type type; + unsigned long size; + /* Extra one for all 0: */ + int num[MTRR_NUM_TYPES + 1]; + + /* Check entries number: */ + memset(num, 0, sizeof(num)); + for (i = 0; i < num_var_ranges; i++) { + type = range_state[i].type; + size = range_state[i].size_pfn; + if (type >= MTRR_NUM_TYPES) + continue; + if (!size) + type = MTRR_NUM_TYPES; + num[type]++; + } + + /* Check if we got UC entries: */ + if (!num[MTRR_TYPE_UNCACHABLE]) + return 0; + + /* Check if we only had WB and UC */ + if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] != + num_var_ranges - num[MTRR_NUM_TYPES]) + return 0; + + return 1; +} + +static unsigned long __initdata range_sums; + +static void __init +mtrr_calc_range_state(u64 chunk_size, u64 gran_size, + unsigned long x_remove_base, + unsigned long x_remove_size, int i) +{ + static struct range range_new[RANGE_NUM]; + unsigned long range_sums_new; + static int nr_range_new; + int num_reg; + + /* Convert ranges to var ranges state: */ + num_reg = x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); + + /* We got new setting in range_state, check it: */ + memset(range_new, 0, sizeof(range_new)); + nr_range_new = x86_get_mtrr_mem_range(range_new, 0, + x_remove_base, x_remove_size); + range_sums_new = sum_ranges(range_new, nr_range_new); + + result[i].chunk_sizek = chunk_size >> 10; + result[i].gran_sizek = gran_size >> 10; + result[i].num_reg = num_reg; + + if (range_sums < range_sums_new) { + result[i].lose_cover_sizek = (range_sums_new - range_sums) << PSHIFT; + result[i].bad = 1; + } else { + result[i].lose_cover_sizek = (range_sums - range_sums_new) << PSHIFT; + } + + /* Double check it: */ + if (!result[i].bad && !result[i].lose_cover_sizek) { + if (nr_range_new != nr_range || memcmp(range, range_new, sizeof(range))) + result[i].bad = 1; + } + + if (!result[i].bad && (range_sums - range_sums_new < min_loss_pfn[num_reg])) + min_loss_pfn[num_reg] = range_sums - range_sums_new; +} + +static void __init mtrr_print_out_one_result(int i) +{ + unsigned long gran_base, chunk_base, lose_base; + char gran_factor, chunk_factor, lose_factor; + + gran_base = to_size_factor(result[i].gran_sizek, &gran_factor); + chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor); + lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor); + + pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t", + result[i].bad ? "*BAD*" : " ", + gran_base, gran_factor, chunk_base, chunk_factor); + pr_cont("num_reg: %d \tlose cover RAM: %s%ld%c\n", + result[i].num_reg, result[i].bad ? "-" : "", + lose_base, lose_factor); +} + +static int __init mtrr_search_optimal_index(void) +{ + int num_reg_good; + int index_good; + int i; + + if (nr_mtrr_spare_reg >= num_var_ranges) + nr_mtrr_spare_reg = num_var_ranges - 1; + + num_reg_good = -1; + for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) { + if (!min_loss_pfn[i]) + num_reg_good = i; + } + + index_good = -1; + if (num_reg_good != -1) { + for (i = 0; i < NUM_RESULT; i++) { + if (!result[i].bad && + result[i].num_reg == num_reg_good && + !result[i].lose_cover_sizek) { + index_good = i; + break; + } + } + } + + return index_good; +} + +int __init mtrr_cleanup(unsigned address_bits) +{ + unsigned long x_remove_base, x_remove_size; + unsigned long base, size, def, dummy; + u64 chunk_size, gran_size; + mtrr_type type; + int index_good; + int i; + + if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) + return 0; + + rdmsr(MSR_MTRRdefType, def, dummy); + def &= 0xff; + if (def != MTRR_TYPE_UNCACHABLE) + return 0; + + /* Get it and store it aside: */ + memset(range_state, 0, sizeof(range_state)); + for (i = 0; i < num_var_ranges; i++) { + mtrr_if->get(i, &base, &size, &type); + range_state[i].base_pfn = base; + range_state[i].size_pfn = size; + range_state[i].type = type; + } + + /* Check if we need handle it and can handle it: */ + if (!mtrr_need_cleanup()) + return 0; + + /* Print original var MTRRs at first, for debugging: */ + printk(KERN_DEBUG "original variable MTRRs\n"); + print_out_mtrr_range_state(); + + memset(range, 0, sizeof(range)); + x_remove_size = 0; + x_remove_base = 1 << (32 - PAGE_SHIFT); + if (mtrr_tom2) + x_remove_size = (mtrr_tom2 >> PAGE_SHIFT) - x_remove_base; + + /* + * [0, 1M) should always be covered by var mtrr with WB + * and fixed mtrrs should take effect before var mtrr for it: + */ + nr_range = add_range_with_merge(range, RANGE_NUM, 0, 0, + 1ULL<<(20 - PAGE_SHIFT)); + /* add from var mtrr at last */ + nr_range = x86_get_mtrr_mem_range(range, nr_range, + x_remove_base, x_remove_size); + + range_sums = sum_ranges(range, nr_range); + printk(KERN_INFO "total RAM covered: %ldM\n", + range_sums >> (20 - PAGE_SHIFT)); + + if (mtrr_chunk_size && mtrr_gran_size) { + i = 0; + mtrr_calc_range_state(mtrr_chunk_size, mtrr_gran_size, + x_remove_base, x_remove_size, i); + + mtrr_print_out_one_result(i); + + if (!result[i].bad) { + set_var_mtrr_all(address_bits); + printk(KERN_DEBUG "New variable MTRRs\n"); + print_out_mtrr_range_state(); + return 1; + } + printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, " + "will find optimal one\n"); + } + + i = 0; + memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn)); + memset(result, 0, sizeof(result)); + for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) { + + for (chunk_size = gran_size; chunk_size < (1ULL<<32); + chunk_size <<= 1) { + + if (i >= NUM_RESULT) + continue; + + mtrr_calc_range_state(chunk_size, gran_size, + x_remove_base, x_remove_size, i); + if (debug_print) { + mtrr_print_out_one_result(i); + printk(KERN_INFO "\n"); + } + + i++; + } + } + + /* Try to find the optimal index: */ + index_good = mtrr_search_optimal_index(); + + if (index_good != -1) { + printk(KERN_INFO "Found optimal setting for mtrr clean up\n"); + i = index_good; + mtrr_print_out_one_result(i); + + /* Convert ranges to var ranges state: */ + chunk_size = result[i].chunk_sizek; + chunk_size <<= 10; + gran_size = result[i].gran_sizek; + gran_size <<= 10; + x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); + set_var_mtrr_all(address_bits); + printk(KERN_DEBUG "New variable MTRRs\n"); + print_out_mtrr_range_state(); + return 1; + } else { + /* print out all */ + for (i = 0; i < NUM_RESULT; i++) + mtrr_print_out_one_result(i); + } + + printk(KERN_INFO "mtrr_cleanup: can not find optimal value\n"); + printk(KERN_INFO "please specify mtrr_gran_size/mtrr_chunk_size\n"); + + return 0; +} +#else +int __init mtrr_cleanup(unsigned address_bits) +{ + return 0; +} +#endif + +static int disable_mtrr_trim; + +static int __init disable_mtrr_trim_setup(char *str) +{ + disable_mtrr_trim = 1; + return 0; +} +early_param("disable_mtrr_trim", disable_mtrr_trim_setup); + +/* + * Newer AMD K8s and later CPUs have a special magic MSR way to force WB + * for memory >4GB. Check for that here. + * Note this won't check if the MTRRs < 4GB where the magic bit doesn't + * apply to are wrong, but so far we don't know of any such case in the wild. + */ +#define Tom2Enabled (1U << 21) +#define Tom2ForceMemTypeWB (1U << 22) + +int __init amd_special_default_mtrr(void) +{ + u32 l, h; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + return 0; + if (boot_cpu_data.x86 < 0xf) + return 0; + /* In case some hypervisor doesn't pass SYSCFG through: */ + if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0) + return 0; + /* + * Memory between 4GB and top of mem is forced WB by this magic bit. + * Reserved before K8RevF, but should be zero there. + */ + if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) == + (Tom2Enabled | Tom2ForceMemTypeWB)) + return 1; + return 0; +} + +static u64 __init +real_trim_memory(unsigned long start_pfn, unsigned long limit_pfn) +{ + u64 trim_start, trim_size; + + trim_start = start_pfn; + trim_start <<= PAGE_SHIFT; + + trim_size = limit_pfn; + trim_size <<= PAGE_SHIFT; + trim_size -= trim_start; + + return e820_update_range(trim_start, trim_size, E820_RAM, E820_RESERVED); +} + +/** + * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs + * @end_pfn: ending page frame number + * + * Some buggy BIOSes don't setup the MTRRs properly for systems with certain + * memory configurations. This routine checks that the highest MTRR matches + * the end of memory, to make sure the MTRRs having a write back type cover + * all of the memory the kernel is intending to use. If not, it'll trim any + * memory off the end by adjusting end_pfn, removing it from the kernel's + * allocation pools, warning the user with an obnoxious message. + */ +int __init mtrr_trim_uncached_memory(unsigned long end_pfn) +{ + unsigned long i, base, size, highest_pfn = 0, def, dummy; + mtrr_type type; + u64 total_trim_size; + /* extra one for all 0 */ + int num[MTRR_NUM_TYPES + 1]; + + /* + * Make sure we only trim uncachable memory on machines that + * support the Intel MTRR architecture: + */ + if (!is_cpu(INTEL) || disable_mtrr_trim) + return 0; + + rdmsr(MSR_MTRRdefType, def, dummy); + def &= 0xff; + if (def != MTRR_TYPE_UNCACHABLE) + return 0; + + /* Get it and store it aside: */ + memset(range_state, 0, sizeof(range_state)); + for (i = 0; i < num_var_ranges; i++) { + mtrr_if->get(i, &base, &size, &type); + range_state[i].base_pfn = base; + range_state[i].size_pfn = size; + range_state[i].type = type; + } + + /* Find highest cached pfn: */ + for (i = 0; i < num_var_ranges; i++) { + type = range_state[i].type; + if (type != MTRR_TYPE_WRBACK) + continue; + base = range_state[i].base_pfn; + size = range_state[i].size_pfn; + if (highest_pfn < base + size) + highest_pfn = base + size; + } + + /* kvm/qemu doesn't have mtrr set right, don't trim them all: */ + if (!highest_pfn) { + printk(KERN_INFO "CPU MTRRs all blank - virtualized system.\n"); + return 0; + } + + /* Check entries number: */ + memset(num, 0, sizeof(num)); + for (i = 0; i < num_var_ranges; i++) { + type = range_state[i].type; + if (type >= MTRR_NUM_TYPES) + continue; + size = range_state[i].size_pfn; + if (!size) + type = MTRR_NUM_TYPES; + num[type]++; + } + + /* No entry for WB? */ + if (!num[MTRR_TYPE_WRBACK]) + return 0; + + /* Check if we only had WB and UC: */ + if (num[MTRR_TYPE_WRBACK] + num[MTRR_TYPE_UNCACHABLE] != + num_var_ranges - num[MTRR_NUM_TYPES]) + return 0; + + memset(range, 0, sizeof(range)); + nr_range = 0; + if (mtrr_tom2) { + range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT)); + range[nr_range].end = mtrr_tom2 >> PAGE_SHIFT; + if (highest_pfn < range[nr_range].end) + highest_pfn = range[nr_range].end; + nr_range++; + } + nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0); + + /* Check the head: */ + total_trim_size = 0; + if (range[0].start) + total_trim_size += real_trim_memory(0, range[0].start); + + /* Check the holes: */ + for (i = 0; i < nr_range - 1; i++) { + if (range[i].end < range[i+1].start) + total_trim_size += real_trim_memory(range[i].end, + range[i+1].start); + } + + /* Check the top: */ + i = nr_range - 1; + if (range[i].end < end_pfn) + total_trim_size += real_trim_memory(range[i].end, + end_pfn); + + if (total_trim_size) { + pr_warning("WARNING: BIOS bug: CPU MTRRs don't cover all of memory, losing %lluMB of RAM.\n", total_trim_size >> 20); + + if (!changed_by_mtrr_cleanup) + WARN_ON(1); + + pr_info("update e820 for mtrr\n"); + update_e820(); + + return 1; + } + + return 0; +} diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c new file mode 100644 index 00000000000..9e451b0876b --- /dev/null +++ b/arch/x86/kernel/cpu/mtrr/cyrix.c @@ -0,0 +1,282 @@ +#include <linux/init.h> +#include <linux/io.h> +#include <linux/mm.h> + +#include <asm/processor-cyrix.h> +#include <asm/processor-flags.h> +#include <asm/mtrr.h> +#include <asm/msr.h> + +#include "mtrr.h" + +static void +cyrix_get_arr(unsigned int reg, unsigned long *base, + unsigned long *size, mtrr_type * type) +{ + unsigned char arr, ccr3, rcr, shift; + unsigned long flags; + + arr = CX86_ARR_BASE + (reg << 1) + reg; /* avoid multiplication by 3 */ + + local_irq_save(flags); + + ccr3 = getCx86(CX86_CCR3); + setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ + ((unsigned char *)base)[3] = getCx86(arr); + ((unsigned char *)base)[2] = getCx86(arr + 1); + ((unsigned char *)base)[1] = getCx86(arr + 2); + rcr = getCx86(CX86_RCR_BASE + reg); + setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ + + local_irq_restore(flags); + + shift = ((unsigned char *) base)[1] & 0x0f; + *base >>= PAGE_SHIFT; + + /* + * Power of two, at least 4K on ARR0-ARR6, 256K on ARR7 + * Note: shift==0xf means 4G, this is unsupported. + */ + if (shift) + *size = (reg < 7 ? 0x1UL : 0x40UL) << (shift - 1); + else + *size = 0; + + /* Bit 0 is Cache Enable on ARR7, Cache Disable on ARR0-ARR6 */ + if (reg < 7) { + switch (rcr) { + case 1: + *type = MTRR_TYPE_UNCACHABLE; + break; + case 8: + *type = MTRR_TYPE_WRBACK; + break; + case 9: + *type = MTRR_TYPE_WRCOMB; + break; + case 24: + default: + *type = MTRR_TYPE_WRTHROUGH; + break; + } + } else { + switch (rcr) { + case 0: + *type = MTRR_TYPE_UNCACHABLE; + break; + case 8: + *type = MTRR_TYPE_WRCOMB; + break; + case 9: + *type = MTRR_TYPE_WRBACK; + break; + case 25: + default: + *type = MTRR_TYPE_WRTHROUGH; + break; + } + } +} + +/* + * cyrix_get_free_region - get a free ARR. + * + * @base: the starting (base) address of the region. + * @size: the size (in bytes) of the region. + * + * Returns: the index of the region on success, else -1 on error. +*/ +static int +cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg) +{ + unsigned long lbase, lsize; + mtrr_type ltype; + int i; + + switch (replace_reg) { + case 7: + if (size < 0x40) + break; + case 6: + case 5: + case 4: + return replace_reg; + case 3: + case 2: + case 1: + case 0: + return replace_reg; + } + /* If we are to set up a region >32M then look at ARR7 immediately */ + if (size > 0x2000) { + cyrix_get_arr(7, &lbase, &lsize, <ype); + if (lsize == 0) + return 7; + /* Else try ARR0-ARR6 first */ + } else { + for (i = 0; i < 7; i++) { + cyrix_get_arr(i, &lbase, &lsize, <ype); + if (lsize == 0) + return i; + } + /* + * ARR0-ARR6 isn't free + * try ARR7 but its size must be at least 256K + */ + cyrix_get_arr(i, &lbase, &lsize, <ype); + if ((lsize == 0) && (size >= 0x40)) + return i; + } + return -ENOSPC; +} + +static u32 cr4, ccr3; + +static void prepare_set(void) +{ + u32 cr0; + + /* Save value of CR4 and clear Page Global Enable (bit 7) */ + if (cpu_has_pge) { + cr4 = read_cr4(); + write_cr4(cr4 & ~X86_CR4_PGE); + } + + /* + * Disable and flush caches. + * Note that wbinvd flushes the TLBs as a side-effect + */ + cr0 = read_cr0() | X86_CR0_CD; + wbinvd(); + write_cr0(cr0); + wbinvd(); + + /* Cyrix ARRs - everything else was excluded at the top */ + ccr3 = getCx86(CX86_CCR3); + + /* Cyrix ARRs - everything else was excluded at the top */ + setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); +} + +static void post_set(void) +{ + /* Flush caches and TLBs */ + wbinvd(); + + /* Cyrix ARRs - everything else was excluded at the top */ + setCx86(CX86_CCR3, ccr3); + + /* Enable caches */ + write_cr0(read_cr0() & ~X86_CR0_CD); + + /* Restore value of CR4 */ + if (cpu_has_pge) + write_cr4(cr4); +} + +static void cyrix_set_arr(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type) +{ + unsigned char arr, arr_type, arr_size; + + arr = CX86_ARR_BASE + (reg << 1) + reg; /* avoid multiplication by 3 */ + + /* count down from 32M (ARR0-ARR6) or from 2G (ARR7) */ + if (reg >= 7) + size >>= 6; + + size &= 0x7fff; /* make sure arr_size <= 14 */ + for (arr_size = 0; size; arr_size++, size >>= 1) + ; + + if (reg < 7) { + switch (type) { + case MTRR_TYPE_UNCACHABLE: + arr_type = 1; + break; + case MTRR_TYPE_WRCOMB: + arr_type = 9; + break; + case MTRR_TYPE_WRTHROUGH: + arr_type = 24; + break; + default: + arr_type = 8; + break; + } + } else { + switch (type) { + case MTRR_TYPE_UNCACHABLE: + arr_type = 0; + break; + case MTRR_TYPE_WRCOMB: + arr_type = 8; + break; + case MTRR_TYPE_WRTHROUGH: + arr_type = 25; + break; + default: + arr_type = 9; + break; + } + } + + prepare_set(); + + base <<= PAGE_SHIFT; + setCx86(arr + 0, ((unsigned char *)&base)[3]); + setCx86(arr + 1, ((unsigned char *)&base)[2]); + setCx86(arr + 2, (((unsigned char *)&base)[1]) | arr_size); + setCx86(CX86_RCR_BASE + reg, arr_type); + + post_set(); +} + +typedef struct { + unsigned long base; + unsigned long size; + mtrr_type type; +} arr_state_t; + +static arr_state_t arr_state[8] = { + {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, + {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL} +}; + +static unsigned char ccr_state[7] = { 0, 0, 0, 0, 0, 0, 0 }; + +static void cyrix_set_all(void) +{ + int i; + + prepare_set(); + + /* the CCRs are not contiguous */ + for (i = 0; i < 4; i++) + setCx86(CX86_CCR0 + i, ccr_state[i]); + for (; i < 7; i++) + setCx86(CX86_CCR4 + i, ccr_state[i]); + + for (i = 0; i < 8; i++) { + cyrix_set_arr(i, arr_state[i].base, + arr_state[i].size, arr_state[i].type); + } + + post_set(); +} + +static const struct mtrr_ops cyrix_mtrr_ops = { + .vendor = X86_VENDOR_CYRIX, + .set_all = cyrix_set_all, + .set = cyrix_set_arr, + .get = cyrix_get_arr, + .get_free_region = cyrix_get_free_region, + .validate_add_page = generic_validate_add_page, + .have_wrcomb = positive_have_wrcomb, +}; + +int __init cyrix_init_mtrr(void) +{ + set_mtrr_ops(&cyrix_mtrr_ops); + return 0; +} diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c new file mode 100644 index 00000000000..0e25a1bc5ab --- /dev/null +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -0,0 +1,845 @@ +/* + * This only handles 32bit MTRR on 32bit hosts. This is strictly wrong + * because MTRRs can span up to 40 bits (36bits on most modern x86) + */ +#define DEBUG + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/io.h> +#include <linux/mm.h> + +#include <asm/processor-flags.h> +#include <asm/cpufeature.h> +#include <asm/tlbflush.h> +#include <asm/mtrr.h> +#include <asm/msr.h> +#include <asm/pat.h> + +#include "mtrr.h" + +struct fixed_range_block { + int base_msr; /* start address of an MTRR block */ + int ranges; /* number of MTRRs in this block */ +}; + +static struct fixed_range_block fixed_range_blocks[] = { + { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */ + { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */ + { MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */ + {} +}; + +static unsigned long smp_changes_mask; +static int mtrr_state_set; +u64 mtrr_tom2; + +struct mtrr_state_type mtrr_state; +EXPORT_SYMBOL_GPL(mtrr_state); + +/* + * BIOS is expected to clear MtrrFixDramModEn bit, see for example + * "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD + * Opteron Processors" (26094 Rev. 3.30 February 2006), section + * "13.2.1.2 SYSCFG Register": "The MtrrFixDramModEn bit should be set + * to 1 during BIOS initalization of the fixed MTRRs, then cleared to + * 0 for operation." + */ +static inline void k8_check_syscfg_dram_mod_en(void) +{ + u32 lo, hi; + + if (!((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && + (boot_cpu_data.x86 >= 0x0f))) + return; + + rdmsr(MSR_K8_SYSCFG, lo, hi); + if (lo & K8_MTRRFIXRANGE_DRAM_MODIFY) { + printk(KERN_ERR FW_WARN "MTRR: CPU %u: SYSCFG[MtrrFixDramModEn]" + " not cleared by BIOS, clearing this bit\n", + smp_processor_id()); + lo &= ~K8_MTRRFIXRANGE_DRAM_MODIFY; + mtrr_wrmsr(MSR_K8_SYSCFG, lo, hi); + } +} + +/* Get the size of contiguous MTRR range */ +static u64 get_mtrr_size(u64 mask) +{ + u64 size; + + mask >>= PAGE_SHIFT; + mask |= size_or_mask; + size = -mask; + size <<= PAGE_SHIFT; + return size; +} + +/* + * Check and return the effective type for MTRR-MTRR type overlap. + * Returns 1 if the effective type is UNCACHEABLE, else returns 0 + */ +static int check_type_overlap(u8 *prev, u8 *curr) +{ + if (*prev == MTRR_TYPE_UNCACHABLE || *curr == MTRR_TYPE_UNCACHABLE) { + *prev = MTRR_TYPE_UNCACHABLE; + *curr = MTRR_TYPE_UNCACHABLE; + return 1; + } + + if ((*prev == MTRR_TYPE_WRBACK && *curr == MTRR_TYPE_WRTHROUGH) || + (*prev == MTRR_TYPE_WRTHROUGH && *curr == MTRR_TYPE_WRBACK)) { + *prev = MTRR_TYPE_WRTHROUGH; + *curr = MTRR_TYPE_WRTHROUGH; + } + + if (*prev != *curr) { + *prev = MTRR_TYPE_UNCACHABLE; + *curr = MTRR_TYPE_UNCACHABLE; + return 1; + } + + return 0; +} + +/* + * Error/Semi-error returns: + * 0xFF - when MTRR is not enabled + * *repeat == 1 implies [start:end] spanned across MTRR range and type returned + * corresponds only to [start:*partial_end]. + * Caller has to lookup again for [*partial_end:end]. + */ +static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat) +{ + int i; + u64 base, mask; + u8 prev_match, curr_match; + + *repeat = 0; + if (!mtrr_state_set) + return 0xFF; + + if (!mtrr_state.enabled) + return 0xFF; + + /* Make end inclusive end, instead of exclusive */ + end--; + + /* Look in fixed ranges. Just return the type as per start */ + if (mtrr_state.have_fixed && (start < 0x100000)) { + int idx; + + if (start < 0x80000) { + idx = 0; + idx += (start >> 16); + return mtrr_state.fixed_ranges[idx]; + } else if (start < 0xC0000) { + idx = 1 * 8; + idx += ((start - 0x80000) >> 14); + return mtrr_state.fixed_ranges[idx]; + } else if (start < 0x1000000) { + idx = 3 * 8; + idx += ((start - 0xC0000) >> 12); + return mtrr_state.fixed_ranges[idx]; + } + } + + /* + * Look in variable ranges + * Look of multiple ranges matching this address and pick type + * as per MTRR precedence + */ + if (!(mtrr_state.enabled & 2)) + return mtrr_state.def_type; + + prev_match = 0xFF; + for (i = 0; i < num_var_ranges; ++i) { + unsigned short start_state, end_state; + + if (!(mtrr_state.var_ranges[i].mask_lo & (1 << 11))) + continue; + + base = (((u64)mtrr_state.var_ranges[i].base_hi) << 32) + + (mtrr_state.var_ranges[i].base_lo & PAGE_MASK); + mask = (((u64)mtrr_state.var_ranges[i].mask_hi) << 32) + + (mtrr_state.var_ranges[i].mask_lo & PAGE_MASK); + + start_state = ((start & mask) == (base & mask)); + end_state = ((end & mask) == (base & mask)); + + if (start_state != end_state) { + /* + * We have start:end spanning across an MTRR. + * We split the region into + * either + * (start:mtrr_end) (mtrr_end:end) + * or + * (start:mtrr_start) (mtrr_start:end) + * depending on kind of overlap. + * Return the type for first region and a pointer to + * the start of second region so that caller will + * lookup again on the second region. + * Note: This way we handle multiple overlaps as well. + */ + if (start_state) + *partial_end = base + get_mtrr_size(mask); + else + *partial_end = base; + + if (unlikely(*partial_end <= start)) { + WARN_ON(1); + *partial_end = start + PAGE_SIZE; + } + + end = *partial_end - 1; /* end is inclusive */ + *repeat = 1; + } + + if ((start & mask) != (base & mask)) + continue; + + curr_match = mtrr_state.var_ranges[i].base_lo & 0xff; + if (prev_match == 0xFF) { + prev_match = curr_match; + continue; + } + + if (check_type_overlap(&prev_match, &curr_match)) + return curr_match; + } + + if (mtrr_tom2) { + if (start >= (1ULL<<32) && (end < mtrr_tom2)) + return MTRR_TYPE_WRBACK; + } + + if (prev_match != 0xFF) + return prev_match; + + return mtrr_state.def_type; +} + +/* + * Returns the effective MTRR type for the region + * Error return: + * 0xFF - when MTRR is not enabled + */ +u8 mtrr_type_lookup(u64 start, u64 end) +{ + u8 type, prev_type; + int repeat; + u64 partial_end; + + type = __mtrr_type_lookup(start, end, &partial_end, &repeat); + + /* + * Common path is with repeat = 0. + * However, we can have cases where [start:end] spans across some + * MTRR range. Do repeated lookups for that case here. + */ + while (repeat) { + prev_type = type; + start = partial_end; + type = __mtrr_type_lookup(start, end, &partial_end, &repeat); + + if (check_type_overlap(&prev_type, &type)) + return type; + } + + return type; +} + +/* Get the MSR pair relating to a var range */ +static void +get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr) +{ + rdmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi); + rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi); +} + +/* Fill the MSR pair relating to a var range */ +void fill_mtrr_var_range(unsigned int index, + u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi) +{ + struct mtrr_var_range *vr; + + vr = mtrr_state.var_ranges; + + vr[index].base_lo = base_lo; + vr[index].base_hi = base_hi; + vr[index].mask_lo = mask_lo; + vr[index].mask_hi = mask_hi; +} + +static void get_fixed_ranges(mtrr_type *frs) +{ + unsigned int *p = (unsigned int *)frs; + int i; + + k8_check_syscfg_dram_mod_en(); + + rdmsr(MSR_MTRRfix64K_00000, p[0], p[1]); + + for (i = 0; i < 2; i++) + rdmsr(MSR_MTRRfix16K_80000 + i, p[2 + i * 2], p[3 + i * 2]); + for (i = 0; i < 8; i++) + rdmsr(MSR_MTRRfix4K_C0000 + i, p[6 + i * 2], p[7 + i * 2]); +} + +void mtrr_save_fixed_ranges(void *info) +{ + if (cpu_has_mtrr) + get_fixed_ranges(mtrr_state.fixed_ranges); +} + +static unsigned __initdata last_fixed_start; +static unsigned __initdata last_fixed_end; +static mtrr_type __initdata last_fixed_type; + +static void __init print_fixed_last(void) +{ + if (!last_fixed_end) + return; + + pr_debug(" %05X-%05X %s\n", last_fixed_start, + last_fixed_end - 1, mtrr_attrib_to_str(last_fixed_type)); + + last_fixed_end = 0; +} + +static void __init update_fixed_last(unsigned base, unsigned end, + mtrr_type type) +{ + last_fixed_start = base; + last_fixed_end = end; + last_fixed_type = type; +} + +static void __init +print_fixed(unsigned base, unsigned step, const mtrr_type *types) +{ + unsigned i; + + for (i = 0; i < 8; ++i, ++types, base += step) { + if (last_fixed_end == 0) { + update_fixed_last(base, base + step, *types); + continue; + } + if (last_fixed_end == base && last_fixed_type == *types) { + last_fixed_end = base + step; + continue; + } + /* new segments: gap or different type */ + print_fixed_last(); + update_fixed_last(base, base + step, *types); + } +} + +static void prepare_set(void); +static void post_set(void); + +static void __init print_mtrr_state(void) +{ + unsigned int i; + int high_width; + + pr_debug("MTRR default type: %s\n", + mtrr_attrib_to_str(mtrr_state.def_type)); + if (mtrr_state.have_fixed) { + pr_debug("MTRR fixed ranges %sabled:\n", + mtrr_state.enabled & 1 ? "en" : "dis"); + print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0); + for (i = 0; i < 2; ++i) + print_fixed(0x80000 + i * 0x20000, 0x04000, + mtrr_state.fixed_ranges + (i + 1) * 8); + for (i = 0; i < 8; ++i) + print_fixed(0xC0000 + i * 0x08000, 0x01000, + mtrr_state.fixed_ranges + (i + 3) * 8); + + /* tail */ + print_fixed_last(); + } + pr_debug("MTRR variable ranges %sabled:\n", + mtrr_state.enabled & 2 ? "en" : "dis"); + high_width = (__ffs64(size_or_mask) - (32 - PAGE_SHIFT) + 3) / 4; + + for (i = 0; i < num_var_ranges; ++i) { + if (mtrr_state.var_ranges[i].mask_lo & (1 << 11)) + pr_debug(" %u base %0*X%05X000 mask %0*X%05X000 %s\n", + i, + high_width, + mtrr_state.var_ranges[i].base_hi, + mtrr_state.var_ranges[i].base_lo >> 12, + high_width, + mtrr_state.var_ranges[i].mask_hi, + mtrr_state.var_ranges[i].mask_lo >> 12, + mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff)); + else + pr_debug(" %u disabled\n", i); + } + if (mtrr_tom2) + pr_debug("TOM2: %016llx aka %lldM\n", mtrr_tom2, mtrr_tom2>>20); +} + +/* Grab all of the MTRR state for this CPU into *state */ +void __init get_mtrr_state(void) +{ + struct mtrr_var_range *vrs; + unsigned long flags; + unsigned lo, dummy; + unsigned int i; + + vrs = mtrr_state.var_ranges; + + rdmsr(MSR_MTRRcap, lo, dummy); + mtrr_state.have_fixed = (lo >> 8) & 1; + + for (i = 0; i < num_var_ranges; i++) + get_mtrr_var_range(i, &vrs[i]); + if (mtrr_state.have_fixed) + get_fixed_ranges(mtrr_state.fixed_ranges); + + rdmsr(MSR_MTRRdefType, lo, dummy); + mtrr_state.def_type = (lo & 0xff); + mtrr_state.enabled = (lo & 0xc00) >> 10; + + if (amd_special_default_mtrr()) { + unsigned low, high; + + /* TOP_MEM2 */ + rdmsr(MSR_K8_TOP_MEM2, low, high); + mtrr_tom2 = high; + mtrr_tom2 <<= 32; + mtrr_tom2 |= low; + mtrr_tom2 &= 0xffffff800000ULL; + } + + print_mtrr_state(); + + mtrr_state_set = 1; + + /* PAT setup for BP. We need to go through sync steps here */ + local_irq_save(flags); + prepare_set(); + + pat_init(); + + post_set(); + local_irq_restore(flags); +} + +/* Some BIOS's are messed up and don't set all MTRRs the same! */ +void __init mtrr_state_warn(void) +{ + unsigned long mask = smp_changes_mask; + + if (!mask) + return; + if (mask & MTRR_CHANGE_MASK_FIXED) + pr_warning("mtrr: your CPUs had inconsistent fixed MTRR settings\n"); + if (mask & MTRR_CHANGE_MASK_VARIABLE) + pr_warning("mtrr: your CPUs had inconsistent variable MTRR settings\n"); + if (mask & MTRR_CHANGE_MASK_DEFTYPE) + pr_warning("mtrr: your CPUs had inconsistent MTRRdefType settings\n"); + + printk(KERN_INFO "mtrr: probably your BIOS does not setup all CPUs.\n"); + printk(KERN_INFO "mtrr: corrected configuration.\n"); +} + +/* + * Doesn't attempt to pass an error out to MTRR users + * because it's quite complicated in some cases and probably not + * worth it because the best error handling is to ignore it. + */ +void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b) +{ + if (wrmsr_safe(msr, a, b) < 0) { + printk(KERN_ERR + "MTRR: CPU %u: Writing MSR %x to %x:%x failed\n", + smp_processor_id(), msr, a, b); + } +} + +/** + * set_fixed_range - checks & updates a fixed-range MTRR if it + * differs from the value it should have + * @msr: MSR address of the MTTR which should be checked and updated + * @changed: pointer which indicates whether the MTRR needed to be changed + * @msrwords: pointer to the MSR values which the MSR should have + */ +static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords) +{ + unsigned lo, hi; + + rdmsr(msr, lo, hi); + + if (lo != msrwords[0] || hi != msrwords[1]) { + mtrr_wrmsr(msr, msrwords[0], msrwords[1]); + *changed = true; + } +} + +/** + * generic_get_free_region - Get a free MTRR. + * @base: The starting (base) address of the region. + * @size: The size (in bytes) of the region. + * @replace_reg: mtrr index to be replaced; set to invalid value if none. + * + * Returns: The index of the region on success, else negative on error. + */ +int +generic_get_free_region(unsigned long base, unsigned long size, int replace_reg) +{ + unsigned long lbase, lsize; + mtrr_type ltype; + int i, max; + + max = num_var_ranges; + if (replace_reg >= 0 && replace_reg < max) + return replace_reg; + + for (i = 0; i < max; ++i) { + mtrr_if->get(i, &lbase, &lsize, <ype); + if (lsize == 0) + return i; + } + + return -ENOSPC; +} + +static void generic_get_mtrr(unsigned int reg, unsigned long *base, + unsigned long *size, mtrr_type *type) +{ + u32 mask_lo, mask_hi, base_lo, base_hi; + unsigned int hi; + u64 tmp, mask; + + /* + * get_mtrr doesn't need to update mtrr_state, also it could be called + * from any cpu, so try to print it out directly. + */ + get_cpu(); + + rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); + + if ((mask_lo & 0x800) == 0) { + /* Invalid (i.e. free) range */ + *base = 0; + *size = 0; + *type = 0; + goto out_put_cpu; + } + + rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi); + + /* Work out the shifted address mask: */ + tmp = (u64)mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT; + mask = size_or_mask | tmp; + + /* Expand tmp with high bits to all 1s: */ + hi = fls64(tmp); + if (hi > 0) { + tmp |= ~((1ULL<<(hi - 1)) - 1); + + if (tmp != mask) { + printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n"); + add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); + mask = tmp; + } + } + + /* + * This works correctly if size is a power of two, i.e. a + * contiguous range: + */ + *size = -mask; + *base = (u64)base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT; + *type = base_lo & 0xff; + +out_put_cpu: + put_cpu(); +} + +/** + * set_fixed_ranges - checks & updates the fixed-range MTRRs if they + * differ from the saved set + * @frs: pointer to fixed-range MTRR values, saved by get_fixed_ranges() + */ +static int set_fixed_ranges(mtrr_type *frs) +{ + unsigned long long *saved = (unsigned long long *)frs; + bool changed = false; + int block = -1, range; + + k8_check_syscfg_dram_mod_en(); + + while (fixed_range_blocks[++block].ranges) { + for (range = 0; range < fixed_range_blocks[block].ranges; range++) + set_fixed_range(fixed_range_blocks[block].base_msr + range, + &changed, (unsigned int *)saved++); + } + + return changed; +} + +/* + * Set the MSR pair relating to a var range. + * Returns true if changes are made. + */ +static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr) +{ + unsigned int lo, hi; + bool changed = false; + + rdmsr(MTRRphysBase_MSR(index), lo, hi); + if ((vr->base_lo & 0xfffff0ffUL) != (lo & 0xfffff0ffUL) + || (vr->base_hi & (size_and_mask >> (32 - PAGE_SHIFT))) != + (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) { + + mtrr_wrmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi); + changed = true; + } + + rdmsr(MTRRphysMask_MSR(index), lo, hi); + + if ((vr->mask_lo & 0xfffff800UL) != (lo & 0xfffff800UL) + || (vr->mask_hi & (size_and_mask >> (32 - PAGE_SHIFT))) != + (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) { + mtrr_wrmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi); + changed = true; + } + return changed; +} + +static u32 deftype_lo, deftype_hi; + +/** + * set_mtrr_state - Set the MTRR state for this CPU. + * + * NOTE: The CPU must already be in a safe state for MTRR changes. + * RETURNS: 0 if no changes made, else a mask indicating what was changed. + */ +static unsigned long set_mtrr_state(void) +{ + unsigned long change_mask = 0; + unsigned int i; + + for (i = 0; i < num_var_ranges; i++) { + if (set_mtrr_var_ranges(i, &mtrr_state.var_ranges[i])) + change_mask |= MTRR_CHANGE_MASK_VARIABLE; + } + + if (mtrr_state.have_fixed && set_fixed_ranges(mtrr_state.fixed_ranges)) + change_mask |= MTRR_CHANGE_MASK_FIXED; + + /* + * Set_mtrr_restore restores the old value of MTRRdefType, + * so to set it we fiddle with the saved value: + */ + if ((deftype_lo & 0xff) != mtrr_state.def_type + || ((deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) { + + deftype_lo = (deftype_lo & ~0xcff) | mtrr_state.def_type | + (mtrr_state.enabled << 10); + change_mask |= MTRR_CHANGE_MASK_DEFTYPE; + } + + return change_mask; +} + + +static unsigned long cr4; +static DEFINE_RAW_SPINLOCK(set_atomicity_lock); + +/* + * Since we are disabling the cache don't allow any interrupts, + * they would run extremely slow and would only increase the pain. + * + * The caller must ensure that local interrupts are disabled and + * are reenabled after post_set() has been called. + */ +static void prepare_set(void) __acquires(set_atomicity_lock) +{ + unsigned long cr0; + + /* + * Note that this is not ideal + * since the cache is only flushed/disabled for this CPU while the + * MTRRs are changed, but changing this requires more invasive + * changes to the way the kernel boots + */ + + raw_spin_lock(&set_atomicity_lock); + + /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ + cr0 = read_cr0() | X86_CR0_CD; + write_cr0(cr0); + wbinvd(); + + /* Save value of CR4 and clear Page Global Enable (bit 7) */ + if (cpu_has_pge) { + cr4 = read_cr4(); + write_cr4(cr4 & ~X86_CR4_PGE); + } + + /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); + __flush_tlb(); + + /* Save MTRR state */ + rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); + + /* Disable MTRRs, and set the default type to uncached */ + mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi); + wbinvd(); +} + +static void post_set(void) __releases(set_atomicity_lock) +{ + /* Flush TLBs (no need to flush caches - they are disabled) */ + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); + __flush_tlb(); + + /* Intel (P6) standard MTRRs */ + mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); + + /* Enable caches */ + write_cr0(read_cr0() & ~X86_CR0_CD); + + /* Restore value of CR4 */ + if (cpu_has_pge) + write_cr4(cr4); + raw_spin_unlock(&set_atomicity_lock); +} + +static void generic_set_all(void) +{ + unsigned long mask, count; + unsigned long flags; + + local_irq_save(flags); + prepare_set(); + + /* Actually set the state */ + mask = set_mtrr_state(); + + /* also set PAT */ + pat_init(); + + post_set(); + local_irq_restore(flags); + + /* Use the atomic bitops to update the global mask */ + for (count = 0; count < sizeof mask * 8; ++count) { + if (mask & 0x01) + set_bit(count, &smp_changes_mask); + mask >>= 1; + } + +} + +/** + * generic_set_mtrr - set variable MTRR register on the local CPU. + * + * @reg: The register to set. + * @base: The base address of the region. + * @size: The size of the region. If this is 0 the region is disabled. + * @type: The type of the region. + * + * Returns nothing. + */ +static void generic_set_mtrr(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type) +{ + unsigned long flags; + struct mtrr_var_range *vr; + + vr = &mtrr_state.var_ranges[reg]; + + local_irq_save(flags); + prepare_set(); + + if (size == 0) { + /* + * The invalid bit is kept in the mask, so we simply + * clear the relevant mask register to disable a range. + */ + mtrr_wrmsr(MTRRphysMask_MSR(reg), 0, 0); + memset(vr, 0, sizeof(struct mtrr_var_range)); + } else { + vr->base_lo = base << PAGE_SHIFT | type; + vr->base_hi = (base & size_and_mask) >> (32 - PAGE_SHIFT); + vr->mask_lo = -size << PAGE_SHIFT | 0x800; + vr->mask_hi = (-size & size_and_mask) >> (32 - PAGE_SHIFT); + + mtrr_wrmsr(MTRRphysBase_MSR(reg), vr->base_lo, vr->base_hi); + mtrr_wrmsr(MTRRphysMask_MSR(reg), vr->mask_lo, vr->mask_hi); + } + + post_set(); + local_irq_restore(flags); +} + +int generic_validate_add_page(unsigned long base, unsigned long size, + unsigned int type) +{ + unsigned long lbase, last; + + /* + * For Intel PPro stepping <= 7 + * must be 4 MiB aligned and not touch 0x70000000 -> 0x7003FFFF + */ + if (is_cpu(INTEL) && boot_cpu_data.x86 == 6 && + boot_cpu_data.x86_model == 1 && + boot_cpu_data.x86_mask <= 7) { + if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) { + pr_warning("mtrr: base(0x%lx000) is not 4 MiB aligned\n", base); + return -EINVAL; + } + if (!(base + size < 0x70000 || base > 0x7003F) && + (type == MTRR_TYPE_WRCOMB + || type == MTRR_TYPE_WRBACK)) { + pr_warning("mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n"); + return -EINVAL; + } + } + + /* + * Check upper bits of base and last are equal and lower bits are 0 + * for base and 1 for last + */ + last = base + size - 1; + for (lbase = base; !(lbase & 1) && (last & 1); + lbase = lbase >> 1, last = last >> 1) + ; + if (lbase != last) { + pr_warning("mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", base, size); + return -EINVAL; + } + return 0; +} + +static int generic_have_wrcomb(void) +{ + unsigned long config, dummy; + rdmsr(MSR_MTRRcap, config, dummy); + return config & (1 << 10); +} + +int positive_have_wrcomb(void) +{ + return 1; +} + +/* + * Generic structure... + */ +const struct mtrr_ops generic_mtrr_ops = { + .use_intel_if = 1, + .set_all = generic_set_all, + .get = generic_get_mtrr, + .get_free_region = generic_get_free_region, + .set = generic_set_mtrr, + .validate_add_page = generic_validate_add_page, + .have_wrcomb = generic_have_wrcomb, +}; diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c new file mode 100644 index 00000000000..a041e094b8b --- /dev/null +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -0,0 +1,451 @@ +#include <linux/capability.h> +#include <linux/seq_file.h> +#include <linux/uaccess.h> +#include <linux/proc_fs.h> +#include <linux/module.h> +#include <linux/ctype.h> +#include <linux/string.h> +#include <linux/slab.h> +#include <linux/init.h> + +#define LINE_SIZE 80 + +#include <asm/mtrr.h> + +#include "mtrr.h" + +#define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private) + +static const char *const mtrr_strings[MTRR_NUM_TYPES] = +{ + "uncachable", /* 0 */ + "write-combining", /* 1 */ + "?", /* 2 */ + "?", /* 3 */ + "write-through", /* 4 */ + "write-protect", /* 5 */ + "write-back", /* 6 */ +}; + +const char *mtrr_attrib_to_str(int x) +{ + return (x <= 6) ? mtrr_strings[x] : "?"; +} + +#ifdef CONFIG_PROC_FS + +static int +mtrr_file_add(unsigned long base, unsigned long size, + unsigned int type, bool increment, struct file *file, int page) +{ + unsigned int *fcount = FILE_FCOUNT(file); + int reg, max; + + max = num_var_ranges; + if (fcount == NULL) { + fcount = kzalloc(max * sizeof *fcount, GFP_KERNEL); + if (!fcount) + return -ENOMEM; + FILE_FCOUNT(file) = fcount; + } + if (!page) { + if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) + return -EINVAL; + base >>= PAGE_SHIFT; + size >>= PAGE_SHIFT; + } + reg = mtrr_add_page(base, size, type, true); + if (reg >= 0) + ++fcount[reg]; + return reg; +} + +static int +mtrr_file_del(unsigned long base, unsigned long size, + struct file *file, int page) +{ + unsigned int *fcount = FILE_FCOUNT(file); + int reg; + + if (!page) { + if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) + return -EINVAL; + base >>= PAGE_SHIFT; + size >>= PAGE_SHIFT; + } + reg = mtrr_del_page(-1, base, size); + if (reg < 0) + return reg; + if (fcount == NULL) + return reg; + if (fcount[reg] < 1) + return -EINVAL; + --fcount[reg]; + return reg; +} + +/* + * seq_file can seek but we ignore it. + * + * Format of control line: + * "base=%Lx size=%Lx type=%s" or "disable=%d" + */ +static ssize_t +mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) +{ + int i, err; + unsigned long reg; + unsigned long long base, size; + char *ptr; + char line[LINE_SIZE]; + int length; + size_t linelen; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + memset(line, 0, LINE_SIZE); + + length = len; + length--; + + if (length > LINE_SIZE - 1) + length = LINE_SIZE - 1; + + if (length < 0) + return -EINVAL; + + if (copy_from_user(line, buf, length)) + return -EFAULT; + + linelen = strlen(line); + ptr = line + linelen - 1; + if (linelen && *ptr == '\n') + *ptr = '\0'; + + if (!strncmp(line, "disable=", 8)) { + reg = simple_strtoul(line + 8, &ptr, 0); + err = mtrr_del_page(reg, 0, 0); + if (err < 0) + return err; + return len; + } + + if (strncmp(line, "base=", 5)) + return -EINVAL; + + base = simple_strtoull(line + 5, &ptr, 0); + ptr = skip_spaces(ptr); + + if (strncmp(ptr, "size=", 5)) + return -EINVAL; + + size = simple_strtoull(ptr + 5, &ptr, 0); + if ((base & 0xfff) || (size & 0xfff)) + return -EINVAL; + ptr = skip_spaces(ptr); + + if (strncmp(ptr, "type=", 5)) + return -EINVAL; + ptr = skip_spaces(ptr + 5); + + for (i = 0; i < MTRR_NUM_TYPES; ++i) { + if (strcmp(ptr, mtrr_strings[i])) + continue; + base >>= PAGE_SHIFT; + size >>= PAGE_SHIFT; + err = mtrr_add_page((unsigned long)base, (unsigned long)size, i, true); + if (err < 0) + return err; + return len; + } + return -EINVAL; +} + +static long +mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) +{ + int err = 0; + mtrr_type type; + unsigned long base; + unsigned long size; + struct mtrr_sentry sentry; + struct mtrr_gentry gentry; + void __user *arg = (void __user *) __arg; + + switch (cmd) { + case MTRRIOC_ADD_ENTRY: + case MTRRIOC_SET_ENTRY: + case MTRRIOC_DEL_ENTRY: + case MTRRIOC_KILL_ENTRY: + case MTRRIOC_ADD_PAGE_ENTRY: + case MTRRIOC_SET_PAGE_ENTRY: + case MTRRIOC_DEL_PAGE_ENTRY: + case MTRRIOC_KILL_PAGE_ENTRY: + if (copy_from_user(&sentry, arg, sizeof sentry)) + return -EFAULT; + break; + case MTRRIOC_GET_ENTRY: + case MTRRIOC_GET_PAGE_ENTRY: + if (copy_from_user(&gentry, arg, sizeof gentry)) + return -EFAULT; + break; +#ifdef CONFIG_COMPAT + case MTRRIOC32_ADD_ENTRY: + case MTRRIOC32_SET_ENTRY: + case MTRRIOC32_DEL_ENTRY: + case MTRRIOC32_KILL_ENTRY: + case MTRRIOC32_ADD_PAGE_ENTRY: + case MTRRIOC32_SET_PAGE_ENTRY: + case MTRRIOC32_DEL_PAGE_ENTRY: + case MTRRIOC32_KILL_PAGE_ENTRY: { + struct mtrr_sentry32 __user *s32; + + s32 = (struct mtrr_sentry32 __user *)__arg; + err = get_user(sentry.base, &s32->base); + err |= get_user(sentry.size, &s32->size); + err |= get_user(sentry.type, &s32->type); + if (err) + return err; + break; + } + case MTRRIOC32_GET_ENTRY: + case MTRRIOC32_GET_PAGE_ENTRY: { + struct mtrr_gentry32 __user *g32; + + g32 = (struct mtrr_gentry32 __user *)__arg; + err = get_user(gentry.regnum, &g32->regnum); + err |= get_user(gentry.base, &g32->base); + err |= get_user(gentry.size, &g32->size); + err |= get_user(gentry.type, &g32->type); + if (err) + return err; + break; + } +#endif + } + + switch (cmd) { + default: + return -ENOTTY; + case MTRRIOC_ADD_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_ADD_ENTRY: +#endif + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + err = + mtrr_file_add(sentry.base, sentry.size, sentry.type, true, + file, 0); + break; + case MTRRIOC_SET_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_SET_ENTRY: +#endif + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + err = mtrr_add(sentry.base, sentry.size, sentry.type, false); + break; + case MTRRIOC_DEL_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_DEL_ENTRY: +#endif + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + err = mtrr_file_del(sentry.base, sentry.size, file, 0); + break; + case MTRRIOC_KILL_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_KILL_ENTRY: +#endif + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + err = mtrr_del(-1, sentry.base, sentry.size); + break; + case MTRRIOC_GET_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_GET_ENTRY: +#endif + if (gentry.regnum >= num_var_ranges) + return -EINVAL; + mtrr_if->get(gentry.regnum, &base, &size, &type); + + /* Hide entries that go above 4GB */ + if (base + size - 1 >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT)) + || size >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT))) + gentry.base = gentry.size = gentry.type = 0; + else { + gentry.base = base << PAGE_SHIFT; + gentry.size = size << PAGE_SHIFT; + gentry.type = type; + } + + break; + case MTRRIOC_ADD_PAGE_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_ADD_PAGE_ENTRY: +#endif + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + err = + mtrr_file_add(sentry.base, sentry.size, sentry.type, true, + file, 1); + break; + case MTRRIOC_SET_PAGE_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_SET_PAGE_ENTRY: +#endif + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + err = + mtrr_add_page(sentry.base, sentry.size, sentry.type, false); + break; + case MTRRIOC_DEL_PAGE_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_DEL_PAGE_ENTRY: +#endif + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + err = mtrr_file_del(sentry.base, sentry.size, file, 1); + break; + case MTRRIOC_KILL_PAGE_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_KILL_PAGE_ENTRY: +#endif + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + err = mtrr_del_page(-1, sentry.base, sentry.size); + break; + case MTRRIOC_GET_PAGE_ENTRY: +#ifdef CONFIG_COMPAT + case MTRRIOC32_GET_PAGE_ENTRY: +#endif + if (gentry.regnum >= num_var_ranges) + return -EINVAL; + mtrr_if->get(gentry.regnum, &base, &size, &type); + /* Hide entries that would overflow */ + if (size != (__typeof__(gentry.size))size) + gentry.base = gentry.size = gentry.type = 0; + else { + gentry.base = base; + gentry.size = size; + gentry.type = type; + } + break; + } + + if (err) + return err; + + switch (cmd) { + case MTRRIOC_GET_ENTRY: + case MTRRIOC_GET_PAGE_ENTRY: + if (copy_to_user(arg, &gentry, sizeof gentry)) + err = -EFAULT; + break; +#ifdef CONFIG_COMPAT + case MTRRIOC32_GET_ENTRY: + case MTRRIOC32_GET_PAGE_ENTRY: { + struct mtrr_gentry32 __user *g32; + + g32 = (struct mtrr_gentry32 __user *)__arg; + err = put_user(gentry.base, &g32->base); + err |= put_user(gentry.size, &g32->size); + err |= put_user(gentry.regnum, &g32->regnum); + err |= put_user(gentry.type, &g32->type); + break; + } +#endif + } + return err; +} + +static int mtrr_close(struct inode *ino, struct file *file) +{ + unsigned int *fcount = FILE_FCOUNT(file); + int i, max; + + if (fcount != NULL) { + max = num_var_ranges; + for (i = 0; i < max; ++i) { + while (fcount[i] > 0) { + mtrr_del(i, 0, 0); + --fcount[i]; + } + } + kfree(fcount); + FILE_FCOUNT(file) = NULL; + } + return single_release(ino, file); +} + +static int mtrr_seq_show(struct seq_file *seq, void *offset); + +static int mtrr_open(struct inode *inode, struct file *file) +{ + if (!mtrr_if) + return -EIO; + if (!mtrr_if->get) + return -ENXIO; + return single_open(file, mtrr_seq_show, NULL); +} + +static const struct file_operations mtrr_fops = { + .owner = THIS_MODULE, + .open = mtrr_open, + .read = seq_read, + .llseek = seq_lseek, + .write = mtrr_write, + .unlocked_ioctl = mtrr_ioctl, + .compat_ioctl = mtrr_ioctl, + .release = mtrr_close, +}; + +static int mtrr_seq_show(struct seq_file *seq, void *offset) +{ + char factor; + int i, max, len; + mtrr_type type; + unsigned long base, size; + + len = 0; + max = num_var_ranges; + for (i = 0; i < max; i++) { + mtrr_if->get(i, &base, &size, &type); + if (size == 0) { + mtrr_usage_table[i] = 0; + continue; + } + if (size < (0x100000 >> PAGE_SHIFT)) { + /* less than 1MB */ + factor = 'K'; + size <<= PAGE_SHIFT - 10; + } else { + factor = 'M'; + size >>= 20 - PAGE_SHIFT; + } + /* Base can be > 32bit */ + len += seq_printf(seq, "reg%02i: base=0x%06lx000 " + "(%5luMB), size=%5lu%cB, count=%d: %s\n", + i, base, base >> (20 - PAGE_SHIFT), size, + factor, mtrr_usage_table[i], + mtrr_attrib_to_str(type)); + } + return 0; +} + +static int __init mtrr_if_init(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + if ((!cpu_has(c, X86_FEATURE_MTRR)) && + (!cpu_has(c, X86_FEATURE_K6_MTRR)) && + (!cpu_has(c, X86_FEATURE_CYRIX_ARR)) && + (!cpu_has(c, X86_FEATURE_CENTAUR_MCR))) + return -ENODEV; + + proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_fops); + return 0; +} +arch_initcall(mtrr_if_init); +#endif /* CONFIG_PROC_FS */ diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c new file mode 100644 index 00000000000..f961de9964c --- /dev/null +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -0,0 +1,842 @@ +/* Generic MTRR (Memory Type Range Register) driver. + + Copyright (C) 1997-2000 Richard Gooch + Copyright (c) 2002 Patrick Mochel + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with this library; if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + Richard Gooch may be reached by email at rgooch@atnf.csiro.au + The postal address is: + Richard Gooch, c/o ATNF, P. O. Box 76, Epping, N.S.W., 2121, Australia. + + Source: "Pentium Pro Family Developer's Manual, Volume 3: + Operating System Writer's Guide" (Intel document number 242692), + section 11.11.7 + + This was cleaned and made readable by Patrick Mochel <mochel@osdl.org> + on 6-7 March 2002. + Source: Intel Architecture Software Developers Manual, Volume 3: + System Programming Guide; Section 9.11. (1997 edition - PPro). +*/ + +#define DEBUG + +#include <linux/types.h> /* FIXME: kvm_para.h needs this */ + +#include <linux/stop_machine.h> +#include <linux/kvm_para.h> +#include <linux/uaccess.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/init.h> +#include <linux/sort.h> +#include <linux/cpu.h> +#include <linux/pci.h> +#include <linux/smp.h> +#include <linux/syscore_ops.h> + +#include <asm/processor.h> +#include <asm/e820.h> +#include <asm/mtrr.h> +#include <asm/msr.h> +#include <asm/pat.h> + +#include "mtrr.h" + +/* arch_phys_wc_add returns an MTRR register index plus this offset. */ +#define MTRR_TO_PHYS_WC_OFFSET 1000 + +u32 num_var_ranges; + +unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; +static DEFINE_MUTEX(mtrr_mutex); + +u64 size_or_mask, size_and_mask; +static bool mtrr_aps_delayed_init; + +static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; + +const struct mtrr_ops *mtrr_if; + +static void set_mtrr(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type); + +void set_mtrr_ops(const struct mtrr_ops *ops) +{ + if (ops->vendor && ops->vendor < X86_VENDOR_NUM) + mtrr_ops[ops->vendor] = ops; +} + +/* Returns non-zero if we have the write-combining memory type */ +static int have_wrcomb(void) +{ + struct pci_dev *dev; + + dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL); + if (dev != NULL) { + /* + * ServerWorks LE chipsets < rev 6 have problems with + * write-combining. Don't allow it and leave room for other + * chipsets to be tagged + */ + if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS && + dev->device == PCI_DEVICE_ID_SERVERWORKS_LE && + dev->revision <= 5) { + pr_info("mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n"); + pci_dev_put(dev); + return 0; + } + /* + * Intel 450NX errata # 23. Non ascending cacheline evictions to + * write combining memory may resulting in data corruption + */ + if (dev->vendor == PCI_VENDOR_ID_INTEL && + dev->device == PCI_DEVICE_ID_INTEL_82451NX) { + pr_info("mtrr: Intel 450NX MMC detected. Write-combining disabled.\n"); + pci_dev_put(dev); + return 0; + } + pci_dev_put(dev); + } + return mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0; +} + +/* This function returns the number of variable MTRRs */ +static void __init set_num_var_ranges(void) +{ + unsigned long config = 0, dummy; + + if (use_intel()) + rdmsr(MSR_MTRRcap, config, dummy); + else if (is_cpu(AMD)) + config = 2; + else if (is_cpu(CYRIX) || is_cpu(CENTAUR)) + config = 8; + + num_var_ranges = config & 0xff; +} + +static void __init init_table(void) +{ + int i, max; + + max = num_var_ranges; + for (i = 0; i < max; i++) + mtrr_usage_table[i] = 1; +} + +struct set_mtrr_data { + unsigned long smp_base; + unsigned long smp_size; + unsigned int smp_reg; + mtrr_type smp_type; +}; + +/** + * mtrr_rendezvous_handler - Work done in the synchronization handler. Executed + * by all the CPUs. + * @info: pointer to mtrr configuration data + * + * Returns nothing. + */ +static int mtrr_rendezvous_handler(void *info) +{ + struct set_mtrr_data *data = info; + + /* + * We use this same function to initialize the mtrrs during boot, + * resume, runtime cpu online and on an explicit request to set a + * specific MTRR. + * + * During boot or suspend, the state of the boot cpu's mtrrs has been + * saved, and we want to replicate that across all the cpus that come + * online (either at the end of boot or resume or during a runtime cpu + * online). If we're doing that, @reg is set to something special and on + * all the cpu's we do mtrr_if->set_all() (On the logical cpu that + * started the boot/resume sequence, this might be a duplicate + * set_all()). + */ + if (data->smp_reg != ~0U) { + mtrr_if->set(data->smp_reg, data->smp_base, + data->smp_size, data->smp_type); + } else if (mtrr_aps_delayed_init || !cpu_online(smp_processor_id())) { + mtrr_if->set_all(); + } + return 0; +} + +static inline int types_compatible(mtrr_type type1, mtrr_type type2) +{ + return type1 == MTRR_TYPE_UNCACHABLE || + type2 == MTRR_TYPE_UNCACHABLE || + (type1 == MTRR_TYPE_WRTHROUGH && type2 == MTRR_TYPE_WRBACK) || + (type1 == MTRR_TYPE_WRBACK && type2 == MTRR_TYPE_WRTHROUGH); +} + +/** + * set_mtrr - update mtrrs on all processors + * @reg: mtrr in question + * @base: mtrr base + * @size: mtrr size + * @type: mtrr type + * + * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly: + * + * 1. Queue work to do the following on all processors: + * 2. Disable Interrupts + * 3. Wait for all procs to do so + * 4. Enter no-fill cache mode + * 5. Flush caches + * 6. Clear PGE bit + * 7. Flush all TLBs + * 8. Disable all range registers + * 9. Update the MTRRs + * 10. Enable all range registers + * 11. Flush all TLBs and caches again + * 12. Enter normal cache mode and reenable caching + * 13. Set PGE + * 14. Wait for buddies to catch up + * 15. Enable interrupts. + * + * What does that mean for us? Well, stop_machine() will ensure that + * the rendezvous handler is started on each CPU. And in lockstep they + * do the state transition of disabling interrupts, updating MTRR's + * (the CPU vendors may each do it differently, so we call mtrr_if->set() + * callback and let them take care of it.) and enabling interrupts. + * + * Note that the mechanism is the same for UP systems, too; all the SMP stuff + * becomes nops. + */ +static void +set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type) +{ + struct set_mtrr_data data = { .smp_reg = reg, + .smp_base = base, + .smp_size = size, + .smp_type = type + }; + + stop_machine(mtrr_rendezvous_handler, &data, cpu_online_mask); +} + +static void set_mtrr_from_inactive_cpu(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type) +{ + struct set_mtrr_data data = { .smp_reg = reg, + .smp_base = base, + .smp_size = size, + .smp_type = type + }; + + stop_machine_from_inactive_cpu(mtrr_rendezvous_handler, &data, + cpu_callout_mask); +} + +/** + * mtrr_add_page - Add a memory type region + * @base: Physical base address of region in pages (in units of 4 kB!) + * @size: Physical size of region in pages (4 kB) + * @type: Type of MTRR desired + * @increment: If this is true do usage counting on the region + * + * Memory type region registers control the caching on newer Intel and + * non Intel processors. This function allows drivers to request an + * MTRR is added. The details and hardware specifics of each processor's + * implementation are hidden from the caller, but nevertheless the + * caller should expect to need to provide a power of two size on an + * equivalent power of two boundary. + * + * If the region cannot be added either because all regions are in use + * or the CPU cannot support it a negative value is returned. On success + * the register number for this entry is returned, but should be treated + * as a cookie only. + * + * On a multiprocessor machine the changes are made to all processors. + * This is required on x86 by the Intel processors. + * + * The available types are + * + * %MTRR_TYPE_UNCACHABLE - No caching + * + * %MTRR_TYPE_WRBACK - Write data back in bursts whenever + * + * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts + * + * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes + * + * BUGS: Needs a quiet flag for the cases where drivers do not mind + * failures and do not wish system log messages to be sent. + */ +int mtrr_add_page(unsigned long base, unsigned long size, + unsigned int type, bool increment) +{ + unsigned long lbase, lsize; + int i, replace, error; + mtrr_type ltype; + + if (!mtrr_if) + return -ENXIO; + + error = mtrr_if->validate_add_page(base, size, type); + if (error) + return error; + + if (type >= MTRR_NUM_TYPES) { + pr_warning("mtrr: type: %u invalid\n", type); + return -EINVAL; + } + + /* If the type is WC, check that this processor supports it */ + if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) { + pr_warning("mtrr: your processor doesn't support write-combining\n"); + return -ENOSYS; + } + + if (!size) { + pr_warning("mtrr: zero sized request\n"); + return -EINVAL; + } + + if ((base | (base + size - 1)) >> + (boot_cpu_data.x86_phys_bits - PAGE_SHIFT)) { + pr_warning("mtrr: base or size exceeds the MTRR width\n"); + return -EINVAL; + } + + error = -EINVAL; + replace = -1; + + /* No CPU hotplug when we change MTRR entries */ + get_online_cpus(); + + /* Search for existing MTRR */ + mutex_lock(&mtrr_mutex); + for (i = 0; i < num_var_ranges; ++i) { + mtrr_if->get(i, &lbase, &lsize, <ype); + if (!lsize || base > lbase + lsize - 1 || + base + size - 1 < lbase) + continue; + /* + * At this point we know there is some kind of + * overlap/enclosure + */ + if (base < lbase || base + size - 1 > lbase + lsize - 1) { + if (base <= lbase && + base + size - 1 >= lbase + lsize - 1) { + /* New region encloses an existing region */ + if (type == ltype) { + replace = replace == -1 ? i : -2; + continue; + } else if (types_compatible(type, ltype)) + continue; + } + pr_warning("mtrr: 0x%lx000,0x%lx000 overlaps existing" + " 0x%lx000,0x%lx000\n", base, size, lbase, + lsize); + goto out; + } + /* New region is enclosed by an existing region */ + if (ltype != type) { + if (types_compatible(type, ltype)) + continue; + pr_warning("mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n", + base, size, mtrr_attrib_to_str(ltype), + mtrr_attrib_to_str(type)); + goto out; + } + if (increment) + ++mtrr_usage_table[i]; + error = i; + goto out; + } + /* Search for an empty MTRR */ + i = mtrr_if->get_free_region(base, size, replace); + if (i >= 0) { + set_mtrr(i, base, size, type); + if (likely(replace < 0)) { + mtrr_usage_table[i] = 1; + } else { + mtrr_usage_table[i] = mtrr_usage_table[replace]; + if (increment) + mtrr_usage_table[i]++; + if (unlikely(replace != i)) { + set_mtrr(replace, 0, 0, 0); + mtrr_usage_table[replace] = 0; + } + } + } else { + pr_info("mtrr: no more MTRRs available\n"); + } + error = i; + out: + mutex_unlock(&mtrr_mutex); + put_online_cpus(); + return error; +} + +static int mtrr_check(unsigned long base, unsigned long size) +{ + if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { + pr_warning("mtrr: size and base must be multiples of 4 kiB\n"); + pr_debug("mtrr: size: 0x%lx base: 0x%lx\n", size, base); + dump_stack(); + return -1; + } + return 0; +} + +/** + * mtrr_add - Add a memory type region + * @base: Physical base address of region + * @size: Physical size of region + * @type: Type of MTRR desired + * @increment: If this is true do usage counting on the region + * + * Memory type region registers control the caching on newer Intel and + * non Intel processors. This function allows drivers to request an + * MTRR is added. The details and hardware specifics of each processor's + * implementation are hidden from the caller, but nevertheless the + * caller should expect to need to provide a power of two size on an + * equivalent power of two boundary. + * + * If the region cannot be added either because all regions are in use + * or the CPU cannot support it a negative value is returned. On success + * the register number for this entry is returned, but should be treated + * as a cookie only. + * + * On a multiprocessor machine the changes are made to all processors. + * This is required on x86 by the Intel processors. + * + * The available types are + * + * %MTRR_TYPE_UNCACHABLE - No caching + * + * %MTRR_TYPE_WRBACK - Write data back in bursts whenever + * + * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts + * + * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes + * + * BUGS: Needs a quiet flag for the cases where drivers do not mind + * failures and do not wish system log messages to be sent. + */ +int mtrr_add(unsigned long base, unsigned long size, unsigned int type, + bool increment) +{ + if (mtrr_check(base, size)) + return -EINVAL; + return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type, + increment); +} +EXPORT_SYMBOL(mtrr_add); + +/** + * mtrr_del_page - delete a memory type region + * @reg: Register returned by mtrr_add + * @base: Physical base address + * @size: Size of region + * + * If register is supplied then base and size are ignored. This is + * how drivers should call it. + * + * Releases an MTRR region. If the usage count drops to zero the + * register is freed and the region returns to default state. + * On success the register is returned, on failure a negative error + * code. + */ +int mtrr_del_page(int reg, unsigned long base, unsigned long size) +{ + int i, max; + mtrr_type ltype; + unsigned long lbase, lsize; + int error = -EINVAL; + + if (!mtrr_if) + return -ENXIO; + + max = num_var_ranges; + /* No CPU hotplug when we change MTRR entries */ + get_online_cpus(); + mutex_lock(&mtrr_mutex); + if (reg < 0) { + /* Search for existing MTRR */ + for (i = 0; i < max; ++i) { + mtrr_if->get(i, &lbase, &lsize, <ype); + if (lbase == base && lsize == size) { + reg = i; + break; + } + } + if (reg < 0) { + pr_debug("mtrr: no MTRR for %lx000,%lx000 found\n", + base, size); + goto out; + } + } + if (reg >= max) { + pr_warning("mtrr: register: %d too big\n", reg); + goto out; + } + mtrr_if->get(reg, &lbase, &lsize, <ype); + if (lsize < 1) { + pr_warning("mtrr: MTRR %d not used\n", reg); + goto out; + } + if (mtrr_usage_table[reg] < 1) { + pr_warning("mtrr: reg: %d has count=0\n", reg); + goto out; + } + if (--mtrr_usage_table[reg] < 1) + set_mtrr(reg, 0, 0, 0); + error = reg; + out: + mutex_unlock(&mtrr_mutex); + put_online_cpus(); + return error; +} + +/** + * mtrr_del - delete a memory type region + * @reg: Register returned by mtrr_add + * @base: Physical base address + * @size: Size of region + * + * If register is supplied then base and size are ignored. This is + * how drivers should call it. + * + * Releases an MTRR region. If the usage count drops to zero the + * register is freed and the region returns to default state. + * On success the register is returned, on failure a negative error + * code. + */ +int mtrr_del(int reg, unsigned long base, unsigned long size) +{ + if (mtrr_check(base, size)) + return -EINVAL; + return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT); +} +EXPORT_SYMBOL(mtrr_del); + +/** + * arch_phys_wc_add - add a WC MTRR and handle errors if PAT is unavailable + * @base: Physical base address + * @size: Size of region + * + * If PAT is available, this does nothing. If PAT is unavailable, it + * attempts to add a WC MTRR covering size bytes starting at base and + * logs an error if this fails. + * + * Drivers must store the return value to pass to mtrr_del_wc_if_needed, + * but drivers should not try to interpret that return value. + */ +int arch_phys_wc_add(unsigned long base, unsigned long size) +{ + int ret; + + if (pat_enabled) + return 0; /* Success! (We don't need to do anything.) */ + + ret = mtrr_add(base, size, MTRR_TYPE_WRCOMB, true); + if (ret < 0) { + pr_warn("Failed to add WC MTRR for [%p-%p]; performance may suffer.", + (void *)base, (void *)(base + size - 1)); + return ret; + } + return ret + MTRR_TO_PHYS_WC_OFFSET; +} +EXPORT_SYMBOL(arch_phys_wc_add); + +/* + * arch_phys_wc_del - undoes arch_phys_wc_add + * @handle: Return value from arch_phys_wc_add + * + * This cleans up after mtrr_add_wc_if_needed. + * + * The API guarantees that mtrr_del_wc_if_needed(error code) and + * mtrr_del_wc_if_needed(0) do nothing. + */ +void arch_phys_wc_del(int handle) +{ + if (handle >= 1) { + WARN_ON(handle < MTRR_TO_PHYS_WC_OFFSET); + mtrr_del(handle - MTRR_TO_PHYS_WC_OFFSET, 0, 0); + } +} +EXPORT_SYMBOL(arch_phys_wc_del); + +/* + * phys_wc_to_mtrr_index - translates arch_phys_wc_add's return value + * @handle: Return value from arch_phys_wc_add + * + * This will turn the return value from arch_phys_wc_add into an mtrr + * index suitable for debugging. + * + * Note: There is no legitimate use for this function, except possibly + * in printk line. Alas there is an illegitimate use in some ancient + * drm ioctls. + */ +int phys_wc_to_mtrr_index(int handle) +{ + if (handle < MTRR_TO_PHYS_WC_OFFSET) + return -1; + else + return handle - MTRR_TO_PHYS_WC_OFFSET; +} +EXPORT_SYMBOL_GPL(phys_wc_to_mtrr_index); + +/* + * HACK ALERT! + * These should be called implicitly, but we can't yet until all the initcall + * stuff is done... + */ +static void __init init_ifs(void) +{ +#ifndef CONFIG_X86_64 + amd_init_mtrr(); + cyrix_init_mtrr(); + centaur_init_mtrr(); +#endif +} + +/* The suspend/resume methods are only for CPU without MTRR. CPU using generic + * MTRR driver doesn't require this + */ +struct mtrr_value { + mtrr_type ltype; + unsigned long lbase; + unsigned long lsize; +}; + +static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES]; + +static int mtrr_save(void) +{ + int i; + + for (i = 0; i < num_var_ranges; i++) { + mtrr_if->get(i, &mtrr_value[i].lbase, + &mtrr_value[i].lsize, + &mtrr_value[i].ltype); + } + return 0; +} + +static void mtrr_restore(void) +{ + int i; + + for (i = 0; i < num_var_ranges; i++) { + if (mtrr_value[i].lsize) { + set_mtrr(i, mtrr_value[i].lbase, + mtrr_value[i].lsize, + mtrr_value[i].ltype); + } + } +} + + + +static struct syscore_ops mtrr_syscore_ops = { + .suspend = mtrr_save, + .resume = mtrr_restore, +}; + +int __initdata changed_by_mtrr_cleanup; + +#define SIZE_OR_MASK_BITS(n) (~((1ULL << ((n) - PAGE_SHIFT)) - 1)) +/** + * mtrr_bp_init - initialize mtrrs on the boot CPU + * + * This needs to be called early; before any of the other CPUs are + * initialized (i.e. before smp_init()). + * + */ +void __init mtrr_bp_init(void) +{ + u32 phys_addr; + + init_ifs(); + + phys_addr = 32; + + if (cpu_has_mtrr) { + mtrr_if = &generic_mtrr_ops; + size_or_mask = SIZE_OR_MASK_BITS(36); + size_and_mask = 0x00f00000; + phys_addr = 36; + + /* + * This is an AMD specific MSR, but we assume(hope?) that + * Intel will implement it too when they extend the address + * bus of the Xeon. + */ + if (cpuid_eax(0x80000000) >= 0x80000008) { + phys_addr = cpuid_eax(0x80000008) & 0xff; + /* CPUID workaround for Intel 0F33/0F34 CPU */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && + boot_cpu_data.x86 == 0xF && + boot_cpu_data.x86_model == 0x3 && + (boot_cpu_data.x86_mask == 0x3 || + boot_cpu_data.x86_mask == 0x4)) + phys_addr = 36; + + size_or_mask = SIZE_OR_MASK_BITS(phys_addr); + size_and_mask = ~size_or_mask & 0xfffff00000ULL; + } else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR && + boot_cpu_data.x86 == 6) { + /* + * VIA C* family have Intel style MTRRs, + * but don't support PAE + */ + size_or_mask = SIZE_OR_MASK_BITS(32); + size_and_mask = 0; + phys_addr = 32; + } + } else { + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + if (cpu_has_k6_mtrr) { + /* Pre-Athlon (K6) AMD CPU MTRRs */ + mtrr_if = mtrr_ops[X86_VENDOR_AMD]; + size_or_mask = SIZE_OR_MASK_BITS(32); + size_and_mask = 0; + } + break; + case X86_VENDOR_CENTAUR: + if (cpu_has_centaur_mcr) { + mtrr_if = mtrr_ops[X86_VENDOR_CENTAUR]; + size_or_mask = SIZE_OR_MASK_BITS(32); + size_and_mask = 0; + } + break; + case X86_VENDOR_CYRIX: + if (cpu_has_cyrix_arr) { + mtrr_if = mtrr_ops[X86_VENDOR_CYRIX]; + size_or_mask = SIZE_OR_MASK_BITS(32); + size_and_mask = 0; + } + break; + default: + break; + } + } + + if (mtrr_if) { + set_num_var_ranges(); + init_table(); + if (use_intel()) { + get_mtrr_state(); + + if (mtrr_cleanup(phys_addr)) { + changed_by_mtrr_cleanup = 1; + mtrr_if->set_all(); + } + } + } +} + +void mtrr_ap_init(void) +{ + if (!use_intel() || mtrr_aps_delayed_init) + return; + /* + * Ideally we should hold mtrr_mutex here to avoid mtrr entries + * changed, but this routine will be called in cpu boot time, + * holding the lock breaks it. + * + * This routine is called in two cases: + * + * 1. very earily time of software resume, when there absolutely + * isn't mtrr entry changes; + * + * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug + * lock to prevent mtrr entry changes + */ + set_mtrr_from_inactive_cpu(~0U, 0, 0, 0); +} + +/** + * Save current fixed-range MTRR state of the first cpu in cpu_online_mask. + */ +void mtrr_save_state(void) +{ + int first_cpu; + + get_online_cpus(); + first_cpu = cpumask_first(cpu_online_mask); + smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1); + put_online_cpus(); +} + +void set_mtrr_aps_delayed_init(void) +{ + if (!use_intel()) + return; + + mtrr_aps_delayed_init = true; +} + +/* + * Delayed MTRR initialization for all AP's + */ +void mtrr_aps_init(void) +{ + if (!use_intel()) + return; + + /* + * Check if someone has requested the delay of AP MTRR initialization, + * by doing set_mtrr_aps_delayed_init(), prior to this point. If not, + * then we are done. + */ + if (!mtrr_aps_delayed_init) + return; + + set_mtrr(~0U, 0, 0, 0); + mtrr_aps_delayed_init = false; +} + +void mtrr_bp_restore(void) +{ + if (!use_intel()) + return; + + mtrr_if->set_all(); +} + +static int __init mtrr_init_finialize(void) +{ + if (!mtrr_if) + return 0; + + if (use_intel()) { + if (!changed_by_mtrr_cleanup) + mtrr_state_warn(); + return 0; + } + + /* + * The CPU has no MTRR and seems to not support SMP. They have + * specific drivers, we use a tricky method to support + * suspend/resume for them. + * + * TBD: is there any system with such CPU which supports + * suspend/resume? If no, we should remove the code. + */ + register_syscore_ops(&mtrr_syscore_ops); + + return 0; +} +subsys_initcall(mtrr_init_finialize); diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h new file mode 100644 index 00000000000..df5e41f31a2 --- /dev/null +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -0,0 +1,78 @@ +/* + * local MTRR defines. + */ + +#include <linux/types.h> +#include <linux/stddef.h> + +#define MTRR_CHANGE_MASK_FIXED 0x01 +#define MTRR_CHANGE_MASK_VARIABLE 0x02 +#define MTRR_CHANGE_MASK_DEFTYPE 0x04 + +extern unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; + +struct mtrr_ops { + u32 vendor; + u32 use_intel_if; + void (*set)(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type); + void (*set_all)(void); + + void (*get)(unsigned int reg, unsigned long *base, + unsigned long *size, mtrr_type *type); + int (*get_free_region)(unsigned long base, unsigned long size, + int replace_reg); + int (*validate_add_page)(unsigned long base, unsigned long size, + unsigned int type); + int (*have_wrcomb)(void); +}; + +extern int generic_get_free_region(unsigned long base, unsigned long size, + int replace_reg); +extern int generic_validate_add_page(unsigned long base, unsigned long size, + unsigned int type); + +extern const struct mtrr_ops generic_mtrr_ops; + +extern int positive_have_wrcomb(void); + +/* library functions for processor-specific routines */ +struct set_mtrr_context { + unsigned long flags; + unsigned long cr4val; + u32 deftype_lo; + u32 deftype_hi; + u32 ccr3; +}; + +void set_mtrr_done(struct set_mtrr_context *ctxt); +void set_mtrr_cache_disable(struct set_mtrr_context *ctxt); +void set_mtrr_prepare_save(struct set_mtrr_context *ctxt); + +void fill_mtrr_var_range(unsigned int index, + u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi); +void get_mtrr_state(void); + +extern void set_mtrr_ops(const struct mtrr_ops *ops); + +extern u64 size_or_mask, size_and_mask; +extern const struct mtrr_ops *mtrr_if; + +#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd) +#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1) + +extern unsigned int num_var_ranges; +extern u64 mtrr_tom2; +extern struct mtrr_state_type mtrr_state; + +void mtrr_state_warn(void); +const char *mtrr_attrib_to_str(int x); +void mtrr_wrmsr(unsigned, unsigned, unsigned); + +/* CPU specific mtrr init functions */ +int amd_init_mtrr(void); +int cyrix_init_mtrr(void); +int centaur_init_mtrr(void); + +extern int changed_by_mtrr_cleanup; +extern int mtrr_cleanup(unsigned address_bits); diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c new file mode 100644 index 00000000000..2879ecdaac4 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event.c @@ -0,0 +1,2170 @@ +/* + * Performance events x86 architecture code + * + * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> + * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2009 Jaswinder Singh Rajput + * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com> + * Copyright (C) 2009 Google, Inc., Stephane Eranian + * + * For licencing details see kernel-base/COPYING + */ + +#include <linux/perf_event.h> +#include <linux/capability.h> +#include <linux/notifier.h> +#include <linux/hardirq.h> +#include <linux/kprobes.h> +#include <linux/module.h> +#include <linux/kdebug.h> +#include <linux/sched.h> +#include <linux/uaccess.h> +#include <linux/slab.h> +#include <linux/cpu.h> +#include <linux/bitops.h> +#include <linux/device.h> + +#include <asm/apic.h> +#include <asm/stacktrace.h> +#include <asm/nmi.h> +#include <asm/smp.h> +#include <asm/alternative.h> +#include <asm/timer.h> +#include <asm/desc.h> +#include <asm/ldt.h> + +#include "perf_event.h" + +struct x86_pmu x86_pmu __read_mostly; + +DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { + .enabled = 1, +}; + +u64 __read_mostly hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX]; +u64 __read_mostly hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX]; + +/* + * Propagate event elapsed time into the generic event. + * Can only be executed on the CPU where the event is active. + * Returns the delta events processed. + */ +u64 x86_perf_event_update(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + int shift = 64 - x86_pmu.cntval_bits; + u64 prev_raw_count, new_raw_count; + int idx = hwc->idx; + s64 delta; + + if (idx == INTEL_PMC_IDX_FIXED_BTS) + return 0; + + /* + * Careful: an NMI might modify the previous event value. + * + * Our tactic to handle this is to first atomically read and + * exchange a new raw count - then add that new-prev delta + * count to the generic event atomically: + */ +again: + prev_raw_count = local64_read(&hwc->prev_count); + rdpmcl(hwc->event_base_rdpmc, new_raw_count); + + if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, + new_raw_count) != prev_raw_count) + goto again; + + /* + * Now we have the new raw value and have updated the prev + * timestamp already. We can now calculate the elapsed delta + * (event-)time and add that to the generic event. + * + * Careful, not all hw sign-extends above the physical width + * of the count. + */ + delta = (new_raw_count << shift) - (prev_raw_count << shift); + delta >>= shift; + + local64_add(delta, &event->count); + local64_sub(delta, &hwc->period_left); + + return new_raw_count; +} + +/* + * Find and validate any extra registers to set up. + */ +static int x86_pmu_extra_regs(u64 config, struct perf_event *event) +{ + struct hw_perf_event_extra *reg; + struct extra_reg *er; + + reg = &event->hw.extra_reg; + + if (!x86_pmu.extra_regs) + return 0; + + for (er = x86_pmu.extra_regs; er->msr; er++) { + if (er->event != (config & er->config_mask)) + continue; + if (event->attr.config1 & ~er->valid_mask) + return -EINVAL; + /* Check if the extra msrs can be safely accessed*/ + if (!er->extra_msr_access) + return -ENXIO; + + reg->idx = er->idx; + reg->config = event->attr.config1; + reg->reg = er->msr; + break; + } + return 0; +} + +static atomic_t active_events; +static DEFINE_MUTEX(pmc_reserve_mutex); + +#ifdef CONFIG_X86_LOCAL_APIC + +static bool reserve_pmc_hardware(void) +{ + int i; + + for (i = 0; i < x86_pmu.num_counters; i++) { + if (!reserve_perfctr_nmi(x86_pmu_event_addr(i))) + goto perfctr_fail; + } + + for (i = 0; i < x86_pmu.num_counters; i++) { + if (!reserve_evntsel_nmi(x86_pmu_config_addr(i))) + goto eventsel_fail; + } + + return true; + +eventsel_fail: + for (i--; i >= 0; i--) + release_evntsel_nmi(x86_pmu_config_addr(i)); + + i = x86_pmu.num_counters; + +perfctr_fail: + for (i--; i >= 0; i--) + release_perfctr_nmi(x86_pmu_event_addr(i)); + + return false; +} + +static void release_pmc_hardware(void) +{ + int i; + + for (i = 0; i < x86_pmu.num_counters; i++) { + release_perfctr_nmi(x86_pmu_event_addr(i)); + release_evntsel_nmi(x86_pmu_config_addr(i)); + } +} + +#else + +static bool reserve_pmc_hardware(void) { return true; } +static void release_pmc_hardware(void) {} + +#endif + +static bool check_hw_exists(void) +{ + u64 val, val_fail, val_new= ~0; + int i, reg, reg_fail, ret = 0; + int bios_fail = 0; + + /* + * Check to see if the BIOS enabled any of the counters, if so + * complain and bail. + */ + for (i = 0; i < x86_pmu.num_counters; i++) { + reg = x86_pmu_config_addr(i); + ret = rdmsrl_safe(reg, &val); + if (ret) + goto msr_fail; + if (val & ARCH_PERFMON_EVENTSEL_ENABLE) { + bios_fail = 1; + val_fail = val; + reg_fail = reg; + } + } + + if (x86_pmu.num_counters_fixed) { + reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; + ret = rdmsrl_safe(reg, &val); + if (ret) + goto msr_fail; + for (i = 0; i < x86_pmu.num_counters_fixed; i++) { + if (val & (0x03 << i*4)) { + bios_fail = 1; + val_fail = val; + reg_fail = reg; + } + } + } + + /* + * Read the current value, change it and read it back to see if it + * matches, this is needed to detect certain hardware emulators + * (qemu/kvm) that don't trap on the MSR access and always return 0s. + */ + reg = x86_pmu_event_addr(0); + if (rdmsrl_safe(reg, &val)) + goto msr_fail; + val ^= 0xffffUL; + ret = wrmsrl_safe(reg, val); + ret |= rdmsrl_safe(reg, &val_new); + if (ret || val != val_new) + goto msr_fail; + + /* + * We still allow the PMU driver to operate: + */ + if (bios_fail) { + printk(KERN_CONT "Broken BIOS detected, complain to your hardware vendor.\n"); + printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg_fail, val_fail); + } + + return true; + +msr_fail: + printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n"); + printk(KERN_ERR "Failed to access perfctr msr (MSR %x is %Lx)\n", reg, val_new); + + return false; +} + +static void hw_perf_event_destroy(struct perf_event *event) +{ + if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) { + release_pmc_hardware(); + release_ds_buffers(); + mutex_unlock(&pmc_reserve_mutex); + } +} + +static inline int x86_pmu_initialized(void) +{ + return x86_pmu.handle_irq != NULL; +} + +static inline int +set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event) +{ + struct perf_event_attr *attr = &event->attr; + unsigned int cache_type, cache_op, cache_result; + u64 config, val; + + config = attr->config; + + cache_type = (config >> 0) & 0xff; + if (cache_type >= PERF_COUNT_HW_CACHE_MAX) + return -EINVAL; + + cache_op = (config >> 8) & 0xff; + if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX) + return -EINVAL; + + cache_result = (config >> 16) & 0xff; + if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX) + return -EINVAL; + + val = hw_cache_event_ids[cache_type][cache_op][cache_result]; + + if (val == 0) + return -ENOENT; + + if (val == -1) + return -EINVAL; + + hwc->config |= val; + attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result]; + return x86_pmu_extra_regs(val, event); +} + +int x86_setup_perfctr(struct perf_event *event) +{ + struct perf_event_attr *attr = &event->attr; + struct hw_perf_event *hwc = &event->hw; + u64 config; + + if (!is_sampling_event(event)) { + hwc->sample_period = x86_pmu.max_period; + hwc->last_period = hwc->sample_period; + local64_set(&hwc->period_left, hwc->sample_period); + } + + if (attr->type == PERF_TYPE_RAW) + return x86_pmu_extra_regs(event->attr.config, event); + + if (attr->type == PERF_TYPE_HW_CACHE) + return set_ext_hw_attr(hwc, event); + + if (attr->config >= x86_pmu.max_events) + return -EINVAL; + + /* + * The generic map: + */ + config = x86_pmu.event_map(attr->config); + + if (config == 0) + return -ENOENT; + + if (config == -1LL) + return -EINVAL; + + /* + * Branch tracing: + */ + if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS && + !attr->freq && hwc->sample_period == 1) { + /* BTS is not supported by this architecture. */ + if (!x86_pmu.bts_active) + return -EOPNOTSUPP; + + /* BTS is currently only allowed for user-mode. */ + if (!attr->exclude_kernel) + return -EOPNOTSUPP; + } + + hwc->config |= config; + + return 0; +} + +/* + * check that branch_sample_type is compatible with + * settings needed for precise_ip > 1 which implies + * using the LBR to capture ALL taken branches at the + * priv levels of the measurement + */ +static inline int precise_br_compat(struct perf_event *event) +{ + u64 m = event->attr.branch_sample_type; + u64 b = 0; + + /* must capture all branches */ + if (!(m & PERF_SAMPLE_BRANCH_ANY)) + return 0; + + m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER; + + if (!event->attr.exclude_user) + b |= PERF_SAMPLE_BRANCH_USER; + + if (!event->attr.exclude_kernel) + b |= PERF_SAMPLE_BRANCH_KERNEL; + + /* + * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86 + */ + + return m == b; +} + +int x86_pmu_hw_config(struct perf_event *event) +{ + if (event->attr.precise_ip) { + int precise = 0; + + /* Support for constant skid */ + if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) { + precise++; + + /* Support for IP fixup */ + if (x86_pmu.lbr_nr) + precise++; + } + + if (event->attr.precise_ip > precise) + return -EOPNOTSUPP; + /* + * check that PEBS LBR correction does not conflict with + * whatever the user is asking with attr->branch_sample_type + */ + if (event->attr.precise_ip > 1 && + x86_pmu.intel_cap.pebs_format < 2) { + u64 *br_type = &event->attr.branch_sample_type; + + if (has_branch_stack(event)) { + if (!precise_br_compat(event)) + return -EOPNOTSUPP; + + /* branch_sample_type is compatible */ + + } else { + /* + * user did not specify branch_sample_type + * + * For PEBS fixups, we capture all + * the branches at the priv level of the + * event. + */ + *br_type = PERF_SAMPLE_BRANCH_ANY; + + if (!event->attr.exclude_user) + *br_type |= PERF_SAMPLE_BRANCH_USER; + + if (!event->attr.exclude_kernel) + *br_type |= PERF_SAMPLE_BRANCH_KERNEL; + } + } + } + + /* + * Generate PMC IRQs: + * (keep 'enabled' bit clear for now) + */ + event->hw.config = ARCH_PERFMON_EVENTSEL_INT; + + /* + * Count user and OS events unless requested not to + */ + if (!event->attr.exclude_user) + event->hw.config |= ARCH_PERFMON_EVENTSEL_USR; + if (!event->attr.exclude_kernel) + event->hw.config |= ARCH_PERFMON_EVENTSEL_OS; + + if (event->attr.type == PERF_TYPE_RAW) + event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK; + + return x86_setup_perfctr(event); +} + +/* + * Setup the hardware configuration for a given attr_type + */ +static int __x86_pmu_event_init(struct perf_event *event) +{ + int err; + + if (!x86_pmu_initialized()) + return -ENODEV; + + err = 0; + if (!atomic_inc_not_zero(&active_events)) { + mutex_lock(&pmc_reserve_mutex); + if (atomic_read(&active_events) == 0) { + if (!reserve_pmc_hardware()) + err = -EBUSY; + else + reserve_ds_buffers(); + } + if (!err) + atomic_inc(&active_events); + mutex_unlock(&pmc_reserve_mutex); + } + if (err) + return err; + + event->destroy = hw_perf_event_destroy; + + event->hw.idx = -1; + event->hw.last_cpu = -1; + event->hw.last_tag = ~0ULL; + + /* mark unused */ + event->hw.extra_reg.idx = EXTRA_REG_NONE; + event->hw.branch_reg.idx = EXTRA_REG_NONE; + + return x86_pmu.hw_config(event); +} + +void x86_pmu_disable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + int idx; + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + u64 val; + + if (!test_bit(idx, cpuc->active_mask)) + continue; + rdmsrl(x86_pmu_config_addr(idx), val); + if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE)) + continue; + val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; + wrmsrl(x86_pmu_config_addr(idx), val); + } +} + +static void x86_pmu_disable(struct pmu *pmu) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + if (!x86_pmu_initialized()) + return; + + if (!cpuc->enabled) + return; + + cpuc->n_added = 0; + cpuc->enabled = 0; + barrier(); + + x86_pmu.disable_all(); +} + +void x86_pmu_enable_all(int added) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + int idx; + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + struct hw_perf_event *hwc = &cpuc->events[idx]->hw; + + if (!test_bit(idx, cpuc->active_mask)) + continue; + + __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); + } +} + +static struct pmu pmu; + +static inline int is_x86_event(struct perf_event *event) +{ + return event->pmu == &pmu; +} + +/* + * Event scheduler state: + * + * Assign events iterating over all events and counters, beginning + * with events with least weights first. Keep the current iterator + * state in struct sched_state. + */ +struct sched_state { + int weight; + int event; /* event index */ + int counter; /* counter index */ + int unassigned; /* number of events to be assigned left */ + unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; +}; + +/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */ +#define SCHED_STATES_MAX 2 + +struct perf_sched { + int max_weight; + int max_events; + struct perf_event **events; + struct sched_state state; + int saved_states; + struct sched_state saved[SCHED_STATES_MAX]; +}; + +/* + * Initialize interator that runs through all events and counters. + */ +static void perf_sched_init(struct perf_sched *sched, struct perf_event **events, + int num, int wmin, int wmax) +{ + int idx; + + memset(sched, 0, sizeof(*sched)); + sched->max_events = num; + sched->max_weight = wmax; + sched->events = events; + + for (idx = 0; idx < num; idx++) { + if (events[idx]->hw.constraint->weight == wmin) + break; + } + + sched->state.event = idx; /* start with min weight */ + sched->state.weight = wmin; + sched->state.unassigned = num; +} + +static void perf_sched_save_state(struct perf_sched *sched) +{ + if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX)) + return; + + sched->saved[sched->saved_states] = sched->state; + sched->saved_states++; +} + +static bool perf_sched_restore_state(struct perf_sched *sched) +{ + if (!sched->saved_states) + return false; + + sched->saved_states--; + sched->state = sched->saved[sched->saved_states]; + + /* continue with next counter: */ + clear_bit(sched->state.counter++, sched->state.used); + + return true; +} + +/* + * Select a counter for the current event to schedule. Return true on + * success. + */ +static bool __perf_sched_find_counter(struct perf_sched *sched) +{ + struct event_constraint *c; + int idx; + + if (!sched->state.unassigned) + return false; + + if (sched->state.event >= sched->max_events) + return false; + + c = sched->events[sched->state.event]->hw.constraint; + /* Prefer fixed purpose counters */ + if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) { + idx = INTEL_PMC_IDX_FIXED; + for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) { + if (!__test_and_set_bit(idx, sched->state.used)) + goto done; + } + } + /* Grab the first unused counter starting with idx */ + idx = sched->state.counter; + for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) { + if (!__test_and_set_bit(idx, sched->state.used)) + goto done; + } + + return false; + +done: + sched->state.counter = idx; + + if (c->overlap) + perf_sched_save_state(sched); + + return true; +} + +static bool perf_sched_find_counter(struct perf_sched *sched) +{ + while (!__perf_sched_find_counter(sched)) { + if (!perf_sched_restore_state(sched)) + return false; + } + + return true; +} + +/* + * Go through all unassigned events and find the next one to schedule. + * Take events with the least weight first. Return true on success. + */ +static bool perf_sched_next_event(struct perf_sched *sched) +{ + struct event_constraint *c; + + if (!sched->state.unassigned || !--sched->state.unassigned) + return false; + + do { + /* next event */ + sched->state.event++; + if (sched->state.event >= sched->max_events) { + /* next weight */ + sched->state.event = 0; + sched->state.weight++; + if (sched->state.weight > sched->max_weight) + return false; + } + c = sched->events[sched->state.event]->hw.constraint; + } while (c->weight != sched->state.weight); + + sched->state.counter = 0; /* start with first counter */ + + return true; +} + +/* + * Assign a counter for each event. + */ +int perf_assign_events(struct perf_event **events, int n, + int wmin, int wmax, int *assign) +{ + struct perf_sched sched; + + perf_sched_init(&sched, events, n, wmin, wmax); + + do { + if (!perf_sched_find_counter(&sched)) + break; /* failed */ + if (assign) + assign[sched.state.event] = sched.state.counter; + } while (perf_sched_next_event(&sched)); + + return sched.state.unassigned; +} +EXPORT_SYMBOL_GPL(perf_assign_events); + +int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) +{ + struct event_constraint *c; + unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + struct perf_event *e; + int i, wmin, wmax, num = 0; + struct hw_perf_event *hwc; + + bitmap_zero(used_mask, X86_PMC_IDX_MAX); + + for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) { + hwc = &cpuc->event_list[i]->hw; + c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]); + hwc->constraint = c; + + wmin = min(wmin, c->weight); + wmax = max(wmax, c->weight); + } + + /* + * fastpath, try to reuse previous register + */ + for (i = 0; i < n; i++) { + hwc = &cpuc->event_list[i]->hw; + c = hwc->constraint; + + /* never assigned */ + if (hwc->idx == -1) + break; + + /* constraint still honored */ + if (!test_bit(hwc->idx, c->idxmsk)) + break; + + /* not already used */ + if (test_bit(hwc->idx, used_mask)) + break; + + __set_bit(hwc->idx, used_mask); + if (assign) + assign[i] = hwc->idx; + } + + /* slow path */ + if (i != n) + num = perf_assign_events(cpuc->event_list, n, wmin, + wmax, assign); + + /* + * Mark the event as committed, so we do not put_constraint() + * in case new events are added and fail scheduling. + */ + if (!num && assign) { + for (i = 0; i < n; i++) { + e = cpuc->event_list[i]; + e->hw.flags |= PERF_X86_EVENT_COMMITTED; + } + } + /* + * scheduling failed or is just a simulation, + * free resources if necessary + */ + if (!assign || num) { + for (i = 0; i < n; i++) { + e = cpuc->event_list[i]; + /* + * do not put_constraint() on comitted events, + * because they are good to go + */ + if ((e->hw.flags & PERF_X86_EVENT_COMMITTED)) + continue; + + if (x86_pmu.put_event_constraints) + x86_pmu.put_event_constraints(cpuc, e); + } + } + return num ? -EINVAL : 0; +} + +/* + * dogrp: true if must collect siblings events (group) + * returns total number of events and error code + */ +static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp) +{ + struct perf_event *event; + int n, max_count; + + max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed; + + /* current number of events already accepted */ + n = cpuc->n_events; + + if (is_x86_event(leader)) { + if (n >= max_count) + return -EINVAL; + cpuc->event_list[n] = leader; + n++; + } + if (!dogrp) + return n; + + list_for_each_entry(event, &leader->sibling_list, group_entry) { + if (!is_x86_event(event) || + event->state <= PERF_EVENT_STATE_OFF) + continue; + + if (n >= max_count) + return -EINVAL; + + cpuc->event_list[n] = event; + n++; + } + return n; +} + +static inline void x86_assign_hw_event(struct perf_event *event, + struct cpu_hw_events *cpuc, int i) +{ + struct hw_perf_event *hwc = &event->hw; + + hwc->idx = cpuc->assign[i]; + hwc->last_cpu = smp_processor_id(); + hwc->last_tag = ++cpuc->tags[i]; + + if (hwc->idx == INTEL_PMC_IDX_FIXED_BTS) { + hwc->config_base = 0; + hwc->event_base = 0; + } else if (hwc->idx >= INTEL_PMC_IDX_FIXED) { + hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; + hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - INTEL_PMC_IDX_FIXED); + hwc->event_base_rdpmc = (hwc->idx - INTEL_PMC_IDX_FIXED) | 1<<30; + } else { + hwc->config_base = x86_pmu_config_addr(hwc->idx); + hwc->event_base = x86_pmu_event_addr(hwc->idx); + hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx); + } +} + +static inline int match_prev_assignment(struct hw_perf_event *hwc, + struct cpu_hw_events *cpuc, + int i) +{ + return hwc->idx == cpuc->assign[i] && + hwc->last_cpu == smp_processor_id() && + hwc->last_tag == cpuc->tags[i]; +} + +static void x86_pmu_start(struct perf_event *event, int flags); + +static void x86_pmu_enable(struct pmu *pmu) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct perf_event *event; + struct hw_perf_event *hwc; + int i, added = cpuc->n_added; + + if (!x86_pmu_initialized()) + return; + + if (cpuc->enabled) + return; + + if (cpuc->n_added) { + int n_running = cpuc->n_events - cpuc->n_added; + /* + * apply assignment obtained either from + * hw_perf_group_sched_in() or x86_pmu_enable() + * + * step1: save events moving to new counters + */ + for (i = 0; i < n_running; i++) { + event = cpuc->event_list[i]; + hwc = &event->hw; + + /* + * we can avoid reprogramming counter if: + * - assigned same counter as last time + * - running on same CPU as last time + * - no other event has used the counter since + */ + if (hwc->idx == -1 || + match_prev_assignment(hwc, cpuc, i)) + continue; + + /* + * Ensure we don't accidentally enable a stopped + * counter simply because we rescheduled. + */ + if (hwc->state & PERF_HES_STOPPED) + hwc->state |= PERF_HES_ARCH; + + x86_pmu_stop(event, PERF_EF_UPDATE); + } + + /* + * step2: reprogram moved events into new counters + */ + for (i = 0; i < cpuc->n_events; i++) { + event = cpuc->event_list[i]; + hwc = &event->hw; + + if (!match_prev_assignment(hwc, cpuc, i)) + x86_assign_hw_event(event, cpuc, i); + else if (i < n_running) + continue; + + if (hwc->state & PERF_HES_ARCH) + continue; + + x86_pmu_start(event, PERF_EF_RELOAD); + } + cpuc->n_added = 0; + perf_events_lapic_init(); + } + + cpuc->enabled = 1; + barrier(); + + x86_pmu.enable_all(added); +} + +static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); + +/* + * Set the next IRQ period, based on the hwc->period_left value. + * To be called with the event disabled in hw: + */ +int x86_perf_event_set_period(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + s64 left = local64_read(&hwc->period_left); + s64 period = hwc->sample_period; + int ret = 0, idx = hwc->idx; + + if (idx == INTEL_PMC_IDX_FIXED_BTS) + return 0; + + /* + * If we are way outside a reasonable range then just skip forward: + */ + if (unlikely(left <= -period)) { + left = period; + local64_set(&hwc->period_left, left); + hwc->last_period = period; + ret = 1; + } + + if (unlikely(left <= 0)) { + left += period; + local64_set(&hwc->period_left, left); + hwc->last_period = period; + ret = 1; + } + /* + * Quirk: certain CPUs dont like it if just 1 hw_event is left: + */ + if (unlikely(left < 2)) + left = 2; + + if (left > x86_pmu.max_period) + left = x86_pmu.max_period; + + per_cpu(pmc_prev_left[idx], smp_processor_id()) = left; + + /* + * The hw event starts counting from this event offset, + * mark it to be able to extra future deltas: + */ + local64_set(&hwc->prev_count, (u64)-left); + + wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask); + + /* + * Due to erratum on certan cpu we need + * a second write to be sure the register + * is updated properly + */ + if (x86_pmu.perfctr_second_write) { + wrmsrl(hwc->event_base, + (u64)(-left) & x86_pmu.cntval_mask); + } + + perf_event_update_userpage(event); + + return ret; +} + +void x86_pmu_enable_event(struct perf_event *event) +{ + if (__this_cpu_read(cpu_hw_events.enabled)) + __x86_pmu_enable_event(&event->hw, + ARCH_PERFMON_EVENTSEL_ENABLE); +} + +/* + * Add a single event to the PMU. + * + * The event is added to the group of enabled events + * but only if it can be scehduled with existing events. + */ +static int x86_pmu_add(struct perf_event *event, int flags) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc; + int assign[X86_PMC_IDX_MAX]; + int n, n0, ret; + + hwc = &event->hw; + + perf_pmu_disable(event->pmu); + n0 = cpuc->n_events; + ret = n = collect_events(cpuc, event, false); + if (ret < 0) + goto out; + + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + if (!(flags & PERF_EF_START)) + hwc->state |= PERF_HES_ARCH; + + /* + * If group events scheduling transaction was started, + * skip the schedulability test here, it will be performed + * at commit time (->commit_txn) as a whole. + */ + if (cpuc->group_flag & PERF_EVENT_TXN) + goto done_collect; + + ret = x86_pmu.schedule_events(cpuc, n, assign); + if (ret) + goto out; + /* + * copy new assignment, now we know it is possible + * will be used by hw_perf_enable() + */ + memcpy(cpuc->assign, assign, n*sizeof(int)); + +done_collect: + /* + * Commit the collect_events() state. See x86_pmu_del() and + * x86_pmu_*_txn(). + */ + cpuc->n_events = n; + cpuc->n_added += n - n0; + cpuc->n_txn += n - n0; + + ret = 0; +out: + perf_pmu_enable(event->pmu); + return ret; +} + +static void x86_pmu_start(struct perf_event *event, int flags) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + int idx = event->hw.idx; + + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) + return; + + if (WARN_ON_ONCE(idx == -1)) + return; + + if (flags & PERF_EF_RELOAD) { + WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); + x86_perf_event_set_period(event); + } + + event->hw.state = 0; + + cpuc->events[idx] = event; + __set_bit(idx, cpuc->active_mask); + __set_bit(idx, cpuc->running); + x86_pmu.enable(event); + perf_event_update_userpage(event); +} + +void perf_event_print_debug(void) +{ + u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; + u64 pebs; + struct cpu_hw_events *cpuc; + unsigned long flags; + int cpu, idx; + + if (!x86_pmu.num_counters) + return; + + local_irq_save(flags); + + cpu = smp_processor_id(); + cpuc = &per_cpu(cpu_hw_events, cpu); + + if (x86_pmu.version >= 2) { + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); + rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); + rdmsrl(MSR_IA32_PEBS_ENABLE, pebs); + + pr_info("\n"); + pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl); + pr_info("CPU#%d: status: %016llx\n", cpu, status); + pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); + pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); + pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs); + } + pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl); + rdmsrl(x86_pmu_event_addr(idx), pmc_count); + + prev_left = per_cpu(pmc_prev_left[idx], cpu); + + pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", + cpu, idx, pmc_ctrl); + pr_info("CPU#%d: gen-PMC%d count: %016llx\n", + cpu, idx, pmc_count); + pr_info("CPU#%d: gen-PMC%d left: %016llx\n", + cpu, idx, prev_left); + } + for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { + rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); + + pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", + cpu, idx, pmc_count); + } + local_irq_restore(flags); +} + +void x86_pmu_stop(struct perf_event *event, int flags) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + + if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) { + x86_pmu.disable(event); + cpuc->events[hwc->idx] = NULL; + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; + } + + if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { + /* + * Drain the remaining delta count out of a event + * that we are disabling: + */ + x86_perf_event_update(event); + hwc->state |= PERF_HES_UPTODATE; + } +} + +static void x86_pmu_del(struct perf_event *event, int flags) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + int i; + + /* + * event is descheduled + */ + event->hw.flags &= ~PERF_X86_EVENT_COMMITTED; + + /* + * If we're called during a txn, we don't need to do anything. + * The events never got scheduled and ->cancel_txn will truncate + * the event_list. + * + * XXX assumes any ->del() called during a TXN will only be on + * an event added during that same TXN. + */ + if (cpuc->group_flag & PERF_EVENT_TXN) + return; + + /* + * Not a TXN, therefore cleanup properly. + */ + x86_pmu_stop(event, PERF_EF_UPDATE); + + for (i = 0; i < cpuc->n_events; i++) { + if (event == cpuc->event_list[i]) + break; + } + + if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */ + return; + + /* If we have a newly added event; make sure to decrease n_added. */ + if (i >= cpuc->n_events - cpuc->n_added) + --cpuc->n_added; + + if (x86_pmu.put_event_constraints) + x86_pmu.put_event_constraints(cpuc, event); + + /* Delete the array entry. */ + while (++i < cpuc->n_events) + cpuc->event_list[i-1] = cpuc->event_list[i]; + --cpuc->n_events; + + perf_event_update_userpage(event); +} + +int x86_pmu_handle_irq(struct pt_regs *regs) +{ + struct perf_sample_data data; + struct cpu_hw_events *cpuc; + struct perf_event *event; + int idx, handled = 0; + u64 val; + + cpuc = &__get_cpu_var(cpu_hw_events); + + /* + * Some chipsets need to unmask the LVTPC in a particular spot + * inside the nmi handler. As a result, the unmasking was pushed + * into all the nmi handlers. + * + * This generic handler doesn't seem to have any issues where the + * unmasking occurs so it was left at the top. + */ + apic_write(APIC_LVTPC, APIC_DM_NMI); + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + if (!test_bit(idx, cpuc->active_mask)) { + /* + * Though we deactivated the counter some cpus + * might still deliver spurious interrupts still + * in flight. Catch them: + */ + if (__test_and_clear_bit(idx, cpuc->running)) + handled++; + continue; + } + + event = cpuc->events[idx]; + + val = x86_perf_event_update(event); + if (val & (1ULL << (x86_pmu.cntval_bits - 1))) + continue; + + /* + * event overflow + */ + handled++; + perf_sample_data_init(&data, 0, event->hw.last_period); + + if (!x86_perf_event_set_period(event)) + continue; + + if (perf_event_overflow(event, &data, regs)) + x86_pmu_stop(event, 0); + } + + if (handled) + inc_irq_stat(apic_perf_irqs); + + return handled; +} + +void perf_events_lapic_init(void) +{ + if (!x86_pmu.apic || !x86_pmu_initialized()) + return; + + /* + * Always use NMI for PMU + */ + apic_write(APIC_LVTPC, APIC_DM_NMI); +} + +static int +perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs) +{ + u64 start_clock; + u64 finish_clock; + int ret; + + if (!atomic_read(&active_events)) + return NMI_DONE; + + start_clock = sched_clock(); + ret = x86_pmu.handle_irq(regs); + finish_clock = sched_clock(); + + perf_sample_event_took(finish_clock - start_clock); + + return ret; +} +NOKPROBE_SYMBOL(perf_event_nmi_handler); + +struct event_constraint emptyconstraint; +struct event_constraint unconstrained; + +static int +x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) +{ + unsigned int cpu = (long)hcpu; + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + int ret = NOTIFY_OK; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_UP_PREPARE: + cpuc->kfree_on_online = NULL; + if (x86_pmu.cpu_prepare) + ret = x86_pmu.cpu_prepare(cpu); + break; + + case CPU_STARTING: + if (x86_pmu.attr_rdpmc) + set_in_cr4(X86_CR4_PCE); + if (x86_pmu.cpu_starting) + x86_pmu.cpu_starting(cpu); + break; + + case CPU_ONLINE: + kfree(cpuc->kfree_on_online); + break; + + case CPU_DYING: + if (x86_pmu.cpu_dying) + x86_pmu.cpu_dying(cpu); + break; + + case CPU_UP_CANCELED: + case CPU_DEAD: + if (x86_pmu.cpu_dead) + x86_pmu.cpu_dead(cpu); + break; + + default: + break; + } + + return ret; +} + +static void __init pmu_check_apic(void) +{ + if (cpu_has_apic) + return; + + x86_pmu.apic = 0; + pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); + pr_info("no hardware sampling interrupt available.\n"); + + /* + * If we have a PMU initialized but no APIC + * interrupts, we cannot sample hardware + * events (user-space has to fall back and + * sample via a hrtimer based software event): + */ + pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT; + +} + +static struct attribute_group x86_pmu_format_group = { + .name = "format", + .attrs = NULL, +}; + +/* + * Remove all undefined events (x86_pmu.event_map(id) == 0) + * out of events_attr attributes. + */ +static void __init filter_events(struct attribute **attrs) +{ + struct device_attribute *d; + struct perf_pmu_events_attr *pmu_attr; + int i, j; + + for (i = 0; attrs[i]; i++) { + d = (struct device_attribute *)attrs[i]; + pmu_attr = container_of(d, struct perf_pmu_events_attr, attr); + /* str trumps id */ + if (pmu_attr->event_str) + continue; + if (x86_pmu.event_map(i)) + continue; + + for (j = i; attrs[j]; j++) + attrs[j] = attrs[j + 1]; + + /* Check the shifted attr. */ + i--; + } +} + +/* Merge two pointer arrays */ +static __init struct attribute **merge_attr(struct attribute **a, struct attribute **b) +{ + struct attribute **new; + int j, i; + + for (j = 0; a[j]; j++) + ; + for (i = 0; b[i]; i++) + j++; + j++; + + new = kmalloc(sizeof(struct attribute *) * j, GFP_KERNEL); + if (!new) + return NULL; + + j = 0; + for (i = 0; a[i]; i++) + new[j++] = a[i]; + for (i = 0; b[i]; i++) + new[j++] = b[i]; + new[j] = NULL; + + return new; +} + +ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, + char *page) +{ + struct perf_pmu_events_attr *pmu_attr = \ + container_of(attr, struct perf_pmu_events_attr, attr); + u64 config = x86_pmu.event_map(pmu_attr->id); + + /* string trumps id */ + if (pmu_attr->event_str) + return sprintf(page, "%s", pmu_attr->event_str); + + return x86_pmu.events_sysfs_show(page, config); +} + +EVENT_ATTR(cpu-cycles, CPU_CYCLES ); +EVENT_ATTR(instructions, INSTRUCTIONS ); +EVENT_ATTR(cache-references, CACHE_REFERENCES ); +EVENT_ATTR(cache-misses, CACHE_MISSES ); +EVENT_ATTR(branch-instructions, BRANCH_INSTRUCTIONS ); +EVENT_ATTR(branch-misses, BRANCH_MISSES ); +EVENT_ATTR(bus-cycles, BUS_CYCLES ); +EVENT_ATTR(stalled-cycles-frontend, STALLED_CYCLES_FRONTEND ); +EVENT_ATTR(stalled-cycles-backend, STALLED_CYCLES_BACKEND ); +EVENT_ATTR(ref-cycles, REF_CPU_CYCLES ); + +static struct attribute *empty_attrs; + +static struct attribute *events_attr[] = { + EVENT_PTR(CPU_CYCLES), + EVENT_PTR(INSTRUCTIONS), + EVENT_PTR(CACHE_REFERENCES), + EVENT_PTR(CACHE_MISSES), + EVENT_PTR(BRANCH_INSTRUCTIONS), + EVENT_PTR(BRANCH_MISSES), + EVENT_PTR(BUS_CYCLES), + EVENT_PTR(STALLED_CYCLES_FRONTEND), + EVENT_PTR(STALLED_CYCLES_BACKEND), + EVENT_PTR(REF_CPU_CYCLES), + NULL, +}; + +static struct attribute_group x86_pmu_events_group = { + .name = "events", + .attrs = events_attr, +}; + +ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event) +{ + u64 umask = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; + u64 cmask = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24; + bool edge = (config & ARCH_PERFMON_EVENTSEL_EDGE); + bool pc = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL); + bool any = (config & ARCH_PERFMON_EVENTSEL_ANY); + bool inv = (config & ARCH_PERFMON_EVENTSEL_INV); + ssize_t ret; + + /* + * We have whole page size to spend and just little data + * to write, so we can safely use sprintf. + */ + ret = sprintf(page, "event=0x%02llx", event); + + if (umask) + ret += sprintf(page + ret, ",umask=0x%02llx", umask); + + if (edge) + ret += sprintf(page + ret, ",edge"); + + if (pc) + ret += sprintf(page + ret, ",pc"); + + if (any) + ret += sprintf(page + ret, ",any"); + + if (inv) + ret += sprintf(page + ret, ",inv"); + + if (cmask) + ret += sprintf(page + ret, ",cmask=0x%02llx", cmask); + + ret += sprintf(page + ret, "\n"); + + return ret; +} + +static int __init init_hw_perf_events(void) +{ + struct x86_pmu_quirk *quirk; + int err; + + pr_info("Performance Events: "); + + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + err = intel_pmu_init(); + break; + case X86_VENDOR_AMD: + err = amd_pmu_init(); + break; + default: + err = -ENOTSUPP; + } + if (err != 0) { + pr_cont("no PMU driver, software events only.\n"); + return 0; + } + + pmu_check_apic(); + + /* sanity check that the hardware exists or is emulated */ + if (!check_hw_exists()) + return 0; + + pr_cont("%s PMU driver.\n", x86_pmu.name); + + x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ + + for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next) + quirk->func(); + + if (!x86_pmu.intel_ctrl) + x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; + + perf_events_lapic_init(); + register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI"); + + unconstrained = (struct event_constraint) + __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1, + 0, x86_pmu.num_counters, 0, 0); + + x86_pmu_format_group.attrs = x86_pmu.format_attrs; + + if (x86_pmu.event_attrs) + x86_pmu_events_group.attrs = x86_pmu.event_attrs; + + if (!x86_pmu.events_sysfs_show) + x86_pmu_events_group.attrs = &empty_attrs; + else + filter_events(x86_pmu_events_group.attrs); + + if (x86_pmu.cpu_events) { + struct attribute **tmp; + + tmp = merge_attr(x86_pmu_events_group.attrs, x86_pmu.cpu_events); + if (!WARN_ON(!tmp)) + x86_pmu_events_group.attrs = tmp; + } + + pr_info("... version: %d\n", x86_pmu.version); + pr_info("... bit width: %d\n", x86_pmu.cntval_bits); + pr_info("... generic registers: %d\n", x86_pmu.num_counters); + pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask); + pr_info("... max period: %016Lx\n", x86_pmu.max_period); + pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed); + pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl); + + perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW); + perf_cpu_notifier(x86_pmu_notifier); + + return 0; +} +early_initcall(init_hw_perf_events); + +static inline void x86_pmu_read(struct perf_event *event) +{ + x86_perf_event_update(event); +} + +/* + * Start group events scheduling transaction + * Set the flag to make pmu::enable() not perform the + * schedulability test, it will be performed at commit time + */ +static void x86_pmu_start_txn(struct pmu *pmu) +{ + perf_pmu_disable(pmu); + __this_cpu_or(cpu_hw_events.group_flag, PERF_EVENT_TXN); + __this_cpu_write(cpu_hw_events.n_txn, 0); +} + +/* + * Stop group events scheduling transaction + * Clear the flag and pmu::enable() will perform the + * schedulability test. + */ +static void x86_pmu_cancel_txn(struct pmu *pmu) +{ + __this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN); + /* + * Truncate collected array by the number of events added in this + * transaction. See x86_pmu_add() and x86_pmu_*_txn(). + */ + __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn)); + __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn)); + perf_pmu_enable(pmu); +} + +/* + * Commit group events scheduling transaction + * Perform the group schedulability test as a whole + * Return 0 if success + * + * Does not cancel the transaction on failure; expects the caller to do this. + */ +static int x86_pmu_commit_txn(struct pmu *pmu) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + int assign[X86_PMC_IDX_MAX]; + int n, ret; + + n = cpuc->n_events; + + if (!x86_pmu_initialized()) + return -EAGAIN; + + ret = x86_pmu.schedule_events(cpuc, n, assign); + if (ret) + return ret; + + /* + * copy new assignment, now we know it is possible + * will be used by hw_perf_enable() + */ + memcpy(cpuc->assign, assign, n*sizeof(int)); + + cpuc->group_flag &= ~PERF_EVENT_TXN; + perf_pmu_enable(pmu); + return 0; +} +/* + * a fake_cpuc is used to validate event groups. Due to + * the extra reg logic, we need to also allocate a fake + * per_core and per_cpu structure. Otherwise, group events + * using extra reg may conflict without the kernel being + * able to catch this when the last event gets added to + * the group. + */ +static void free_fake_cpuc(struct cpu_hw_events *cpuc) +{ + kfree(cpuc->shared_regs); + kfree(cpuc); +} + +static struct cpu_hw_events *allocate_fake_cpuc(void) +{ + struct cpu_hw_events *cpuc; + int cpu = raw_smp_processor_id(); + + cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL); + if (!cpuc) + return ERR_PTR(-ENOMEM); + + /* only needed, if we have extra_regs */ + if (x86_pmu.extra_regs) { + cpuc->shared_regs = allocate_shared_regs(cpu); + if (!cpuc->shared_regs) + goto error; + } + cpuc->is_fake = 1; + return cpuc; +error: + free_fake_cpuc(cpuc); + return ERR_PTR(-ENOMEM); +} + +/* + * validate that we can schedule this event + */ +static int validate_event(struct perf_event *event) +{ + struct cpu_hw_events *fake_cpuc; + struct event_constraint *c; + int ret = 0; + + fake_cpuc = allocate_fake_cpuc(); + if (IS_ERR(fake_cpuc)) + return PTR_ERR(fake_cpuc); + + c = x86_pmu.get_event_constraints(fake_cpuc, event); + + if (!c || !c->weight) + ret = -EINVAL; + + if (x86_pmu.put_event_constraints) + x86_pmu.put_event_constraints(fake_cpuc, event); + + free_fake_cpuc(fake_cpuc); + + return ret; +} + +/* + * validate a single event group + * + * validation include: + * - check events are compatible which each other + * - events do not compete for the same counter + * - number of events <= number of counters + * + * validation ensures the group can be loaded onto the + * PMU if it was the only group available. + */ +static int validate_group(struct perf_event *event) +{ + struct perf_event *leader = event->group_leader; + struct cpu_hw_events *fake_cpuc; + int ret = -EINVAL, n; + + fake_cpuc = allocate_fake_cpuc(); + if (IS_ERR(fake_cpuc)) + return PTR_ERR(fake_cpuc); + /* + * the event is not yet connected with its + * siblings therefore we must first collect + * existing siblings, then add the new event + * before we can simulate the scheduling + */ + n = collect_events(fake_cpuc, leader, true); + if (n < 0) + goto out; + + fake_cpuc->n_events = n; + n = collect_events(fake_cpuc, event, false); + if (n < 0) + goto out; + + fake_cpuc->n_events = n; + + ret = x86_pmu.schedule_events(fake_cpuc, n, NULL); + +out: + free_fake_cpuc(fake_cpuc); + return ret; +} + +static int x86_pmu_event_init(struct perf_event *event) +{ + struct pmu *tmp; + int err; + + switch (event->attr.type) { + case PERF_TYPE_RAW: + case PERF_TYPE_HARDWARE: + case PERF_TYPE_HW_CACHE: + break; + + default: + return -ENOENT; + } + + err = __x86_pmu_event_init(event); + if (!err) { + /* + * we temporarily connect event to its pmu + * such that validate_group() can classify + * it as an x86 event using is_x86_event() + */ + tmp = event->pmu; + event->pmu = &pmu; + + if (event->group_leader != event) + err = validate_group(event); + else + err = validate_event(event); + + event->pmu = tmp; + } + if (err) { + if (event->destroy) + event->destroy(event); + } + + return err; +} + +static int x86_pmu_event_idx(struct perf_event *event) +{ + int idx = event->hw.idx; + + if (!x86_pmu.attr_rdpmc) + return 0; + + if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) { + idx -= INTEL_PMC_IDX_FIXED; + idx |= 1 << 30; + } + + return idx + 1; +} + +static ssize_t get_attr_rdpmc(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc); +} + +static void change_rdpmc(void *info) +{ + bool enable = !!(unsigned long)info; + + if (enable) + set_in_cr4(X86_CR4_PCE); + else + clear_in_cr4(X86_CR4_PCE); +} + +static ssize_t set_attr_rdpmc(struct device *cdev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + unsigned long val; + ssize_t ret; + + ret = kstrtoul(buf, 0, &val); + if (ret) + return ret; + + if (x86_pmu.attr_rdpmc_broken) + return -ENOTSUPP; + + if (!!val != !!x86_pmu.attr_rdpmc) { + x86_pmu.attr_rdpmc = !!val; + on_each_cpu(change_rdpmc, (void *)val, 1); + } + + return count; +} + +static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc); + +static struct attribute *x86_pmu_attrs[] = { + &dev_attr_rdpmc.attr, + NULL, +}; + +static struct attribute_group x86_pmu_attr_group = { + .attrs = x86_pmu_attrs, +}; + +static const struct attribute_group *x86_pmu_attr_groups[] = { + &x86_pmu_attr_group, + &x86_pmu_format_group, + &x86_pmu_events_group, + NULL, +}; + +static void x86_pmu_flush_branch_stack(void) +{ + if (x86_pmu.flush_branch_stack) + x86_pmu.flush_branch_stack(); +} + +void perf_check_microcode(void) +{ + if (x86_pmu.check_microcode) + x86_pmu.check_microcode(); +} +EXPORT_SYMBOL_GPL(perf_check_microcode); + +static struct pmu pmu = { + .pmu_enable = x86_pmu_enable, + .pmu_disable = x86_pmu_disable, + + .attr_groups = x86_pmu_attr_groups, + + .event_init = x86_pmu_event_init, + + .add = x86_pmu_add, + .del = x86_pmu_del, + .start = x86_pmu_start, + .stop = x86_pmu_stop, + .read = x86_pmu_read, + + .start_txn = x86_pmu_start_txn, + .cancel_txn = x86_pmu_cancel_txn, + .commit_txn = x86_pmu_commit_txn, + + .event_idx = x86_pmu_event_idx, + .flush_branch_stack = x86_pmu_flush_branch_stack, +}; + +void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) +{ + struct cyc2ns_data *data; + + userpg->cap_user_time = 0; + userpg->cap_user_time_zero = 0; + userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc; + userpg->pmc_width = x86_pmu.cntval_bits; + + if (!sched_clock_stable()) + return; + + data = cyc2ns_read_begin(); + + userpg->cap_user_time = 1; + userpg->time_mult = data->cyc2ns_mul; + userpg->time_shift = data->cyc2ns_shift; + userpg->time_offset = data->cyc2ns_offset - now; + + userpg->cap_user_time_zero = 1; + userpg->time_zero = data->cyc2ns_offset; + + cyc2ns_read_end(data); +} + +/* + * callchain support + */ + +static int backtrace_stack(void *data, char *name) +{ + return 0; +} + +static void backtrace_address(void *data, unsigned long addr, int reliable) +{ + struct perf_callchain_entry *entry = data; + + perf_callchain_store(entry, addr); +} + +static const struct stacktrace_ops backtrace_ops = { + .stack = backtrace_stack, + .address = backtrace_address, + .walk_stack = print_context_stack_bp, +}; + +void +perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) +{ + if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + /* TODO: We don't support guest os callchain now */ + return; + } + + perf_callchain_store(entry, regs->ip); + + dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); +} + +static inline int +valid_user_frame(const void __user *fp, unsigned long size) +{ + return (__range_not_ok(fp, size, TASK_SIZE) == 0); +} + +static unsigned long get_segment_base(unsigned int segment) +{ + struct desc_struct *desc; + int idx = segment >> 3; + + if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) { + if (idx > LDT_ENTRIES) + return 0; + + if (idx > current->active_mm->context.size) + return 0; + + desc = current->active_mm->context.ldt; + } else { + if (idx > GDT_ENTRIES) + return 0; + + desc = __this_cpu_ptr(&gdt_page.gdt[0]); + } + + return get_desc_base(desc + idx); +} + +#ifdef CONFIG_COMPAT + +#include <asm/compat.h> + +static inline int +perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) +{ + /* 32-bit process in 64-bit kernel. */ + unsigned long ss_base, cs_base; + struct stack_frame_ia32 frame; + const void __user *fp; + + if (!test_thread_flag(TIF_IA32)) + return 0; + + cs_base = get_segment_base(regs->cs); + ss_base = get_segment_base(regs->ss); + + fp = compat_ptr(ss_base + regs->bp); + while (entry->nr < PERF_MAX_STACK_DEPTH) { + unsigned long bytes; + frame.next_frame = 0; + frame.return_address = 0; + + bytes = copy_from_user_nmi(&frame, fp, sizeof(frame)); + if (bytes != 0) + break; + + if (!valid_user_frame(fp, sizeof(frame))) + break; + + perf_callchain_store(entry, cs_base + frame.return_address); + fp = compat_ptr(ss_base + frame.next_frame); + } + return 1; +} +#else +static inline int +perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) +{ + return 0; +} +#endif + +void +perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) +{ + struct stack_frame frame; + const void __user *fp; + + if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + /* TODO: We don't support guest os callchain now */ + return; + } + + /* + * We don't know what to do with VM86 stacks.. ignore them for now. + */ + if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM)) + return; + + fp = (void __user *)regs->bp; + + perf_callchain_store(entry, regs->ip); + + if (!current->mm) + return; + + if (perf_callchain_user32(regs, entry)) + return; + + while (entry->nr < PERF_MAX_STACK_DEPTH) { + unsigned long bytes; + frame.next_frame = NULL; + frame.return_address = 0; + + bytes = copy_from_user_nmi(&frame, fp, sizeof(frame)); + if (bytes != 0) + break; + + if (!valid_user_frame(fp, sizeof(frame))) + break; + + perf_callchain_store(entry, frame.return_address); + fp = frame.next_frame; + } +} + +/* + * Deal with code segment offsets for the various execution modes: + * + * VM86 - the good olde 16 bit days, where the linear address is + * 20 bits and we use regs->ip + 0x10 * regs->cs. + * + * IA32 - Where we need to look at GDT/LDT segment descriptor tables + * to figure out what the 32bit base address is. + * + * X32 - has TIF_X32 set, but is running in x86_64 + * + * X86_64 - CS,DS,SS,ES are all zero based. + */ +static unsigned long code_segment_base(struct pt_regs *regs) +{ + /* + * If we are in VM86 mode, add the segment offset to convert to a + * linear address. + */ + if (regs->flags & X86_VM_MASK) + return 0x10 * regs->cs; + + /* + * For IA32 we look at the GDT/LDT segment base to convert the + * effective IP to a linear address. + */ +#ifdef CONFIG_X86_32 + if (user_mode(regs) && regs->cs != __USER_CS) + return get_segment_base(regs->cs); +#else + if (test_thread_flag(TIF_IA32)) { + if (user_mode(regs) && regs->cs != __USER32_CS) + return get_segment_base(regs->cs); + } +#endif + return 0; +} + +unsigned long perf_instruction_pointer(struct pt_regs *regs) +{ + if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) + return perf_guest_cbs->get_guest_ip(); + + return regs->ip + code_segment_base(regs); +} + +unsigned long perf_misc_flags(struct pt_regs *regs) +{ + int misc = 0; + + if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + if (perf_guest_cbs->is_user_mode()) + misc |= PERF_RECORD_MISC_GUEST_USER; + else + misc |= PERF_RECORD_MISC_GUEST_KERNEL; + } else { + if (user_mode(regs)) + misc |= PERF_RECORD_MISC_USER; + else + misc |= PERF_RECORD_MISC_KERNEL; + } + + if (regs->flags & PERF_EFLAGS_EXACT) + misc |= PERF_RECORD_MISC_EXACT_IP; + + return misc; +} + +void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) +{ + cap->version = x86_pmu.version; + cap->num_counters_gp = x86_pmu.num_counters; + cap->num_counters_fixed = x86_pmu.num_counters_fixed; + cap->bit_width_gp = x86_pmu.cntval_bits; + cap->bit_width_fixed = x86_pmu.cntval_bits; + cap->events_mask = (unsigned int)x86_pmu.events_maskl; + cap->events_mask_len = x86_pmu.events_mask_len; +} +EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability); diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h new file mode 100644 index 00000000000..8ade93111e0 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event.h @@ -0,0 +1,739 @@ +/* + * Performance events x86 architecture header + * + * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> + * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2009 Jaswinder Singh Rajput + * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com> + * Copyright (C) 2009 Google, Inc., Stephane Eranian + * + * For licencing details see kernel-base/COPYING + */ + +#include <linux/perf_event.h> + +#if 0 +#undef wrmsrl +#define wrmsrl(msr, val) \ +do { \ + unsigned int _msr = (msr); \ + u64 _val = (val); \ + trace_printk("wrmsrl(%x, %Lx)\n", (unsigned int)(_msr), \ + (unsigned long long)(_val)); \ + native_write_msr((_msr), (u32)(_val), (u32)(_val >> 32)); \ +} while (0) +#endif + +/* + * | NHM/WSM | SNB | + * register ------------------------------- + * | HT | no HT | HT | no HT | + *----------------------------------------- + * offcore | core | core | cpu | core | + * lbr_sel | core | core | cpu | core | + * ld_lat | cpu | core | cpu | core | + *----------------------------------------- + * + * Given that there is a small number of shared regs, + * we can pre-allocate their slot in the per-cpu + * per-core reg tables. + */ +enum extra_reg_type { + EXTRA_REG_NONE = -1, /* not used */ + + EXTRA_REG_RSP_0 = 0, /* offcore_response_0 */ + EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */ + EXTRA_REG_LBR = 2, /* lbr_select */ + EXTRA_REG_LDLAT = 3, /* ld_lat_threshold */ + + EXTRA_REG_MAX /* number of entries needed */ +}; + +struct event_constraint { + union { + unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + u64 idxmsk64; + }; + u64 code; + u64 cmask; + int weight; + int overlap; + int flags; +}; +/* + * struct hw_perf_event.flags flags + */ +#define PERF_X86_EVENT_PEBS_LDLAT 0x1 /* ld+ldlat data address sampling */ +#define PERF_X86_EVENT_PEBS_ST 0x2 /* st data address sampling */ +#define PERF_X86_EVENT_PEBS_ST_HSW 0x4 /* haswell style st data sampling */ +#define PERF_X86_EVENT_COMMITTED 0x8 /* event passed commit_txn */ + +struct amd_nb { + int nb_id; /* NorthBridge id */ + int refcnt; /* reference count */ + struct perf_event *owners[X86_PMC_IDX_MAX]; + struct event_constraint event_constraints[X86_PMC_IDX_MAX]; +}; + +/* The maximal number of PEBS events: */ +#define MAX_PEBS_EVENTS 8 + +/* + * A debug store configuration. + * + * We only support architectures that use 64bit fields. + */ +struct debug_store { + u64 bts_buffer_base; + u64 bts_index; + u64 bts_absolute_maximum; + u64 bts_interrupt_threshold; + u64 pebs_buffer_base; + u64 pebs_index; + u64 pebs_absolute_maximum; + u64 pebs_interrupt_threshold; + u64 pebs_event_reset[MAX_PEBS_EVENTS]; +}; + +/* + * Per register state. + */ +struct er_account { + raw_spinlock_t lock; /* per-core: protect structure */ + u64 config; /* extra MSR config */ + u64 reg; /* extra MSR number */ + atomic_t ref; /* reference count */ +}; + +/* + * Per core/cpu state + * + * Used to coordinate shared registers between HT threads or + * among events on a single PMU. + */ +struct intel_shared_regs { + struct er_account regs[EXTRA_REG_MAX]; + int refcnt; /* per-core: #HT threads */ + unsigned core_id; /* per-core: core id */ +}; + +#define MAX_LBR_ENTRIES 16 + +struct cpu_hw_events { + /* + * Generic x86 PMC bits + */ + struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */ + unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + unsigned long running[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + int enabled; + + int n_events; /* the # of events in the below arrays */ + int n_added; /* the # last events in the below arrays; + they've never been enabled yet */ + int n_txn; /* the # last events in the below arrays; + added in the current transaction */ + int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */ + u64 tags[X86_PMC_IDX_MAX]; + struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ + + unsigned int group_flag; + int is_fake; + + /* + * Intel DebugStore bits + */ + struct debug_store *ds; + u64 pebs_enabled; + + /* + * Intel LBR bits + */ + int lbr_users; + void *lbr_context; + struct perf_branch_stack lbr_stack; + struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; + struct er_account *lbr_sel; + u64 br_sel; + + /* + * Intel host/guest exclude bits + */ + u64 intel_ctrl_guest_mask; + u64 intel_ctrl_host_mask; + struct perf_guest_switch_msr guest_switch_msrs[X86_PMC_IDX_MAX]; + + /* + * Intel checkpoint mask + */ + u64 intel_cp_status; + + /* + * manage shared (per-core, per-cpu) registers + * used on Intel NHM/WSM/SNB + */ + struct intel_shared_regs *shared_regs; + + /* + * AMD specific bits + */ + struct amd_nb *amd_nb; + /* Inverted mask of bits to clear in the perf_ctr ctrl registers */ + u64 perf_ctr_virt_mask; + + void *kfree_on_online; +}; + +#define __EVENT_CONSTRAINT(c, n, m, w, o, f) {\ + { .idxmsk64 = (n) }, \ + .code = (c), \ + .cmask = (m), \ + .weight = (w), \ + .overlap = (o), \ + .flags = f, \ +} + +#define EVENT_CONSTRAINT(c, n, m) \ + __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0, 0) + +/* + * The overlap flag marks event constraints with overlapping counter + * masks. This is the case if the counter mask of such an event is not + * a subset of any other counter mask of a constraint with an equal or + * higher weight, e.g.: + * + * c_overlaps = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0); + * c_another1 = EVENT_CONSTRAINT(0, 0x07, 0); + * c_another2 = EVENT_CONSTRAINT(0, 0x38, 0); + * + * The event scheduler may not select the correct counter in the first + * cycle because it needs to know which subsequent events will be + * scheduled. It may fail to schedule the events then. So we set the + * overlap flag for such constraints to give the scheduler a hint which + * events to select for counter rescheduling. + * + * Care must be taken as the rescheduling algorithm is O(n!) which + * will increase scheduling cycles for an over-commited system + * dramatically. The number of such EVENT_CONSTRAINT_OVERLAP() macros + * and its counter masks must be kept at a minimum. + */ +#define EVENT_CONSTRAINT_OVERLAP(c, n, m) \ + __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 1, 0) + +/* + * Constraint on the Event code. + */ +#define INTEL_EVENT_CONSTRAINT(c, n) \ + EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT) + +/* + * Constraint on the Event code + UMask + fixed-mask + * + * filter mask to validate fixed counter events. + * the following filters disqualify for fixed counters: + * - inv + * - edge + * - cnt-mask + * - in_tx + * - in_tx_checkpointed + * The other filters are supported by fixed counters. + * The any-thread option is supported starting with v3. + */ +#define FIXED_EVENT_FLAGS (X86_RAW_EVENT_MASK|HSW_IN_TX|HSW_IN_TX_CHECKPOINTED) +#define FIXED_EVENT_CONSTRAINT(c, n) \ + EVENT_CONSTRAINT(c, (1ULL << (32+n)), FIXED_EVENT_FLAGS) + +/* + * Constraint on the Event code + UMask + */ +#define INTEL_UEVENT_CONSTRAINT(c, n) \ + EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK) + +#define INTEL_PLD_CONSTRAINT(c, n) \ + __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \ + HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT) + +#define INTEL_PST_CONSTRAINT(c, n) \ + __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \ + HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST) + +/* DataLA version of store sampling without extra enable bit. */ +#define INTEL_PST_HSW_CONSTRAINT(c, n) \ + __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \ + HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW) + +/* + * We define the end marker as having a weight of -1 + * to enable blacklisting of events using a counter bitmask + * of zero and thus a weight of zero. + * The end marker has a weight that cannot possibly be + * obtained from counting the bits in the bitmask. + */ +#define EVENT_CONSTRAINT_END { .weight = -1 } + +/* + * Check for end marker with weight == -1 + */ +#define for_each_event_constraint(e, c) \ + for ((e) = (c); (e)->weight != -1; (e)++) + +/* + * Extra registers for specific events. + * + * Some events need large masks and require external MSRs. + * Those extra MSRs end up being shared for all events on + * a PMU and sometimes between PMU of sibling HT threads. + * In either case, the kernel needs to handle conflicting + * accesses to those extra, shared, regs. The data structure + * to manage those registers is stored in cpu_hw_event. + */ +struct extra_reg { + unsigned int event; + unsigned int msr; + u64 config_mask; + u64 valid_mask; + int idx; /* per_xxx->regs[] reg index */ + bool extra_msr_access; +}; + +#define EVENT_EXTRA_REG(e, ms, m, vm, i) { \ + .event = (e), \ + .msr = (ms), \ + .config_mask = (m), \ + .valid_mask = (vm), \ + .idx = EXTRA_REG_##i, \ + .extra_msr_access = true, \ + } + +#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx) \ + EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx) + +#define INTEL_UEVENT_EXTRA_REG(event, msr, vm, idx) \ + EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT | \ + ARCH_PERFMON_EVENTSEL_UMASK, vm, idx) + +#define INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(c) \ + INTEL_UEVENT_EXTRA_REG(c, \ + MSR_PEBS_LD_LAT_THRESHOLD, \ + 0xffff, \ + LDLAT) + +#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0) + +union perf_capabilities { + struct { + u64 lbr_format:6; + u64 pebs_trap:1; + u64 pebs_arch_reg:1; + u64 pebs_format:4; + u64 smm_freeze:1; + /* + * PMU supports separate counter range for writing + * values > 32bit. + */ + u64 full_width_write:1; + }; + u64 capabilities; +}; + +struct x86_pmu_quirk { + struct x86_pmu_quirk *next; + void (*func)(void); +}; + +union x86_pmu_config { + struct { + u64 event:8, + umask:8, + usr:1, + os:1, + edge:1, + pc:1, + interrupt:1, + __reserved1:1, + en:1, + inv:1, + cmask:8, + event2:4, + __reserved2:4, + go:1, + ho:1; + } bits; + u64 value; +}; + +#define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value + +/* + * struct x86_pmu - generic x86 pmu + */ +struct x86_pmu { + /* + * Generic x86 PMC bits + */ + const char *name; + int version; + int (*handle_irq)(struct pt_regs *); + void (*disable_all)(void); + void (*enable_all)(int added); + void (*enable)(struct perf_event *); + void (*disable)(struct perf_event *); + int (*hw_config)(struct perf_event *event); + int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign); + unsigned eventsel; + unsigned perfctr; + int (*addr_offset)(int index, bool eventsel); + int (*rdpmc_index)(int index); + u64 (*event_map)(int); + int max_events; + int num_counters; + int num_counters_fixed; + int cntval_bits; + u64 cntval_mask; + union { + unsigned long events_maskl; + unsigned long events_mask[BITS_TO_LONGS(ARCH_PERFMON_EVENTS_COUNT)]; + }; + int events_mask_len; + int apic; + u64 max_period; + struct event_constraint * + (*get_event_constraints)(struct cpu_hw_events *cpuc, + struct perf_event *event); + + void (*put_event_constraints)(struct cpu_hw_events *cpuc, + struct perf_event *event); + struct event_constraint *event_constraints; + struct x86_pmu_quirk *quirks; + int perfctr_second_write; + bool late_ack; + + /* + * sysfs attrs + */ + int attr_rdpmc_broken; + int attr_rdpmc; + struct attribute **format_attrs; + struct attribute **event_attrs; + + ssize_t (*events_sysfs_show)(char *page, u64 config); + struct attribute **cpu_events; + + /* + * CPU Hotplug hooks + */ + int (*cpu_prepare)(int cpu); + void (*cpu_starting)(int cpu); + void (*cpu_dying)(int cpu); + void (*cpu_dead)(int cpu); + + void (*check_microcode)(void); + void (*flush_branch_stack)(void); + + /* + * Intel Arch Perfmon v2+ + */ + u64 intel_ctrl; + union perf_capabilities intel_cap; + + /* + * Intel DebugStore bits + */ + unsigned int bts :1, + bts_active :1, + pebs :1, + pebs_active :1, + pebs_broken :1; + int pebs_record_size; + void (*drain_pebs)(struct pt_regs *regs); + struct event_constraint *pebs_constraints; + void (*pebs_aliases)(struct perf_event *event); + int max_pebs_events; + + /* + * Intel LBR + */ + unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */ + int lbr_nr; /* hardware stack size */ + u64 lbr_sel_mask; /* LBR_SELECT valid bits */ + const int *lbr_sel_map; /* lbr_select mappings */ + bool lbr_double_abort; /* duplicated lbr aborts */ + + /* + * Extra registers for events + */ + struct extra_reg *extra_regs; + unsigned int er_flags; + + /* + * Intel host/guest support (KVM) + */ + struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr); +}; + +#define x86_add_quirk(func_) \ +do { \ + static struct x86_pmu_quirk __quirk __initdata = { \ + .func = func_, \ + }; \ + __quirk.next = x86_pmu.quirks; \ + x86_pmu.quirks = &__quirk; \ +} while (0) + +#define ERF_NO_HT_SHARING 1 +#define ERF_HAS_RSP_1 2 + +#define EVENT_VAR(_id) event_attr_##_id +#define EVENT_PTR(_id) &event_attr_##_id.attr.attr + +#define EVENT_ATTR(_name, _id) \ +static struct perf_pmu_events_attr EVENT_VAR(_id) = { \ + .attr = __ATTR(_name, 0444, events_sysfs_show, NULL), \ + .id = PERF_COUNT_HW_##_id, \ + .event_str = NULL, \ +}; + +#define EVENT_ATTR_STR(_name, v, str) \ +static struct perf_pmu_events_attr event_attr_##v = { \ + .attr = __ATTR(_name, 0444, events_sysfs_show, NULL), \ + .id = 0, \ + .event_str = str, \ +}; + +extern struct x86_pmu x86_pmu __read_mostly; + +DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events); + +int x86_perf_event_set_period(struct perf_event *event); + +/* + * Generalized hw caching related hw_event table, filled + * in on a per model basis. A value of 0 means + * 'not supported', -1 means 'hw_event makes no sense on + * this CPU', any other value means the raw hw_event + * ID. + */ + +#define C(x) PERF_COUNT_HW_CACHE_##x + +extern u64 __read_mostly hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX]; +extern u64 __read_mostly hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX]; + +u64 x86_perf_event_update(struct perf_event *event); + +static inline unsigned int x86_pmu_config_addr(int index) +{ + return x86_pmu.eventsel + (x86_pmu.addr_offset ? + x86_pmu.addr_offset(index, true) : index); +} + +static inline unsigned int x86_pmu_event_addr(int index) +{ + return x86_pmu.perfctr + (x86_pmu.addr_offset ? + x86_pmu.addr_offset(index, false) : index); +} + +static inline int x86_pmu_rdpmc_index(int index) +{ + return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index; +} + +int x86_setup_perfctr(struct perf_event *event); + +int x86_pmu_hw_config(struct perf_event *event); + +void x86_pmu_disable_all(void); + +static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, + u64 enable_mask) +{ + u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask); + + if (hwc->extra_reg.reg) + wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config); + wrmsrl(hwc->config_base, (hwc->config | enable_mask) & ~disable_mask); +} + +void x86_pmu_enable_all(int added); + +int perf_assign_events(struct perf_event **events, int n, + int wmin, int wmax, int *assign); +int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign); + +void x86_pmu_stop(struct perf_event *event, int flags); + +static inline void x86_pmu_disable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + wrmsrl(hwc->config_base, hwc->config); +} + +void x86_pmu_enable_event(struct perf_event *event); + +int x86_pmu_handle_irq(struct pt_regs *regs); + +extern struct event_constraint emptyconstraint; + +extern struct event_constraint unconstrained; + +static inline bool kernel_ip(unsigned long ip) +{ +#ifdef CONFIG_X86_32 + return ip > PAGE_OFFSET; +#else + return (long)ip < 0; +#endif +} + +/* + * Not all PMUs provide the right context information to place the reported IP + * into full context. Specifically segment registers are typically not + * supplied. + * + * Assuming the address is a linear address (it is for IBS), we fake the CS and + * vm86 mode using the known zero-based code segment and 'fix up' the registers + * to reflect this. + * + * Intel PEBS/LBR appear to typically provide the effective address, nothing + * much we can do about that but pray and treat it like a linear address. + */ +static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip) +{ + regs->cs = kernel_ip(ip) ? __KERNEL_CS : __USER_CS; + if (regs->flags & X86_VM_MASK) + regs->flags ^= (PERF_EFLAGS_VM | X86_VM_MASK); + regs->ip = ip; +} + +ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event); +ssize_t intel_event_sysfs_show(char *page, u64 config); + +#ifdef CONFIG_CPU_SUP_AMD + +int amd_pmu_init(void); + +#else /* CONFIG_CPU_SUP_AMD */ + +static inline int amd_pmu_init(void) +{ + return 0; +} + +#endif /* CONFIG_CPU_SUP_AMD */ + +#ifdef CONFIG_CPU_SUP_INTEL + +int intel_pmu_save_and_restart(struct perf_event *event); + +struct event_constraint * +x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event); + +struct intel_shared_regs *allocate_shared_regs(int cpu); + +int intel_pmu_init(void); + +void init_debug_store_on_cpu(int cpu); + +void fini_debug_store_on_cpu(int cpu); + +void release_ds_buffers(void); + +void reserve_ds_buffers(void); + +extern struct event_constraint bts_constraint; + +void intel_pmu_enable_bts(u64 config); + +void intel_pmu_disable_bts(void); + +int intel_pmu_drain_bts_buffer(void); + +extern struct event_constraint intel_core2_pebs_event_constraints[]; + +extern struct event_constraint intel_atom_pebs_event_constraints[]; + +extern struct event_constraint intel_slm_pebs_event_constraints[]; + +extern struct event_constraint intel_nehalem_pebs_event_constraints[]; + +extern struct event_constraint intel_westmere_pebs_event_constraints[]; + +extern struct event_constraint intel_snb_pebs_event_constraints[]; + +extern struct event_constraint intel_ivb_pebs_event_constraints[]; + +extern struct event_constraint intel_hsw_pebs_event_constraints[]; + +struct event_constraint *intel_pebs_constraints(struct perf_event *event); + +void intel_pmu_pebs_enable(struct perf_event *event); + +void intel_pmu_pebs_disable(struct perf_event *event); + +void intel_pmu_pebs_enable_all(void); + +void intel_pmu_pebs_disable_all(void); + +void intel_ds_init(void); + +void intel_pmu_lbr_reset(void); + +void intel_pmu_lbr_enable(struct perf_event *event); + +void intel_pmu_lbr_disable(struct perf_event *event); + +void intel_pmu_lbr_enable_all(void); + +void intel_pmu_lbr_disable_all(void); + +void intel_pmu_lbr_read(void); + +void intel_pmu_lbr_init_core(void); + +void intel_pmu_lbr_init_nhm(void); + +void intel_pmu_lbr_init_atom(void); + +void intel_pmu_lbr_init_snb(void); + +int intel_pmu_setup_lbr_filter(struct perf_event *event); + +int p4_pmu_init(void); + +int p6_pmu_init(void); + +int knc_pmu_init(void); + +ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, + char *page); + +#else /* CONFIG_CPU_SUP_INTEL */ + +static inline void reserve_ds_buffers(void) +{ +} + +static inline void release_ds_buffers(void) +{ +} + +static inline int intel_pmu_init(void) +{ + return 0; +} + +static inline struct intel_shared_regs *allocate_shared_regs(int cpu) +{ + return NULL; +} + +#endif /* CONFIG_CPU_SUP_INTEL */ diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c new file mode 100644 index 00000000000..beeb7cc0704 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -0,0 +1,728 @@ +#include <linux/perf_event.h> +#include <linux/export.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <asm/apicdef.h> + +#include "perf_event.h" + +static __initconst const u64 amd_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ + [ C(RESULT_MISS) ] = 0x0141, /* Data Cache Misses */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */ + [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */ + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */ + [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */ + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */ + [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ + [ C(RESULT_MISS) ] = 0x0746, /* L1_DTLB_AND_L2_DLTB_MISS.ALL */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */ + [ C(RESULT_MISS) ] = 0x0385, /* L1_ITLB_AND_L2_ITLB_MISS.ALL */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */ + [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0xb8e9, /* CPU Request to Memory, l+r */ + [ C(RESULT_MISS) ] = 0x98e9, /* CPU Request to Memory, r */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +/* + * AMD Performance Monitor K7 and later. + */ +static const u64 amd_perfmon_event_map[] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, + [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, + [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00d0, /* "Decoder empty" event */ + [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x00d1, /* "Dispatch stalls" event */ +}; + +static u64 amd_pmu_event_map(int hw_event) +{ + return amd_perfmon_event_map[hw_event]; +} + +/* + * Previously calculated offsets + */ +static unsigned int event_offsets[X86_PMC_IDX_MAX] __read_mostly; +static unsigned int count_offsets[X86_PMC_IDX_MAX] __read_mostly; + +/* + * Legacy CPUs: + * 4 counters starting at 0xc0010000 each offset by 1 + * + * CPUs with core performance counter extensions: + * 6 counters starting at 0xc0010200 each offset by 2 + */ +static inline int amd_pmu_addr_offset(int index, bool eventsel) +{ + int offset; + + if (!index) + return index; + + if (eventsel) + offset = event_offsets[index]; + else + offset = count_offsets[index]; + + if (offset) + return offset; + + if (!cpu_has_perfctr_core) + offset = index; + else + offset = index << 1; + + if (eventsel) + event_offsets[index] = offset; + else + count_offsets[index] = offset; + + return offset; +} + +static int amd_core_hw_config(struct perf_event *event) +{ + if (event->attr.exclude_host && event->attr.exclude_guest) + /* + * When HO == GO == 1 the hardware treats that as GO == HO == 0 + * and will count in both modes. We don't want to count in that + * case so we emulate no-counting by setting US = OS = 0. + */ + event->hw.config &= ~(ARCH_PERFMON_EVENTSEL_USR | + ARCH_PERFMON_EVENTSEL_OS); + else if (event->attr.exclude_host) + event->hw.config |= AMD64_EVENTSEL_GUESTONLY; + else if (event->attr.exclude_guest) + event->hw.config |= AMD64_EVENTSEL_HOSTONLY; + + return 0; +} + +/* + * AMD64 events are detected based on their event codes. + */ +static inline unsigned int amd_get_event_code(struct hw_perf_event *hwc) +{ + return ((hwc->config >> 24) & 0x0f00) | (hwc->config & 0x00ff); +} + +static inline int amd_is_nb_event(struct hw_perf_event *hwc) +{ + return (hwc->config & 0xe0) == 0xe0; +} + +static inline int amd_has_nb(struct cpu_hw_events *cpuc) +{ + struct amd_nb *nb = cpuc->amd_nb; + + return nb && nb->nb_id != -1; +} + +static int amd_pmu_hw_config(struct perf_event *event) +{ + int ret; + + /* pass precise event sampling to ibs: */ + if (event->attr.precise_ip && get_ibs_caps()) + return -ENOENT; + + if (has_branch_stack(event)) + return -EOPNOTSUPP; + + ret = x86_pmu_hw_config(event); + if (ret) + return ret; + + if (event->attr.type == PERF_TYPE_RAW) + event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK; + + return amd_core_hw_config(event); +} + +static void __amd_put_nb_event_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + struct amd_nb *nb = cpuc->amd_nb; + int i; + + /* + * need to scan whole list because event may not have + * been assigned during scheduling + * + * no race condition possible because event can only + * be removed on one CPU at a time AND PMU is disabled + * when we come here + */ + for (i = 0; i < x86_pmu.num_counters; i++) { + if (cmpxchg(nb->owners + i, event, NULL) == event) + break; + } +} + + /* + * AMD64 NorthBridge events need special treatment because + * counter access needs to be synchronized across all cores + * of a package. Refer to BKDG section 3.12 + * + * NB events are events measuring L3 cache, Hypertransport + * traffic. They are identified by an event code >= 0xe00. + * They measure events on the NorthBride which is shared + * by all cores on a package. NB events are counted on a + * shared set of counters. When a NB event is programmed + * in a counter, the data actually comes from a shared + * counter. Thus, access to those counters needs to be + * synchronized. + * + * We implement the synchronization such that no two cores + * can be measuring NB events using the same counters. Thus, + * we maintain a per-NB allocation table. The available slot + * is propagated using the event_constraint structure. + * + * We provide only one choice for each NB event based on + * the fact that only NB events have restrictions. Consequently, + * if a counter is available, there is a guarantee the NB event + * will be assigned to it. If no slot is available, an empty + * constraint is returned and scheduling will eventually fail + * for this event. + * + * Note that all cores attached the same NB compete for the same + * counters to host NB events, this is why we use atomic ops. Some + * multi-chip CPUs may have more than one NB. + * + * Given that resources are allocated (cmpxchg), they must be + * eventually freed for others to use. This is accomplished by + * calling __amd_put_nb_event_constraints() + * + * Non NB events are not impacted by this restriction. + */ +static struct event_constraint * +__amd_get_nb_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, + struct event_constraint *c) +{ + struct hw_perf_event *hwc = &event->hw; + struct amd_nb *nb = cpuc->amd_nb; + struct perf_event *old; + int idx, new = -1; + + if (!c) + c = &unconstrained; + + if (cpuc->is_fake) + return c; + + /* + * detect if already present, if so reuse + * + * cannot merge with actual allocation + * because of possible holes + * + * event can already be present yet not assigned (in hwc->idx) + * because of successive calls to x86_schedule_events() from + * hw_perf_group_sched_in() without hw_perf_enable() + */ + for_each_set_bit(idx, c->idxmsk, x86_pmu.num_counters) { + if (new == -1 || hwc->idx == idx) + /* assign free slot, prefer hwc->idx */ + old = cmpxchg(nb->owners + idx, NULL, event); + else if (nb->owners[idx] == event) + /* event already present */ + old = event; + else + continue; + + if (old && old != event) + continue; + + /* reassign to this slot */ + if (new != -1) + cmpxchg(nb->owners + new, event, NULL); + new = idx; + + /* already present, reuse */ + if (old == event) + break; + } + + if (new == -1) + return &emptyconstraint; + + return &nb->event_constraints[new]; +} + +static struct amd_nb *amd_alloc_nb(int cpu) +{ + struct amd_nb *nb; + int i; + + nb = kzalloc_node(sizeof(struct amd_nb), GFP_KERNEL, cpu_to_node(cpu)); + if (!nb) + return NULL; + + nb->nb_id = -1; + + /* + * initialize all possible NB constraints + */ + for (i = 0; i < x86_pmu.num_counters; i++) { + __set_bit(i, nb->event_constraints[i].idxmsk); + nb->event_constraints[i].weight = 1; + } + return nb; +} + +static int amd_pmu_cpu_prepare(int cpu) +{ + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + + WARN_ON_ONCE(cpuc->amd_nb); + + if (boot_cpu_data.x86_max_cores < 2) + return NOTIFY_OK; + + cpuc->amd_nb = amd_alloc_nb(cpu); + if (!cpuc->amd_nb) + return NOTIFY_BAD; + + return NOTIFY_OK; +} + +static void amd_pmu_cpu_starting(int cpu) +{ + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + struct amd_nb *nb; + int i, nb_id; + + cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY; + + if (boot_cpu_data.x86_max_cores < 2) + return; + + nb_id = amd_get_nb_id(cpu); + WARN_ON_ONCE(nb_id == BAD_APICID); + + for_each_online_cpu(i) { + nb = per_cpu(cpu_hw_events, i).amd_nb; + if (WARN_ON_ONCE(!nb)) + continue; + + if (nb->nb_id == nb_id) { + cpuc->kfree_on_online = cpuc->amd_nb; + cpuc->amd_nb = nb; + break; + } + } + + cpuc->amd_nb->nb_id = nb_id; + cpuc->amd_nb->refcnt++; +} + +static void amd_pmu_cpu_dead(int cpu) +{ + struct cpu_hw_events *cpuhw; + + if (boot_cpu_data.x86_max_cores < 2) + return; + + cpuhw = &per_cpu(cpu_hw_events, cpu); + + if (cpuhw->amd_nb) { + struct amd_nb *nb = cpuhw->amd_nb; + + if (nb->nb_id == -1 || --nb->refcnt == 0) + kfree(nb); + + cpuhw->amd_nb = NULL; + } +} + +static struct event_constraint * +amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +{ + /* + * if not NB event or no NB, then no constraints + */ + if (!(amd_has_nb(cpuc) && amd_is_nb_event(&event->hw))) + return &unconstrained; + + return __amd_get_nb_event_constraints(cpuc, event, NULL); +} + +static void amd_put_event_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + if (amd_has_nb(cpuc) && amd_is_nb_event(&event->hw)) + __amd_put_nb_event_constraints(cpuc, event); +} + +PMU_FORMAT_ATTR(event, "config:0-7,32-35"); +PMU_FORMAT_ATTR(umask, "config:8-15" ); +PMU_FORMAT_ATTR(edge, "config:18" ); +PMU_FORMAT_ATTR(inv, "config:23" ); +PMU_FORMAT_ATTR(cmask, "config:24-31" ); + +static struct attribute *amd_format_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_cmask.attr, + NULL, +}; + +/* AMD Family 15h */ + +#define AMD_EVENT_TYPE_MASK 0x000000F0ULL + +#define AMD_EVENT_FP 0x00000000ULL ... 0x00000010ULL +#define AMD_EVENT_LS 0x00000020ULL ... 0x00000030ULL +#define AMD_EVENT_DC 0x00000040ULL ... 0x00000050ULL +#define AMD_EVENT_CU 0x00000060ULL ... 0x00000070ULL +#define AMD_EVENT_IC_DE 0x00000080ULL ... 0x00000090ULL +#define AMD_EVENT_EX_LS 0x000000C0ULL +#define AMD_EVENT_DE 0x000000D0ULL +#define AMD_EVENT_NB 0x000000E0ULL ... 0x000000F0ULL + +/* + * AMD family 15h event code/PMC mappings: + * + * type = event_code & 0x0F0: + * + * 0x000 FP PERF_CTL[5:3] + * 0x010 FP PERF_CTL[5:3] + * 0x020 LS PERF_CTL[5:0] + * 0x030 LS PERF_CTL[5:0] + * 0x040 DC PERF_CTL[5:0] + * 0x050 DC PERF_CTL[5:0] + * 0x060 CU PERF_CTL[2:0] + * 0x070 CU PERF_CTL[2:0] + * 0x080 IC/DE PERF_CTL[2:0] + * 0x090 IC/DE PERF_CTL[2:0] + * 0x0A0 --- + * 0x0B0 --- + * 0x0C0 EX/LS PERF_CTL[5:0] + * 0x0D0 DE PERF_CTL[2:0] + * 0x0E0 NB NB_PERF_CTL[3:0] + * 0x0F0 NB NB_PERF_CTL[3:0] + * + * Exceptions: + * + * 0x000 FP PERF_CTL[3], PERF_CTL[5:3] (*) + * 0x003 FP PERF_CTL[3] + * 0x004 FP PERF_CTL[3], PERF_CTL[5:3] (*) + * 0x00B FP PERF_CTL[3] + * 0x00D FP PERF_CTL[3] + * 0x023 DE PERF_CTL[2:0] + * 0x02D LS PERF_CTL[3] + * 0x02E LS PERF_CTL[3,0] + * 0x031 LS PERF_CTL[2:0] (**) + * 0x043 CU PERF_CTL[2:0] + * 0x045 CU PERF_CTL[2:0] + * 0x046 CU PERF_CTL[2:0] + * 0x054 CU PERF_CTL[2:0] + * 0x055 CU PERF_CTL[2:0] + * 0x08F IC PERF_CTL[0] + * 0x187 DE PERF_CTL[0] + * 0x188 DE PERF_CTL[0] + * 0x0DB EX PERF_CTL[5:0] + * 0x0DC LS PERF_CTL[5:0] + * 0x0DD LS PERF_CTL[5:0] + * 0x0DE LS PERF_CTL[5:0] + * 0x0DF LS PERF_CTL[5:0] + * 0x1C0 EX PERF_CTL[5:3] + * 0x1D6 EX PERF_CTL[5:0] + * 0x1D8 EX PERF_CTL[5:0] + * + * (*) depending on the umask all FPU counters may be used + * (**) only one unitmask enabled at a time + */ + +static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0); +static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0); +static struct event_constraint amd_f15_PMC3 = EVENT_CONSTRAINT(0, 0x08, 0); +static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0); +static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0); +static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0); + +static struct event_constraint * +amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + unsigned int event_code = amd_get_event_code(hwc); + + switch (event_code & AMD_EVENT_TYPE_MASK) { + case AMD_EVENT_FP: + switch (event_code) { + case 0x000: + if (!(hwc->config & 0x0000F000ULL)) + break; + if (!(hwc->config & 0x00000F00ULL)) + break; + return &amd_f15_PMC3; + case 0x004: + if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1) + break; + return &amd_f15_PMC3; + case 0x003: + case 0x00B: + case 0x00D: + return &amd_f15_PMC3; + } + return &amd_f15_PMC53; + case AMD_EVENT_LS: + case AMD_EVENT_DC: + case AMD_EVENT_EX_LS: + switch (event_code) { + case 0x023: + case 0x043: + case 0x045: + case 0x046: + case 0x054: + case 0x055: + return &amd_f15_PMC20; + case 0x02D: + return &amd_f15_PMC3; + case 0x02E: + return &amd_f15_PMC30; + case 0x031: + if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1) + return &amd_f15_PMC20; + return &emptyconstraint; + case 0x1C0: + return &amd_f15_PMC53; + default: + return &amd_f15_PMC50; + } + case AMD_EVENT_CU: + case AMD_EVENT_IC_DE: + case AMD_EVENT_DE: + switch (event_code) { + case 0x08F: + case 0x187: + case 0x188: + return &amd_f15_PMC0; + case 0x0DB ... 0x0DF: + case 0x1D6: + case 0x1D8: + return &amd_f15_PMC50; + default: + return &amd_f15_PMC20; + } + case AMD_EVENT_NB: + /* moved to perf_event_amd_uncore.c */ + return &emptyconstraint; + default: + return &emptyconstraint; + } +} + +static ssize_t amd_event_sysfs_show(char *page, u64 config) +{ + u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT) | + (config & AMD64_EVENTSEL_EVENT) >> 24; + + return x86_event_sysfs_show(page, config, event); +} + +static __initconst const struct x86_pmu amd_pmu = { + .name = "AMD", + .handle_irq = x86_pmu_handle_irq, + .disable_all = x86_pmu_disable_all, + .enable_all = x86_pmu_enable_all, + .enable = x86_pmu_enable_event, + .disable = x86_pmu_disable_event, + .hw_config = amd_pmu_hw_config, + .schedule_events = x86_schedule_events, + .eventsel = MSR_K7_EVNTSEL0, + .perfctr = MSR_K7_PERFCTR0, + .addr_offset = amd_pmu_addr_offset, + .event_map = amd_pmu_event_map, + .max_events = ARRAY_SIZE(amd_perfmon_event_map), + .num_counters = AMD64_NUM_COUNTERS, + .cntval_bits = 48, + .cntval_mask = (1ULL << 48) - 1, + .apic = 1, + /* use highest bit to detect overflow */ + .max_period = (1ULL << 47) - 1, + .get_event_constraints = amd_get_event_constraints, + .put_event_constraints = amd_put_event_constraints, + + .format_attrs = amd_format_attr, + .events_sysfs_show = amd_event_sysfs_show, + + .cpu_prepare = amd_pmu_cpu_prepare, + .cpu_starting = amd_pmu_cpu_starting, + .cpu_dead = amd_pmu_cpu_dead, +}; + +static int __init amd_core_pmu_init(void) +{ + if (!cpu_has_perfctr_core) + return 0; + + switch (boot_cpu_data.x86) { + case 0x15: + pr_cont("Fam15h "); + x86_pmu.get_event_constraints = amd_get_event_constraints_f15h; + break; + + default: + pr_err("core perfctr but no constraints; unknown hardware!\n"); + return -ENODEV; + } + + /* + * If core performance counter extensions exists, we must use + * MSR_F15H_PERF_CTL/MSR_F15H_PERF_CTR msrs. See also + * amd_pmu_addr_offset(). + */ + x86_pmu.eventsel = MSR_F15H_PERF_CTL; + x86_pmu.perfctr = MSR_F15H_PERF_CTR; + x86_pmu.num_counters = AMD64_NUM_COUNTERS_CORE; + + pr_cont("core perfctr, "); + return 0; +} + +__init int amd_pmu_init(void) +{ + int ret; + + /* Performance-monitoring supported from K7 and later: */ + if (boot_cpu_data.x86 < 6) + return -ENODEV; + + x86_pmu = amd_pmu; + + ret = amd_core_pmu_init(); + if (ret) + return ret; + + /* Events are common for all AMDs */ + memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + return 0; +} + +void amd_pmu_enable_virt(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + cpuc->perf_ctr_virt_mask = 0; + + /* Reload all events */ + x86_pmu_disable_all(); + x86_pmu_enable_all(0); +} +EXPORT_SYMBOL_GPL(amd_pmu_enable_virt); + +void amd_pmu_disable_virt(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + /* + * We only mask out the Host-only bit so that host-only counting works + * when SVM is disabled. If someone sets up a guest-only counter when + * SVM is disabled the Guest-only bits still gets set and the counter + * will not count anything. + */ + cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY; + + /* Reload all events */ + x86_pmu_disable_all(); + x86_pmu_enable_all(0); +} +EXPORT_SYMBOL_GPL(amd_pmu_disable_virt); diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c new file mode 100644 index 00000000000..cbb1be3ed9e --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -0,0 +1,946 @@ +/* + * Performance events - AMD IBS + * + * Copyright (C) 2011 Advanced Micro Devices, Inc., Robert Richter + * + * For licencing details see kernel-base/COPYING + */ + +#include <linux/perf_event.h> +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/ptrace.h> +#include <linux/syscore_ops.h> + +#include <asm/apic.h> + +#include "perf_event.h" + +static u32 ibs_caps; + +#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) + +#include <linux/kprobes.h> +#include <linux/hardirq.h> + +#include <asm/nmi.h> + +#define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT) +#define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT + +enum ibs_states { + IBS_ENABLED = 0, + IBS_STARTED = 1, + IBS_STOPPING = 2, + + IBS_MAX_STATES, +}; + +struct cpu_perf_ibs { + struct perf_event *event; + unsigned long state[BITS_TO_LONGS(IBS_MAX_STATES)]; +}; + +struct perf_ibs { + struct pmu pmu; + unsigned int msr; + u64 config_mask; + u64 cnt_mask; + u64 enable_mask; + u64 valid_mask; + u64 max_period; + unsigned long offset_mask[1]; + int offset_max; + struct cpu_perf_ibs __percpu *pcpu; + + struct attribute **format_attrs; + struct attribute_group format_group; + const struct attribute_group *attr_groups[2]; + + u64 (*get_count)(u64 config); +}; + +struct perf_ibs_data { + u32 size; + union { + u32 data[0]; /* data buffer starts here */ + u32 caps; + }; + u64 regs[MSR_AMD64_IBS_REG_COUNT_MAX]; +}; + +static int +perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period) +{ + s64 left = local64_read(&hwc->period_left); + s64 period = hwc->sample_period; + int overflow = 0; + + /* + * If we are way outside a reasonable range then just skip forward: + */ + if (unlikely(left <= -period)) { + left = period; + local64_set(&hwc->period_left, left); + hwc->last_period = period; + overflow = 1; + } + + if (unlikely(left < (s64)min)) { + left += period; + local64_set(&hwc->period_left, left); + hwc->last_period = period; + overflow = 1; + } + + /* + * If the hw period that triggers the sw overflow is too short + * we might hit the irq handler. This biases the results. + * Thus we shorten the next-to-last period and set the last + * period to the max period. + */ + if (left > max) { + left -= max; + if (left > max) + left = max; + else if (left < min) + left = min; + } + + *hw_period = (u64)left; + + return overflow; +} + +static int +perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width) +{ + struct hw_perf_event *hwc = &event->hw; + int shift = 64 - width; + u64 prev_raw_count; + u64 delta; + + /* + * Careful: an NMI might modify the previous event value. + * + * Our tactic to handle this is to first atomically read and + * exchange a new raw count - then add that new-prev delta + * count to the generic event atomically: + */ + prev_raw_count = local64_read(&hwc->prev_count); + if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, + new_raw_count) != prev_raw_count) + return 0; + + /* + * Now we have the new raw value and have updated the prev + * timestamp already. We can now calculate the elapsed delta + * (event-)time and add that to the generic event. + * + * Careful, not all hw sign-extends above the physical width + * of the count. + */ + delta = (new_raw_count << shift) - (prev_raw_count << shift); + delta >>= shift; + + local64_add(delta, &event->count); + local64_sub(delta, &hwc->period_left); + + return 1; +} + +static struct perf_ibs perf_ibs_fetch; +static struct perf_ibs perf_ibs_op; + +static struct perf_ibs *get_ibs_pmu(int type) +{ + if (perf_ibs_fetch.pmu.type == type) + return &perf_ibs_fetch; + if (perf_ibs_op.pmu.type == type) + return &perf_ibs_op; + return NULL; +} + +/* + * Use IBS for precise event sampling: + * + * perf record -a -e cpu-cycles:p ... # use ibs op counting cycle count + * perf record -a -e r076:p ... # same as -e cpu-cycles:p + * perf record -a -e r0C1:p ... # use ibs op counting micro-ops + * + * IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl, + * MSRC001_1033) is used to select either cycle or micro-ops counting + * mode. + * + * The rip of IBS samples has skid 0. Thus, IBS supports precise + * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the + * rip is invalid when IBS was not able to record the rip correctly. + * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then. + * + */ +static int perf_ibs_precise_event(struct perf_event *event, u64 *config) +{ + switch (event->attr.precise_ip) { + case 0: + return -ENOENT; + case 1: + case 2: + break; + default: + return -EOPNOTSUPP; + } + + switch (event->attr.type) { + case PERF_TYPE_HARDWARE: + switch (event->attr.config) { + case PERF_COUNT_HW_CPU_CYCLES: + *config = 0; + return 0; + } + break; + case PERF_TYPE_RAW: + switch (event->attr.config) { + case 0x0076: + *config = 0; + return 0; + case 0x00C1: + *config = IBS_OP_CNT_CTL; + return 0; + } + break; + default: + return -ENOENT; + } + + return -EOPNOTSUPP; +} + +static const struct perf_event_attr ibs_notsupp = { + .exclude_user = 1, + .exclude_kernel = 1, + .exclude_hv = 1, + .exclude_idle = 1, + .exclude_host = 1, + .exclude_guest = 1, +}; + +static int perf_ibs_init(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct perf_ibs *perf_ibs; + u64 max_cnt, config; + int ret; + + perf_ibs = get_ibs_pmu(event->attr.type); + if (perf_ibs) { + config = event->attr.config; + } else { + perf_ibs = &perf_ibs_op; + ret = perf_ibs_precise_event(event, &config); + if (ret) + return ret; + } + + if (event->pmu != &perf_ibs->pmu) + return -ENOENT; + + if (perf_flags(&event->attr) & perf_flags(&ibs_notsupp)) + return -EINVAL; + + if (config & ~perf_ibs->config_mask) + return -EINVAL; + + if (hwc->sample_period) { + if (config & perf_ibs->cnt_mask) + /* raw max_cnt may not be set */ + return -EINVAL; + if (!event->attr.sample_freq && hwc->sample_period & 0x0f) + /* + * lower 4 bits can not be set in ibs max cnt, + * but allowing it in case we adjust the + * sample period to set a frequency. + */ + return -EINVAL; + hwc->sample_period &= ~0x0FULL; + if (!hwc->sample_period) + hwc->sample_period = 0x10; + } else { + max_cnt = config & perf_ibs->cnt_mask; + config &= ~perf_ibs->cnt_mask; + event->attr.sample_period = max_cnt << 4; + hwc->sample_period = event->attr.sample_period; + } + + if (!hwc->sample_period) + return -EINVAL; + + /* + * If we modify hwc->sample_period, we also need to update + * hwc->last_period and hwc->period_left. + */ + hwc->last_period = hwc->sample_period; + local64_set(&hwc->period_left, hwc->sample_period); + + hwc->config_base = perf_ibs->msr; + hwc->config = config; + + return 0; +} + +static int perf_ibs_set_period(struct perf_ibs *perf_ibs, + struct hw_perf_event *hwc, u64 *period) +{ + int overflow; + + /* ignore lower 4 bits in min count: */ + overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period); + local64_set(&hwc->prev_count, 0); + + return overflow; +} + +static u64 get_ibs_fetch_count(u64 config) +{ + return (config & IBS_FETCH_CNT) >> 12; +} + +static u64 get_ibs_op_count(u64 config) +{ + u64 count = 0; + + if (config & IBS_OP_VAL) + count += (config & IBS_OP_MAX_CNT) << 4; /* cnt rolled over */ + + if (ibs_caps & IBS_CAPS_RDWROPCNT) + count += (config & IBS_OP_CUR_CNT) >> 32; + + return count; +} + +static void +perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event, + u64 *config) +{ + u64 count = perf_ibs->get_count(*config); + + /* + * Set width to 64 since we do not overflow on max width but + * instead on max count. In perf_ibs_set_period() we clear + * prev count manually on overflow. + */ + while (!perf_event_try_update(event, count, 64)) { + rdmsrl(event->hw.config_base, *config); + count = perf_ibs->get_count(*config); + } +} + +static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs, + struct hw_perf_event *hwc, u64 config) +{ + wrmsrl(hwc->config_base, hwc->config | config | perf_ibs->enable_mask); +} + +/* + * Erratum #420 Instruction-Based Sampling Engine May Generate + * Interrupt that Cannot Be Cleared: + * + * Must clear counter mask first, then clear the enable bit. See + * Revision Guide for AMD Family 10h Processors, Publication #41322. + */ +static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs, + struct hw_perf_event *hwc, u64 config) +{ + config &= ~perf_ibs->cnt_mask; + wrmsrl(hwc->config_base, config); + config &= ~perf_ibs->enable_mask; + wrmsrl(hwc->config_base, config); +} + +/* + * We cannot restore the ibs pmu state, so we always needs to update + * the event while stopping it and then reset the state when starting + * again. Thus, ignoring PERF_EF_RELOAD and PERF_EF_UPDATE flags in + * perf_ibs_start()/perf_ibs_stop() and instead always do it. + */ +static void perf_ibs_start(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); + struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); + u64 period; + + if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) + return; + + WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); + hwc->state = 0; + + perf_ibs_set_period(perf_ibs, hwc, &period); + set_bit(IBS_STARTED, pcpu->state); + perf_ibs_enable_event(perf_ibs, hwc, period >> 4); + + perf_event_update_userpage(event); +} + +static void perf_ibs_stop(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); + struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); + u64 config; + int stopping; + + stopping = test_and_clear_bit(IBS_STARTED, pcpu->state); + + if (!stopping && (hwc->state & PERF_HES_UPTODATE)) + return; + + rdmsrl(hwc->config_base, config); + + if (stopping) { + set_bit(IBS_STOPPING, pcpu->state); + perf_ibs_disable_event(perf_ibs, hwc, config); + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; + } + + if (hwc->state & PERF_HES_UPTODATE) + return; + + /* + * Clear valid bit to not count rollovers on update, rollovers + * are only updated in the irq handler. + */ + config &= ~perf_ibs->valid_mask; + + perf_ibs_event_update(perf_ibs, event, &config); + hwc->state |= PERF_HES_UPTODATE; +} + +static int perf_ibs_add(struct perf_event *event, int flags) +{ + struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); + struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); + + if (test_and_set_bit(IBS_ENABLED, pcpu->state)) + return -ENOSPC; + + event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + + pcpu->event = event; + + if (flags & PERF_EF_START) + perf_ibs_start(event, PERF_EF_RELOAD); + + return 0; +} + +static void perf_ibs_del(struct perf_event *event, int flags) +{ + struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); + struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); + + if (!test_and_clear_bit(IBS_ENABLED, pcpu->state)) + return; + + perf_ibs_stop(event, PERF_EF_UPDATE); + + pcpu->event = NULL; + + perf_event_update_userpage(event); +} + +static void perf_ibs_read(struct perf_event *event) { } + +PMU_FORMAT_ATTR(rand_en, "config:57"); +PMU_FORMAT_ATTR(cnt_ctl, "config:19"); + +static struct attribute *ibs_fetch_format_attrs[] = { + &format_attr_rand_en.attr, + NULL, +}; + +static struct attribute *ibs_op_format_attrs[] = { + NULL, /* &format_attr_cnt_ctl.attr if IBS_CAPS_OPCNT */ + NULL, +}; + +static struct perf_ibs perf_ibs_fetch = { + .pmu = { + .task_ctx_nr = perf_invalid_context, + + .event_init = perf_ibs_init, + .add = perf_ibs_add, + .del = perf_ibs_del, + .start = perf_ibs_start, + .stop = perf_ibs_stop, + .read = perf_ibs_read, + }, + .msr = MSR_AMD64_IBSFETCHCTL, + .config_mask = IBS_FETCH_CONFIG_MASK, + .cnt_mask = IBS_FETCH_MAX_CNT, + .enable_mask = IBS_FETCH_ENABLE, + .valid_mask = IBS_FETCH_VAL, + .max_period = IBS_FETCH_MAX_CNT << 4, + .offset_mask = { MSR_AMD64_IBSFETCH_REG_MASK }, + .offset_max = MSR_AMD64_IBSFETCH_REG_COUNT, + .format_attrs = ibs_fetch_format_attrs, + + .get_count = get_ibs_fetch_count, +}; + +static struct perf_ibs perf_ibs_op = { + .pmu = { + .task_ctx_nr = perf_invalid_context, + + .event_init = perf_ibs_init, + .add = perf_ibs_add, + .del = perf_ibs_del, + .start = perf_ibs_start, + .stop = perf_ibs_stop, + .read = perf_ibs_read, + }, + .msr = MSR_AMD64_IBSOPCTL, + .config_mask = IBS_OP_CONFIG_MASK, + .cnt_mask = IBS_OP_MAX_CNT, + .enable_mask = IBS_OP_ENABLE, + .valid_mask = IBS_OP_VAL, + .max_period = IBS_OP_MAX_CNT << 4, + .offset_mask = { MSR_AMD64_IBSOP_REG_MASK }, + .offset_max = MSR_AMD64_IBSOP_REG_COUNT, + .format_attrs = ibs_op_format_attrs, + + .get_count = get_ibs_op_count, +}; + +static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) +{ + struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); + struct perf_event *event = pcpu->event; + struct hw_perf_event *hwc = &event->hw; + struct perf_sample_data data; + struct perf_raw_record raw; + struct pt_regs regs; + struct perf_ibs_data ibs_data; + int offset, size, check_rip, offset_max, throttle = 0; + unsigned int msr; + u64 *buf, *config, period; + + if (!test_bit(IBS_STARTED, pcpu->state)) { + /* + * Catch spurious interrupts after stopping IBS: After + * disabling IBS there could be still incoming NMIs + * with samples that even have the valid bit cleared. + * Mark all this NMIs as handled. + */ + return test_and_clear_bit(IBS_STOPPING, pcpu->state) ? 1 : 0; + } + + msr = hwc->config_base; + buf = ibs_data.regs; + rdmsrl(msr, *buf); + if (!(*buf++ & perf_ibs->valid_mask)) + return 0; + + config = &ibs_data.regs[0]; + perf_ibs_event_update(perf_ibs, event, config); + perf_sample_data_init(&data, 0, hwc->last_period); + if (!perf_ibs_set_period(perf_ibs, hwc, &period)) + goto out; /* no sw counter overflow */ + + ibs_data.caps = ibs_caps; + size = 1; + offset = 1; + check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK)); + if (event->attr.sample_type & PERF_SAMPLE_RAW) + offset_max = perf_ibs->offset_max; + else if (check_rip) + offset_max = 2; + else + offset_max = 1; + do { + rdmsrl(msr + offset, *buf++); + size++; + offset = find_next_bit(perf_ibs->offset_mask, + perf_ibs->offset_max, + offset + 1); + } while (offset < offset_max); + ibs_data.size = sizeof(u64) * size; + + regs = *iregs; + if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) { + regs.flags &= ~PERF_EFLAGS_EXACT; + } else { + set_linear_ip(®s, ibs_data.regs[1]); + regs.flags |= PERF_EFLAGS_EXACT; + } + + if (event->attr.sample_type & PERF_SAMPLE_RAW) { + raw.size = sizeof(u32) + ibs_data.size; + raw.data = ibs_data.data; + data.raw = &raw; + } + + throttle = perf_event_overflow(event, &data, ®s); +out: + if (throttle) + perf_ibs_disable_event(perf_ibs, hwc, *config); + else + perf_ibs_enable_event(perf_ibs, hwc, period >> 4); + + perf_event_update_userpage(event); + + return 1; +} + +static int +perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs) +{ + int handled = 0; + + handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs); + handled += perf_ibs_handle_irq(&perf_ibs_op, regs); + + if (handled) + inc_irq_stat(apic_perf_irqs); + + return handled; +} +NOKPROBE_SYMBOL(perf_ibs_nmi_handler); + +static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name) +{ + struct cpu_perf_ibs __percpu *pcpu; + int ret; + + pcpu = alloc_percpu(struct cpu_perf_ibs); + if (!pcpu) + return -ENOMEM; + + perf_ibs->pcpu = pcpu; + + /* register attributes */ + if (perf_ibs->format_attrs[0]) { + memset(&perf_ibs->format_group, 0, sizeof(perf_ibs->format_group)); + perf_ibs->format_group.name = "format"; + perf_ibs->format_group.attrs = perf_ibs->format_attrs; + + memset(&perf_ibs->attr_groups, 0, sizeof(perf_ibs->attr_groups)); + perf_ibs->attr_groups[0] = &perf_ibs->format_group; + perf_ibs->pmu.attr_groups = perf_ibs->attr_groups; + } + + ret = perf_pmu_register(&perf_ibs->pmu, name, -1); + if (ret) { + perf_ibs->pcpu = NULL; + free_percpu(pcpu); + } + + return ret; +} + +static __init int perf_event_ibs_init(void) +{ + struct attribute **attr = ibs_op_format_attrs; + + if (!ibs_caps) + return -ENODEV; /* ibs not supported by the cpu */ + + perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); + + if (ibs_caps & IBS_CAPS_OPCNT) { + perf_ibs_op.config_mask |= IBS_OP_CNT_CTL; + *attr++ = &format_attr_cnt_ctl.attr; + } + perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); + + register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs"); + printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps); + + return 0; +} + +#else /* defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) */ + +static __init int perf_event_ibs_init(void) { return 0; } + +#endif + +/* IBS - apic initialization, for perf and oprofile */ + +static __init u32 __get_ibs_caps(void) +{ + u32 caps; + unsigned int max_level; + + if (!boot_cpu_has(X86_FEATURE_IBS)) + return 0; + + /* check IBS cpuid feature flags */ + max_level = cpuid_eax(0x80000000); + if (max_level < IBS_CPUID_FEATURES) + return IBS_CAPS_DEFAULT; + + caps = cpuid_eax(IBS_CPUID_FEATURES); + if (!(caps & IBS_CAPS_AVAIL)) + /* cpuid flags not valid */ + return IBS_CAPS_DEFAULT; + + return caps; +} + +u32 get_ibs_caps(void) +{ + return ibs_caps; +} + +EXPORT_SYMBOL(get_ibs_caps); + +static inline int get_eilvt(int offset) +{ + return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1); +} + +static inline int put_eilvt(int offset) +{ + return !setup_APIC_eilvt(offset, 0, 0, 1); +} + +/* + * Check and reserve APIC extended interrupt LVT offset for IBS if available. + */ +static inline int ibs_eilvt_valid(void) +{ + int offset; + u64 val; + int valid = 0; + + preempt_disable(); + + rdmsrl(MSR_AMD64_IBSCTL, val); + offset = val & IBSCTL_LVT_OFFSET_MASK; + + if (!(val & IBSCTL_LVT_OFFSET_VALID)) { + pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n", + smp_processor_id(), offset, MSR_AMD64_IBSCTL, val); + goto out; + } + + if (!get_eilvt(offset)) { + pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n", + smp_processor_id(), offset, MSR_AMD64_IBSCTL, val); + goto out; + } + + valid = 1; +out: + preempt_enable(); + + return valid; +} + +static int setup_ibs_ctl(int ibs_eilvt_off) +{ + struct pci_dev *cpu_cfg; + int nodes; + u32 value = 0; + + nodes = 0; + cpu_cfg = NULL; + do { + cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD, + PCI_DEVICE_ID_AMD_10H_NB_MISC, + cpu_cfg); + if (!cpu_cfg) + break; + ++nodes; + pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off + | IBSCTL_LVT_OFFSET_VALID); + pci_read_config_dword(cpu_cfg, IBSCTL, &value); + if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) { + pci_dev_put(cpu_cfg); + printk(KERN_DEBUG "Failed to setup IBS LVT offset, " + "IBSCTL = 0x%08x\n", value); + return -EINVAL; + } + } while (1); + + if (!nodes) { + printk(KERN_DEBUG "No CPU node configured for IBS\n"); + return -ENODEV; + } + + return 0; +} + +/* + * This runs only on the current cpu. We try to find an LVT offset and + * setup the local APIC. For this we must disable preemption. On + * success we initialize all nodes with this offset. This updates then + * the offset in the IBS_CTL per-node msr. The per-core APIC setup of + * the IBS interrupt vector is handled by perf_ibs_cpu_notifier that + * is using the new offset. + */ +static int force_ibs_eilvt_setup(void) +{ + int offset; + int ret; + + preempt_disable(); + /* find the next free available EILVT entry, skip offset 0 */ + for (offset = 1; offset < APIC_EILVT_NR_MAX; offset++) { + if (get_eilvt(offset)) + break; + } + preempt_enable(); + + if (offset == APIC_EILVT_NR_MAX) { + printk(KERN_DEBUG "No EILVT entry available\n"); + return -EBUSY; + } + + ret = setup_ibs_ctl(offset); + if (ret) + goto out; + + if (!ibs_eilvt_valid()) { + ret = -EFAULT; + goto out; + } + + pr_info("IBS: LVT offset %d assigned\n", offset); + + return 0; +out: + preempt_disable(); + put_eilvt(offset); + preempt_enable(); + return ret; +} + +static void ibs_eilvt_setup(void) +{ + /* + * Force LVT offset assignment for family 10h: The offsets are + * not assigned by the BIOS for this family, so the OS is + * responsible for doing it. If the OS assignment fails, fall + * back to BIOS settings and try to setup this. + */ + if (boot_cpu_data.x86 == 0x10) + force_ibs_eilvt_setup(); +} + +static inline int get_ibs_lvt_offset(void) +{ + u64 val; + + rdmsrl(MSR_AMD64_IBSCTL, val); + if (!(val & IBSCTL_LVT_OFFSET_VALID)) + return -EINVAL; + + return val & IBSCTL_LVT_OFFSET_MASK; +} + +static void setup_APIC_ibs(void *dummy) +{ + int offset; + + offset = get_ibs_lvt_offset(); + if (offset < 0) + goto failed; + + if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0)) + return; +failed: + pr_warn("perf: IBS APIC setup failed on cpu #%d\n", + smp_processor_id()); +} + +static void clear_APIC_ibs(void *dummy) +{ + int offset; + + offset = get_ibs_lvt_offset(); + if (offset >= 0) + setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1); +} + +#ifdef CONFIG_PM + +static int perf_ibs_suspend(void) +{ + clear_APIC_ibs(NULL); + return 0; +} + +static void perf_ibs_resume(void) +{ + ibs_eilvt_setup(); + setup_APIC_ibs(NULL); +} + +static struct syscore_ops perf_ibs_syscore_ops = { + .resume = perf_ibs_resume, + .suspend = perf_ibs_suspend, +}; + +static void perf_ibs_pm_init(void) +{ + register_syscore_ops(&perf_ibs_syscore_ops); +} + +#else + +static inline void perf_ibs_pm_init(void) { } + +#endif + +static int +perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) +{ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_STARTING: + setup_APIC_ibs(NULL); + break; + case CPU_DYING: + clear_APIC_ibs(NULL); + break; + default: + break; + } + + return NOTIFY_OK; +} + +static __init int amd_ibs_init(void) +{ + u32 caps; + int ret = -EINVAL; + + caps = __get_ibs_caps(); + if (!caps) + return -ENODEV; /* ibs not supported by the cpu */ + + ibs_eilvt_setup(); + + if (!ibs_eilvt_valid()) + goto out; + + perf_ibs_pm_init(); + cpu_notifier_register_begin(); + ibs_caps = caps; + /* make ibs_caps visible to other cpus: */ + smp_mb(); + smp_call_function(setup_APIC_ibs, NULL, 1); + __perf_cpu_notifier(perf_ibs_cpu_notifier); + cpu_notifier_register_done(); + + ret = perf_event_ibs_init(); +out: + if (ret) + pr_err("Failed to setup IBS, %d\n", ret); + return ret; +} + +/* Since we need the pci subsystem to init ibs we can't do this earlier: */ +device_initcall(amd_ibs_init); diff --git a/arch/x86/kernel/cpu/perf_event_amd_iommu.c b/arch/x86/kernel/cpu/perf_event_amd_iommu.c new file mode 100644 index 00000000000..639d1289b1b --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_amd_iommu.c @@ -0,0 +1,502 @@ +/* + * Copyright (C) 2013 Advanced Micro Devices, Inc. + * + * Author: Steven Kinney <Steven.Kinney@amd.com> + * Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com> + * + * Perf: amd_iommu - AMD IOMMU Performance Counter PMU implementation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/perf_event.h> +#include <linux/module.h> +#include <linux/cpumask.h> +#include <linux/slab.h> + +#include "perf_event.h" +#include "perf_event_amd_iommu.h" + +#define COUNTER_SHIFT 16 + +#define _GET_BANK(ev) ((u8)(ev->hw.extra_reg.reg >> 8)) +#define _GET_CNTR(ev) ((u8)(ev->hw.extra_reg.reg)) + +/* iommu pmu config masks */ +#define _GET_CSOURCE(ev) ((ev->hw.config & 0xFFULL)) +#define _GET_DEVID(ev) ((ev->hw.config >> 8) & 0xFFFFULL) +#define _GET_PASID(ev) ((ev->hw.config >> 24) & 0xFFFFULL) +#define _GET_DOMID(ev) ((ev->hw.config >> 40) & 0xFFFFULL) +#define _GET_DEVID_MASK(ev) ((ev->hw.extra_reg.config) & 0xFFFFULL) +#define _GET_PASID_MASK(ev) ((ev->hw.extra_reg.config >> 16) & 0xFFFFULL) +#define _GET_DOMID_MASK(ev) ((ev->hw.extra_reg.config >> 32) & 0xFFFFULL) + +static struct perf_amd_iommu __perf_iommu; + +struct perf_amd_iommu { + struct pmu pmu; + u8 max_banks; + u8 max_counters; + u64 cntr_assign_mask; + raw_spinlock_t lock; + const struct attribute_group *attr_groups[4]; +}; + +#define format_group attr_groups[0] +#define cpumask_group attr_groups[1] +#define events_group attr_groups[2] +#define null_group attr_groups[3] + +/*--------------------------------------------- + * sysfs format attributes + *---------------------------------------------*/ +PMU_FORMAT_ATTR(csource, "config:0-7"); +PMU_FORMAT_ATTR(devid, "config:8-23"); +PMU_FORMAT_ATTR(pasid, "config:24-39"); +PMU_FORMAT_ATTR(domid, "config:40-55"); +PMU_FORMAT_ATTR(devid_mask, "config1:0-15"); +PMU_FORMAT_ATTR(pasid_mask, "config1:16-31"); +PMU_FORMAT_ATTR(domid_mask, "config1:32-47"); + +static struct attribute *iommu_format_attrs[] = { + &format_attr_csource.attr, + &format_attr_devid.attr, + &format_attr_pasid.attr, + &format_attr_domid.attr, + &format_attr_devid_mask.attr, + &format_attr_pasid_mask.attr, + &format_attr_domid_mask.attr, + NULL, +}; + +static struct attribute_group amd_iommu_format_group = { + .name = "format", + .attrs = iommu_format_attrs, +}; + +/*--------------------------------------------- + * sysfs events attributes + *---------------------------------------------*/ +struct amd_iommu_event_desc { + struct kobj_attribute attr; + const char *event; +}; + +static ssize_t _iommu_event_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct amd_iommu_event_desc *event = + container_of(attr, struct amd_iommu_event_desc, attr); + return sprintf(buf, "%s\n", event->event); +} + +#define AMD_IOMMU_EVENT_DESC(_name, _event) \ +{ \ + .attr = __ATTR(_name, 0444, _iommu_event_show, NULL), \ + .event = _event, \ +} + +static struct amd_iommu_event_desc amd_iommu_v2_event_descs[] = { + AMD_IOMMU_EVENT_DESC(mem_pass_untrans, "csource=0x01"), + AMD_IOMMU_EVENT_DESC(mem_pass_pretrans, "csource=0x02"), + AMD_IOMMU_EVENT_DESC(mem_pass_excl, "csource=0x03"), + AMD_IOMMU_EVENT_DESC(mem_target_abort, "csource=0x04"), + AMD_IOMMU_EVENT_DESC(mem_trans_total, "csource=0x05"), + AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pte_hit, "csource=0x06"), + AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pte_mis, "csource=0x07"), + AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pde_hit, "csource=0x08"), + AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pde_mis, "csource=0x09"), + AMD_IOMMU_EVENT_DESC(mem_dte_hit, "csource=0x0a"), + AMD_IOMMU_EVENT_DESC(mem_dte_mis, "csource=0x0b"), + AMD_IOMMU_EVENT_DESC(page_tbl_read_tot, "csource=0x0c"), + AMD_IOMMU_EVENT_DESC(page_tbl_read_nst, "csource=0x0d"), + AMD_IOMMU_EVENT_DESC(page_tbl_read_gst, "csource=0x0e"), + AMD_IOMMU_EVENT_DESC(int_dte_hit, "csource=0x0f"), + AMD_IOMMU_EVENT_DESC(int_dte_mis, "csource=0x10"), + AMD_IOMMU_EVENT_DESC(cmd_processed, "csource=0x11"), + AMD_IOMMU_EVENT_DESC(cmd_processed_inv, "csource=0x12"), + AMD_IOMMU_EVENT_DESC(tlb_inv, "csource=0x13"), + { /* end: all zeroes */ }, +}; + +/*--------------------------------------------- + * sysfs cpumask attributes + *---------------------------------------------*/ +static cpumask_t iommu_cpumask; + +static ssize_t _iommu_cpumask_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &iommu_cpumask); + buf[n++] = '\n'; + buf[n] = '\0'; + return n; +} +static DEVICE_ATTR(cpumask, S_IRUGO, _iommu_cpumask_show, NULL); + +static struct attribute *iommu_cpumask_attrs[] = { + &dev_attr_cpumask.attr, + NULL, +}; + +static struct attribute_group amd_iommu_cpumask_group = { + .attrs = iommu_cpumask_attrs, +}; + +/*---------------------------------------------*/ + +static int get_next_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu) +{ + unsigned long flags; + int shift, bank, cntr, retval; + int max_banks = perf_iommu->max_banks; + int max_cntrs = perf_iommu->max_counters; + + raw_spin_lock_irqsave(&perf_iommu->lock, flags); + + for (bank = 0, shift = 0; bank < max_banks; bank++) { + for (cntr = 0; cntr < max_cntrs; cntr++) { + shift = bank + (bank*3) + cntr; + if (perf_iommu->cntr_assign_mask & (1ULL<<shift)) { + continue; + } else { + perf_iommu->cntr_assign_mask |= (1ULL<<shift); + retval = ((u16)((u16)bank<<8) | (u8)(cntr)); + goto out; + } + } + } + retval = -ENOSPC; +out: + raw_spin_unlock_irqrestore(&perf_iommu->lock, flags); + return retval; +} + +static int clear_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu, + u8 bank, u8 cntr) +{ + unsigned long flags; + int max_banks, max_cntrs; + int shift = 0; + + max_banks = perf_iommu->max_banks; + max_cntrs = perf_iommu->max_counters; + + if ((bank > max_banks) || (cntr > max_cntrs)) + return -EINVAL; + + shift = bank + cntr + (bank*3); + + raw_spin_lock_irqsave(&perf_iommu->lock, flags); + perf_iommu->cntr_assign_mask &= ~(1ULL<<shift); + raw_spin_unlock_irqrestore(&perf_iommu->lock, flags); + + return 0; +} + +static int perf_iommu_event_init(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct perf_amd_iommu *perf_iommu; + u64 config, config1; + + /* test the event attr type check for PMU enumeration */ + if (event->attr.type != event->pmu->type) + return -ENOENT; + + /* + * IOMMU counters are shared across all cores. + * Therefore, it does not support per-process mode. + * Also, it does not support event sampling mode. + */ + if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK) + return -EINVAL; + + /* IOMMU counters do not have usr/os/guest/host bits */ + if (event->attr.exclude_user || event->attr.exclude_kernel || + event->attr.exclude_host || event->attr.exclude_guest) + return -EINVAL; + + if (event->cpu < 0) + return -EINVAL; + + perf_iommu = &__perf_iommu; + + if (event->pmu != &perf_iommu->pmu) + return -ENOENT; + + if (perf_iommu) { + config = event->attr.config; + config1 = event->attr.config1; + } else { + return -EINVAL; + } + + /* integrate with iommu base devid (0000), assume one iommu */ + perf_iommu->max_banks = + amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID); + perf_iommu->max_counters = + amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID); + if ((perf_iommu->max_banks == 0) || (perf_iommu->max_counters == 0)) + return -EINVAL; + + /* update the hw_perf_event struct with the iommu config data */ + hwc->config = config; + hwc->extra_reg.config = config1; + + return 0; +} + +static void perf_iommu_enable_event(struct perf_event *ev) +{ + u8 csource = _GET_CSOURCE(ev); + u16 devid = _GET_DEVID(ev); + u64 reg = 0ULL; + + reg = csource; + amd_iommu_pc_get_set_reg_val(devid, + _GET_BANK(ev), _GET_CNTR(ev) , + IOMMU_PC_COUNTER_SRC_REG, ®, true); + + reg = 0ULL | devid | (_GET_DEVID_MASK(ev) << 32); + if (reg) + reg |= (1UL << 31); + amd_iommu_pc_get_set_reg_val(devid, + _GET_BANK(ev), _GET_CNTR(ev) , + IOMMU_PC_DEVID_MATCH_REG, ®, true); + + reg = 0ULL | _GET_PASID(ev) | (_GET_PASID_MASK(ev) << 32); + if (reg) + reg |= (1UL << 31); + amd_iommu_pc_get_set_reg_val(devid, + _GET_BANK(ev), _GET_CNTR(ev) , + IOMMU_PC_PASID_MATCH_REG, ®, true); + + reg = 0ULL | _GET_DOMID(ev) | (_GET_DOMID_MASK(ev) << 32); + if (reg) + reg |= (1UL << 31); + amd_iommu_pc_get_set_reg_val(devid, + _GET_BANK(ev), _GET_CNTR(ev) , + IOMMU_PC_DOMID_MATCH_REG, ®, true); +} + +static void perf_iommu_disable_event(struct perf_event *event) +{ + u64 reg = 0ULL; + + amd_iommu_pc_get_set_reg_val(_GET_DEVID(event), + _GET_BANK(event), _GET_CNTR(event), + IOMMU_PC_COUNTER_SRC_REG, ®, true); +} + +static void perf_iommu_start(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + + pr_debug("perf: amd_iommu:perf_iommu_start\n"); + if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) + return; + + WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); + hwc->state = 0; + + if (flags & PERF_EF_RELOAD) { + u64 prev_raw_count = local64_read(&hwc->prev_count); + amd_iommu_pc_get_set_reg_val(_GET_DEVID(event), + _GET_BANK(event), _GET_CNTR(event), + IOMMU_PC_COUNTER_REG, &prev_raw_count, true); + } + + perf_iommu_enable_event(event); + perf_event_update_userpage(event); + +} + +static void perf_iommu_read(struct perf_event *event) +{ + u64 count = 0ULL; + u64 prev_raw_count = 0ULL; + u64 delta = 0ULL; + struct hw_perf_event *hwc = &event->hw; + pr_debug("perf: amd_iommu:perf_iommu_read\n"); + + amd_iommu_pc_get_set_reg_val(_GET_DEVID(event), + _GET_BANK(event), _GET_CNTR(event), + IOMMU_PC_COUNTER_REG, &count, false); + + /* IOMMU pc counter register is only 48 bits */ + count &= 0xFFFFFFFFFFFFULL; + + prev_raw_count = local64_read(&hwc->prev_count); + if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, + count) != prev_raw_count) + return; + + /* Handling 48-bit counter overflowing */ + delta = (count << COUNTER_SHIFT) - (prev_raw_count << COUNTER_SHIFT); + delta >>= COUNTER_SHIFT; + local64_add(delta, &event->count); + +} + +static void perf_iommu_stop(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + u64 config; + + pr_debug("perf: amd_iommu:perf_iommu_stop\n"); + + if (hwc->state & PERF_HES_UPTODATE) + return; + + perf_iommu_disable_event(event); + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; + + if (hwc->state & PERF_HES_UPTODATE) + return; + + config = hwc->config; + perf_iommu_read(event); + hwc->state |= PERF_HES_UPTODATE; +} + +static int perf_iommu_add(struct perf_event *event, int flags) +{ + int retval; + struct perf_amd_iommu *perf_iommu = + container_of(event->pmu, struct perf_amd_iommu, pmu); + + pr_debug("perf: amd_iommu:perf_iommu_add\n"); + event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + + /* request an iommu bank/counter */ + retval = get_next_avail_iommu_bnk_cntr(perf_iommu); + if (retval != -ENOSPC) + event->hw.extra_reg.reg = (u16)retval; + else + return retval; + + if (flags & PERF_EF_START) + perf_iommu_start(event, PERF_EF_RELOAD); + + return 0; +} + +static void perf_iommu_del(struct perf_event *event, int flags) +{ + struct perf_amd_iommu *perf_iommu = + container_of(event->pmu, struct perf_amd_iommu, pmu); + + pr_debug("perf: amd_iommu:perf_iommu_del\n"); + perf_iommu_stop(event, PERF_EF_UPDATE); + + /* clear the assigned iommu bank/counter */ + clear_avail_iommu_bnk_cntr(perf_iommu, + _GET_BANK(event), + _GET_CNTR(event)); + + perf_event_update_userpage(event); +} + +static __init int _init_events_attrs(struct perf_amd_iommu *perf_iommu) +{ + struct attribute **attrs; + struct attribute_group *attr_group; + int i = 0, j; + + while (amd_iommu_v2_event_descs[i].attr.attr.name) + i++; + + attr_group = kzalloc(sizeof(struct attribute *) + * (i + 1) + sizeof(*attr_group), GFP_KERNEL); + if (!attr_group) + return -ENOMEM; + + attrs = (struct attribute **)(attr_group + 1); + for (j = 0; j < i; j++) + attrs[j] = &amd_iommu_v2_event_descs[j].attr.attr; + + attr_group->name = "events"; + attr_group->attrs = attrs; + perf_iommu->events_group = attr_group; + + return 0; +} + +static __init void amd_iommu_pc_exit(void) +{ + if (__perf_iommu.events_group != NULL) { + kfree(__perf_iommu.events_group); + __perf_iommu.events_group = NULL; + } +} + +static __init int _init_perf_amd_iommu( + struct perf_amd_iommu *perf_iommu, char *name) +{ + int ret; + + raw_spin_lock_init(&perf_iommu->lock); + + /* Init format attributes */ + perf_iommu->format_group = &amd_iommu_format_group; + + /* Init cpumask attributes to only core 0 */ + cpumask_set_cpu(0, &iommu_cpumask); + perf_iommu->cpumask_group = &amd_iommu_cpumask_group; + + /* Init events attributes */ + if (_init_events_attrs(perf_iommu) != 0) + pr_err("perf: amd_iommu: Only support raw events.\n"); + + /* Init null attributes */ + perf_iommu->null_group = NULL; + perf_iommu->pmu.attr_groups = perf_iommu->attr_groups; + + ret = perf_pmu_register(&perf_iommu->pmu, name, -1); + if (ret) { + pr_err("perf: amd_iommu: Failed to initialized.\n"); + amd_iommu_pc_exit(); + } else { + pr_info("perf: amd_iommu: Detected. (%d banks, %d counters/bank)\n", + amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID), + amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID)); + } + + return ret; +} + +static struct perf_amd_iommu __perf_iommu = { + .pmu = { + .event_init = perf_iommu_event_init, + .add = perf_iommu_add, + .del = perf_iommu_del, + .start = perf_iommu_start, + .stop = perf_iommu_stop, + .read = perf_iommu_read, + }, + .max_banks = 0x00, + .max_counters = 0x00, + .cntr_assign_mask = 0ULL, + .format_group = NULL, + .cpumask_group = NULL, + .events_group = NULL, + .null_group = NULL, +}; + +static __init int amd_iommu_pc_init(void) +{ + /* Make sure the IOMMU PC resource is available */ + if (!amd_iommu_pc_supported()) + return -ENODEV; + + _init_perf_amd_iommu(&__perf_iommu, "amd_iommu"); + + return 0; +} + +device_initcall(amd_iommu_pc_init); diff --git a/arch/x86/kernel/cpu/perf_event_amd_iommu.h b/arch/x86/kernel/cpu/perf_event_amd_iommu.h new file mode 100644 index 00000000000..845d173278e --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_amd_iommu.h @@ -0,0 +1,40 @@ +/* + * Copyright (C) 2013 Advanced Micro Devices, Inc. + * + * Author: Steven Kinney <Steven.Kinney@amd.com> + * Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _PERF_EVENT_AMD_IOMMU_H_ +#define _PERF_EVENT_AMD_IOMMU_H_ + +/* iommu pc mmio region register indexes */ +#define IOMMU_PC_COUNTER_REG 0x00 +#define IOMMU_PC_COUNTER_SRC_REG 0x08 +#define IOMMU_PC_PASID_MATCH_REG 0x10 +#define IOMMU_PC_DOMID_MATCH_REG 0x18 +#define IOMMU_PC_DEVID_MATCH_REG 0x20 +#define IOMMU_PC_COUNTER_REPORT_REG 0x28 + +/* maximun specified bank/counters */ +#define PC_MAX_SPEC_BNKS 64 +#define PC_MAX_SPEC_CNTRS 16 + +/* iommu pc reg masks*/ +#define IOMMU_BASE_DEVID 0x0000 + +/* amd_iommu_init.c external support functions */ +extern bool amd_iommu_pc_supported(void); + +extern u8 amd_iommu_pc_get_max_banks(u16 devid); + +extern u8 amd_iommu_pc_get_max_counters(u16 devid); + +extern int amd_iommu_pc_get_set_reg_val(u16 devid, u8 bank, u8 cntr, + u8 fxn, u64 *value, bool is_write); + +#endif /*_PERF_EVENT_AMD_IOMMU_H_*/ diff --git a/arch/x86/kernel/cpu/perf_event_amd_uncore.c b/arch/x86/kernel/cpu/perf_event_amd_uncore.c new file mode 100644 index 00000000000..3bbdf4cd38b --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_amd_uncore.c @@ -0,0 +1,547 @@ +/* + * Copyright (C) 2013 Advanced Micro Devices, Inc. + * + * Author: Jacob Shin <jacob.shin@amd.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/perf_event.h> +#include <linux/percpu.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/cpu.h> +#include <linux/cpumask.h> + +#include <asm/cpufeature.h> +#include <asm/perf_event.h> +#include <asm/msr.h> + +#define NUM_COUNTERS_NB 4 +#define NUM_COUNTERS_L2 4 +#define MAX_COUNTERS NUM_COUNTERS_NB + +#define RDPMC_BASE_NB 6 +#define RDPMC_BASE_L2 10 + +#define COUNTER_SHIFT 16 + +struct amd_uncore { + int id; + int refcnt; + int cpu; + int num_counters; + int rdpmc_base; + u32 msr_base; + cpumask_t *active_mask; + struct pmu *pmu; + struct perf_event *events[MAX_COUNTERS]; + struct amd_uncore *free_when_cpu_online; +}; + +static struct amd_uncore * __percpu *amd_uncore_nb; +static struct amd_uncore * __percpu *amd_uncore_l2; + +static struct pmu amd_nb_pmu; +static struct pmu amd_l2_pmu; + +static cpumask_t amd_nb_active_mask; +static cpumask_t amd_l2_active_mask; + +static bool is_nb_event(struct perf_event *event) +{ + return event->pmu->type == amd_nb_pmu.type; +} + +static bool is_l2_event(struct perf_event *event) +{ + return event->pmu->type == amd_l2_pmu.type; +} + +static struct amd_uncore *event_to_amd_uncore(struct perf_event *event) +{ + if (is_nb_event(event) && amd_uncore_nb) + return *per_cpu_ptr(amd_uncore_nb, event->cpu); + else if (is_l2_event(event) && amd_uncore_l2) + return *per_cpu_ptr(amd_uncore_l2, event->cpu); + + return NULL; +} + +static void amd_uncore_read(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 prev, new; + s64 delta; + + /* + * since we do not enable counter overflow interrupts, + * we do not have to worry about prev_count changing on us + */ + + prev = local64_read(&hwc->prev_count); + rdpmcl(hwc->event_base_rdpmc, new); + local64_set(&hwc->prev_count, new); + delta = (new << COUNTER_SHIFT) - (prev << COUNTER_SHIFT); + delta >>= COUNTER_SHIFT; + local64_add(delta, &event->count); +} + +static void amd_uncore_start(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + + if (flags & PERF_EF_RELOAD) + wrmsrl(hwc->event_base, (u64)local64_read(&hwc->prev_count)); + + hwc->state = 0; + wrmsrl(hwc->config_base, (hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE)); + perf_event_update_userpage(event); +} + +static void amd_uncore_stop(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + + wrmsrl(hwc->config_base, hwc->config); + hwc->state |= PERF_HES_STOPPED; + + if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { + amd_uncore_read(event); + hwc->state |= PERF_HES_UPTODATE; + } +} + +static int amd_uncore_add(struct perf_event *event, int flags) +{ + int i; + struct amd_uncore *uncore = event_to_amd_uncore(event); + struct hw_perf_event *hwc = &event->hw; + + /* are we already assigned? */ + if (hwc->idx != -1 && uncore->events[hwc->idx] == event) + goto out; + + for (i = 0; i < uncore->num_counters; i++) { + if (uncore->events[i] == event) { + hwc->idx = i; + goto out; + } + } + + /* if not, take the first available counter */ + hwc->idx = -1; + for (i = 0; i < uncore->num_counters; i++) { + if (cmpxchg(&uncore->events[i], NULL, event) == NULL) { + hwc->idx = i; + break; + } + } + +out: + if (hwc->idx == -1) + return -EBUSY; + + hwc->config_base = uncore->msr_base + (2 * hwc->idx); + hwc->event_base = uncore->msr_base + 1 + (2 * hwc->idx); + hwc->event_base_rdpmc = uncore->rdpmc_base + hwc->idx; + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + + if (flags & PERF_EF_START) + amd_uncore_start(event, PERF_EF_RELOAD); + + return 0; +} + +static void amd_uncore_del(struct perf_event *event, int flags) +{ + int i; + struct amd_uncore *uncore = event_to_amd_uncore(event); + struct hw_perf_event *hwc = &event->hw; + + amd_uncore_stop(event, PERF_EF_UPDATE); + + for (i = 0; i < uncore->num_counters; i++) { + if (cmpxchg(&uncore->events[i], event, NULL) == event) + break; + } + + hwc->idx = -1; +} + +static int amd_uncore_event_init(struct perf_event *event) +{ + struct amd_uncore *uncore; + struct hw_perf_event *hwc = &event->hw; + + if (event->attr.type != event->pmu->type) + return -ENOENT; + + /* + * NB and L2 counters (MSRs) are shared across all cores that share the + * same NB / L2 cache. Interrupts can be directed to a single target + * core, however, event counts generated by processes running on other + * cores cannot be masked out. So we do not support sampling and + * per-thread events. + */ + if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK) + return -EINVAL; + + /* NB and L2 counters do not have usr/os/guest/host bits */ + if (event->attr.exclude_user || event->attr.exclude_kernel || + event->attr.exclude_host || event->attr.exclude_guest) + return -EINVAL; + + /* and we do not enable counter overflow interrupts */ + hwc->config = event->attr.config & AMD64_RAW_EVENT_MASK_NB; + hwc->idx = -1; + + if (event->cpu < 0) + return -EINVAL; + + uncore = event_to_amd_uncore(event); + if (!uncore) + return -ENODEV; + + /* + * since request can come in to any of the shared cores, we will remap + * to a single common cpu. + */ + event->cpu = uncore->cpu; + + return 0; +} + +static ssize_t amd_uncore_attr_show_cpumask(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + int n; + cpumask_t *active_mask; + struct pmu *pmu = dev_get_drvdata(dev); + + if (pmu->type == amd_nb_pmu.type) + active_mask = &amd_nb_active_mask; + else if (pmu->type == amd_l2_pmu.type) + active_mask = &amd_l2_active_mask; + else + return 0; + + n = cpulist_scnprintf(buf, PAGE_SIZE - 2, active_mask); + buf[n++] = '\n'; + buf[n] = '\0'; + return n; +} +static DEVICE_ATTR(cpumask, S_IRUGO, amd_uncore_attr_show_cpumask, NULL); + +static struct attribute *amd_uncore_attrs[] = { + &dev_attr_cpumask.attr, + NULL, +}; + +static struct attribute_group amd_uncore_attr_group = { + .attrs = amd_uncore_attrs, +}; + +PMU_FORMAT_ATTR(event, "config:0-7,32-35"); +PMU_FORMAT_ATTR(umask, "config:8-15"); + +static struct attribute *amd_uncore_format_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + NULL, +}; + +static struct attribute_group amd_uncore_format_group = { + .name = "format", + .attrs = amd_uncore_format_attr, +}; + +static const struct attribute_group *amd_uncore_attr_groups[] = { + &amd_uncore_attr_group, + &amd_uncore_format_group, + NULL, +}; + +static struct pmu amd_nb_pmu = { + .attr_groups = amd_uncore_attr_groups, + .name = "amd_nb", + .event_init = amd_uncore_event_init, + .add = amd_uncore_add, + .del = amd_uncore_del, + .start = amd_uncore_start, + .stop = amd_uncore_stop, + .read = amd_uncore_read, +}; + +static struct pmu amd_l2_pmu = { + .attr_groups = amd_uncore_attr_groups, + .name = "amd_l2", + .event_init = amd_uncore_event_init, + .add = amd_uncore_add, + .del = amd_uncore_del, + .start = amd_uncore_start, + .stop = amd_uncore_stop, + .read = amd_uncore_read, +}; + +static struct amd_uncore *amd_uncore_alloc(unsigned int cpu) +{ + return kzalloc_node(sizeof(struct amd_uncore), GFP_KERNEL, + cpu_to_node(cpu)); +} + +static void amd_uncore_cpu_up_prepare(unsigned int cpu) +{ + struct amd_uncore *uncore; + + if (amd_uncore_nb) { + uncore = amd_uncore_alloc(cpu); + uncore->cpu = cpu; + uncore->num_counters = NUM_COUNTERS_NB; + uncore->rdpmc_base = RDPMC_BASE_NB; + uncore->msr_base = MSR_F15H_NB_PERF_CTL; + uncore->active_mask = &amd_nb_active_mask; + uncore->pmu = &amd_nb_pmu; + *per_cpu_ptr(amd_uncore_nb, cpu) = uncore; + } + + if (amd_uncore_l2) { + uncore = amd_uncore_alloc(cpu); + uncore->cpu = cpu; + uncore->num_counters = NUM_COUNTERS_L2; + uncore->rdpmc_base = RDPMC_BASE_L2; + uncore->msr_base = MSR_F16H_L2I_PERF_CTL; + uncore->active_mask = &amd_l2_active_mask; + uncore->pmu = &amd_l2_pmu; + *per_cpu_ptr(amd_uncore_l2, cpu) = uncore; + } +} + +static struct amd_uncore * +amd_uncore_find_online_sibling(struct amd_uncore *this, + struct amd_uncore * __percpu *uncores) +{ + unsigned int cpu; + struct amd_uncore *that; + + for_each_online_cpu(cpu) { + that = *per_cpu_ptr(uncores, cpu); + + if (!that) + continue; + + if (this == that) + continue; + + if (this->id == that->id) { + that->free_when_cpu_online = this; + this = that; + break; + } + } + + this->refcnt++; + return this; +} + +static void amd_uncore_cpu_starting(unsigned int cpu) +{ + unsigned int eax, ebx, ecx, edx; + struct amd_uncore *uncore; + + if (amd_uncore_nb) { + uncore = *per_cpu_ptr(amd_uncore_nb, cpu); + cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); + uncore->id = ecx & 0xff; + + uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_nb); + *per_cpu_ptr(amd_uncore_nb, cpu) = uncore; + } + + if (amd_uncore_l2) { + unsigned int apicid = cpu_data(cpu).apicid; + unsigned int nshared; + + uncore = *per_cpu_ptr(amd_uncore_l2, cpu); + cpuid_count(0x8000001d, 2, &eax, &ebx, &ecx, &edx); + nshared = ((eax >> 14) & 0xfff) + 1; + uncore->id = apicid - (apicid % nshared); + + uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_l2); + *per_cpu_ptr(amd_uncore_l2, cpu) = uncore; + } +} + +static void uncore_online(unsigned int cpu, + struct amd_uncore * __percpu *uncores) +{ + struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu); + + kfree(uncore->free_when_cpu_online); + uncore->free_when_cpu_online = NULL; + + if (cpu == uncore->cpu) + cpumask_set_cpu(cpu, uncore->active_mask); +} + +static void amd_uncore_cpu_online(unsigned int cpu) +{ + if (amd_uncore_nb) + uncore_online(cpu, amd_uncore_nb); + + if (amd_uncore_l2) + uncore_online(cpu, amd_uncore_l2); +} + +static void uncore_down_prepare(unsigned int cpu, + struct amd_uncore * __percpu *uncores) +{ + unsigned int i; + struct amd_uncore *this = *per_cpu_ptr(uncores, cpu); + + if (this->cpu != cpu) + return; + + /* this cpu is going down, migrate to a shared sibling if possible */ + for_each_online_cpu(i) { + struct amd_uncore *that = *per_cpu_ptr(uncores, i); + + if (cpu == i) + continue; + + if (this == that) { + perf_pmu_migrate_context(this->pmu, cpu, i); + cpumask_clear_cpu(cpu, that->active_mask); + cpumask_set_cpu(i, that->active_mask); + that->cpu = i; + break; + } + } +} + +static void amd_uncore_cpu_down_prepare(unsigned int cpu) +{ + if (amd_uncore_nb) + uncore_down_prepare(cpu, amd_uncore_nb); + + if (amd_uncore_l2) + uncore_down_prepare(cpu, amd_uncore_l2); +} + +static void uncore_dead(unsigned int cpu, struct amd_uncore * __percpu *uncores) +{ + struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu); + + if (cpu == uncore->cpu) + cpumask_clear_cpu(cpu, uncore->active_mask); + + if (!--uncore->refcnt) + kfree(uncore); + *per_cpu_ptr(amd_uncore_nb, cpu) = NULL; +} + +static void amd_uncore_cpu_dead(unsigned int cpu) +{ + if (amd_uncore_nb) + uncore_dead(cpu, amd_uncore_nb); + + if (amd_uncore_l2) + uncore_dead(cpu, amd_uncore_l2); +} + +static int +amd_uncore_cpu_notifier(struct notifier_block *self, unsigned long action, + void *hcpu) +{ + unsigned int cpu = (long)hcpu; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_UP_PREPARE: + amd_uncore_cpu_up_prepare(cpu); + break; + + case CPU_STARTING: + amd_uncore_cpu_starting(cpu); + break; + + case CPU_ONLINE: + amd_uncore_cpu_online(cpu); + break; + + case CPU_DOWN_PREPARE: + amd_uncore_cpu_down_prepare(cpu); + break; + + case CPU_UP_CANCELED: + case CPU_DEAD: + amd_uncore_cpu_dead(cpu); + break; + + default: + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block amd_uncore_cpu_notifier_block = { + .notifier_call = amd_uncore_cpu_notifier, + .priority = CPU_PRI_PERF + 1, +}; + +static void __init init_cpu_already_online(void *dummy) +{ + unsigned int cpu = smp_processor_id(); + + amd_uncore_cpu_starting(cpu); + amd_uncore_cpu_online(cpu); +} + +static int __init amd_uncore_init(void) +{ + unsigned int cpu; + int ret = -ENODEV; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + return -ENODEV; + + if (!cpu_has_topoext) + return -ENODEV; + + if (cpu_has_perfctr_nb) { + amd_uncore_nb = alloc_percpu(struct amd_uncore *); + perf_pmu_register(&amd_nb_pmu, amd_nb_pmu.name, -1); + + printk(KERN_INFO "perf: AMD NB counters detected\n"); + ret = 0; + } + + if (cpu_has_perfctr_l2) { + amd_uncore_l2 = alloc_percpu(struct amd_uncore *); + perf_pmu_register(&amd_l2_pmu, amd_l2_pmu.name, -1); + + printk(KERN_INFO "perf: AMD L2I counters detected\n"); + ret = 0; + } + + if (ret) + return -ENODEV; + + cpu_notifier_register_begin(); + + /* init cpus already online before registering for hotplug notifier */ + for_each_online_cpu(cpu) { + amd_uncore_cpu_up_prepare(cpu); + smp_call_function_single(cpu, init_cpu_already_online, NULL, 1); + } + + __register_cpu_notifier(&amd_uncore_cpu_notifier_block); + cpu_notifier_register_done(); + + return 0; +} +device_initcall(amd_uncore_init); diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c new file mode 100644 index 00000000000..2502d0d9d24 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -0,0 +1,2652 @@ +/* + * Per core/cpu state + * + * Used to coordinate shared registers between HT threads or + * among events on a single PMU. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/stddef.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/export.h> + +#include <asm/cpufeature.h> +#include <asm/hardirq.h> +#include <asm/apic.h> + +#include "perf_event.h" + +/* + * Intel PerfMon, used on Core and later. + */ +static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e, + [PERF_COUNT_HW_CACHE_MISSES] = 0x412e, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, + [PERF_COUNT_HW_REF_CPU_CYCLES] = 0x0300, /* pseudo-encoding */ +}; + +static struct event_constraint intel_core_event_constraints[] __read_mostly = +{ + INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ + INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ + INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ + INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ + INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */ + INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FP_COMP_INSTR_RET */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_core2_event_constraints[] __read_mostly = +{ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ + INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ + INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ + INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ + INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ + INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ + INTEL_EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */ + INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */ + INTEL_EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */ + INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* ITLB_MISS_RETIRED (T30-9) */ + INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_nehalem_event_constraints[] __read_mostly = +{ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ + INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */ + INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */ + INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */ + INTEL_EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */ + INTEL_EVENT_CONSTRAINT(0x48, 0x3), /* L1D_PEND_MISS */ + INTEL_EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */ + INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ + INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ + EVENT_CONSTRAINT_END +}; + +static struct extra_reg intel_nehalem_extra_regs[] __read_mostly = +{ + /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0), + INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b), + EVENT_EXTRA_END +}; + +static struct event_constraint intel_westmere_event_constraints[] __read_mostly = +{ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ + INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ + INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */ + INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ + INTEL_EVENT_CONSTRAINT(0xb3, 0x1), /* SNOOPQ_REQUEST_OUTSTANDING */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_snb_event_constraints[] __read_mostly = +{ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ + INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */ + INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */ + INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */ + INTEL_UEVENT_CONSTRAINT(0x06a3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */ + INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */ + INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ + INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ + INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */ + INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_ivb_event_constraints[] __read_mostly = +{ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ + INTEL_UEVENT_CONSTRAINT(0x0148, 0x4), /* L1D_PEND_MISS.PENDING */ + INTEL_UEVENT_CONSTRAINT(0x0279, 0xf), /* IDQ.EMTPY */ + INTEL_UEVENT_CONSTRAINT(0x019c, 0xf), /* IDQ_UOPS_NOT_DELIVERED.CORE */ + INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_LDM_PENDING */ + INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */ + INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */ + INTEL_UEVENT_CONSTRAINT(0x06a3, 0xf), /* CYCLE_ACTIVITY.STALLS_LDM_PENDING */ + INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */ + INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */ + INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ + /* + * Errata BV98 -- MEM_*_RETIRED events can leak between counters of SMT + * siblings; disable these events because they can corrupt unrelated + * counters. + */ + INTEL_EVENT_CONSTRAINT(0xd0, 0x0), /* MEM_UOPS_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd1, 0x0), /* MEM_LOAD_UOPS_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd2, 0x0), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd3, 0x0), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */ + EVENT_CONSTRAINT_END +}; + +static struct extra_reg intel_westmere_extra_regs[] __read_mostly = +{ + /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1), + INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b), + EVENT_EXTRA_END +}; + +static struct event_constraint intel_v1_event_constraints[] __read_mostly = +{ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_gen_event_constraints[] __read_mostly = +{ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_slm_event_constraints[] __read_mostly = +{ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* pseudo CPU_CLK_UNHALTED.REF */ + EVENT_CONSTRAINT_END +}; + +static struct extra_reg intel_snb_extra_regs[] __read_mostly = { + /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3f807f8fffull, RSP_1), + INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), + EVENT_EXTRA_END +}; + +static struct extra_reg intel_snbep_extra_regs[] __read_mostly = { + /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1), + INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), + EVENT_EXTRA_END +}; + +EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3"); +EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3"); +EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2"); + +struct attribute *nhm_events_attrs[] = { + EVENT_PTR(mem_ld_nhm), + NULL, +}; + +struct attribute *snb_events_attrs[] = { + EVENT_PTR(mem_ld_snb), + EVENT_PTR(mem_st_snb), + NULL, +}; + +static struct event_constraint intel_hsw_event_constraints[] = { + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ + INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.* */ + INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ + INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ + /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */ + INTEL_EVENT_CONSTRAINT(0x08a3, 0x4), + /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */ + INTEL_EVENT_CONSTRAINT(0x0ca3, 0x4), + /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */ + INTEL_EVENT_CONSTRAINT(0x04a3, 0xf), + EVENT_CONSTRAINT_END +}; + +static u64 intel_pmu_event_map(int hw_event) +{ + return intel_perfmon_event_map[hw_event]; +} + +#define SNB_DMND_DATA_RD (1ULL << 0) +#define SNB_DMND_RFO (1ULL << 1) +#define SNB_DMND_IFETCH (1ULL << 2) +#define SNB_DMND_WB (1ULL << 3) +#define SNB_PF_DATA_RD (1ULL << 4) +#define SNB_PF_RFO (1ULL << 5) +#define SNB_PF_IFETCH (1ULL << 6) +#define SNB_LLC_DATA_RD (1ULL << 7) +#define SNB_LLC_RFO (1ULL << 8) +#define SNB_LLC_IFETCH (1ULL << 9) +#define SNB_BUS_LOCKS (1ULL << 10) +#define SNB_STRM_ST (1ULL << 11) +#define SNB_OTHER (1ULL << 15) +#define SNB_RESP_ANY (1ULL << 16) +#define SNB_NO_SUPP (1ULL << 17) +#define SNB_LLC_HITM (1ULL << 18) +#define SNB_LLC_HITE (1ULL << 19) +#define SNB_LLC_HITS (1ULL << 20) +#define SNB_LLC_HITF (1ULL << 21) +#define SNB_LOCAL (1ULL << 22) +#define SNB_REMOTE (0xffULL << 23) +#define SNB_SNP_NONE (1ULL << 31) +#define SNB_SNP_NOT_NEEDED (1ULL << 32) +#define SNB_SNP_MISS (1ULL << 33) +#define SNB_NO_FWD (1ULL << 34) +#define SNB_SNP_FWD (1ULL << 35) +#define SNB_HITM (1ULL << 36) +#define SNB_NON_DRAM (1ULL << 37) + +#define SNB_DMND_READ (SNB_DMND_DATA_RD|SNB_LLC_DATA_RD) +#define SNB_DMND_WRITE (SNB_DMND_RFO|SNB_LLC_RFO) +#define SNB_DMND_PREFETCH (SNB_PF_DATA_RD|SNB_PF_RFO) + +#define SNB_SNP_ANY (SNB_SNP_NONE|SNB_SNP_NOT_NEEDED| \ + SNB_SNP_MISS|SNB_NO_FWD|SNB_SNP_FWD| \ + SNB_HITM) + +#define SNB_DRAM_ANY (SNB_LOCAL|SNB_REMOTE|SNB_SNP_ANY) +#define SNB_DRAM_REMOTE (SNB_REMOTE|SNB_SNP_ANY) + +#define SNB_L3_ACCESS SNB_RESP_ANY +#define SNB_L3_MISS (SNB_DRAM_ANY|SNB_NON_DRAM) + +static __initconst const u64 snb_hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_L3_ACCESS, + [ C(RESULT_MISS) ] = SNB_DMND_READ|SNB_L3_MISS, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_L3_ACCESS, + [ C(RESULT_MISS) ] = SNB_DMND_WRITE|SNB_L3_MISS, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_L3_ACCESS, + [ C(RESULT_MISS) ] = SNB_DMND_PREFETCH|SNB_L3_MISS, + }, + }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_DRAM_ANY, + [ C(RESULT_MISS) ] = SNB_DMND_READ|SNB_DRAM_REMOTE, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_DRAM_ANY, + [ C(RESULT_MISS) ] = SNB_DMND_WRITE|SNB_DRAM_REMOTE, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_DRAM_ANY, + [ C(RESULT_MISS) ] = SNB_DMND_PREFETCH|SNB_DRAM_REMOTE, + }, + }, +}; + +static __initconst const u64 snb_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0xf1d0, /* MEM_UOP_RETIRED.LOADS */ + [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPLACEMENT */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0xf2d0, /* MEM_UOP_RETIRED.STORES */ + [ C(RESULT_MISS) ] = 0x0851, /* L1D.ALL_M_REPLACEMENT */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x024e, /* HW_PRE_REQ.DL1_MISS */ + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0280, /* ICACHE.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_WRITE) ] = { + /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_PREFETCH) ] = { + /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOP_RETIRED.ALL_LOADS */ + [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.CAUSES_A_WALK */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOP_RETIRED.ALL_STORES */ + [ C(RESULT_MISS) ] = 0x0149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x1085, /* ITLB_MISSES.STLB_HIT */ + [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.CAUSES_A_WALK */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ + [ C(RESULT_MISS) ] = 0x00c5, /* BR_MISP_RETIRED.ALL_BRANCHES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + }, + +}; + +static __initconst const u64 westmere_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */ + [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */ + [ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ + [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */ + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + /* + * Use RFO, not WRITEBACK, because a write miss would typically occur + * on RFO. + */ + [ C(OP_WRITE) ] = { + /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_PREFETCH) ] = { + /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */ + [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */ + [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ + [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + }, +}; + +/* + * Nehalem/Westmere MSR_OFFCORE_RESPONSE bits; + * See IA32 SDM Vol 3B 30.6.1.3 + */ + +#define NHM_DMND_DATA_RD (1 << 0) +#define NHM_DMND_RFO (1 << 1) +#define NHM_DMND_IFETCH (1 << 2) +#define NHM_DMND_WB (1 << 3) +#define NHM_PF_DATA_RD (1 << 4) +#define NHM_PF_DATA_RFO (1 << 5) +#define NHM_PF_IFETCH (1 << 6) +#define NHM_OFFCORE_OTHER (1 << 7) +#define NHM_UNCORE_HIT (1 << 8) +#define NHM_OTHER_CORE_HIT_SNP (1 << 9) +#define NHM_OTHER_CORE_HITM (1 << 10) + /* reserved */ +#define NHM_REMOTE_CACHE_FWD (1 << 12) +#define NHM_REMOTE_DRAM (1 << 13) +#define NHM_LOCAL_DRAM (1 << 14) +#define NHM_NON_DRAM (1 << 15) + +#define NHM_LOCAL (NHM_LOCAL_DRAM|NHM_REMOTE_CACHE_FWD) +#define NHM_REMOTE (NHM_REMOTE_DRAM) + +#define NHM_DMND_READ (NHM_DMND_DATA_RD) +#define NHM_DMND_WRITE (NHM_DMND_RFO|NHM_DMND_WB) +#define NHM_DMND_PREFETCH (NHM_PF_DATA_RD|NHM_PF_DATA_RFO) + +#define NHM_L3_HIT (NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM) +#define NHM_L3_MISS (NHM_NON_DRAM|NHM_LOCAL_DRAM|NHM_REMOTE_DRAM|NHM_REMOTE_CACHE_FWD) +#define NHM_L3_ACCESS (NHM_L3_HIT|NHM_L3_MISS) + +static __initconst const u64 nehalem_hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_L3_ACCESS, + [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_L3_MISS, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_L3_ACCESS, + [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_L3_MISS, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS, + [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_L3_MISS, + }, + }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_LOCAL|NHM_REMOTE, + [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_REMOTE, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_LOCAL|NHM_REMOTE, + [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_REMOTE, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_LOCAL|NHM_REMOTE, + [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_REMOTE, + }, + }, +}; + +static __initconst const u64 nehalem_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */ + [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */ + [ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ + [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */ + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + /* + * Use RFO, not WRITEBACK, because a write miss would typically occur + * on RFO. + */ + [ C(OP_WRITE) ] = { + /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_PREFETCH) ] = { + /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ + [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + }, +}; + +static __initconst const u64 core2_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ + [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ + [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */ + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ + [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ + [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ + [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static __initconst const u64 atom_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ + [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ + [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ + [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static struct extra_reg intel_slm_extra_regs[] __read_mostly = +{ + /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x768005ffffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x768005ffffull, RSP_1), + EVENT_EXTRA_END +}; + +#define SLM_DMND_READ SNB_DMND_DATA_RD +#define SLM_DMND_WRITE SNB_DMND_RFO +#define SLM_DMND_PREFETCH (SNB_PF_DATA_RD|SNB_PF_RFO) + +#define SLM_SNP_ANY (SNB_SNP_NONE|SNB_SNP_MISS|SNB_NO_FWD|SNB_HITM) +#define SLM_LLC_ACCESS SNB_RESP_ANY +#define SLM_LLC_MISS (SLM_SNP_ANY|SNB_NON_DRAM) + +static __initconst const u64 slm_hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = SLM_DMND_READ|SLM_LLC_ACCESS, + [ C(RESULT_MISS) ] = SLM_DMND_READ|SLM_LLC_MISS, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = SLM_DMND_WRITE|SLM_LLC_ACCESS, + [ C(RESULT_MISS) ] = SLM_DMND_WRITE|SLM_LLC_MISS, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = SLM_DMND_PREFETCH|SLM_LLC_ACCESS, + [ C(RESULT_MISS) ] = SLM_DMND_PREFETCH|SLM_LLC_MISS, + }, + }, +}; + +static __initconst const u64 slm_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0x0104, /* LD_DCU_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0380, /* ICACHE.ACCESSES */ + [ C(RESULT_MISS) ] = 0x0280, /* ICACGE.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_WRITE) ] = { + /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_PREFETCH) ] = { + /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0x0804, /* LD_DTLB_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ + [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event) +{ + /* user explicitly requested branch sampling */ + if (has_branch_stack(event)) + return true; + + /* implicit branch sampling to correct PEBS skid */ + if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 && + x86_pmu.intel_cap.pebs_format < 2) + return true; + + return false; +} + +static void intel_pmu_disable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); + + if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) + intel_pmu_disable_bts(); + + intel_pmu_pebs_disable_all(); + intel_pmu_lbr_disable_all(); +} + +static void intel_pmu_enable_all(int added) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + intel_pmu_pebs_enable_all(); + intel_pmu_lbr_enable_all(); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, + x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask); + + if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { + struct perf_event *event = + cpuc->events[INTEL_PMC_IDX_FIXED_BTS]; + + if (WARN_ON_ONCE(!event)) + return; + + intel_pmu_enable_bts(event->hw.config); + } +} + +/* + * Workaround for: + * Intel Errata AAK100 (model 26) + * Intel Errata AAP53 (model 30) + * Intel Errata BD53 (model 44) + * + * The official story: + * These chips need to be 'reset' when adding counters by programming the + * magic three (non-counting) events 0x4300B5, 0x4300D2, and 0x4300B1 either + * in sequence on the same PMC or on different PMCs. + * + * In practise it appears some of these events do in fact count, and + * we need to programm all 4 events. + */ +static void intel_pmu_nhm_workaround(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + static const unsigned long nhm_magic[4] = { + 0x4300B5, + 0x4300D2, + 0x4300B1, + 0x4300B1 + }; + struct perf_event *event; + int i; + + /* + * The Errata requires below steps: + * 1) Clear MSR_IA32_PEBS_ENABLE and MSR_CORE_PERF_GLOBAL_CTRL; + * 2) Configure 4 PERFEVTSELx with the magic events and clear + * the corresponding PMCx; + * 3) set bit0~bit3 of MSR_CORE_PERF_GLOBAL_CTRL; + * 4) Clear MSR_CORE_PERF_GLOBAL_CTRL; + * 5) Clear 4 pairs of ERFEVTSELx and PMCx; + */ + + /* + * The real steps we choose are a little different from above. + * A) To reduce MSR operations, we don't run step 1) as they + * are already cleared before this function is called; + * B) Call x86_perf_event_update to save PMCx before configuring + * PERFEVTSELx with magic number; + * C) With step 5), we do clear only when the PERFEVTSELx is + * not used currently. + * D) Call x86_perf_event_set_period to restore PMCx; + */ + + /* We always operate 4 pairs of PERF Counters */ + for (i = 0; i < 4; i++) { + event = cpuc->events[i]; + if (event) + x86_perf_event_update(event); + } + + for (i = 0; i < 4; i++) { + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, nhm_magic[i]); + wrmsrl(MSR_ARCH_PERFMON_PERFCTR0 + i, 0x0); + } + + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0xf); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0); + + for (i = 0; i < 4; i++) { + event = cpuc->events[i]; + + if (event) { + x86_perf_event_set_period(event); + __x86_pmu_enable_event(&event->hw, + ARCH_PERFMON_EVENTSEL_ENABLE); + } else + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0x0); + } +} + +static void intel_pmu_nhm_enable_all(int added) +{ + if (added) + intel_pmu_nhm_workaround(); + intel_pmu_enable_all(added); +} + +static inline u64 intel_pmu_get_status(void) +{ + u64 status; + + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + + return status; +} + +static inline void intel_pmu_ack_status(u64 ack) +{ + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); +} + +static void intel_pmu_disable_fixed(struct hw_perf_event *hwc) +{ + int idx = hwc->idx - INTEL_PMC_IDX_FIXED; + u64 ctrl_val, mask; + + mask = 0xfULL << (idx * 4); + + rdmsrl(hwc->config_base, ctrl_val); + ctrl_val &= ~mask; + wrmsrl(hwc->config_base, ctrl_val); +} + +static inline bool event_is_checkpointed(struct perf_event *event) +{ + return (event->hw.config & HSW_IN_TX_CHECKPOINTED) != 0; +} + +static void intel_pmu_disable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) { + intel_pmu_disable_bts(); + intel_pmu_drain_bts_buffer(); + return; + } + + cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx); + cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx); + cpuc->intel_cp_status &= ~(1ull << hwc->idx); + + /* + * must disable before any actual event + * because any event may be combined with LBR + */ + if (intel_pmu_needs_lbr_smpl(event)) + intel_pmu_lbr_disable(event); + + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + intel_pmu_disable_fixed(hwc); + return; + } + + x86_pmu_disable_event(event); + + if (unlikely(event->attr.precise_ip)) + intel_pmu_pebs_disable(event); +} + +static void intel_pmu_enable_fixed(struct hw_perf_event *hwc) +{ + int idx = hwc->idx - INTEL_PMC_IDX_FIXED; + u64 ctrl_val, bits, mask; + + /* + * Enable IRQ generation (0x8), + * and enable ring-3 counting (0x2) and ring-0 counting (0x1) + * if requested: + */ + bits = 0x8ULL; + if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) + bits |= 0x2; + if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) + bits |= 0x1; + + /* + * ANY bit is supported in v3 and up + */ + if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY) + bits |= 0x4; + + bits <<= (idx * 4); + mask = 0xfULL << (idx * 4); + + rdmsrl(hwc->config_base, ctrl_val); + ctrl_val &= ~mask; + ctrl_val |= bits; + wrmsrl(hwc->config_base, ctrl_val); +} + +static void intel_pmu_enable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) { + if (!__this_cpu_read(cpu_hw_events.enabled)) + return; + + intel_pmu_enable_bts(hwc->config); + return; + } + /* + * must enabled before any actual event + * because any event may be combined with LBR + */ + if (intel_pmu_needs_lbr_smpl(event)) + intel_pmu_lbr_enable(event); + + if (event->attr.exclude_host) + cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx); + if (event->attr.exclude_guest) + cpuc->intel_ctrl_host_mask |= (1ull << hwc->idx); + + if (unlikely(event_is_checkpointed(event))) + cpuc->intel_cp_status |= (1ull << hwc->idx); + + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + intel_pmu_enable_fixed(hwc); + return; + } + + if (unlikely(event->attr.precise_ip)) + intel_pmu_pebs_enable(event); + + __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); +} + +/* + * Save and restart an expired event. Called by NMI contexts, + * so it has to be careful about preempting normal event ops: + */ +int intel_pmu_save_and_restart(struct perf_event *event) +{ + x86_perf_event_update(event); + /* + * For a checkpointed counter always reset back to 0. This + * avoids a situation where the counter overflows, aborts the + * transaction and is then set back to shortly before the + * overflow, and overflows and aborts again. + */ + if (unlikely(event_is_checkpointed(event))) { + /* No race with NMIs because the counter should not be armed */ + wrmsrl(event->hw.event_base, 0); + local64_set(&event->hw.prev_count, 0); + } + return x86_perf_event_set_period(event); +} + +static void intel_pmu_reset(void) +{ + struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds); + unsigned long flags; + int idx; + + if (!x86_pmu.num_counters) + return; + + local_irq_save(flags); + + pr_info("clearing PMU state on CPU#%d\n", smp_processor_id()); + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + wrmsrl_safe(x86_pmu_config_addr(idx), 0ull); + wrmsrl_safe(x86_pmu_event_addr(idx), 0ull); + } + for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) + wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); + + if (ds) + ds->bts_index = ds->bts_buffer_base; + + local_irq_restore(flags); +} + +/* + * This handler is triggered by the local APIC, so the APIC IRQ handling + * rules apply: + */ +static int intel_pmu_handle_irq(struct pt_regs *regs) +{ + struct perf_sample_data data; + struct cpu_hw_events *cpuc; + int bit, loops; + u64 status; + int handled; + + cpuc = &__get_cpu_var(cpu_hw_events); + + /* + * No known reason to not always do late ACK, + * but just in case do it opt-in. + */ + if (!x86_pmu.late_ack) + apic_write(APIC_LVTPC, APIC_DM_NMI); + intel_pmu_disable_all(); + handled = intel_pmu_drain_bts_buffer(); + status = intel_pmu_get_status(); + if (!status) + goto done; + + loops = 0; +again: + intel_pmu_ack_status(status); + if (++loops > 100) { + static bool warned = false; + if (!warned) { + WARN(1, "perfevents: irq loop stuck!\n"); + perf_event_print_debug(); + warned = true; + } + intel_pmu_reset(); + goto done; + } + + inc_irq_stat(apic_perf_irqs); + + intel_pmu_lbr_read(); + + /* + * CondChgd bit 63 doesn't mean any overflow status. Ignore + * and clear the bit. + */ + if (__test_and_clear_bit(63, (unsigned long *)&status)) { + if (!status) + goto done; + } + + /* + * PEBS overflow sets bit 62 in the global status register + */ + if (__test_and_clear_bit(62, (unsigned long *)&status)) { + handled++; + x86_pmu.drain_pebs(regs); + } + + /* + * Checkpointed counters can lead to 'spurious' PMIs because the + * rollback caused by the PMI will have cleared the overflow status + * bit. Therefore always force probe these counters. + */ + status |= cpuc->intel_cp_status; + + for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { + struct perf_event *event = cpuc->events[bit]; + + handled++; + + if (!test_bit(bit, cpuc->active_mask)) + continue; + + if (!intel_pmu_save_and_restart(event)) + continue; + + perf_sample_data_init(&data, 0, event->hw.last_period); + + if (has_branch_stack(event)) + data.br_stack = &cpuc->lbr_stack; + + if (perf_event_overflow(event, &data, regs)) + x86_pmu_stop(event, 0); + } + + /* + * Repeat if there is more work to be done: + */ + status = intel_pmu_get_status(); + if (status) + goto again; + +done: + intel_pmu_enable_all(0); + /* + * Only unmask the NMI after the overflow counters + * have been reset. This avoids spurious NMIs on + * Haswell CPUs. + */ + if (x86_pmu.late_ack) + apic_write(APIC_LVTPC, APIC_DM_NMI); + return handled; +} + +static struct event_constraint * +intel_bts_constraints(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + unsigned int hw_event, bts_event; + + if (event->attr.freq) + return NULL; + + hw_event = hwc->config & INTEL_ARCH_EVENT_MASK; + bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS); + + if (unlikely(hw_event == bts_event && hwc->sample_period == 1)) + return &bts_constraint; + + return NULL; +} + +static int intel_alt_er(int idx) +{ + if (!(x86_pmu.er_flags & ERF_HAS_RSP_1)) + return idx; + + if (idx == EXTRA_REG_RSP_0) + return EXTRA_REG_RSP_1; + + if (idx == EXTRA_REG_RSP_1) + return EXTRA_REG_RSP_0; + + return idx; +} + +static void intel_fixup_er(struct perf_event *event, int idx) +{ + event->hw.extra_reg.idx = idx; + + if (idx == EXTRA_REG_RSP_0) { + event->hw.config &= ~INTEL_ARCH_EVENT_MASK; + event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_0].event; + event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0; + } else if (idx == EXTRA_REG_RSP_1) { + event->hw.config &= ~INTEL_ARCH_EVENT_MASK; + event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_1].event; + event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1; + } +} + +/* + * manage allocation of shared extra msr for certain events + * + * sharing can be: + * per-cpu: to be shared between the various events on a single PMU + * per-core: per-cpu + shared by HT threads + */ +static struct event_constraint * +__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event, + struct hw_perf_event_extra *reg) +{ + struct event_constraint *c = &emptyconstraint; + struct er_account *era; + unsigned long flags; + int idx = reg->idx; + + /* + * reg->alloc can be set due to existing state, so for fake cpuc we + * need to ignore this, otherwise we might fail to allocate proper fake + * state for this extra reg constraint. Also see the comment below. + */ + if (reg->alloc && !cpuc->is_fake) + return NULL; /* call x86_get_event_constraint() */ + +again: + era = &cpuc->shared_regs->regs[idx]; + /* + * we use spin_lock_irqsave() to avoid lockdep issues when + * passing a fake cpuc + */ + raw_spin_lock_irqsave(&era->lock, flags); + + if (!atomic_read(&era->ref) || era->config == reg->config) { + + /* + * If its a fake cpuc -- as per validate_{group,event}() we + * shouldn't touch event state and we can avoid doing so + * since both will only call get_event_constraints() once + * on each event, this avoids the need for reg->alloc. + * + * Not doing the ER fixup will only result in era->reg being + * wrong, but since we won't actually try and program hardware + * this isn't a problem either. + */ + if (!cpuc->is_fake) { + if (idx != reg->idx) + intel_fixup_er(event, idx); + + /* + * x86_schedule_events() can call get_event_constraints() + * multiple times on events in the case of incremental + * scheduling(). reg->alloc ensures we only do the ER + * allocation once. + */ + reg->alloc = 1; + } + + /* lock in msr value */ + era->config = reg->config; + era->reg = reg->reg; + + /* one more user */ + atomic_inc(&era->ref); + + /* + * need to call x86_get_event_constraint() + * to check if associated event has constraints + */ + c = NULL; + } else { + idx = intel_alt_er(idx); + if (idx != reg->idx) { + raw_spin_unlock_irqrestore(&era->lock, flags); + goto again; + } + } + raw_spin_unlock_irqrestore(&era->lock, flags); + + return c; +} + +static void +__intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc, + struct hw_perf_event_extra *reg) +{ + struct er_account *era; + + /* + * Only put constraint if extra reg was actually allocated. Also takes + * care of event which do not use an extra shared reg. + * + * Also, if this is a fake cpuc we shouldn't touch any event state + * (reg->alloc) and we don't care about leaving inconsistent cpuc state + * either since it'll be thrown out. + */ + if (!reg->alloc || cpuc->is_fake) + return; + + era = &cpuc->shared_regs->regs[reg->idx]; + + /* one fewer user */ + atomic_dec(&era->ref); + + /* allocate again next time */ + reg->alloc = 0; +} + +static struct event_constraint * +intel_shared_regs_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + struct event_constraint *c = NULL, *d; + struct hw_perf_event_extra *xreg, *breg; + + xreg = &event->hw.extra_reg; + if (xreg->idx != EXTRA_REG_NONE) { + c = __intel_shared_reg_get_constraints(cpuc, event, xreg); + if (c == &emptyconstraint) + return c; + } + breg = &event->hw.branch_reg; + if (breg->idx != EXTRA_REG_NONE) { + d = __intel_shared_reg_get_constraints(cpuc, event, breg); + if (d == &emptyconstraint) { + __intel_shared_reg_put_constraints(cpuc, xreg); + c = d; + } + } + return c; +} + +struct event_constraint * +x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +{ + struct event_constraint *c; + + if (x86_pmu.event_constraints) { + for_each_event_constraint(c, x86_pmu.event_constraints) { + if ((event->hw.config & c->cmask) == c->code) { + event->hw.flags |= c->flags; + return c; + } + } + } + + return &unconstrained; +} + +static struct event_constraint * +intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +{ + struct event_constraint *c; + + c = intel_bts_constraints(event); + if (c) + return c; + + c = intel_pebs_constraints(event); + if (c) + return c; + + c = intel_shared_regs_constraints(cpuc, event); + if (c) + return c; + + return x86_get_event_constraints(cpuc, event); +} + +static void +intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + struct hw_perf_event_extra *reg; + + reg = &event->hw.extra_reg; + if (reg->idx != EXTRA_REG_NONE) + __intel_shared_reg_put_constraints(cpuc, reg); + + reg = &event->hw.branch_reg; + if (reg->idx != EXTRA_REG_NONE) + __intel_shared_reg_put_constraints(cpuc, reg); +} + +static void intel_put_event_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + intel_put_shared_regs_event_constraints(cpuc, event); +} + +static void intel_pebs_aliases_core2(struct perf_event *event) +{ + if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) { + /* + * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P + * (0x003c) so that we can use it with PEBS. + * + * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't + * PEBS capable. However we can use INST_RETIRED.ANY_P + * (0x00c0), which is a PEBS capable event, to get the same + * count. + * + * INST_RETIRED.ANY_P counts the number of cycles that retires + * CNTMASK instructions. By setting CNTMASK to a value (16) + * larger than the maximum number of instructions that can be + * retired per cycle (4) and then inverting the condition, we + * count all cycles that retire 16 or less instructions, which + * is every cycle. + * + * Thereby we gain a PEBS capable cycle counter. + */ + u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16); + + alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK); + event->hw.config = alt_config; + } +} + +static void intel_pebs_aliases_snb(struct perf_event *event) +{ + if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) { + /* + * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P + * (0x003c) so that we can use it with PEBS. + * + * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't + * PEBS capable. However we can use UOPS_RETIRED.ALL + * (0x01c2), which is a PEBS capable event, to get the same + * count. + * + * UOPS_RETIRED.ALL counts the number of cycles that retires + * CNTMASK micro-ops. By setting CNTMASK to a value (16) + * larger than the maximum number of micro-ops that can be + * retired per cycle (4) and then inverting the condition, we + * count all cycles that retire 16 or less micro-ops, which + * is every cycle. + * + * Thereby we gain a PEBS capable cycle counter. + */ + u64 alt_config = X86_CONFIG(.event=0xc2, .umask=0x01, .inv=1, .cmask=16); + + alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK); + event->hw.config = alt_config; + } +} + +static int intel_pmu_hw_config(struct perf_event *event) +{ + int ret = x86_pmu_hw_config(event); + + if (ret) + return ret; + + if (event->attr.precise_ip && x86_pmu.pebs_aliases) + x86_pmu.pebs_aliases(event); + + if (intel_pmu_needs_lbr_smpl(event)) { + ret = intel_pmu_setup_lbr_filter(event); + if (ret) + return ret; + } + + if (event->attr.type != PERF_TYPE_RAW) + return 0; + + if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY)) + return 0; + + if (x86_pmu.version < 3) + return -EINVAL; + + if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) + return -EACCES; + + event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY; + + return 0; +} + +struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr) +{ + if (x86_pmu.guest_get_msrs) + return x86_pmu.guest_get_msrs(nr); + *nr = 0; + return NULL; +} +EXPORT_SYMBOL_GPL(perf_guest_get_msrs); + +static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs; + + arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL; + arr[0].host = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask; + arr[0].guest = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_host_mask; + /* + * If PMU counter has PEBS enabled it is not enough to disable counter + * on a guest entry since PEBS memory write can overshoot guest entry + * and corrupt guest memory. Disabling PEBS solves the problem. + */ + arr[1].msr = MSR_IA32_PEBS_ENABLE; + arr[1].host = cpuc->pebs_enabled; + arr[1].guest = 0; + + *nr = 2; + return arr; +} + +static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs; + int idx; + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + struct perf_event *event = cpuc->events[idx]; + + arr[idx].msr = x86_pmu_config_addr(idx); + arr[idx].host = arr[idx].guest = 0; + + if (!test_bit(idx, cpuc->active_mask)) + continue; + + arr[idx].host = arr[idx].guest = + event->hw.config | ARCH_PERFMON_EVENTSEL_ENABLE; + + if (event->attr.exclude_host) + arr[idx].host &= ~ARCH_PERFMON_EVENTSEL_ENABLE; + else if (event->attr.exclude_guest) + arr[idx].guest &= ~ARCH_PERFMON_EVENTSEL_ENABLE; + } + + *nr = x86_pmu.num_counters; + return arr; +} + +static void core_pmu_enable_event(struct perf_event *event) +{ + if (!event->attr.exclude_host) + x86_pmu_enable_event(event); +} + +static void core_pmu_enable_all(int added) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + int idx; + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + struct hw_perf_event *hwc = &cpuc->events[idx]->hw; + + if (!test_bit(idx, cpuc->active_mask) || + cpuc->events[idx]->attr.exclude_host) + continue; + + __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); + } +} + +static int hsw_hw_config(struct perf_event *event) +{ + int ret = intel_pmu_hw_config(event); + + if (ret) + return ret; + if (!boot_cpu_has(X86_FEATURE_RTM) && !boot_cpu_has(X86_FEATURE_HLE)) + return 0; + event->hw.config |= event->attr.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED); + + /* + * IN_TX/IN_TX-CP filters are not supported by the Haswell PMU with + * PEBS or in ANY thread mode. Since the results are non-sensical forbid + * this combination. + */ + if ((event->hw.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED)) && + ((event->hw.config & ARCH_PERFMON_EVENTSEL_ANY) || + event->attr.precise_ip > 0)) + return -EOPNOTSUPP; + + if (event_is_checkpointed(event)) { + /* + * Sampling of checkpointed events can cause situations where + * the CPU constantly aborts because of a overflow, which is + * then checkpointed back and ignored. Forbid checkpointing + * for sampling. + * + * But still allow a long sampling period, so that perf stat + * from KVM works. + */ + if (event->attr.sample_period > 0 && + event->attr.sample_period < 0x7fffffff) + return -EOPNOTSUPP; + } + return 0; +} + +static struct event_constraint counter2_constraint = + EVENT_CONSTRAINT(0, 0x4, 0); + +static struct event_constraint * +hsw_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +{ + struct event_constraint *c = intel_get_event_constraints(cpuc, event); + + /* Handle special quirk on in_tx_checkpointed only in counter 2 */ + if (event->hw.config & HSW_IN_TX_CHECKPOINTED) { + if (c->idxmsk64 & (1U << 2)) + return &counter2_constraint; + return &emptyconstraint; + } + + return c; +} + +PMU_FORMAT_ATTR(event, "config:0-7" ); +PMU_FORMAT_ATTR(umask, "config:8-15" ); +PMU_FORMAT_ATTR(edge, "config:18" ); +PMU_FORMAT_ATTR(pc, "config:19" ); +PMU_FORMAT_ATTR(any, "config:21" ); /* v3 + */ +PMU_FORMAT_ATTR(inv, "config:23" ); +PMU_FORMAT_ATTR(cmask, "config:24-31" ); +PMU_FORMAT_ATTR(in_tx, "config:32"); +PMU_FORMAT_ATTR(in_tx_cp, "config:33"); + +static struct attribute *intel_arch_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_pc.attr, + &format_attr_inv.attr, + &format_attr_cmask.attr, + NULL, +}; + +ssize_t intel_event_sysfs_show(char *page, u64 config) +{ + u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT); + + return x86_event_sysfs_show(page, config, event); +} + +static __initconst const struct x86_pmu core_pmu = { + .name = "core", + .handle_irq = x86_pmu_handle_irq, + .disable_all = x86_pmu_disable_all, + .enable_all = core_pmu_enable_all, + .enable = core_pmu_enable_event, + .disable = x86_pmu_disable_event, + .hw_config = x86_pmu_hw_config, + .schedule_events = x86_schedule_events, + .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, + .perfctr = MSR_ARCH_PERFMON_PERFCTR0, + .event_map = intel_pmu_event_map, + .max_events = ARRAY_SIZE(intel_perfmon_event_map), + .apic = 1, + /* + * Intel PMCs cannot be accessed sanely above 32 bit width, + * so we install an artificial 1<<31 period regardless of + * the generic event period: + */ + .max_period = (1ULL << 31) - 1, + .get_event_constraints = intel_get_event_constraints, + .put_event_constraints = intel_put_event_constraints, + .event_constraints = intel_core_event_constraints, + .guest_get_msrs = core_guest_get_msrs, + .format_attrs = intel_arch_formats_attr, + .events_sysfs_show = intel_event_sysfs_show, +}; + +struct intel_shared_regs *allocate_shared_regs(int cpu) +{ + struct intel_shared_regs *regs; + int i; + + regs = kzalloc_node(sizeof(struct intel_shared_regs), + GFP_KERNEL, cpu_to_node(cpu)); + if (regs) { + /* + * initialize the locks to keep lockdep happy + */ + for (i = 0; i < EXTRA_REG_MAX; i++) + raw_spin_lock_init(®s->regs[i].lock); + + regs->core_id = -1; + } + return regs; +} + +static int intel_pmu_cpu_prepare(int cpu) +{ + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + + if (!(x86_pmu.extra_regs || x86_pmu.lbr_sel_map)) + return NOTIFY_OK; + + cpuc->shared_regs = allocate_shared_regs(cpu); + if (!cpuc->shared_regs) + return NOTIFY_BAD; + + return NOTIFY_OK; +} + +static void intel_pmu_cpu_starting(int cpu) +{ + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + int core_id = topology_core_id(cpu); + int i; + + init_debug_store_on_cpu(cpu); + /* + * Deal with CPUs that don't clear their LBRs on power-up. + */ + intel_pmu_lbr_reset(); + + cpuc->lbr_sel = NULL; + + if (!cpuc->shared_regs) + return; + + if (!(x86_pmu.er_flags & ERF_NO_HT_SHARING)) { + for_each_cpu(i, topology_thread_cpumask(cpu)) { + struct intel_shared_regs *pc; + + pc = per_cpu(cpu_hw_events, i).shared_regs; + if (pc && pc->core_id == core_id) { + cpuc->kfree_on_online = cpuc->shared_regs; + cpuc->shared_regs = pc; + break; + } + } + cpuc->shared_regs->core_id = core_id; + cpuc->shared_regs->refcnt++; + } + + if (x86_pmu.lbr_sel_map) + cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR]; +} + +static void intel_pmu_cpu_dying(int cpu) +{ + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + struct intel_shared_regs *pc; + + pc = cpuc->shared_regs; + if (pc) { + if (pc->core_id == -1 || --pc->refcnt == 0) + kfree(pc); + cpuc->shared_regs = NULL; + } + + fini_debug_store_on_cpu(cpu); +} + +static void intel_pmu_flush_branch_stack(void) +{ + /* + * Intel LBR does not tag entries with the + * PID of the current task, then we need to + * flush it on ctxsw + * For now, we simply reset it + */ + if (x86_pmu.lbr_nr) + intel_pmu_lbr_reset(); +} + +PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63"); + +PMU_FORMAT_ATTR(ldlat, "config1:0-15"); + +static struct attribute *intel_arch3_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_pc.attr, + &format_attr_any.attr, + &format_attr_inv.attr, + &format_attr_cmask.attr, + &format_attr_in_tx.attr, + &format_attr_in_tx_cp.attr, + + &format_attr_offcore_rsp.attr, /* XXX do NHM/WSM + SNB breakout */ + &format_attr_ldlat.attr, /* PEBS load latency */ + NULL, +}; + +static __initconst const struct x86_pmu intel_pmu = { + .name = "Intel", + .handle_irq = intel_pmu_handle_irq, + .disable_all = intel_pmu_disable_all, + .enable_all = intel_pmu_enable_all, + .enable = intel_pmu_enable_event, + .disable = intel_pmu_disable_event, + .hw_config = intel_pmu_hw_config, + .schedule_events = x86_schedule_events, + .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, + .perfctr = MSR_ARCH_PERFMON_PERFCTR0, + .event_map = intel_pmu_event_map, + .max_events = ARRAY_SIZE(intel_perfmon_event_map), + .apic = 1, + /* + * Intel PMCs cannot be accessed sanely above 32 bit width, + * so we install an artificial 1<<31 period regardless of + * the generic event period: + */ + .max_period = (1ULL << 31) - 1, + .get_event_constraints = intel_get_event_constraints, + .put_event_constraints = intel_put_event_constraints, + .pebs_aliases = intel_pebs_aliases_core2, + + .format_attrs = intel_arch3_formats_attr, + .events_sysfs_show = intel_event_sysfs_show, + + .cpu_prepare = intel_pmu_cpu_prepare, + .cpu_starting = intel_pmu_cpu_starting, + .cpu_dying = intel_pmu_cpu_dying, + .guest_get_msrs = intel_guest_get_msrs, + .flush_branch_stack = intel_pmu_flush_branch_stack, +}; + +static __init void intel_clovertown_quirk(void) +{ + /* + * PEBS is unreliable due to: + * + * AJ67 - PEBS may experience CPL leaks + * AJ68 - PEBS PMI may be delayed by one event + * AJ69 - GLOBAL_STATUS[62] will only be set when DEBUGCTL[12] + * AJ106 - FREEZE_LBRS_ON_PMI doesn't work in combination with PEBS + * + * AJ67 could be worked around by restricting the OS/USR flags. + * AJ69 could be worked around by setting PMU_FREEZE_ON_PMI. + * + * AJ106 could possibly be worked around by not allowing LBR + * usage from PEBS, including the fixup. + * AJ68 could possibly be worked around by always programming + * a pebs_event_reset[0] value and coping with the lost events. + * + * But taken together it might just make sense to not enable PEBS on + * these chips. + */ + pr_warn("PEBS disabled due to CPU errata\n"); + x86_pmu.pebs = 0; + x86_pmu.pebs_constraints = NULL; +} + +static int intel_snb_pebs_broken(int cpu) +{ + u32 rev = UINT_MAX; /* default to broken for unknown models */ + + switch (cpu_data(cpu).x86_model) { + case 42: /* SNB */ + rev = 0x28; + break; + + case 45: /* SNB-EP */ + switch (cpu_data(cpu).x86_mask) { + case 6: rev = 0x618; break; + case 7: rev = 0x70c; break; + } + } + + return (cpu_data(cpu).microcode < rev); +} + +static void intel_snb_check_microcode(void) +{ + int pebs_broken = 0; + int cpu; + + get_online_cpus(); + for_each_online_cpu(cpu) { + if ((pebs_broken = intel_snb_pebs_broken(cpu))) + break; + } + put_online_cpus(); + + if (pebs_broken == x86_pmu.pebs_broken) + return; + + /* + * Serialized by the microcode lock.. + */ + if (x86_pmu.pebs_broken) { + pr_info("PEBS enabled due to microcode update\n"); + x86_pmu.pebs_broken = 0; + } else { + pr_info("PEBS disabled due to CPU errata, please upgrade microcode\n"); + x86_pmu.pebs_broken = 1; + } +} + +/* + * Under certain circumstances, access certain MSR may cause #GP. + * The function tests if the input MSR can be safely accessed. + */ +static bool check_msr(unsigned long msr, u64 mask) +{ + u64 val_old, val_new, val_tmp; + + /* + * Read the current value, change it and read it back to see if it + * matches, this is needed to detect certain hardware emulators + * (qemu/kvm) that don't trap on the MSR access and always return 0s. + */ + if (rdmsrl_safe(msr, &val_old)) + return false; + + /* + * Only change the bits which can be updated by wrmsrl. + */ + val_tmp = val_old ^ mask; + if (wrmsrl_safe(msr, val_tmp) || + rdmsrl_safe(msr, &val_new)) + return false; + + if (val_new != val_tmp) + return false; + + /* Here it's sure that the MSR can be safely accessed. + * Restore the old value and return. + */ + wrmsrl(msr, val_old); + + return true; +} + +static __init void intel_sandybridge_quirk(void) +{ + x86_pmu.check_microcode = intel_snb_check_microcode; + intel_snb_check_microcode(); +} + +static const struct { int id; char *name; } intel_arch_events_map[] __initconst = { + { PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" }, + { PERF_COUNT_HW_INSTRUCTIONS, "instructions" }, + { PERF_COUNT_HW_BUS_CYCLES, "bus cycles" }, + { PERF_COUNT_HW_CACHE_REFERENCES, "cache references" }, + { PERF_COUNT_HW_CACHE_MISSES, "cache misses" }, + { PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" }, + { PERF_COUNT_HW_BRANCH_MISSES, "branch misses" }, +}; + +static __init void intel_arch_events_quirk(void) +{ + int bit; + + /* disable event that reported as not presend by cpuid */ + for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) { + intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0; + pr_warn("CPUID marked event: \'%s\' unavailable\n", + intel_arch_events_map[bit].name); + } +} + +static __init void intel_nehalem_quirk(void) +{ + union cpuid10_ebx ebx; + + ebx.full = x86_pmu.events_maskl; + if (ebx.split.no_branch_misses_retired) { + /* + * Erratum AAJ80 detected, we work it around by using + * the BR_MISP_EXEC.ANY event. This will over-count + * branch-misses, but it's still much better than the + * architectural event which is often completely bogus: + */ + intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89; + ebx.split.no_branch_misses_retired = 0; + x86_pmu.events_maskl = ebx.full; + pr_info("CPU erratum AAJ80 worked around\n"); + } +} + +EVENT_ATTR_STR(mem-loads, mem_ld_hsw, "event=0xcd,umask=0x1,ldlat=3"); +EVENT_ATTR_STR(mem-stores, mem_st_hsw, "event=0xd0,umask=0x82") + +/* Haswell special events */ +EVENT_ATTR_STR(tx-start, tx_start, "event=0xc9,umask=0x1"); +EVENT_ATTR_STR(tx-commit, tx_commit, "event=0xc9,umask=0x2"); +EVENT_ATTR_STR(tx-abort, tx_abort, "event=0xc9,umask=0x4"); +EVENT_ATTR_STR(tx-capacity, tx_capacity, "event=0x54,umask=0x2"); +EVENT_ATTR_STR(tx-conflict, tx_conflict, "event=0x54,umask=0x1"); +EVENT_ATTR_STR(el-start, el_start, "event=0xc8,umask=0x1"); +EVENT_ATTR_STR(el-commit, el_commit, "event=0xc8,umask=0x2"); +EVENT_ATTR_STR(el-abort, el_abort, "event=0xc8,umask=0x4"); +EVENT_ATTR_STR(el-capacity, el_capacity, "event=0x54,umask=0x2"); +EVENT_ATTR_STR(el-conflict, el_conflict, "event=0x54,umask=0x1"); +EVENT_ATTR_STR(cycles-t, cycles_t, "event=0x3c,in_tx=1"); +EVENT_ATTR_STR(cycles-ct, cycles_ct, "event=0x3c,in_tx=1,in_tx_cp=1"); + +static struct attribute *hsw_events_attrs[] = { + EVENT_PTR(tx_start), + EVENT_PTR(tx_commit), + EVENT_PTR(tx_abort), + EVENT_PTR(tx_capacity), + EVENT_PTR(tx_conflict), + EVENT_PTR(el_start), + EVENT_PTR(el_commit), + EVENT_PTR(el_abort), + EVENT_PTR(el_capacity), + EVENT_PTR(el_conflict), + EVENT_PTR(cycles_t), + EVENT_PTR(cycles_ct), + EVENT_PTR(mem_ld_hsw), + EVENT_PTR(mem_st_hsw), + NULL +}; + +__init int intel_pmu_init(void) +{ + union cpuid10_edx edx; + union cpuid10_eax eax; + union cpuid10_ebx ebx; + struct event_constraint *c; + unsigned int unused; + struct extra_reg *er; + int version, i; + + if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { + switch (boot_cpu_data.x86) { + case 0x6: + return p6_pmu_init(); + case 0xb: + return knc_pmu_init(); + case 0xf: + return p4_pmu_init(); + } + return -ENODEV; + } + + /* + * Check whether the Architectural PerfMon supports + * Branch Misses Retired hw_event or not. + */ + cpuid(10, &eax.full, &ebx.full, &unused, &edx.full); + if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT) + return -ENODEV; + + version = eax.split.version_id; + if (version < 2) + x86_pmu = core_pmu; + else + x86_pmu = intel_pmu; + + x86_pmu.version = version; + x86_pmu.num_counters = eax.split.num_counters; + x86_pmu.cntval_bits = eax.split.bit_width; + x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1; + + x86_pmu.events_maskl = ebx.full; + x86_pmu.events_mask_len = eax.split.mask_length; + + x86_pmu.max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters); + + /* + * Quirk: v2 perfmon does not report fixed-purpose events, so + * assume at least 3 events: + */ + if (version > 1) + x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); + + if (boot_cpu_has(X86_FEATURE_PDCM)) { + u64 capabilities; + + rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities); + x86_pmu.intel_cap.capabilities = capabilities; + } + + intel_ds_init(); + + x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */ + + /* + * Install the hw-cache-events table: + */ + switch (boot_cpu_data.x86_model) { + case 14: /* 65 nm core solo/duo, "Yonah" */ + pr_cont("Core events, "); + break; + + case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ + x86_add_quirk(intel_clovertown_quirk); + case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ + case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ + case 29: /* six-core 45 nm xeon "Dunnington" */ + memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + intel_pmu_lbr_init_core(); + + x86_pmu.event_constraints = intel_core2_event_constraints; + x86_pmu.pebs_constraints = intel_core2_pebs_event_constraints; + pr_cont("Core2 events, "); + break; + + case 26: /* 45 nm nehalem, "Bloomfield" */ + case 30: /* 45 nm nehalem, "Lynnfield" */ + case 46: /* 45 nm nehalem-ex, "Beckton" */ + memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs, + sizeof(hw_cache_extra_regs)); + + intel_pmu_lbr_init_nhm(); + + x86_pmu.event_constraints = intel_nehalem_event_constraints; + x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints; + x86_pmu.enable_all = intel_pmu_nhm_enable_all; + x86_pmu.extra_regs = intel_nehalem_extra_regs; + + x86_pmu.cpu_events = nhm_events_attrs; + + /* UOPS_ISSUED.STALLED_CYCLES */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = + X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1); + /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = + X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1); + + x86_add_quirk(intel_nehalem_quirk); + + pr_cont("Nehalem events, "); + break; + + case 28: /* Atom */ + case 38: /* Lincroft */ + case 39: /* Penwell */ + case 53: /* Cloverview */ + case 54: /* Cedarview */ + memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + intel_pmu_lbr_init_atom(); + + x86_pmu.event_constraints = intel_gen_event_constraints; + x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints; + pr_cont("Atom events, "); + break; + + case 55: /* Atom 22nm "Silvermont" */ + case 77: /* Avoton "Silvermont" */ + memcpy(hw_cache_event_ids, slm_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs, + sizeof(hw_cache_extra_regs)); + + intel_pmu_lbr_init_atom(); + + x86_pmu.event_constraints = intel_slm_event_constraints; + x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints; + x86_pmu.extra_regs = intel_slm_extra_regs; + x86_pmu.er_flags |= ERF_HAS_RSP_1; + pr_cont("Silvermont events, "); + break; + + case 37: /* 32 nm nehalem, "Clarkdale" */ + case 44: /* 32 nm nehalem, "Gulftown" */ + case 47: /* 32 nm Xeon E7 */ + memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs, + sizeof(hw_cache_extra_regs)); + + intel_pmu_lbr_init_nhm(); + + x86_pmu.event_constraints = intel_westmere_event_constraints; + x86_pmu.enable_all = intel_pmu_nhm_enable_all; + x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints; + x86_pmu.extra_regs = intel_westmere_extra_regs; + x86_pmu.er_flags |= ERF_HAS_RSP_1; + + x86_pmu.cpu_events = nhm_events_attrs; + + /* UOPS_ISSUED.STALLED_CYCLES */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = + X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1); + /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = + X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1); + + pr_cont("Westmere events, "); + break; + + case 42: /* SandyBridge */ + case 45: /* SandyBridge, "Romely-EP" */ + x86_add_quirk(intel_sandybridge_quirk); + memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, + sizeof(hw_cache_extra_regs)); + + intel_pmu_lbr_init_snb(); + + x86_pmu.event_constraints = intel_snb_event_constraints; + x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints; + x86_pmu.pebs_aliases = intel_pebs_aliases_snb; + if (boot_cpu_data.x86_model == 45) + x86_pmu.extra_regs = intel_snbep_extra_regs; + else + x86_pmu.extra_regs = intel_snb_extra_regs; + /* all extra regs are per-cpu when HT is on */ + x86_pmu.er_flags |= ERF_HAS_RSP_1; + x86_pmu.er_flags |= ERF_NO_HT_SHARING; + + x86_pmu.cpu_events = snb_events_attrs; + + /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = + X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1); + /* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = + X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1); + + pr_cont("SandyBridge events, "); + break; + case 58: /* IvyBridge */ + case 62: /* IvyBridge EP */ + memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + /* dTLB-load-misses on IVB is different than SNB */ + hw_cache_event_ids[C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = 0x8108; /* DTLB_LOAD_MISSES.DEMAND_LD_MISS_CAUSES_A_WALK */ + + memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, + sizeof(hw_cache_extra_regs)); + + intel_pmu_lbr_init_snb(); + + x86_pmu.event_constraints = intel_ivb_event_constraints; + x86_pmu.pebs_constraints = intel_ivb_pebs_event_constraints; + x86_pmu.pebs_aliases = intel_pebs_aliases_snb; + if (boot_cpu_data.x86_model == 62) + x86_pmu.extra_regs = intel_snbep_extra_regs; + else + x86_pmu.extra_regs = intel_snb_extra_regs; + /* all extra regs are per-cpu when HT is on */ + x86_pmu.er_flags |= ERF_HAS_RSP_1; + x86_pmu.er_flags |= ERF_NO_HT_SHARING; + + x86_pmu.cpu_events = snb_events_attrs; + + /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = + X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1); + + pr_cont("IvyBridge events, "); + break; + + + case 60: /* Haswell Client */ + case 70: + case 71: + case 63: + case 69: + x86_pmu.late_ack = true; + memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); + + intel_pmu_lbr_init_snb(); + + x86_pmu.event_constraints = intel_hsw_event_constraints; + x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints; + x86_pmu.extra_regs = intel_snb_extra_regs; + x86_pmu.pebs_aliases = intel_pebs_aliases_snb; + /* all extra regs are per-cpu when HT is on */ + x86_pmu.er_flags |= ERF_HAS_RSP_1; + x86_pmu.er_flags |= ERF_NO_HT_SHARING; + + x86_pmu.hw_config = hsw_hw_config; + x86_pmu.get_event_constraints = hsw_get_event_constraints; + x86_pmu.cpu_events = hsw_events_attrs; + x86_pmu.lbr_double_abort = true; + pr_cont("Haswell events, "); + break; + + default: + switch (x86_pmu.version) { + case 1: + x86_pmu.event_constraints = intel_v1_event_constraints; + pr_cont("generic architected perfmon v1, "); + break; + default: + /* + * default constraints for v2 and up + */ + x86_pmu.event_constraints = intel_gen_event_constraints; + pr_cont("generic architected perfmon, "); + break; + } + } + + if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) { + WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", + x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC); + x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC; + } + x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; + + if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) { + WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", + x86_pmu.num_counters_fixed, INTEL_PMC_MAX_FIXED); + x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED; + } + + x86_pmu.intel_ctrl |= + ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED; + + if (x86_pmu.event_constraints) { + /* + * event on fixed counter2 (REF_CYCLES) only works on this + * counter, so do not extend mask to generic counters + */ + for_each_event_constraint(c, x86_pmu.event_constraints) { + if (c->cmask != FIXED_EVENT_FLAGS + || c->idxmsk64 == INTEL_PMC_MSK_FIXED_REF_CYCLES) { + continue; + } + + c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; + c->weight += x86_pmu.num_counters; + } + } + + /* + * Access LBR MSR may cause #GP under certain circumstances. + * E.g. KVM doesn't support LBR MSR + * Check all LBT MSR here. + * Disable LBR access if any LBR MSRs can not be accessed. + */ + if (x86_pmu.lbr_nr && !check_msr(x86_pmu.lbr_tos, 0x3UL)) + x86_pmu.lbr_nr = 0; + for (i = 0; i < x86_pmu.lbr_nr; i++) { + if (!(check_msr(x86_pmu.lbr_from + i, 0xffffUL) && + check_msr(x86_pmu.lbr_to + i, 0xffffUL))) + x86_pmu.lbr_nr = 0; + } + + /* + * Access extra MSR may cause #GP under certain circumstances. + * E.g. KVM doesn't support offcore event + * Check all extra_regs here. + */ + if (x86_pmu.extra_regs) { + for (er = x86_pmu.extra_regs; er->msr; er++) { + er->extra_msr_access = check_msr(er->msr, 0x1ffUL); + /* Disable LBR select mapping */ + if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access) + x86_pmu.lbr_sel_map = NULL; + } + } + + /* Support full width counters using alternative MSR range */ + if (x86_pmu.intel_cap.full_width_write) { + x86_pmu.max_period = x86_pmu.cntval_mask; + x86_pmu.perfctr = MSR_IA32_PMC0; + pr_cont("full-width counters, "); + } + + return 0; +} diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c new file mode 100644 index 00000000000..696ade311de --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -0,0 +1,1106 @@ +#include <linux/bitops.h> +#include <linux/types.h> +#include <linux/slab.h> + +#include <asm/perf_event.h> +#include <asm/insn.h> + +#include "perf_event.h" + +/* The size of a BTS record in bytes: */ +#define BTS_RECORD_SIZE 24 + +#define BTS_BUFFER_SIZE (PAGE_SIZE << 4) +#define PEBS_BUFFER_SIZE PAGE_SIZE +#define PEBS_FIXUP_SIZE PAGE_SIZE + +/* + * pebs_record_32 for p4 and core not supported + +struct pebs_record_32 { + u32 flags, ip; + u32 ax, bc, cx, dx; + u32 si, di, bp, sp; +}; + + */ + +union intel_x86_pebs_dse { + u64 val; + struct { + unsigned int ld_dse:4; + unsigned int ld_stlb_miss:1; + unsigned int ld_locked:1; + unsigned int ld_reserved:26; + }; + struct { + unsigned int st_l1d_hit:1; + unsigned int st_reserved1:3; + unsigned int st_stlb_miss:1; + unsigned int st_locked:1; + unsigned int st_reserved2:26; + }; +}; + + +/* + * Map PEBS Load Latency Data Source encodings to generic + * memory data source information + */ +#define P(a, b) PERF_MEM_S(a, b) +#define OP_LH (P(OP, LOAD) | P(LVL, HIT)) +#define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS)) + +static const u64 pebs_data_source[] = { + P(OP, LOAD) | P(LVL, MISS) | P(LVL, L3) | P(SNOOP, NA),/* 0x00:ukn L3 */ + OP_LH | P(LVL, L1) | P(SNOOP, NONE), /* 0x01: L1 local */ + OP_LH | P(LVL, LFB) | P(SNOOP, NONE), /* 0x02: LFB hit */ + OP_LH | P(LVL, L2) | P(SNOOP, NONE), /* 0x03: L2 hit */ + OP_LH | P(LVL, L3) | P(SNOOP, NONE), /* 0x04: L3 hit */ + OP_LH | P(LVL, L3) | P(SNOOP, MISS), /* 0x05: L3 hit, snoop miss */ + OP_LH | P(LVL, L3) | P(SNOOP, HIT), /* 0x06: L3 hit, snoop hit */ + OP_LH | P(LVL, L3) | P(SNOOP, HITM), /* 0x07: L3 hit, snoop hitm */ + OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HIT), /* 0x08: L3 miss snoop hit */ + OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/ + OP_LH | P(LVL, LOC_RAM) | P(SNOOP, HIT), /* 0x0a: L3 miss, shared */ + OP_LH | P(LVL, REM_RAM1) | P(SNOOP, HIT), /* 0x0b: L3 miss, shared */ + OP_LH | P(LVL, LOC_RAM) | SNOOP_NONE_MISS,/* 0x0c: L3 miss, excl */ + OP_LH | P(LVL, REM_RAM1) | SNOOP_NONE_MISS,/* 0x0d: L3 miss, excl */ + OP_LH | P(LVL, IO) | P(SNOOP, NONE), /* 0x0e: I/O */ + OP_LH | P(LVL, UNC) | P(SNOOP, NONE), /* 0x0f: uncached */ +}; + +static u64 precise_store_data(u64 status) +{ + union intel_x86_pebs_dse dse; + u64 val = P(OP, STORE) | P(SNOOP, NA) | P(LVL, L1) | P(TLB, L2); + + dse.val = status; + + /* + * bit 4: TLB access + * 1 = stored missed 2nd level TLB + * + * so it either hit the walker or the OS + * otherwise hit 2nd level TLB + */ + if (dse.st_stlb_miss) + val |= P(TLB, MISS); + else + val |= P(TLB, HIT); + + /* + * bit 0: hit L1 data cache + * if not set, then all we know is that + * it missed L1D + */ + if (dse.st_l1d_hit) + val |= P(LVL, HIT); + else + val |= P(LVL, MISS); + + /* + * bit 5: Locked prefix + */ + if (dse.st_locked) + val |= P(LOCK, LOCKED); + + return val; +} + +static u64 precise_store_data_hsw(struct perf_event *event, u64 status) +{ + union perf_mem_data_src dse; + u64 cfg = event->hw.config & INTEL_ARCH_EVENT_MASK; + + dse.val = 0; + dse.mem_op = PERF_MEM_OP_STORE; + dse.mem_lvl = PERF_MEM_LVL_NA; + + /* + * L1 info only valid for following events: + * + * MEM_UOPS_RETIRED.STLB_MISS_STORES + * MEM_UOPS_RETIRED.LOCK_STORES + * MEM_UOPS_RETIRED.SPLIT_STORES + * MEM_UOPS_RETIRED.ALL_STORES + */ + if (cfg != 0x12d0 && cfg != 0x22d0 && cfg != 0x42d0 && cfg != 0x82d0) + return dse.mem_lvl; + + if (status & 1) + dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT; + else + dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS; + + /* Nothing else supported. Sorry. */ + return dse.val; +} + +static u64 load_latency_data(u64 status) +{ + union intel_x86_pebs_dse dse; + u64 val; + int model = boot_cpu_data.x86_model; + int fam = boot_cpu_data.x86; + + dse.val = status; + + /* + * use the mapping table for bit 0-3 + */ + val = pebs_data_source[dse.ld_dse]; + + /* + * Nehalem models do not support TLB, Lock infos + */ + if (fam == 0x6 && (model == 26 || model == 30 + || model == 31 || model == 46)) { + val |= P(TLB, NA) | P(LOCK, NA); + return val; + } + /* + * bit 4: TLB access + * 0 = did not miss 2nd level TLB + * 1 = missed 2nd level TLB + */ + if (dse.ld_stlb_miss) + val |= P(TLB, MISS) | P(TLB, L2); + else + val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2); + + /* + * bit 5: locked prefix + */ + if (dse.ld_locked) + val |= P(LOCK, LOCKED); + + return val; +} + +struct pebs_record_core { + u64 flags, ip; + u64 ax, bx, cx, dx; + u64 si, di, bp, sp; + u64 r8, r9, r10, r11; + u64 r12, r13, r14, r15; +}; + +struct pebs_record_nhm { + u64 flags, ip; + u64 ax, bx, cx, dx; + u64 si, di, bp, sp; + u64 r8, r9, r10, r11; + u64 r12, r13, r14, r15; + u64 status, dla, dse, lat; +}; + +/* + * Same as pebs_record_nhm, with two additional fields. + */ +struct pebs_record_hsw { + u64 flags, ip; + u64 ax, bx, cx, dx; + u64 si, di, bp, sp; + u64 r8, r9, r10, r11; + u64 r12, r13, r14, r15; + u64 status, dla, dse, lat; + u64 real_ip, tsx_tuning; +}; + +union hsw_tsx_tuning { + struct { + u32 cycles_last_block : 32, + hle_abort : 1, + rtm_abort : 1, + instruction_abort : 1, + non_instruction_abort : 1, + retry : 1, + data_conflict : 1, + capacity_writes : 1, + capacity_reads : 1; + }; + u64 value; +}; + +#define PEBS_HSW_TSX_FLAGS 0xff00000000ULL + +void init_debug_store_on_cpu(int cpu) +{ + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + + if (!ds) + return; + + wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, + (u32)((u64)(unsigned long)ds), + (u32)((u64)(unsigned long)ds >> 32)); +} + +void fini_debug_store_on_cpu(int cpu) +{ + if (!per_cpu(cpu_hw_events, cpu).ds) + return; + + wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0); +} + +static DEFINE_PER_CPU(void *, insn_buffer); + +static int alloc_pebs_buffer(int cpu) +{ + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + int node = cpu_to_node(cpu); + int max, thresh = 1; /* always use a single PEBS record */ + void *buffer, *ibuffer; + + if (!x86_pmu.pebs) + return 0; + + buffer = kzalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL, node); + if (unlikely(!buffer)) + return -ENOMEM; + + /* + * HSW+ already provides us the eventing ip; no need to allocate this + * buffer then. + */ + if (x86_pmu.intel_cap.pebs_format < 2) { + ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node); + if (!ibuffer) { + kfree(buffer); + return -ENOMEM; + } + per_cpu(insn_buffer, cpu) = ibuffer; + } + + max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size; + + ds->pebs_buffer_base = (u64)(unsigned long)buffer; + ds->pebs_index = ds->pebs_buffer_base; + ds->pebs_absolute_maximum = ds->pebs_buffer_base + + max * x86_pmu.pebs_record_size; + + ds->pebs_interrupt_threshold = ds->pebs_buffer_base + + thresh * x86_pmu.pebs_record_size; + + return 0; +} + +static void release_pebs_buffer(int cpu) +{ + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + + if (!ds || !x86_pmu.pebs) + return; + + kfree(per_cpu(insn_buffer, cpu)); + per_cpu(insn_buffer, cpu) = NULL; + + kfree((void *)(unsigned long)ds->pebs_buffer_base); + ds->pebs_buffer_base = 0; +} + +static int alloc_bts_buffer(int cpu) +{ + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + int node = cpu_to_node(cpu); + int max, thresh; + void *buffer; + + if (!x86_pmu.bts) + return 0; + + buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node); + if (unlikely(!buffer)) { + WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__); + return -ENOMEM; + } + + max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE; + thresh = max / 16; + + ds->bts_buffer_base = (u64)(unsigned long)buffer; + ds->bts_index = ds->bts_buffer_base; + ds->bts_absolute_maximum = ds->bts_buffer_base + + max * BTS_RECORD_SIZE; + ds->bts_interrupt_threshold = ds->bts_absolute_maximum - + thresh * BTS_RECORD_SIZE; + + return 0; +} + +static void release_bts_buffer(int cpu) +{ + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + + if (!ds || !x86_pmu.bts) + return; + + kfree((void *)(unsigned long)ds->bts_buffer_base); + ds->bts_buffer_base = 0; +} + +static int alloc_ds_buffer(int cpu) +{ + int node = cpu_to_node(cpu); + struct debug_store *ds; + + ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node); + if (unlikely(!ds)) + return -ENOMEM; + + per_cpu(cpu_hw_events, cpu).ds = ds; + + return 0; +} + +static void release_ds_buffer(int cpu) +{ + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + + if (!ds) + return; + + per_cpu(cpu_hw_events, cpu).ds = NULL; + kfree(ds); +} + +void release_ds_buffers(void) +{ + int cpu; + + if (!x86_pmu.bts && !x86_pmu.pebs) + return; + + get_online_cpus(); + for_each_online_cpu(cpu) + fini_debug_store_on_cpu(cpu); + + for_each_possible_cpu(cpu) { + release_pebs_buffer(cpu); + release_bts_buffer(cpu); + release_ds_buffer(cpu); + } + put_online_cpus(); +} + +void reserve_ds_buffers(void) +{ + int bts_err = 0, pebs_err = 0; + int cpu; + + x86_pmu.bts_active = 0; + x86_pmu.pebs_active = 0; + + if (!x86_pmu.bts && !x86_pmu.pebs) + return; + + if (!x86_pmu.bts) + bts_err = 1; + + if (!x86_pmu.pebs) + pebs_err = 1; + + get_online_cpus(); + + for_each_possible_cpu(cpu) { + if (alloc_ds_buffer(cpu)) { + bts_err = 1; + pebs_err = 1; + } + + if (!bts_err && alloc_bts_buffer(cpu)) + bts_err = 1; + + if (!pebs_err && alloc_pebs_buffer(cpu)) + pebs_err = 1; + + if (bts_err && pebs_err) + break; + } + + if (bts_err) { + for_each_possible_cpu(cpu) + release_bts_buffer(cpu); + } + + if (pebs_err) { + for_each_possible_cpu(cpu) + release_pebs_buffer(cpu); + } + + if (bts_err && pebs_err) { + for_each_possible_cpu(cpu) + release_ds_buffer(cpu); + } else { + if (x86_pmu.bts && !bts_err) + x86_pmu.bts_active = 1; + + if (x86_pmu.pebs && !pebs_err) + x86_pmu.pebs_active = 1; + + for_each_online_cpu(cpu) + init_debug_store_on_cpu(cpu); + } + + put_online_cpus(); +} + +/* + * BTS + */ + +struct event_constraint bts_constraint = + EVENT_CONSTRAINT(0, 1ULL << INTEL_PMC_IDX_FIXED_BTS, 0); + +void intel_pmu_enable_bts(u64 config) +{ + unsigned long debugctlmsr; + + debugctlmsr = get_debugctlmsr(); + + debugctlmsr |= DEBUGCTLMSR_TR; + debugctlmsr |= DEBUGCTLMSR_BTS; + debugctlmsr |= DEBUGCTLMSR_BTINT; + + if (!(config & ARCH_PERFMON_EVENTSEL_OS)) + debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS; + + if (!(config & ARCH_PERFMON_EVENTSEL_USR)) + debugctlmsr |= DEBUGCTLMSR_BTS_OFF_USR; + + update_debugctlmsr(debugctlmsr); +} + +void intel_pmu_disable_bts(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + unsigned long debugctlmsr; + + if (!cpuc->ds) + return; + + debugctlmsr = get_debugctlmsr(); + + debugctlmsr &= + ~(DEBUGCTLMSR_TR | DEBUGCTLMSR_BTS | DEBUGCTLMSR_BTINT | + DEBUGCTLMSR_BTS_OFF_OS | DEBUGCTLMSR_BTS_OFF_USR); + + update_debugctlmsr(debugctlmsr); +} + +int intel_pmu_drain_bts_buffer(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct debug_store *ds = cpuc->ds; + struct bts_record { + u64 from; + u64 to; + u64 flags; + }; + struct perf_event *event = cpuc->events[INTEL_PMC_IDX_FIXED_BTS]; + struct bts_record *at, *top; + struct perf_output_handle handle; + struct perf_event_header header; + struct perf_sample_data data; + struct pt_regs regs; + + if (!event) + return 0; + + if (!x86_pmu.bts_active) + return 0; + + at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; + top = (struct bts_record *)(unsigned long)ds->bts_index; + + if (top <= at) + return 0; + + memset(®s, 0, sizeof(regs)); + + ds->bts_index = ds->bts_buffer_base; + + perf_sample_data_init(&data, 0, event->hw.last_period); + + /* + * Prepare a generic sample, i.e. fill in the invariant fields. + * We will overwrite the from and to address before we output + * the sample. + */ + perf_prepare_sample(&header, &data, event, ®s); + + if (perf_output_begin(&handle, event, header.size * (top - at))) + return 1; + + for (; at < top; at++) { + data.ip = at->from; + data.addr = at->to; + + perf_output_sample(&handle, &header, &data, event); + } + + perf_output_end(&handle); + + /* There's new data available. */ + event->hw.interrupts++; + event->pending_kill = POLL_IN; + return 1; +} + +/* + * PEBS + */ +struct event_constraint intel_core2_pebs_event_constraints[] = { + INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ + INTEL_UEVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */ + INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */ + INTEL_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */ + INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ + EVENT_CONSTRAINT_END +}; + +struct event_constraint intel_atom_pebs_event_constraints[] = { + INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ + INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */ + INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ + EVENT_CONSTRAINT_END +}; + +struct event_constraint intel_slm_pebs_event_constraints[] = { + INTEL_UEVENT_CONSTRAINT(0x0103, 0x1), /* REHABQ.LD_BLOCK_ST_FORWARD_PS */ + INTEL_UEVENT_CONSTRAINT(0x0803, 0x1), /* REHABQ.LD_SPLITS_PS */ + INTEL_UEVENT_CONSTRAINT(0x0204, 0x1), /* MEM_UOPS_RETIRED.L2_HIT_LOADS_PS */ + INTEL_UEVENT_CONSTRAINT(0x0404, 0x1), /* MEM_UOPS_RETIRED.L2_MISS_LOADS_PS */ + INTEL_UEVENT_CONSTRAINT(0x0804, 0x1), /* MEM_UOPS_RETIRED.DTLB_MISS_LOADS_PS */ + INTEL_UEVENT_CONSTRAINT(0x2004, 0x1), /* MEM_UOPS_RETIRED.HITM_PS */ + INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY_PS */ + INTEL_UEVENT_CONSTRAINT(0x00c4, 0x1), /* BR_INST_RETIRED.ALL_BRANCHES_PS */ + INTEL_UEVENT_CONSTRAINT(0x7ec4, 0x1), /* BR_INST_RETIRED.JCC_PS */ + INTEL_UEVENT_CONSTRAINT(0xbfc4, 0x1), /* BR_INST_RETIRED.FAR_BRANCH_PS */ + INTEL_UEVENT_CONSTRAINT(0xebc4, 0x1), /* BR_INST_RETIRED.NON_RETURN_IND_PS */ + INTEL_UEVENT_CONSTRAINT(0xf7c4, 0x1), /* BR_INST_RETIRED.RETURN_PS */ + INTEL_UEVENT_CONSTRAINT(0xf9c4, 0x1), /* BR_INST_RETIRED.CALL_PS */ + INTEL_UEVENT_CONSTRAINT(0xfbc4, 0x1), /* BR_INST_RETIRED.IND_CALL_PS */ + INTEL_UEVENT_CONSTRAINT(0xfdc4, 0x1), /* BR_INST_RETIRED.REL_CALL_PS */ + INTEL_UEVENT_CONSTRAINT(0xfec4, 0x1), /* BR_INST_RETIRED.TAKEN_JCC_PS */ + INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_MISP_RETIRED.ALL_BRANCHES_PS */ + INTEL_UEVENT_CONSTRAINT(0x7ec5, 0x1), /* BR_INST_MISP_RETIRED.JCC_PS */ + INTEL_UEVENT_CONSTRAINT(0xebc5, 0x1), /* BR_INST_MISP_RETIRED.NON_RETURN_IND_PS */ + INTEL_UEVENT_CONSTRAINT(0xf7c5, 0x1), /* BR_INST_MISP_RETIRED.RETURN_PS */ + INTEL_UEVENT_CONSTRAINT(0xfbc5, 0x1), /* BR_INST_MISP_RETIRED.IND_CALL_PS */ + INTEL_UEVENT_CONSTRAINT(0xfec5, 0x1), /* BR_INST_MISP_RETIRED.TAKEN_JCC_PS */ + EVENT_CONSTRAINT_END +}; + +struct event_constraint intel_nehalem_pebs_event_constraints[] = { + INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ + INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */ + INTEL_EVENT_CONSTRAINT(0xc0, 0xf), /* INST_RETIRED.ANY */ + INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ + INTEL_UEVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */ + INTEL_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */ + INTEL_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ + INTEL_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ + EVENT_CONSTRAINT_END +}; + +struct event_constraint intel_westmere_pebs_event_constraints[] = { + INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ + INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */ + INTEL_EVENT_CONSTRAINT(0xc0, 0xf), /* INSTR_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */ + INTEL_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ + INTEL_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ + EVENT_CONSTRAINT_END +}; + +struct event_constraint intel_snb_pebs_event_constraints[] = { + INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ + INTEL_UEVENT_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */ + INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */ + INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */ + INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */ + INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */ + INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */ + INTEL_UEVENT_CONSTRAINT(0x02d4, 0xf), /* MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS */ + EVENT_CONSTRAINT_END +}; + +struct event_constraint intel_ivb_pebs_event_constraints[] = { + INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ + INTEL_UEVENT_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */ + INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */ + INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */ + INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */ + INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */ + INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */ + EVENT_CONSTRAINT_END +}; + +struct event_constraint intel_hsw_pebs_event_constraints[] = { + INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ + INTEL_PST_HSW_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */ + INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */ + INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ + INTEL_UEVENT_CONSTRAINT(0x01c5, 0xf), /* BR_MISP_RETIRED.CONDITIONAL */ + INTEL_UEVENT_CONSTRAINT(0x04c5, 0xf), /* BR_MISP_RETIRED.ALL_BRANCHES */ + INTEL_UEVENT_CONSTRAINT(0x20c5, 0xf), /* BR_MISP_RETIRED.NEAR_TAKEN */ + INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.* */ + /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */ + INTEL_UEVENT_CONSTRAINT(0x11d0, 0xf), + /* MEM_UOPS_RETIRED.STLB_MISS_STORES */ + INTEL_UEVENT_CONSTRAINT(0x12d0, 0xf), + INTEL_UEVENT_CONSTRAINT(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */ + INTEL_UEVENT_CONSTRAINT(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */ + /* MEM_UOPS_RETIRED.SPLIT_STORES */ + INTEL_UEVENT_CONSTRAINT(0x42d0, 0xf), + INTEL_UEVENT_CONSTRAINT(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */ + INTEL_PST_HSW_CONSTRAINT(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */ + INTEL_UEVENT_CONSTRAINT(0x01d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.L1_HIT */ + INTEL_UEVENT_CONSTRAINT(0x02d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.L2_HIT */ + INTEL_UEVENT_CONSTRAINT(0x04d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.L3_HIT */ + /* MEM_LOAD_UOPS_RETIRED.HIT_LFB */ + INTEL_UEVENT_CONSTRAINT(0x40d1, 0xf), + /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS */ + INTEL_UEVENT_CONSTRAINT(0x01d2, 0xf), + /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT */ + INTEL_UEVENT_CONSTRAINT(0x02d2, 0xf), + /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM */ + INTEL_UEVENT_CONSTRAINT(0x01d3, 0xf), + INTEL_UEVENT_CONSTRAINT(0x04c8, 0xf), /* HLE_RETIRED.Abort */ + INTEL_UEVENT_CONSTRAINT(0x04c9, 0xf), /* RTM_RETIRED.Abort */ + + EVENT_CONSTRAINT_END +}; + +struct event_constraint *intel_pebs_constraints(struct perf_event *event) +{ + struct event_constraint *c; + + if (!event->attr.precise_ip) + return NULL; + + if (x86_pmu.pebs_constraints) { + for_each_event_constraint(c, x86_pmu.pebs_constraints) { + if ((event->hw.config & c->cmask) == c->code) { + event->hw.flags |= c->flags; + return c; + } + } + } + + return &emptyconstraint; +} + +void intel_pmu_pebs_enable(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + + hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT; + + cpuc->pebs_enabled |= 1ULL << hwc->idx; + + if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) + cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32); + else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST) + cpuc->pebs_enabled |= 1ULL << 63; +} + +void intel_pmu_pebs_disable(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + + cpuc->pebs_enabled &= ~(1ULL << hwc->idx); + + if (event->hw.constraint->flags & PERF_X86_EVENT_PEBS_LDLAT) + cpuc->pebs_enabled &= ~(1ULL << (hwc->idx + 32)); + else if (event->hw.constraint->flags & PERF_X86_EVENT_PEBS_ST) + cpuc->pebs_enabled &= ~(1ULL << 63); + + if (cpuc->enabled) + wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); + + hwc->config |= ARCH_PERFMON_EVENTSEL_INT; +} + +void intel_pmu_pebs_enable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + if (cpuc->pebs_enabled) + wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); +} + +void intel_pmu_pebs_disable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + if (cpuc->pebs_enabled) + wrmsrl(MSR_IA32_PEBS_ENABLE, 0); +} + +static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + unsigned long from = cpuc->lbr_entries[0].from; + unsigned long old_to, to = cpuc->lbr_entries[0].to; + unsigned long ip = regs->ip; + int is_64bit = 0; + void *kaddr; + + /* + * We don't need to fixup if the PEBS assist is fault like + */ + if (!x86_pmu.intel_cap.pebs_trap) + return 1; + + /* + * No LBR entry, no basic block, no rewinding + */ + if (!cpuc->lbr_stack.nr || !from || !to) + return 0; + + /* + * Basic blocks should never cross user/kernel boundaries + */ + if (kernel_ip(ip) != kernel_ip(to)) + return 0; + + /* + * unsigned math, either ip is before the start (impossible) or + * the basic block is larger than 1 page (sanity) + */ + if ((ip - to) > PEBS_FIXUP_SIZE) + return 0; + + /* + * We sampled a branch insn, rewind using the LBR stack + */ + if (ip == to) { + set_linear_ip(regs, from); + return 1; + } + + if (!kernel_ip(ip)) { + int size, bytes; + u8 *buf = this_cpu_read(insn_buffer); + + size = ip - to; /* Must fit our buffer, see above */ + bytes = copy_from_user_nmi(buf, (void __user *)to, size); + if (bytes != 0) + return 0; + + kaddr = buf; + } else { + kaddr = (void *)to; + } + + do { + struct insn insn; + + old_to = to; + +#ifdef CONFIG_X86_64 + is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32); +#endif + insn_init(&insn, kaddr, is_64bit); + insn_get_length(&insn); + + to += insn.length; + kaddr += insn.length; + } while (to < ip); + + if (to == ip) { + set_linear_ip(regs, old_to); + return 1; + } + + /* + * Even though we decoded the basic block, the instruction stream + * never matched the given IP, either the TO or the IP got corrupted. + */ + return 0; +} + +static inline u64 intel_hsw_weight(struct pebs_record_hsw *pebs) +{ + if (pebs->tsx_tuning) { + union hsw_tsx_tuning tsx = { .value = pebs->tsx_tuning }; + return tsx.cycles_last_block; + } + return 0; +} + +static inline u64 intel_hsw_transaction(struct pebs_record_hsw *pebs) +{ + u64 txn = (pebs->tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32; + + /* For RTM XABORTs also log the abort code from AX */ + if ((txn & PERF_TXN_TRANSACTION) && (pebs->ax & 1)) + txn |= ((pebs->ax >> 24) & 0xff) << PERF_TXN_ABORT_SHIFT; + return txn; +} + +static void __intel_pmu_pebs_event(struct perf_event *event, + struct pt_regs *iregs, void *__pebs) +{ + /* + * We cast to the biggest pebs_record but are careful not to + * unconditionally access the 'extra' entries. + */ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct pebs_record_hsw *pebs = __pebs; + struct perf_sample_data data; + struct pt_regs regs; + u64 sample_type; + int fll, fst; + + if (!intel_pmu_save_and_restart(event)) + return; + + fll = event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT; + fst = event->hw.flags & (PERF_X86_EVENT_PEBS_ST | + PERF_X86_EVENT_PEBS_ST_HSW); + + perf_sample_data_init(&data, 0, event->hw.last_period); + + data.period = event->hw.last_period; + sample_type = event->attr.sample_type; + + /* + * if PEBS-LL or PreciseStore + */ + if (fll || fst) { + /* + * Use latency for weight (only avail with PEBS-LL) + */ + if (fll && (sample_type & PERF_SAMPLE_WEIGHT)) + data.weight = pebs->lat; + + /* + * data.data_src encodes the data source + */ + if (sample_type & PERF_SAMPLE_DATA_SRC) { + if (fll) + data.data_src.val = load_latency_data(pebs->dse); + else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW) + data.data_src.val = + precise_store_data_hsw(event, pebs->dse); + else + data.data_src.val = precise_store_data(pebs->dse); + } + } + + /* + * We use the interrupt regs as a base because the PEBS record + * does not contain a full regs set, specifically it seems to + * lack segment descriptors, which get used by things like + * user_mode(). + * + * In the simple case fix up only the IP and BP,SP regs, for + * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly. + * A possible PERF_SAMPLE_REGS will have to transfer all regs. + */ + regs = *iregs; + regs.flags = pebs->flags; + set_linear_ip(®s, pebs->ip); + regs.bp = pebs->bp; + regs.sp = pebs->sp; + + if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format >= 2) { + regs.ip = pebs->real_ip; + regs.flags |= PERF_EFLAGS_EXACT; + } else if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(®s)) + regs.flags |= PERF_EFLAGS_EXACT; + else + regs.flags &= ~PERF_EFLAGS_EXACT; + + if ((event->attr.sample_type & PERF_SAMPLE_ADDR) && + x86_pmu.intel_cap.pebs_format >= 1) + data.addr = pebs->dla; + + if (x86_pmu.intel_cap.pebs_format >= 2) { + /* Only set the TSX weight when no memory weight. */ + if ((event->attr.sample_type & PERF_SAMPLE_WEIGHT) && !fll) + data.weight = intel_hsw_weight(pebs); + + if (event->attr.sample_type & PERF_SAMPLE_TRANSACTION) + data.txn = intel_hsw_transaction(pebs); + } + + if (has_branch_stack(event)) + data.br_stack = &cpuc->lbr_stack; + + if (perf_event_overflow(event, &data, ®s)) + x86_pmu_stop(event, 0); +} + +static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct debug_store *ds = cpuc->ds; + struct perf_event *event = cpuc->events[0]; /* PMC0 only */ + struct pebs_record_core *at, *top; + int n; + + if (!x86_pmu.pebs_active) + return; + + at = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base; + top = (struct pebs_record_core *)(unsigned long)ds->pebs_index; + + /* + * Whatever else happens, drain the thing + */ + ds->pebs_index = ds->pebs_buffer_base; + + if (!test_bit(0, cpuc->active_mask)) + return; + + WARN_ON_ONCE(!event); + + if (!event->attr.precise_ip) + return; + + n = top - at; + if (n <= 0) + return; + + /* + * Should not happen, we program the threshold at 1 and do not + * set a reset value. + */ + WARN_ONCE(n > 1, "bad leftover pebs %d\n", n); + at += n - 1; + + __intel_pmu_pebs_event(event, iregs, at); +} + +static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct debug_store *ds = cpuc->ds; + struct perf_event *event = NULL; + void *at, *top; + u64 status = 0; + int bit; + + if (!x86_pmu.pebs_active) + return; + + at = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base; + top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index; + + ds->pebs_index = ds->pebs_buffer_base; + + if (unlikely(at > top)) + return; + + /* + * Should not happen, we program the threshold at 1 and do not + * set a reset value. + */ + WARN_ONCE(top - at > x86_pmu.max_pebs_events * x86_pmu.pebs_record_size, + "Unexpected number of pebs records %ld\n", + (long)(top - at) / x86_pmu.pebs_record_size); + + for (; at < top; at += x86_pmu.pebs_record_size) { + struct pebs_record_nhm *p = at; + + for_each_set_bit(bit, (unsigned long *)&p->status, + x86_pmu.max_pebs_events) { + event = cpuc->events[bit]; + if (!test_bit(bit, cpuc->active_mask)) + continue; + + WARN_ON_ONCE(!event); + + if (!event->attr.precise_ip) + continue; + + if (__test_and_set_bit(bit, (unsigned long *)&status)) + continue; + + break; + } + + if (!event || bit >= x86_pmu.max_pebs_events) + continue; + + __intel_pmu_pebs_event(event, iregs, at); + } +} + +/* + * BTS, PEBS probe and setup + */ + +void intel_ds_init(void) +{ + /* + * No support for 32bit formats + */ + if (!boot_cpu_has(X86_FEATURE_DTES64)) + return; + + x86_pmu.bts = boot_cpu_has(X86_FEATURE_BTS); + x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS); + if (x86_pmu.pebs) { + char pebs_type = x86_pmu.intel_cap.pebs_trap ? '+' : '-'; + int format = x86_pmu.intel_cap.pebs_format; + + switch (format) { + case 0: + printk(KERN_CONT "PEBS fmt0%c, ", pebs_type); + x86_pmu.pebs_record_size = sizeof(struct pebs_record_core); + x86_pmu.drain_pebs = intel_pmu_drain_pebs_core; + break; + + case 1: + printk(KERN_CONT "PEBS fmt1%c, ", pebs_type); + x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm); + x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm; + break; + + case 2: + pr_cont("PEBS fmt2%c, ", pebs_type); + x86_pmu.pebs_record_size = sizeof(struct pebs_record_hsw); + x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm; + break; + + default: + printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type); + x86_pmu.pebs = 0; + } + } +} + +void perf_restore_debug_store(void) +{ + struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds); + + if (!x86_pmu.bts && !x86_pmu.pebs) + return; + + wrmsrl(MSR_IA32_DS_AREA, (unsigned long)ds); +} diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c new file mode 100644 index 00000000000..9dd2459a4c7 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -0,0 +1,779 @@ +#include <linux/perf_event.h> +#include <linux/types.h> + +#include <asm/perf_event.h> +#include <asm/msr.h> +#include <asm/insn.h> + +#include "perf_event.h" + +enum { + LBR_FORMAT_32 = 0x00, + LBR_FORMAT_LIP = 0x01, + LBR_FORMAT_EIP = 0x02, + LBR_FORMAT_EIP_FLAGS = 0x03, + LBR_FORMAT_EIP_FLAGS2 = 0x04, + LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_EIP_FLAGS2, +}; + +static enum { + LBR_EIP_FLAGS = 1, + LBR_TSX = 2, +} lbr_desc[LBR_FORMAT_MAX_KNOWN + 1] = { + [LBR_FORMAT_EIP_FLAGS] = LBR_EIP_FLAGS, + [LBR_FORMAT_EIP_FLAGS2] = LBR_EIP_FLAGS | LBR_TSX, +}; + +/* + * Intel LBR_SELECT bits + * Intel Vol3a, April 2011, Section 16.7 Table 16-10 + * + * Hardware branch filter (not available on all CPUs) + */ +#define LBR_KERNEL_BIT 0 /* do not capture at ring0 */ +#define LBR_USER_BIT 1 /* do not capture at ring > 0 */ +#define LBR_JCC_BIT 2 /* do not capture conditional branches */ +#define LBR_REL_CALL_BIT 3 /* do not capture relative calls */ +#define LBR_IND_CALL_BIT 4 /* do not capture indirect calls */ +#define LBR_RETURN_BIT 5 /* do not capture near returns */ +#define LBR_IND_JMP_BIT 6 /* do not capture indirect jumps */ +#define LBR_REL_JMP_BIT 7 /* do not capture relative jumps */ +#define LBR_FAR_BIT 8 /* do not capture far branches */ + +#define LBR_KERNEL (1 << LBR_KERNEL_BIT) +#define LBR_USER (1 << LBR_USER_BIT) +#define LBR_JCC (1 << LBR_JCC_BIT) +#define LBR_REL_CALL (1 << LBR_REL_CALL_BIT) +#define LBR_IND_CALL (1 << LBR_IND_CALL_BIT) +#define LBR_RETURN (1 << LBR_RETURN_BIT) +#define LBR_REL_JMP (1 << LBR_REL_JMP_BIT) +#define LBR_IND_JMP (1 << LBR_IND_JMP_BIT) +#define LBR_FAR (1 << LBR_FAR_BIT) + +#define LBR_PLM (LBR_KERNEL | LBR_USER) + +#define LBR_SEL_MASK 0x1ff /* valid bits in LBR_SELECT */ +#define LBR_NOT_SUPP -1 /* LBR filter not supported */ +#define LBR_IGN 0 /* ignored */ + +#define LBR_ANY \ + (LBR_JCC |\ + LBR_REL_CALL |\ + LBR_IND_CALL |\ + LBR_RETURN |\ + LBR_REL_JMP |\ + LBR_IND_JMP |\ + LBR_FAR) + +#define LBR_FROM_FLAG_MISPRED (1ULL << 63) +#define LBR_FROM_FLAG_IN_TX (1ULL << 62) +#define LBR_FROM_FLAG_ABORT (1ULL << 61) + +#define for_each_branch_sample_type(x) \ + for ((x) = PERF_SAMPLE_BRANCH_USER; \ + (x) < PERF_SAMPLE_BRANCH_MAX; (x) <<= 1) + +/* + * x86control flow change classification + * x86control flow changes include branches, interrupts, traps, faults + */ +enum { + X86_BR_NONE = 0, /* unknown */ + + X86_BR_USER = 1 << 0, /* branch target is user */ + X86_BR_KERNEL = 1 << 1, /* branch target is kernel */ + + X86_BR_CALL = 1 << 2, /* call */ + X86_BR_RET = 1 << 3, /* return */ + X86_BR_SYSCALL = 1 << 4, /* syscall */ + X86_BR_SYSRET = 1 << 5, /* syscall return */ + X86_BR_INT = 1 << 6, /* sw interrupt */ + X86_BR_IRET = 1 << 7, /* return from interrupt */ + X86_BR_JCC = 1 << 8, /* conditional */ + X86_BR_JMP = 1 << 9, /* jump */ + X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */ + X86_BR_IND_CALL = 1 << 11,/* indirect calls */ + X86_BR_ABORT = 1 << 12,/* transaction abort */ + X86_BR_IN_TX = 1 << 13,/* in transaction */ + X86_BR_NO_TX = 1 << 14,/* not in transaction */ +}; + +#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL) +#define X86_BR_ANYTX (X86_BR_NO_TX | X86_BR_IN_TX) + +#define X86_BR_ANY \ + (X86_BR_CALL |\ + X86_BR_RET |\ + X86_BR_SYSCALL |\ + X86_BR_SYSRET |\ + X86_BR_INT |\ + X86_BR_IRET |\ + X86_BR_JCC |\ + X86_BR_JMP |\ + X86_BR_IRQ |\ + X86_BR_ABORT |\ + X86_BR_IND_CALL) + +#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY) + +#define X86_BR_ANY_CALL \ + (X86_BR_CALL |\ + X86_BR_IND_CALL |\ + X86_BR_SYSCALL |\ + X86_BR_IRQ |\ + X86_BR_INT) + +static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc); + +/* + * We only support LBR implementations that have FREEZE_LBRS_ON_PMI + * otherwise it becomes near impossible to get a reliable stack. + */ + +static void __intel_pmu_lbr_enable(void) +{ + u64 debugctl; + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + if (cpuc->lbr_sel) + wrmsrl(MSR_LBR_SELECT, cpuc->lbr_sel->config); + + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); + wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); +} + +static void __intel_pmu_lbr_disable(void) +{ + u64 debugctl; + + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); + wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); +} + +static void intel_pmu_lbr_reset_32(void) +{ + int i; + + for (i = 0; i < x86_pmu.lbr_nr; i++) + wrmsrl(x86_pmu.lbr_from + i, 0); +} + +static void intel_pmu_lbr_reset_64(void) +{ + int i; + + for (i = 0; i < x86_pmu.lbr_nr; i++) { + wrmsrl(x86_pmu.lbr_from + i, 0); + wrmsrl(x86_pmu.lbr_to + i, 0); + } +} + +void intel_pmu_lbr_reset(void) +{ + if (!x86_pmu.lbr_nr) + return; + + if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) + intel_pmu_lbr_reset_32(); + else + intel_pmu_lbr_reset_64(); +} + +void intel_pmu_lbr_enable(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + if (!x86_pmu.lbr_nr) + return; + + /* + * Reset the LBR stack if we changed task context to + * avoid data leaks. + */ + if (event->ctx->task && cpuc->lbr_context != event->ctx) { + intel_pmu_lbr_reset(); + cpuc->lbr_context = event->ctx; + } + cpuc->br_sel = event->hw.branch_reg.reg; + + cpuc->lbr_users++; +} + +void intel_pmu_lbr_disable(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + if (!x86_pmu.lbr_nr) + return; + + cpuc->lbr_users--; + WARN_ON_ONCE(cpuc->lbr_users < 0); + + if (cpuc->enabled && !cpuc->lbr_users) { + __intel_pmu_lbr_disable(); + /* avoid stale pointer */ + cpuc->lbr_context = NULL; + } +} + +void intel_pmu_lbr_enable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + if (cpuc->lbr_users) + __intel_pmu_lbr_enable(); +} + +void intel_pmu_lbr_disable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + if (cpuc->lbr_users) + __intel_pmu_lbr_disable(); +} + +/* + * TOS = most recently recorded branch + */ +static inline u64 intel_pmu_lbr_tos(void) +{ + u64 tos; + + rdmsrl(x86_pmu.lbr_tos, tos); + + return tos; +} + +static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) +{ + unsigned long mask = x86_pmu.lbr_nr - 1; + u64 tos = intel_pmu_lbr_tos(); + int i; + + for (i = 0; i < x86_pmu.lbr_nr; i++) { + unsigned long lbr_idx = (tos - i) & mask; + union { + struct { + u32 from; + u32 to; + }; + u64 lbr; + } msr_lastbranch; + + rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr); + + cpuc->lbr_entries[i].from = msr_lastbranch.from; + cpuc->lbr_entries[i].to = msr_lastbranch.to; + cpuc->lbr_entries[i].mispred = 0; + cpuc->lbr_entries[i].predicted = 0; + cpuc->lbr_entries[i].reserved = 0; + } + cpuc->lbr_stack.nr = i; +} + +/* + * Due to lack of segmentation in Linux the effective address (offset) + * is the same as the linear address, allowing us to merge the LIP and EIP + * LBR formats. + */ +static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) +{ + unsigned long mask = x86_pmu.lbr_nr - 1; + int lbr_format = x86_pmu.intel_cap.lbr_format; + u64 tos = intel_pmu_lbr_tos(); + int i; + int out = 0; + + for (i = 0; i < x86_pmu.lbr_nr; i++) { + unsigned long lbr_idx = (tos - i) & mask; + u64 from, to, mis = 0, pred = 0, in_tx = 0, abort = 0; + int skip = 0; + int lbr_flags = lbr_desc[lbr_format]; + + rdmsrl(x86_pmu.lbr_from + lbr_idx, from); + rdmsrl(x86_pmu.lbr_to + lbr_idx, to); + + if (lbr_flags & LBR_EIP_FLAGS) { + mis = !!(from & LBR_FROM_FLAG_MISPRED); + pred = !mis; + skip = 1; + } + if (lbr_flags & LBR_TSX) { + in_tx = !!(from & LBR_FROM_FLAG_IN_TX); + abort = !!(from & LBR_FROM_FLAG_ABORT); + skip = 3; + } + from = (u64)((((s64)from) << skip) >> skip); + + /* + * Some CPUs report duplicated abort records, + * with the second entry not having an abort bit set. + * Skip them here. This loop runs backwards, + * so we need to undo the previous record. + * If the abort just happened outside the window + * the extra entry cannot be removed. + */ + if (abort && x86_pmu.lbr_double_abort && out > 0) + out--; + + cpuc->lbr_entries[out].from = from; + cpuc->lbr_entries[out].to = to; + cpuc->lbr_entries[out].mispred = mis; + cpuc->lbr_entries[out].predicted = pred; + cpuc->lbr_entries[out].in_tx = in_tx; + cpuc->lbr_entries[out].abort = abort; + cpuc->lbr_entries[out].reserved = 0; + out++; + } + cpuc->lbr_stack.nr = out; +} + +void intel_pmu_lbr_read(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + if (!cpuc->lbr_users) + return; + + if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) + intel_pmu_lbr_read_32(cpuc); + else + intel_pmu_lbr_read_64(cpuc); + + intel_pmu_lbr_filter(cpuc); +} + +/* + * SW filter is used: + * - in case there is no HW filter + * - in case the HW filter has errata or limitations + */ +static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event) +{ + u64 br_type = event->attr.branch_sample_type; + int mask = 0; + + if (br_type & PERF_SAMPLE_BRANCH_USER) + mask |= X86_BR_USER; + + if (br_type & PERF_SAMPLE_BRANCH_KERNEL) + mask |= X86_BR_KERNEL; + + /* we ignore BRANCH_HV here */ + + if (br_type & PERF_SAMPLE_BRANCH_ANY) + mask |= X86_BR_ANY; + + if (br_type & PERF_SAMPLE_BRANCH_ANY_CALL) + mask |= X86_BR_ANY_CALL; + + if (br_type & PERF_SAMPLE_BRANCH_ANY_RETURN) + mask |= X86_BR_RET | X86_BR_IRET | X86_BR_SYSRET; + + if (br_type & PERF_SAMPLE_BRANCH_IND_CALL) + mask |= X86_BR_IND_CALL; + + if (br_type & PERF_SAMPLE_BRANCH_ABORT_TX) + mask |= X86_BR_ABORT; + + if (br_type & PERF_SAMPLE_BRANCH_IN_TX) + mask |= X86_BR_IN_TX; + + if (br_type & PERF_SAMPLE_BRANCH_NO_TX) + mask |= X86_BR_NO_TX; + + if (br_type & PERF_SAMPLE_BRANCH_COND) + mask |= X86_BR_JCC; + + /* + * stash actual user request into reg, it may + * be used by fixup code for some CPU + */ + event->hw.branch_reg.reg = mask; +} + +/* + * setup the HW LBR filter + * Used only when available, may not be enough to disambiguate + * all branches, may need the help of the SW filter + */ +static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event) +{ + struct hw_perf_event_extra *reg; + u64 br_type = event->attr.branch_sample_type; + u64 mask = 0, m; + u64 v; + + for_each_branch_sample_type(m) { + if (!(br_type & m)) + continue; + + v = x86_pmu.lbr_sel_map[m]; + if (v == LBR_NOT_SUPP) + return -EOPNOTSUPP; + + if (v != LBR_IGN) + mask |= v; + } + reg = &event->hw.branch_reg; + reg->idx = EXTRA_REG_LBR; + + /* LBR_SELECT operates in suppress mode so invert mask */ + reg->config = ~mask & x86_pmu.lbr_sel_mask; + + return 0; +} + +int intel_pmu_setup_lbr_filter(struct perf_event *event) +{ + int ret = 0; + + /* + * no LBR on this PMU + */ + if (!x86_pmu.lbr_nr) + return -EOPNOTSUPP; + + /* + * setup SW LBR filter + */ + intel_pmu_setup_sw_lbr_filter(event); + + /* + * setup HW LBR filter, if any + */ + if (x86_pmu.lbr_sel_map) + ret = intel_pmu_setup_hw_lbr_filter(event); + + return ret; +} + +/* + * return the type of control flow change at address "from" + * intruction is not necessarily a branch (in case of interrupt). + * + * The branch type returned also includes the priv level of the + * target of the control flow change (X86_BR_USER, X86_BR_KERNEL). + * + * If a branch type is unknown OR the instruction cannot be + * decoded (e.g., text page not present), then X86_BR_NONE is + * returned. + */ +static int branch_type(unsigned long from, unsigned long to, int abort) +{ + struct insn insn; + void *addr; + int bytes, size = MAX_INSN_SIZE; + int ret = X86_BR_NONE; + int ext, to_plm, from_plm; + u8 buf[MAX_INSN_SIZE]; + int is64 = 0; + + to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER; + from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER; + + /* + * maybe zero if lbr did not fill up after a reset by the time + * we get a PMU interrupt + */ + if (from == 0 || to == 0) + return X86_BR_NONE; + + if (abort) + return X86_BR_ABORT | to_plm; + + if (from_plm == X86_BR_USER) { + /* + * can happen if measuring at the user level only + * and we interrupt in a kernel thread, e.g., idle. + */ + if (!current->mm) + return X86_BR_NONE; + + /* may fail if text not present */ + bytes = copy_from_user_nmi(buf, (void __user *)from, size); + if (bytes != 0) + return X86_BR_NONE; + + addr = buf; + } else { + /* + * The LBR logs any address in the IP, even if the IP just + * faulted. This means userspace can control the from address. + * Ensure we don't blindy read any address by validating it is + * a known text address. + */ + if (kernel_text_address(from)) + addr = (void *)from; + else + return X86_BR_NONE; + } + + /* + * decoder needs to know the ABI especially + * on 64-bit systems running 32-bit apps + */ +#ifdef CONFIG_X86_64 + is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32); +#endif + insn_init(&insn, addr, is64); + insn_get_opcode(&insn); + + switch (insn.opcode.bytes[0]) { + case 0xf: + switch (insn.opcode.bytes[1]) { + case 0x05: /* syscall */ + case 0x34: /* sysenter */ + ret = X86_BR_SYSCALL; + break; + case 0x07: /* sysret */ + case 0x35: /* sysexit */ + ret = X86_BR_SYSRET; + break; + case 0x80 ... 0x8f: /* conditional */ + ret = X86_BR_JCC; + break; + default: + ret = X86_BR_NONE; + } + break; + case 0x70 ... 0x7f: /* conditional */ + ret = X86_BR_JCC; + break; + case 0xc2: /* near ret */ + case 0xc3: /* near ret */ + case 0xca: /* far ret */ + case 0xcb: /* far ret */ + ret = X86_BR_RET; + break; + case 0xcf: /* iret */ + ret = X86_BR_IRET; + break; + case 0xcc ... 0xce: /* int */ + ret = X86_BR_INT; + break; + case 0xe8: /* call near rel */ + case 0x9a: /* call far absolute */ + ret = X86_BR_CALL; + break; + case 0xe0 ... 0xe3: /* loop jmp */ + ret = X86_BR_JCC; + break; + case 0xe9 ... 0xeb: /* jmp */ + ret = X86_BR_JMP; + break; + case 0xff: /* call near absolute, call far absolute ind */ + insn_get_modrm(&insn); + ext = (insn.modrm.bytes[0] >> 3) & 0x7; + switch (ext) { + case 2: /* near ind call */ + case 3: /* far ind call */ + ret = X86_BR_IND_CALL; + break; + case 4: + case 5: + ret = X86_BR_JMP; + break; + } + break; + default: + ret = X86_BR_NONE; + } + /* + * interrupts, traps, faults (and thus ring transition) may + * occur on any instructions. Thus, to classify them correctly, + * we need to first look at the from and to priv levels. If they + * are different and to is in the kernel, then it indicates + * a ring transition. If the from instruction is not a ring + * transition instr (syscall, systenter, int), then it means + * it was a irq, trap or fault. + * + * we have no way of detecting kernel to kernel faults. + */ + if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL + && ret != X86_BR_SYSCALL && ret != X86_BR_INT) + ret = X86_BR_IRQ; + + /* + * branch priv level determined by target as + * is done by HW when LBR_SELECT is implemented + */ + if (ret != X86_BR_NONE) + ret |= to_plm; + + return ret; +} + +/* + * implement actual branch filter based on user demand. + * Hardware may not exactly satisfy that request, thus + * we need to inspect opcodes. Mismatched branches are + * discarded. Therefore, the number of branches returned + * in PERF_SAMPLE_BRANCH_STACK sample may vary. + */ +static void +intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) +{ + u64 from, to; + int br_sel = cpuc->br_sel; + int i, j, type; + bool compress = false; + + /* if sampling all branches, then nothing to filter */ + if ((br_sel & X86_BR_ALL) == X86_BR_ALL) + return; + + for (i = 0; i < cpuc->lbr_stack.nr; i++) { + + from = cpuc->lbr_entries[i].from; + to = cpuc->lbr_entries[i].to; + + type = branch_type(from, to, cpuc->lbr_entries[i].abort); + if (type != X86_BR_NONE && (br_sel & X86_BR_ANYTX)) { + if (cpuc->lbr_entries[i].in_tx) + type |= X86_BR_IN_TX; + else + type |= X86_BR_NO_TX; + } + + /* if type does not correspond, then discard */ + if (type == X86_BR_NONE || (br_sel & type) != type) { + cpuc->lbr_entries[i].from = 0; + compress = true; + } + } + + if (!compress) + return; + + /* remove all entries with from=0 */ + for (i = 0; i < cpuc->lbr_stack.nr; ) { + if (!cpuc->lbr_entries[i].from) { + j = i; + while (++j < cpuc->lbr_stack.nr) + cpuc->lbr_entries[j-1] = cpuc->lbr_entries[j]; + cpuc->lbr_stack.nr--; + if (!cpuc->lbr_entries[i].from) + continue; + } + i++; + } +} + +/* + * Map interface branch filters onto LBR filters + */ +static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = { + [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY, + [PERF_SAMPLE_BRANCH_USER] = LBR_USER, + [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL, + [PERF_SAMPLE_BRANCH_HV] = LBR_IGN, + [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_REL_JMP + | LBR_IND_JMP | LBR_FAR, + /* + * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches + */ + [PERF_SAMPLE_BRANCH_ANY_CALL] = + LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR, + /* + * NHM/WSM erratum: must include IND_JMP to capture IND_CALL + */ + [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL | LBR_IND_JMP, + [PERF_SAMPLE_BRANCH_COND] = LBR_JCC, +}; + +static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = { + [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY, + [PERF_SAMPLE_BRANCH_USER] = LBR_USER, + [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL, + [PERF_SAMPLE_BRANCH_HV] = LBR_IGN, + [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_FAR, + [PERF_SAMPLE_BRANCH_ANY_CALL] = LBR_REL_CALL | LBR_IND_CALL + | LBR_FAR, + [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL, + [PERF_SAMPLE_BRANCH_COND] = LBR_JCC, +}; + +/* core */ +void intel_pmu_lbr_init_core(void) +{ + x86_pmu.lbr_nr = 4; + x86_pmu.lbr_tos = MSR_LBR_TOS; + x86_pmu.lbr_from = MSR_LBR_CORE_FROM; + x86_pmu.lbr_to = MSR_LBR_CORE_TO; + + /* + * SW branch filter usage: + * - compensate for lack of HW filter + */ + pr_cont("4-deep LBR, "); +} + +/* nehalem/westmere */ +void intel_pmu_lbr_init_nhm(void) +{ + x86_pmu.lbr_nr = 16; + x86_pmu.lbr_tos = MSR_LBR_TOS; + x86_pmu.lbr_from = MSR_LBR_NHM_FROM; + x86_pmu.lbr_to = MSR_LBR_NHM_TO; + + x86_pmu.lbr_sel_mask = LBR_SEL_MASK; + x86_pmu.lbr_sel_map = nhm_lbr_sel_map; + + /* + * SW branch filter usage: + * - workaround LBR_SEL errata (see above) + * - support syscall, sysret capture. + * That requires LBR_FAR but that means far + * jmp need to be filtered out + */ + pr_cont("16-deep LBR, "); +} + +/* sandy bridge */ +void intel_pmu_lbr_init_snb(void) +{ + x86_pmu.lbr_nr = 16; + x86_pmu.lbr_tos = MSR_LBR_TOS; + x86_pmu.lbr_from = MSR_LBR_NHM_FROM; + x86_pmu.lbr_to = MSR_LBR_NHM_TO; + + x86_pmu.lbr_sel_mask = LBR_SEL_MASK; + x86_pmu.lbr_sel_map = snb_lbr_sel_map; + + /* + * SW branch filter usage: + * - support syscall, sysret capture. + * That requires LBR_FAR but that means far + * jmp need to be filtered out + */ + pr_cont("16-deep LBR, "); +} + +/* atom */ +void intel_pmu_lbr_init_atom(void) +{ + /* + * only models starting at stepping 10 seems + * to have an operational LBR which can freeze + * on PMU interrupt + */ + if (boot_cpu_data.x86_model == 28 + && boot_cpu_data.x86_mask < 10) { + pr_cont("LBR disabled due to erratum"); + return; + } + + x86_pmu.lbr_nr = 8; + x86_pmu.lbr_tos = MSR_LBR_TOS; + x86_pmu.lbr_from = MSR_LBR_CORE_FROM; + x86_pmu.lbr_to = MSR_LBR_CORE_TO; + + /* + * SW branch filter usage: + * - compensate for lack of HW filter + */ + pr_cont("8-deep LBR, "); +} diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c new file mode 100644 index 00000000000..619f7699487 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c @@ -0,0 +1,714 @@ +/* + * perf_event_intel_rapl.c: support Intel RAPL energy consumption counters + * Copyright (C) 2013 Google, Inc., Stephane Eranian + * + * Intel RAPL interface is specified in the IA-32 Manual Vol3b + * section 14.7.1 (September 2013) + * + * RAPL provides more controls than just reporting energy consumption + * however here we only expose the 3 energy consumption free running + * counters (pp0, pkg, dram). + * + * Each of those counters increments in a power unit defined by the + * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules + * but it can vary. + * + * Counter to rapl events mappings: + * + * pp0 counter: consumption of all physical cores (power plane 0) + * event: rapl_energy_cores + * perf code: 0x1 + * + * pkg counter: consumption of the whole processor package + * event: rapl_energy_pkg + * perf code: 0x2 + * + * dram counter: consumption of the dram domain (servers only) + * event: rapl_energy_dram + * perf code: 0x3 + * + * dram counter: consumption of the builtin-gpu domain (client only) + * event: rapl_energy_gpu + * perf code: 0x4 + * + * We manage those counters as free running (read-only). They may be + * use simultaneously by other tools, such as turbostat. + * + * The events only support system-wide mode counting. There is no + * sampling support because it does not make sense and is not + * supported by the RAPL hardware. + * + * Because we want to avoid floating-point operations in the kernel, + * the events are all reported in fixed point arithmetic (32.32). + * Tools must adjust the counts to convert them to Watts using + * the duration of the measurement. Tools may use a function such as + * ldexp(raw_count, -32); + */ +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/perf_event.h> +#include <asm/cpu_device_id.h> +#include "perf_event.h" + +/* + * RAPL energy status counters + */ +#define RAPL_IDX_PP0_NRG_STAT 0 /* all cores */ +#define INTEL_RAPL_PP0 0x1 /* pseudo-encoding */ +#define RAPL_IDX_PKG_NRG_STAT 1 /* entire package */ +#define INTEL_RAPL_PKG 0x2 /* pseudo-encoding */ +#define RAPL_IDX_RAM_NRG_STAT 2 /* DRAM */ +#define INTEL_RAPL_RAM 0x3 /* pseudo-encoding */ +#define RAPL_IDX_PP1_NRG_STAT 3 /* gpu */ +#define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */ + +/* Clients have PP0, PKG */ +#define RAPL_IDX_CLN (1<<RAPL_IDX_PP0_NRG_STAT|\ + 1<<RAPL_IDX_PKG_NRG_STAT|\ + 1<<RAPL_IDX_PP1_NRG_STAT) + +/* Servers have PP0, PKG, RAM */ +#define RAPL_IDX_SRV (1<<RAPL_IDX_PP0_NRG_STAT|\ + 1<<RAPL_IDX_PKG_NRG_STAT|\ + 1<<RAPL_IDX_RAM_NRG_STAT) + +/* Servers have PP0, PKG, RAM, PP1 */ +#define RAPL_IDX_HSW (1<<RAPL_IDX_PP0_NRG_STAT|\ + 1<<RAPL_IDX_PKG_NRG_STAT|\ + 1<<RAPL_IDX_RAM_NRG_STAT|\ + 1<<RAPL_IDX_PP1_NRG_STAT) + +/* + * event code: LSB 8 bits, passed in attr->config + * any other bit is reserved + */ +#define RAPL_EVENT_MASK 0xFFULL + +#define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format) \ +static ssize_t __rapl_##_var##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + char *page) \ +{ \ + BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ + return sprintf(page, _format "\n"); \ +} \ +static struct kobj_attribute format_attr_##_var = \ + __ATTR(_name, 0444, __rapl_##_var##_show, NULL) + +#define RAPL_EVENT_DESC(_name, _config) \ +{ \ + .attr = __ATTR(_name, 0444, rapl_event_show, NULL), \ + .config = _config, \ +} + +#define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */ + +struct rapl_pmu { + spinlock_t lock; + int hw_unit; /* 1/2^hw_unit Joule */ + int n_active; /* number of active events */ + struct list_head active_list; + struct pmu *pmu; /* pointer to rapl_pmu_class */ + ktime_t timer_interval; /* in ktime_t unit */ + struct hrtimer hrtimer; +}; + +static struct pmu rapl_pmu_class; +static cpumask_t rapl_cpu_mask; +static int rapl_cntr_mask; + +static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu); +static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free); + +static inline u64 rapl_read_counter(struct perf_event *event) +{ + u64 raw; + rdmsrl(event->hw.event_base, raw); + return raw; +} + +static inline u64 rapl_scale(u64 v) +{ + /* + * scale delta to smallest unit (1/2^32) + * users must then scale back: count * 1/(1e9*2^32) to get Joules + * or use ldexp(count, -32). + * Watts = Joules/Time delta + */ + return v << (32 - __get_cpu_var(rapl_pmu)->hw_unit); +} + +static u64 rapl_event_update(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 prev_raw_count, new_raw_count; + s64 delta, sdelta; + int shift = RAPL_CNTR_WIDTH; + +again: + prev_raw_count = local64_read(&hwc->prev_count); + rdmsrl(event->hw.event_base, new_raw_count); + + if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, + new_raw_count) != prev_raw_count) { + cpu_relax(); + goto again; + } + + /* + * Now we have the new raw value and have updated the prev + * timestamp already. We can now calculate the elapsed delta + * (event-)time and add that to the generic event. + * + * Careful, not all hw sign-extends above the physical width + * of the count. + */ + delta = (new_raw_count << shift) - (prev_raw_count << shift); + delta >>= shift; + + sdelta = rapl_scale(delta); + + local64_add(sdelta, &event->count); + + return new_raw_count; +} + +static void rapl_start_hrtimer(struct rapl_pmu *pmu) +{ + __hrtimer_start_range_ns(&pmu->hrtimer, + pmu->timer_interval, 0, + HRTIMER_MODE_REL_PINNED, 0); +} + +static void rapl_stop_hrtimer(struct rapl_pmu *pmu) +{ + hrtimer_cancel(&pmu->hrtimer); +} + +static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer) +{ + struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu); + struct perf_event *event; + unsigned long flags; + + if (!pmu->n_active) + return HRTIMER_NORESTART; + + spin_lock_irqsave(&pmu->lock, flags); + + list_for_each_entry(event, &pmu->active_list, active_entry) { + rapl_event_update(event); + } + + spin_unlock_irqrestore(&pmu->lock, flags); + + hrtimer_forward_now(hrtimer, pmu->timer_interval); + + return HRTIMER_RESTART; +} + +static void rapl_hrtimer_init(struct rapl_pmu *pmu) +{ + struct hrtimer *hr = &pmu->hrtimer; + + hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hr->function = rapl_hrtimer_handle; +} + +static void __rapl_pmu_event_start(struct rapl_pmu *pmu, + struct perf_event *event) +{ + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) + return; + + event->hw.state = 0; + + list_add_tail(&event->active_entry, &pmu->active_list); + + local64_set(&event->hw.prev_count, rapl_read_counter(event)); + + pmu->n_active++; + if (pmu->n_active == 1) + rapl_start_hrtimer(pmu); +} + +static void rapl_pmu_event_start(struct perf_event *event, int mode) +{ + struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu); + unsigned long flags; + + spin_lock_irqsave(&pmu->lock, flags); + __rapl_pmu_event_start(pmu, event); + spin_unlock_irqrestore(&pmu->lock, flags); +} + +static void rapl_pmu_event_stop(struct perf_event *event, int mode) +{ + struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu); + struct hw_perf_event *hwc = &event->hw; + unsigned long flags; + + spin_lock_irqsave(&pmu->lock, flags); + + /* mark event as deactivated and stopped */ + if (!(hwc->state & PERF_HES_STOPPED)) { + WARN_ON_ONCE(pmu->n_active <= 0); + pmu->n_active--; + if (pmu->n_active == 0) + rapl_stop_hrtimer(pmu); + + list_del(&event->active_entry); + + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; + } + + /* check if update of sw counter is necessary */ + if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { + /* + * Drain the remaining delta count out of a event + * that we are disabling: + */ + rapl_event_update(event); + hwc->state |= PERF_HES_UPTODATE; + } + + spin_unlock_irqrestore(&pmu->lock, flags); +} + +static int rapl_pmu_event_add(struct perf_event *event, int mode) +{ + struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu); + struct hw_perf_event *hwc = &event->hw; + unsigned long flags; + + spin_lock_irqsave(&pmu->lock, flags); + + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + + if (mode & PERF_EF_START) + __rapl_pmu_event_start(pmu, event); + + spin_unlock_irqrestore(&pmu->lock, flags); + + return 0; +} + +static void rapl_pmu_event_del(struct perf_event *event, int flags) +{ + rapl_pmu_event_stop(event, PERF_EF_UPDATE); +} + +static int rapl_pmu_event_init(struct perf_event *event) +{ + u64 cfg = event->attr.config & RAPL_EVENT_MASK; + int bit, msr, ret = 0; + + /* only look at RAPL events */ + if (event->attr.type != rapl_pmu_class.type) + return -ENOENT; + + /* check only supported bits are set */ + if (event->attr.config & ~RAPL_EVENT_MASK) + return -EINVAL; + + /* + * check event is known (determines counter) + */ + switch (cfg) { + case INTEL_RAPL_PP0: + bit = RAPL_IDX_PP0_NRG_STAT; + msr = MSR_PP0_ENERGY_STATUS; + break; + case INTEL_RAPL_PKG: + bit = RAPL_IDX_PKG_NRG_STAT; + msr = MSR_PKG_ENERGY_STATUS; + break; + case INTEL_RAPL_RAM: + bit = RAPL_IDX_RAM_NRG_STAT; + msr = MSR_DRAM_ENERGY_STATUS; + break; + case INTEL_RAPL_PP1: + bit = RAPL_IDX_PP1_NRG_STAT; + msr = MSR_PP1_ENERGY_STATUS; + break; + default: + return -EINVAL; + } + /* check event supported */ + if (!(rapl_cntr_mask & (1 << bit))) + return -EINVAL; + + /* unsupported modes and filters */ + if (event->attr.exclude_user || + event->attr.exclude_kernel || + event->attr.exclude_hv || + event->attr.exclude_idle || + event->attr.exclude_host || + event->attr.exclude_guest || + event->attr.sample_period) /* no sampling */ + return -EINVAL; + + /* must be done before validate_group */ + event->hw.event_base = msr; + event->hw.config = cfg; + event->hw.idx = bit; + + return ret; +} + +static void rapl_pmu_event_read(struct perf_event *event) +{ + rapl_event_update(event); +} + +static ssize_t rapl_get_attr_cpumask(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &rapl_cpu_mask); + + buf[n++] = '\n'; + buf[n] = '\0'; + return n; +} + +static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL); + +static struct attribute *rapl_pmu_attrs[] = { + &dev_attr_cpumask.attr, + NULL, +}; + +static struct attribute_group rapl_pmu_attr_group = { + .attrs = rapl_pmu_attrs, +}; + +EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01"); +EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02"); +EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03"); +EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04"); + +EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules"); +EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules"); +EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules"); +EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules"); + +/* + * we compute in 0.23 nJ increments regardless of MSR + */ +EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10"); +EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10"); +EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10"); +EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10"); + +static struct attribute *rapl_events_srv_attr[] = { + EVENT_PTR(rapl_cores), + EVENT_PTR(rapl_pkg), + EVENT_PTR(rapl_ram), + + EVENT_PTR(rapl_cores_unit), + EVENT_PTR(rapl_pkg_unit), + EVENT_PTR(rapl_ram_unit), + + EVENT_PTR(rapl_cores_scale), + EVENT_PTR(rapl_pkg_scale), + EVENT_PTR(rapl_ram_scale), + NULL, +}; + +static struct attribute *rapl_events_cln_attr[] = { + EVENT_PTR(rapl_cores), + EVENT_PTR(rapl_pkg), + EVENT_PTR(rapl_gpu), + + EVENT_PTR(rapl_cores_unit), + EVENT_PTR(rapl_pkg_unit), + EVENT_PTR(rapl_gpu_unit), + + EVENT_PTR(rapl_cores_scale), + EVENT_PTR(rapl_pkg_scale), + EVENT_PTR(rapl_gpu_scale), + NULL, +}; + +static struct attribute *rapl_events_hsw_attr[] = { + EVENT_PTR(rapl_cores), + EVENT_PTR(rapl_pkg), + EVENT_PTR(rapl_gpu), + EVENT_PTR(rapl_ram), + + EVENT_PTR(rapl_cores_unit), + EVENT_PTR(rapl_pkg_unit), + EVENT_PTR(rapl_gpu_unit), + EVENT_PTR(rapl_ram_unit), + + EVENT_PTR(rapl_cores_scale), + EVENT_PTR(rapl_pkg_scale), + EVENT_PTR(rapl_gpu_scale), + EVENT_PTR(rapl_ram_scale), + NULL, +}; + +static struct attribute_group rapl_pmu_events_group = { + .name = "events", + .attrs = NULL, /* patched at runtime */ +}; + +DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7"); +static struct attribute *rapl_formats_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group rapl_pmu_format_group = { + .name = "format", + .attrs = rapl_formats_attr, +}; + +const struct attribute_group *rapl_attr_groups[] = { + &rapl_pmu_attr_group, + &rapl_pmu_format_group, + &rapl_pmu_events_group, + NULL, +}; + +static struct pmu rapl_pmu_class = { + .attr_groups = rapl_attr_groups, + .task_ctx_nr = perf_invalid_context, /* system-wide only */ + .event_init = rapl_pmu_event_init, + .add = rapl_pmu_event_add, /* must have */ + .del = rapl_pmu_event_del, /* must have */ + .start = rapl_pmu_event_start, + .stop = rapl_pmu_event_stop, + .read = rapl_pmu_event_read, +}; + +static void rapl_cpu_exit(int cpu) +{ + struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); + int i, phys_id = topology_physical_package_id(cpu); + int target = -1; + + /* find a new cpu on same package */ + for_each_online_cpu(i) { + if (i == cpu) + continue; + if (phys_id == topology_physical_package_id(i)) { + target = i; + break; + } + } + /* + * clear cpu from cpumask + * if was set in cpumask and still some cpu on package, + * then move to new cpu + */ + if (cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask) && target >= 0) + cpumask_set_cpu(target, &rapl_cpu_mask); + + WARN_ON(cpumask_empty(&rapl_cpu_mask)); + /* + * migrate events and context to new cpu + */ + if (target >= 0) + perf_pmu_migrate_context(pmu->pmu, cpu, target); + + /* cancel overflow polling timer for CPU */ + rapl_stop_hrtimer(pmu); +} + +static void rapl_cpu_init(int cpu) +{ + int i, phys_id = topology_physical_package_id(cpu); + + /* check if phys_is is already covered */ + for_each_cpu(i, &rapl_cpu_mask) { + if (phys_id == topology_physical_package_id(i)) + return; + } + /* was not found, so add it */ + cpumask_set_cpu(cpu, &rapl_cpu_mask); +} + +static int rapl_cpu_prepare(int cpu) +{ + struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); + int phys_id = topology_physical_package_id(cpu); + u64 ms; + u64 msr_rapl_power_unit_bits; + + if (pmu) + return 0; + + if (phys_id < 0) + return -1; + + /* protect rdmsrl() to handle virtualization */ + if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits)) + return -1; + + pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); + if (!pmu) + return -1; + + spin_lock_init(&pmu->lock); + + INIT_LIST_HEAD(&pmu->active_list); + + /* + * grab power unit as: 1/2^unit Joules + * + * we cache in local PMU instance + */ + pmu->hw_unit = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; + pmu->pmu = &rapl_pmu_class; + + /* + * use reference of 200W for scaling the timeout + * to avoid missing counter overflows. + * 200W = 200 Joules/sec + * divide interval by 2 to avoid lockstep (2 * 100) + * if hw unit is 32, then we use 2 ms 1/200/2 + */ + if (pmu->hw_unit < 32) + ms = (1000 / (2 * 100)) * (1ULL << (32 - pmu->hw_unit - 1)); + else + ms = 2; + + pmu->timer_interval = ms_to_ktime(ms); + + rapl_hrtimer_init(pmu); + + /* set RAPL pmu for this cpu for now */ + per_cpu(rapl_pmu, cpu) = pmu; + per_cpu(rapl_pmu_to_free, cpu) = NULL; + + return 0; +} + +static void rapl_cpu_kfree(int cpu) +{ + struct rapl_pmu *pmu = per_cpu(rapl_pmu_to_free, cpu); + + kfree(pmu); + + per_cpu(rapl_pmu_to_free, cpu) = NULL; +} + +static int rapl_cpu_dying(int cpu) +{ + struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); + + if (!pmu) + return 0; + + per_cpu(rapl_pmu, cpu) = NULL; + + per_cpu(rapl_pmu_to_free, cpu) = pmu; + + return 0; +} + +static int rapl_cpu_notifier(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (long)hcpu; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_UP_PREPARE: + rapl_cpu_prepare(cpu); + break; + case CPU_STARTING: + rapl_cpu_init(cpu); + break; + case CPU_UP_CANCELED: + case CPU_DYING: + rapl_cpu_dying(cpu); + break; + case CPU_ONLINE: + case CPU_DEAD: + rapl_cpu_kfree(cpu); + break; + case CPU_DOWN_PREPARE: + rapl_cpu_exit(cpu); + break; + default: + break; + } + + return NOTIFY_OK; +} + +static const struct x86_cpu_id rapl_cpu_match[] = { + [0] = { .vendor = X86_VENDOR_INTEL, .family = 6 }, + [1] = {}, +}; + +static int __init rapl_pmu_init(void) +{ + struct rapl_pmu *pmu; + int cpu, ret; + + /* + * check for Intel processor family 6 + */ + if (!x86_match_cpu(rapl_cpu_match)) + return 0; + + /* check supported CPU */ + switch (boot_cpu_data.x86_model) { + case 42: /* Sandy Bridge */ + case 58: /* Ivy Bridge */ + rapl_cntr_mask = RAPL_IDX_CLN; + rapl_pmu_events_group.attrs = rapl_events_cln_attr; + break; + case 60: /* Haswell */ + case 69: /* Haswell-Celeron */ + rapl_cntr_mask = RAPL_IDX_HSW; + rapl_pmu_events_group.attrs = rapl_events_hsw_attr; + break; + case 45: /* Sandy Bridge-EP */ + case 62: /* IvyTown */ + rapl_cntr_mask = RAPL_IDX_SRV; + rapl_pmu_events_group.attrs = rapl_events_srv_attr; + break; + + default: + /* unsupported */ + return 0; + } + + cpu_notifier_register_begin(); + + for_each_online_cpu(cpu) { + ret = rapl_cpu_prepare(cpu); + if (ret) + goto out; + rapl_cpu_init(cpu); + } + + __perf_cpu_notifier(rapl_cpu_notifier); + + ret = perf_pmu_register(&rapl_pmu_class, "power", -1); + if (WARN_ON(ret)) { + pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret); + cpu_notifier_register_done(); + return -1; + } + + pmu = __get_cpu_var(rapl_pmu); + + pr_info("RAPL PMU detected, hw unit 2^-%d Joules," + " API unit is 2^-32 Joules," + " %d fixed counters" + " %llu ms ovfl timer\n", + pmu->hw_unit, + hweight32(rapl_cntr_mask), + ktime_to_ms(pmu->timer_interval)); + +out: + cpu_notifier_register_done(); + + return 0; +} +device_initcall(rapl_pmu_init); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c new file mode 100644 index 00000000000..ae6552a0701 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -0,0 +1,4298 @@ +#include "perf_event_intel_uncore.h" + +static struct intel_uncore_type *empty_uncore[] = { NULL, }; +static struct intel_uncore_type **msr_uncores = empty_uncore; +static struct intel_uncore_type **pci_uncores = empty_uncore; +/* pci bus to socket mapping */ +static int pcibus_to_physid[256] = { [0 ... 255] = -1, }; + +static struct pci_dev *extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX]; + +static DEFINE_RAW_SPINLOCK(uncore_box_lock); + +/* mask of cpus that collect uncore events */ +static cpumask_t uncore_cpu_mask; + +/* constraint for the fixed counter */ +static struct event_constraint constraint_fixed = + EVENT_CONSTRAINT(~0ULL, 1 << UNCORE_PMC_IDX_FIXED, ~0ULL); +static struct event_constraint constraint_empty = + EVENT_CONSTRAINT(0, 0, 0); + +#define __BITS_VALUE(x, i, n) ((typeof(x))(((x) >> ((i) * (n))) & \ + ((1ULL << (n)) - 1))) + +DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7"); +DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21"); +DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); +DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18"); +DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19"); +DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23"); +DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28"); +DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31"); +DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31"); +DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28"); +DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15"); +DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30"); +DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51"); +DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4"); +DEFINE_UNCORE_FORMAT_ATTR(filter_link, filter_link, "config1:5-8"); +DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17"); +DEFINE_UNCORE_FORMAT_ATTR(filter_nid2, filter_nid, "config1:32-47"); +DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22"); +DEFINE_UNCORE_FORMAT_ATTR(filter_state2, filter_state, "config1:17-22"); +DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31"); +DEFINE_UNCORE_FORMAT_ATTR(filter_opc2, filter_opc, "config1:52-60"); +DEFINE_UNCORE_FORMAT_ATTR(filter_band0, filter_band0, "config1:0-7"); +DEFINE_UNCORE_FORMAT_ATTR(filter_band1, filter_band1, "config1:8-15"); +DEFINE_UNCORE_FORMAT_ATTR(filter_band2, filter_band2, "config1:16-23"); +DEFINE_UNCORE_FORMAT_ATTR(filter_band3, filter_band3, "config1:24-31"); +DEFINE_UNCORE_FORMAT_ATTR(match_rds, match_rds, "config1:48-51"); +DEFINE_UNCORE_FORMAT_ATTR(match_rnid30, match_rnid30, "config1:32-35"); +DEFINE_UNCORE_FORMAT_ATTR(match_rnid4, match_rnid4, "config1:31"); +DEFINE_UNCORE_FORMAT_ATTR(match_dnid, match_dnid, "config1:13-17"); +DEFINE_UNCORE_FORMAT_ATTR(match_mc, match_mc, "config1:9-12"); +DEFINE_UNCORE_FORMAT_ATTR(match_opc, match_opc, "config1:5-8"); +DEFINE_UNCORE_FORMAT_ATTR(match_vnw, match_vnw, "config1:3-4"); +DEFINE_UNCORE_FORMAT_ATTR(match0, match0, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(match1, match1, "config1:32-63"); +DEFINE_UNCORE_FORMAT_ATTR(mask_rds, mask_rds, "config2:48-51"); +DEFINE_UNCORE_FORMAT_ATTR(mask_rnid30, mask_rnid30, "config2:32-35"); +DEFINE_UNCORE_FORMAT_ATTR(mask_rnid4, mask_rnid4, "config2:31"); +DEFINE_UNCORE_FORMAT_ATTR(mask_dnid, mask_dnid, "config2:13-17"); +DEFINE_UNCORE_FORMAT_ATTR(mask_mc, mask_mc, "config2:9-12"); +DEFINE_UNCORE_FORMAT_ATTR(mask_opc, mask_opc, "config2:5-8"); +DEFINE_UNCORE_FORMAT_ATTR(mask_vnw, mask_vnw, "config2:3-4"); +DEFINE_UNCORE_FORMAT_ATTR(mask0, mask0, "config2:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(mask1, mask1, "config2:32-63"); + +static void uncore_pmu_start_hrtimer(struct intel_uncore_box *box); +static void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box); +static void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event); +static void uncore_pmu_event_read(struct perf_event *event); + +static struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event) +{ + return container_of(event->pmu, struct intel_uncore_pmu, pmu); +} + +static struct intel_uncore_box * +uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu) +{ + struct intel_uncore_box *box; + + box = *per_cpu_ptr(pmu->box, cpu); + if (box) + return box; + + raw_spin_lock(&uncore_box_lock); + list_for_each_entry(box, &pmu->box_list, list) { + if (box->phys_id == topology_physical_package_id(cpu)) { + atomic_inc(&box->refcnt); + *per_cpu_ptr(pmu->box, cpu) = box; + break; + } + } + raw_spin_unlock(&uncore_box_lock); + + return *per_cpu_ptr(pmu->box, cpu); +} + +static struct intel_uncore_box *uncore_event_to_box(struct perf_event *event) +{ + /* + * perf core schedules event on the basis of cpu, uncore events are + * collected by one of the cpus inside a physical package. + */ + return uncore_pmu_to_box(uncore_event_to_pmu(event), smp_processor_id()); +} + +static u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event) +{ + u64 count; + + rdmsrl(event->hw.event_base, count); + + return count; +} + +/* + * generic get constraint function for shared match/mask registers. + */ +static struct event_constraint * +uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + struct intel_uncore_extra_reg *er; + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; + unsigned long flags; + bool ok = false; + + /* + * reg->alloc can be set due to existing state, so for fake box we + * need to ignore this, otherwise we might fail to allocate proper + * fake state for this extra reg constraint. + */ + if (reg1->idx == EXTRA_REG_NONE || + (!uncore_box_is_fake(box) && reg1->alloc)) + return NULL; + + er = &box->shared_regs[reg1->idx]; + raw_spin_lock_irqsave(&er->lock, flags); + if (!atomic_read(&er->ref) || + (er->config1 == reg1->config && er->config2 == reg2->config)) { + atomic_inc(&er->ref); + er->config1 = reg1->config; + er->config2 = reg2->config; + ok = true; + } + raw_spin_unlock_irqrestore(&er->lock, flags); + + if (ok) { + if (!uncore_box_is_fake(box)) + reg1->alloc = 1; + return NULL; + } + + return &constraint_empty; +} + +static void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + struct intel_uncore_extra_reg *er; + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + + /* + * Only put constraint if extra reg was actually allocated. Also + * takes care of event which do not use an extra shared reg. + * + * Also, if this is a fake box we shouldn't touch any event state + * (reg->alloc) and we don't care about leaving inconsistent box + * state either since it will be thrown out. + */ + if (uncore_box_is_fake(box) || !reg1->alloc) + return; + + er = &box->shared_regs[reg1->idx]; + atomic_dec(&er->ref); + reg1->alloc = 0; +} + +static u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx) +{ + struct intel_uncore_extra_reg *er; + unsigned long flags; + u64 config; + + er = &box->shared_regs[idx]; + + raw_spin_lock_irqsave(&er->lock, flags); + config = er->config; + raw_spin_unlock_irqrestore(&er->lock, flags); + + return config; +} + +/* Sandy Bridge-EP uncore support */ +static struct intel_uncore_type snbep_uncore_cbox; +static struct intel_uncore_type snbep_uncore_pcu; + +static void snbep_uncore_pci_disable_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + int box_ctl = uncore_pci_box_ctl(box); + u32 config = 0; + + if (!pci_read_config_dword(pdev, box_ctl, &config)) { + config |= SNBEP_PMON_BOX_CTL_FRZ; + pci_write_config_dword(pdev, box_ctl, config); + } +} + +static void snbep_uncore_pci_enable_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + int box_ctl = uncore_pci_box_ctl(box); + u32 config = 0; + + if (!pci_read_config_dword(pdev, box_ctl, &config)) { + config &= ~SNBEP_PMON_BOX_CTL_FRZ; + pci_write_config_dword(pdev, box_ctl, config); + } +} + +static void snbep_uncore_pci_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + + pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); +} + +static void snbep_uncore_pci_disable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + + pci_write_config_dword(pdev, hwc->config_base, hwc->config); +} + +static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + u64 count = 0; + + pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count); + pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1); + + return count; +} + +static void snbep_uncore_pci_init_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + + pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, SNBEP_PMON_BOX_CTL_INT); +} + +static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box) +{ + u64 config; + unsigned msr; + + msr = uncore_msr_box_ctl(box); + if (msr) { + rdmsrl(msr, config); + config |= SNBEP_PMON_BOX_CTL_FRZ; + wrmsrl(msr, config); + } +} + +static void snbep_uncore_msr_enable_box(struct intel_uncore_box *box) +{ + u64 config; + unsigned msr; + + msr = uncore_msr_box_ctl(box); + if (msr) { + rdmsrl(msr, config); + config &= ~SNBEP_PMON_BOX_CTL_FRZ; + wrmsrl(msr, config); + } +} + +static void snbep_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + + if (reg1->idx != EXTRA_REG_NONE) + wrmsrl(reg1->reg, uncore_shared_reg_config(box, 0)); + + wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); +} + +static void snbep_uncore_msr_disable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + wrmsrl(hwc->config_base, hwc->config); +} + +static void snbep_uncore_msr_init_box(struct intel_uncore_box *box) +{ + unsigned msr = uncore_msr_box_ctl(box); + + if (msr) + wrmsrl(msr, SNBEP_PMON_BOX_CTL_INT); +} + +static struct attribute *snbep_uncore_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + NULL, +}; + +static struct attribute *snbep_uncore_ubox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh5.attr, + NULL, +}; + +static struct attribute *snbep_uncore_cbox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_tid_en.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + &format_attr_filter_tid.attr, + &format_attr_filter_nid.attr, + &format_attr_filter_state.attr, + &format_attr_filter_opc.attr, + NULL, +}; + +static struct attribute *snbep_uncore_pcu_formats_attr[] = { + &format_attr_event_ext.attr, + &format_attr_occ_sel.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh5.attr, + &format_attr_occ_invert.attr, + &format_attr_occ_edge.attr, + &format_attr_filter_band0.attr, + &format_attr_filter_band1.attr, + &format_attr_filter_band2.attr, + &format_attr_filter_band3.attr, + NULL, +}; + +static struct attribute *snbep_uncore_qpi_formats_attr[] = { + &format_attr_event_ext.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + &format_attr_match_rds.attr, + &format_attr_match_rnid30.attr, + &format_attr_match_rnid4.attr, + &format_attr_match_dnid.attr, + &format_attr_match_mc.attr, + &format_attr_match_opc.attr, + &format_attr_match_vnw.attr, + &format_attr_match0.attr, + &format_attr_match1.attr, + &format_attr_mask_rds.attr, + &format_attr_mask_rnid30.attr, + &format_attr_mask_rnid4.attr, + &format_attr_mask_dnid.attr, + &format_attr_mask_mc.attr, + &format_attr_mask_opc.attr, + &format_attr_mask_vnw.attr, + &format_attr_mask0.attr, + &format_attr_mask1.attr, + NULL, +}; + +static struct uncore_event_desc snbep_uncore_imc_events[] = { + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"), + INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x03"), + INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"), + { /* end: all zeroes */ }, +}; + +static struct uncore_event_desc snbep_uncore_qpi_events[] = { + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x14"), + INTEL_UNCORE_EVENT_DESC(txl_flits_active, "event=0x00,umask=0x06"), + INTEL_UNCORE_EVENT_DESC(drs_data, "event=0x102,umask=0x08"), + INTEL_UNCORE_EVENT_DESC(ncb_data, "event=0x103,umask=0x04"), + { /* end: all zeroes */ }, +}; + +static struct attribute_group snbep_uncore_format_group = { + .name = "format", + .attrs = snbep_uncore_formats_attr, +}; + +static struct attribute_group snbep_uncore_ubox_format_group = { + .name = "format", + .attrs = snbep_uncore_ubox_formats_attr, +}; + +static struct attribute_group snbep_uncore_cbox_format_group = { + .name = "format", + .attrs = snbep_uncore_cbox_formats_attr, +}; + +static struct attribute_group snbep_uncore_pcu_format_group = { + .name = "format", + .attrs = snbep_uncore_pcu_formats_attr, +}; + +static struct attribute_group snbep_uncore_qpi_format_group = { + .name = "format", + .attrs = snbep_uncore_qpi_formats_attr, +}; + +#define SNBEP_UNCORE_MSR_OPS_COMMON_INIT() \ + .init_box = snbep_uncore_msr_init_box, \ + .disable_box = snbep_uncore_msr_disable_box, \ + .enable_box = snbep_uncore_msr_enable_box, \ + .disable_event = snbep_uncore_msr_disable_event, \ + .enable_event = snbep_uncore_msr_enable_event, \ + .read_counter = uncore_msr_read_counter + +static struct intel_uncore_ops snbep_uncore_msr_ops = { + SNBEP_UNCORE_MSR_OPS_COMMON_INIT(), +}; + +#define SNBEP_UNCORE_PCI_OPS_COMMON_INIT() \ + .init_box = snbep_uncore_pci_init_box, \ + .disable_box = snbep_uncore_pci_disable_box, \ + .enable_box = snbep_uncore_pci_enable_box, \ + .disable_event = snbep_uncore_pci_disable_event, \ + .read_counter = snbep_uncore_pci_read_counter + +static struct intel_uncore_ops snbep_uncore_pci_ops = { + SNBEP_UNCORE_PCI_OPS_COMMON_INIT(), + .enable_event = snbep_uncore_pci_enable_event, \ +}; + +static struct event_constraint snbep_uncore_cbox_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x01, 0x1), + UNCORE_EVENT_CONSTRAINT(0x02, 0x3), + UNCORE_EVENT_CONSTRAINT(0x04, 0x3), + UNCORE_EVENT_CONSTRAINT(0x05, 0x3), + UNCORE_EVENT_CONSTRAINT(0x07, 0x3), + UNCORE_EVENT_CONSTRAINT(0x09, 0x3), + UNCORE_EVENT_CONSTRAINT(0x11, 0x1), + UNCORE_EVENT_CONSTRAINT(0x12, 0x3), + UNCORE_EVENT_CONSTRAINT(0x13, 0x3), + UNCORE_EVENT_CONSTRAINT(0x1b, 0xc), + UNCORE_EVENT_CONSTRAINT(0x1c, 0xc), + UNCORE_EVENT_CONSTRAINT(0x1d, 0xc), + UNCORE_EVENT_CONSTRAINT(0x1e, 0xc), + EVENT_CONSTRAINT_OVERLAP(0x1f, 0xe, 0xff), + UNCORE_EVENT_CONSTRAINT(0x21, 0x3), + UNCORE_EVENT_CONSTRAINT(0x23, 0x3), + UNCORE_EVENT_CONSTRAINT(0x31, 0x3), + UNCORE_EVENT_CONSTRAINT(0x32, 0x3), + UNCORE_EVENT_CONSTRAINT(0x33, 0x3), + UNCORE_EVENT_CONSTRAINT(0x34, 0x3), + UNCORE_EVENT_CONSTRAINT(0x35, 0x3), + UNCORE_EVENT_CONSTRAINT(0x36, 0x1), + UNCORE_EVENT_CONSTRAINT(0x37, 0x3), + UNCORE_EVENT_CONSTRAINT(0x38, 0x3), + UNCORE_EVENT_CONSTRAINT(0x39, 0x3), + UNCORE_EVENT_CONSTRAINT(0x3b, 0x1), + EVENT_CONSTRAINT_END +}; + +static struct event_constraint snbep_uncore_r2pcie_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x10, 0x3), + UNCORE_EVENT_CONSTRAINT(0x11, 0x3), + UNCORE_EVENT_CONSTRAINT(0x12, 0x1), + UNCORE_EVENT_CONSTRAINT(0x23, 0x3), + UNCORE_EVENT_CONSTRAINT(0x24, 0x3), + UNCORE_EVENT_CONSTRAINT(0x25, 0x3), + UNCORE_EVENT_CONSTRAINT(0x26, 0x3), + UNCORE_EVENT_CONSTRAINT(0x32, 0x3), + UNCORE_EVENT_CONSTRAINT(0x33, 0x3), + UNCORE_EVENT_CONSTRAINT(0x34, 0x3), + EVENT_CONSTRAINT_END +}; + +static struct event_constraint snbep_uncore_r3qpi_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x10, 0x3), + UNCORE_EVENT_CONSTRAINT(0x11, 0x3), + UNCORE_EVENT_CONSTRAINT(0x12, 0x3), + UNCORE_EVENT_CONSTRAINT(0x13, 0x1), + UNCORE_EVENT_CONSTRAINT(0x20, 0x3), + UNCORE_EVENT_CONSTRAINT(0x21, 0x3), + UNCORE_EVENT_CONSTRAINT(0x22, 0x3), + UNCORE_EVENT_CONSTRAINT(0x23, 0x3), + UNCORE_EVENT_CONSTRAINT(0x24, 0x3), + UNCORE_EVENT_CONSTRAINT(0x25, 0x3), + UNCORE_EVENT_CONSTRAINT(0x26, 0x3), + UNCORE_EVENT_CONSTRAINT(0x28, 0x3), + UNCORE_EVENT_CONSTRAINT(0x29, 0x3), + UNCORE_EVENT_CONSTRAINT(0x2a, 0x3), + UNCORE_EVENT_CONSTRAINT(0x2b, 0x3), + UNCORE_EVENT_CONSTRAINT(0x2c, 0x3), + UNCORE_EVENT_CONSTRAINT(0x2d, 0x3), + UNCORE_EVENT_CONSTRAINT(0x2e, 0x3), + UNCORE_EVENT_CONSTRAINT(0x2f, 0x3), + UNCORE_EVENT_CONSTRAINT(0x30, 0x3), + UNCORE_EVENT_CONSTRAINT(0x31, 0x3), + UNCORE_EVENT_CONSTRAINT(0x32, 0x3), + UNCORE_EVENT_CONSTRAINT(0x33, 0x3), + UNCORE_EVENT_CONSTRAINT(0x34, 0x3), + UNCORE_EVENT_CONSTRAINT(0x36, 0x3), + UNCORE_EVENT_CONSTRAINT(0x37, 0x3), + UNCORE_EVENT_CONSTRAINT(0x38, 0x3), + UNCORE_EVENT_CONSTRAINT(0x39, 0x3), + EVENT_CONSTRAINT_END +}; + +static struct intel_uncore_type snbep_uncore_ubox = { + .name = "ubox", + .num_counters = 2, + .num_boxes = 1, + .perf_ctr_bits = 44, + .fixed_ctr_bits = 48, + .perf_ctr = SNBEP_U_MSR_PMON_CTR0, + .event_ctl = SNBEP_U_MSR_PMON_CTL0, + .event_mask = SNBEP_U_MSR_PMON_RAW_EVENT_MASK, + .fixed_ctr = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR, + .fixed_ctl = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL, + .ops = &snbep_uncore_msr_ops, + .format_group = &snbep_uncore_ubox_format_group, +}; + +static struct extra_reg snbep_uncore_cbox_extra_regs[] = { + SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN, + SNBEP_CBO_PMON_CTL_TID_EN, 0x1), + SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0x6), + SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0x6), + SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0x6), + SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x6), + SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0xa), + SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0xa), + SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x2), + SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x2), + SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x2), + SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x2), + SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0xa), + SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0xa), + SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x2), + SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x2), + SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x2), + SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x2), + EVENT_EXTRA_END +}; + +static void snbep_cbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct intel_uncore_extra_reg *er = &box->shared_regs[0]; + int i; + + if (uncore_box_is_fake(box)) + return; + + for (i = 0; i < 5; i++) { + if (reg1->alloc & (0x1 << i)) + atomic_sub(1 << (i * 6), &er->ref); + } + reg1->alloc = 0; +} + +static struct event_constraint * +__snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event, + u64 (*cbox_filter_mask)(int fields)) +{ + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct intel_uncore_extra_reg *er = &box->shared_regs[0]; + int i, alloc = 0; + unsigned long flags; + u64 mask; + + if (reg1->idx == EXTRA_REG_NONE) + return NULL; + + raw_spin_lock_irqsave(&er->lock, flags); + for (i = 0; i < 5; i++) { + if (!(reg1->idx & (0x1 << i))) + continue; + if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i))) + continue; + + mask = cbox_filter_mask(0x1 << i); + if (!__BITS_VALUE(atomic_read(&er->ref), i, 6) || + !((reg1->config ^ er->config) & mask)) { + atomic_add(1 << (i * 6), &er->ref); + er->config &= ~mask; + er->config |= reg1->config & mask; + alloc |= (0x1 << i); + } else { + break; + } + } + raw_spin_unlock_irqrestore(&er->lock, flags); + if (i < 5) + goto fail; + + if (!uncore_box_is_fake(box)) + reg1->alloc |= alloc; + + return NULL; +fail: + for (; i >= 0; i--) { + if (alloc & (0x1 << i)) + atomic_sub(1 << (i * 6), &er->ref); + } + return &constraint_empty; +} + +static u64 snbep_cbox_filter_mask(int fields) +{ + u64 mask = 0; + + if (fields & 0x1) + mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_TID; + if (fields & 0x2) + mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_NID; + if (fields & 0x4) + mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE; + if (fields & 0x8) + mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC; + + return mask; +} + +static struct event_constraint * +snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + return __snbep_cbox_get_constraint(box, event, snbep_cbox_filter_mask); +} + +static int snbep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct extra_reg *er; + int idx = 0; + + for (er = snbep_uncore_cbox_extra_regs; er->msr; er++) { + if (er->event != (event->hw.config & er->config_mask)) + continue; + idx |= er->idx; + } + + if (idx) { + reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER + + SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx; + reg1->config = event->attr.config1 & snbep_cbox_filter_mask(idx); + reg1->idx = idx; + } + return 0; +} + +static struct intel_uncore_ops snbep_uncore_cbox_ops = { + SNBEP_UNCORE_MSR_OPS_COMMON_INIT(), + .hw_config = snbep_cbox_hw_config, + .get_constraint = snbep_cbox_get_constraint, + .put_constraint = snbep_cbox_put_constraint, +}; + +static struct intel_uncore_type snbep_uncore_cbox = { + .name = "cbox", + .num_counters = 4, + .num_boxes = 8, + .perf_ctr_bits = 44, + .event_ctl = SNBEP_C0_MSR_PMON_CTL0, + .perf_ctr = SNBEP_C0_MSR_PMON_CTR0, + .event_mask = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_C0_MSR_PMON_BOX_CTL, + .msr_offset = SNBEP_CBO_MSR_OFFSET, + .num_shared_regs = 1, + .constraints = snbep_uncore_cbox_constraints, + .ops = &snbep_uncore_cbox_ops, + .format_group = &snbep_uncore_cbox_format_group, +}; + +static u64 snbep_pcu_alter_er(struct perf_event *event, int new_idx, bool modify) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + u64 config = reg1->config; + + if (new_idx > reg1->idx) + config <<= 8 * (new_idx - reg1->idx); + else + config >>= 8 * (reg1->idx - new_idx); + + if (modify) { + hwc->config += new_idx - reg1->idx; + reg1->config = config; + reg1->idx = new_idx; + } + return config; +} + +static struct event_constraint * +snbep_pcu_get_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct intel_uncore_extra_reg *er = &box->shared_regs[0]; + unsigned long flags; + int idx = reg1->idx; + u64 mask, config1 = reg1->config; + bool ok = false; + + if (reg1->idx == EXTRA_REG_NONE || + (!uncore_box_is_fake(box) && reg1->alloc)) + return NULL; +again: + mask = 0xffULL << (idx * 8); + raw_spin_lock_irqsave(&er->lock, flags); + if (!__BITS_VALUE(atomic_read(&er->ref), idx, 8) || + !((config1 ^ er->config) & mask)) { + atomic_add(1 << (idx * 8), &er->ref); + er->config &= ~mask; + er->config |= config1 & mask; + ok = true; + } + raw_spin_unlock_irqrestore(&er->lock, flags); + + if (!ok) { + idx = (idx + 1) % 4; + if (idx != reg1->idx) { + config1 = snbep_pcu_alter_er(event, idx, false); + goto again; + } + return &constraint_empty; + } + + if (!uncore_box_is_fake(box)) { + if (idx != reg1->idx) + snbep_pcu_alter_er(event, idx, true); + reg1->alloc = 1; + } + return NULL; +} + +static void snbep_pcu_put_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct intel_uncore_extra_reg *er = &box->shared_regs[0]; + + if (uncore_box_is_fake(box) || !reg1->alloc) + return; + + atomic_sub(1 << (reg1->idx * 8), &er->ref); + reg1->alloc = 0; +} + +static int snbep_pcu_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + int ev_sel = hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK; + + if (ev_sel >= 0xb && ev_sel <= 0xe) { + reg1->reg = SNBEP_PCU_MSR_PMON_BOX_FILTER; + reg1->idx = ev_sel - 0xb; + reg1->config = event->attr.config1 & (0xff << reg1->idx); + } + return 0; +} + +static struct intel_uncore_ops snbep_uncore_pcu_ops = { + SNBEP_UNCORE_MSR_OPS_COMMON_INIT(), + .hw_config = snbep_pcu_hw_config, + .get_constraint = snbep_pcu_get_constraint, + .put_constraint = snbep_pcu_put_constraint, +}; + +static struct intel_uncore_type snbep_uncore_pcu = { + .name = "pcu", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .perf_ctr = SNBEP_PCU_MSR_PMON_CTR0, + .event_ctl = SNBEP_PCU_MSR_PMON_CTL0, + .event_mask = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCU_MSR_PMON_BOX_CTL, + .num_shared_regs = 1, + .ops = &snbep_uncore_pcu_ops, + .format_group = &snbep_uncore_pcu_format_group, +}; + +static struct intel_uncore_type *snbep_msr_uncores[] = { + &snbep_uncore_ubox, + &snbep_uncore_cbox, + &snbep_uncore_pcu, + NULL, +}; + +enum { + SNBEP_PCI_QPI_PORT0_FILTER, + SNBEP_PCI_QPI_PORT1_FILTER, +}; + +static int snbep_qpi_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + + if ((hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK) == 0x38) { + reg1->idx = 0; + reg1->reg = SNBEP_Q_Py_PCI_PMON_PKT_MATCH0; + reg1->config = event->attr.config1; + reg2->reg = SNBEP_Q_Py_PCI_PMON_PKT_MASK0; + reg2->config = event->attr.config2; + } + return 0; +} + +static void snbep_qpi_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + + if (reg1->idx != EXTRA_REG_NONE) { + int idx = box->pmu->pmu_idx + SNBEP_PCI_QPI_PORT0_FILTER; + struct pci_dev *filter_pdev = extra_pci_dev[box->phys_id][idx]; + WARN_ON_ONCE(!filter_pdev); + if (filter_pdev) { + pci_write_config_dword(filter_pdev, reg1->reg, + (u32)reg1->config); + pci_write_config_dword(filter_pdev, reg1->reg + 4, + (u32)(reg1->config >> 32)); + pci_write_config_dword(filter_pdev, reg2->reg, + (u32)reg2->config); + pci_write_config_dword(filter_pdev, reg2->reg + 4, + (u32)(reg2->config >> 32)); + } + } + + pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); +} + +static struct intel_uncore_ops snbep_uncore_qpi_ops = { + SNBEP_UNCORE_PCI_OPS_COMMON_INIT(), + .enable_event = snbep_qpi_enable_event, + .hw_config = snbep_qpi_hw_config, + .get_constraint = uncore_get_constraint, + .put_constraint = uncore_put_constraint, +}; + +#define SNBEP_UNCORE_PCI_COMMON_INIT() \ + .perf_ctr = SNBEP_PCI_PMON_CTR0, \ + .event_ctl = SNBEP_PCI_PMON_CTL0, \ + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, \ + .box_ctl = SNBEP_PCI_PMON_BOX_CTL, \ + .ops = &snbep_uncore_pci_ops, \ + .format_group = &snbep_uncore_format_group + +static struct intel_uncore_type snbep_uncore_ha = { + .name = "ha", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type snbep_uncore_imc = { + .name = "imc", + .num_counters = 4, + .num_boxes = 4, + .perf_ctr_bits = 48, + .fixed_ctr_bits = 48, + .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR, + .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL, + .event_descs = snbep_uncore_imc_events, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type snbep_uncore_qpi = { + .name = "qpi", + .num_counters = 4, + .num_boxes = 2, + .perf_ctr_bits = 48, + .perf_ctr = SNBEP_PCI_PMON_CTR0, + .event_ctl = SNBEP_PCI_PMON_CTL0, + .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCI_PMON_BOX_CTL, + .num_shared_regs = 1, + .ops = &snbep_uncore_qpi_ops, + .event_descs = snbep_uncore_qpi_events, + .format_group = &snbep_uncore_qpi_format_group, +}; + + +static struct intel_uncore_type snbep_uncore_r2pcie = { + .name = "r2pcie", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 44, + .constraints = snbep_uncore_r2pcie_constraints, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type snbep_uncore_r3qpi = { + .name = "r3qpi", + .num_counters = 3, + .num_boxes = 2, + .perf_ctr_bits = 44, + .constraints = snbep_uncore_r3qpi_constraints, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +enum { + SNBEP_PCI_UNCORE_HA, + SNBEP_PCI_UNCORE_IMC, + SNBEP_PCI_UNCORE_QPI, + SNBEP_PCI_UNCORE_R2PCIE, + SNBEP_PCI_UNCORE_R3QPI, +}; + +static struct intel_uncore_type *snbep_pci_uncores[] = { + [SNBEP_PCI_UNCORE_HA] = &snbep_uncore_ha, + [SNBEP_PCI_UNCORE_IMC] = &snbep_uncore_imc, + [SNBEP_PCI_UNCORE_QPI] = &snbep_uncore_qpi, + [SNBEP_PCI_UNCORE_R2PCIE] = &snbep_uncore_r2pcie, + [SNBEP_PCI_UNCORE_R3QPI] = &snbep_uncore_r3qpi, + NULL, +}; + +static DEFINE_PCI_DEVICE_TABLE(snbep_uncore_pci_ids) = { + { /* Home Agent */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA), + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_HA, 0), + }, + { /* MC Channel 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC0), + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 0), + }, + { /* MC Channel 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC1), + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 1), + }, + { /* MC Channel 2 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC2), + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 2), + }, + { /* MC Channel 3 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC3), + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 3), + }, + { /* QPI Port 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI0), + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 0), + }, + { /* QPI Port 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI1), + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 1), + }, + { /* R2PCIe */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R2PCIE), + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R2PCIE, 0), + }, + { /* R3QPI Link 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI0), + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 0), + }, + { /* R3QPI Link 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI1), + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 1), + }, + { /* QPI Port 0 filter */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c86), + .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, + SNBEP_PCI_QPI_PORT0_FILTER), + }, + { /* QPI Port 0 filter */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c96), + .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, + SNBEP_PCI_QPI_PORT1_FILTER), + }, + { /* end: all zeroes */ } +}; + +static struct pci_driver snbep_uncore_pci_driver = { + .name = "snbep_uncore", + .id_table = snbep_uncore_pci_ids, +}; + +/* + * build pci bus to socket mapping + */ +static int snbep_pci2phy_map_init(int devid) +{ + struct pci_dev *ubox_dev = NULL; + int i, bus, nodeid; + int err = 0; + u32 config = 0; + + while (1) { + /* find the UBOX device */ + ubox_dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, ubox_dev); + if (!ubox_dev) + break; + bus = ubox_dev->bus->number; + /* get the Node ID of the local register */ + err = pci_read_config_dword(ubox_dev, 0x40, &config); + if (err) + break; + nodeid = config; + /* get the Node ID mapping */ + err = pci_read_config_dword(ubox_dev, 0x54, &config); + if (err) + break; + /* + * every three bits in the Node ID mapping register maps + * to a particular node. + */ + for (i = 0; i < 8; i++) { + if (nodeid == ((config >> (3 * i)) & 0x7)) { + pcibus_to_physid[bus] = i; + break; + } + } + } + + if (!err) { + /* + * For PCI bus with no UBOX device, find the next bus + * that has UBOX device and use its mapping. + */ + i = -1; + for (bus = 255; bus >= 0; bus--) { + if (pcibus_to_physid[bus] >= 0) + i = pcibus_to_physid[bus]; + else + pcibus_to_physid[bus] = i; + } + } + + if (ubox_dev) + pci_dev_put(ubox_dev); + + return err ? pcibios_err_to_errno(err) : 0; +} +/* end of Sandy Bridge-EP uncore support */ + +/* IvyTown uncore support */ +static void ivt_uncore_msr_init_box(struct intel_uncore_box *box) +{ + unsigned msr = uncore_msr_box_ctl(box); + if (msr) + wrmsrl(msr, IVT_PMON_BOX_CTL_INT); +} + +static void ivt_uncore_pci_init_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + + pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, IVT_PMON_BOX_CTL_INT); +} + +#define IVT_UNCORE_MSR_OPS_COMMON_INIT() \ + .init_box = ivt_uncore_msr_init_box, \ + .disable_box = snbep_uncore_msr_disable_box, \ + .enable_box = snbep_uncore_msr_enable_box, \ + .disable_event = snbep_uncore_msr_disable_event, \ + .enable_event = snbep_uncore_msr_enable_event, \ + .read_counter = uncore_msr_read_counter + +static struct intel_uncore_ops ivt_uncore_msr_ops = { + IVT_UNCORE_MSR_OPS_COMMON_INIT(), +}; + +static struct intel_uncore_ops ivt_uncore_pci_ops = { + .init_box = ivt_uncore_pci_init_box, + .disable_box = snbep_uncore_pci_disable_box, + .enable_box = snbep_uncore_pci_enable_box, + .disable_event = snbep_uncore_pci_disable_event, + .enable_event = snbep_uncore_pci_enable_event, + .read_counter = snbep_uncore_pci_read_counter, +}; + +#define IVT_UNCORE_PCI_COMMON_INIT() \ + .perf_ctr = SNBEP_PCI_PMON_CTR0, \ + .event_ctl = SNBEP_PCI_PMON_CTL0, \ + .event_mask = IVT_PMON_RAW_EVENT_MASK, \ + .box_ctl = SNBEP_PCI_PMON_BOX_CTL, \ + .ops = &ivt_uncore_pci_ops, \ + .format_group = &ivt_uncore_format_group + +static struct attribute *ivt_uncore_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + NULL, +}; + +static struct attribute *ivt_uncore_ubox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh5.attr, + NULL, +}; + +static struct attribute *ivt_uncore_cbox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_tid_en.attr, + &format_attr_thresh8.attr, + &format_attr_filter_tid.attr, + &format_attr_filter_link.attr, + &format_attr_filter_state2.attr, + &format_attr_filter_nid2.attr, + &format_attr_filter_opc2.attr, + NULL, +}; + +static struct attribute *ivt_uncore_pcu_formats_attr[] = { + &format_attr_event_ext.attr, + &format_attr_occ_sel.attr, + &format_attr_edge.attr, + &format_attr_thresh5.attr, + &format_attr_occ_invert.attr, + &format_attr_occ_edge.attr, + &format_attr_filter_band0.attr, + &format_attr_filter_band1.attr, + &format_attr_filter_band2.attr, + &format_attr_filter_band3.attr, + NULL, +}; + +static struct attribute *ivt_uncore_qpi_formats_attr[] = { + &format_attr_event_ext.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_thresh8.attr, + &format_attr_match_rds.attr, + &format_attr_match_rnid30.attr, + &format_attr_match_rnid4.attr, + &format_attr_match_dnid.attr, + &format_attr_match_mc.attr, + &format_attr_match_opc.attr, + &format_attr_match_vnw.attr, + &format_attr_match0.attr, + &format_attr_match1.attr, + &format_attr_mask_rds.attr, + &format_attr_mask_rnid30.attr, + &format_attr_mask_rnid4.attr, + &format_attr_mask_dnid.attr, + &format_attr_mask_mc.attr, + &format_attr_mask_opc.attr, + &format_attr_mask_vnw.attr, + &format_attr_mask0.attr, + &format_attr_mask1.attr, + NULL, +}; + +static struct attribute_group ivt_uncore_format_group = { + .name = "format", + .attrs = ivt_uncore_formats_attr, +}; + +static struct attribute_group ivt_uncore_ubox_format_group = { + .name = "format", + .attrs = ivt_uncore_ubox_formats_attr, +}; + +static struct attribute_group ivt_uncore_cbox_format_group = { + .name = "format", + .attrs = ivt_uncore_cbox_formats_attr, +}; + +static struct attribute_group ivt_uncore_pcu_format_group = { + .name = "format", + .attrs = ivt_uncore_pcu_formats_attr, +}; + +static struct attribute_group ivt_uncore_qpi_format_group = { + .name = "format", + .attrs = ivt_uncore_qpi_formats_attr, +}; + +static struct intel_uncore_type ivt_uncore_ubox = { + .name = "ubox", + .num_counters = 2, + .num_boxes = 1, + .perf_ctr_bits = 44, + .fixed_ctr_bits = 48, + .perf_ctr = SNBEP_U_MSR_PMON_CTR0, + .event_ctl = SNBEP_U_MSR_PMON_CTL0, + .event_mask = IVT_U_MSR_PMON_RAW_EVENT_MASK, + .fixed_ctr = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR, + .fixed_ctl = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL, + .ops = &ivt_uncore_msr_ops, + .format_group = &ivt_uncore_ubox_format_group, +}; + +static struct extra_reg ivt_uncore_cbox_extra_regs[] = { + SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN, + SNBEP_CBO_PMON_CTL_TID_EN, 0x1), + SNBEP_CBO_EVENT_EXTRA_REG(0x1031, 0x10ff, 0x2), + + SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0xc), + SNBEP_CBO_EVENT_EXTRA_REG(0x5134, 0xffff, 0xc), + SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0xc), + SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0xc), + SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0xc), + SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x2335, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0x18), + SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0x18), + SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x8135, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18), + SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18), + SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x5036, 0xffff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x8136, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x8336, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x8), + EVENT_EXTRA_END +}; + +static u64 ivt_cbox_filter_mask(int fields) +{ + u64 mask = 0; + + if (fields & 0x1) + mask |= IVT_CB0_MSR_PMON_BOX_FILTER_TID; + if (fields & 0x2) + mask |= IVT_CB0_MSR_PMON_BOX_FILTER_LINK; + if (fields & 0x4) + mask |= IVT_CB0_MSR_PMON_BOX_FILTER_STATE; + if (fields & 0x8) + mask |= IVT_CB0_MSR_PMON_BOX_FILTER_NID; + if (fields & 0x10) + mask |= IVT_CB0_MSR_PMON_BOX_FILTER_OPC; + + return mask; +} + +static struct event_constraint * +ivt_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + return __snbep_cbox_get_constraint(box, event, ivt_cbox_filter_mask); +} + +static int ivt_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct extra_reg *er; + int idx = 0; + + for (er = ivt_uncore_cbox_extra_regs; er->msr; er++) { + if (er->event != (event->hw.config & er->config_mask)) + continue; + idx |= er->idx; + } + + if (idx) { + reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER + + SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx; + reg1->config = event->attr.config1 & ivt_cbox_filter_mask(idx); + reg1->idx = idx; + } + return 0; +} + +static void ivt_cbox_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + + if (reg1->idx != EXTRA_REG_NONE) { + u64 filter = uncore_shared_reg_config(box, 0); + wrmsrl(reg1->reg, filter & 0xffffffff); + wrmsrl(reg1->reg + 6, filter >> 32); + } + + wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); +} + +static struct intel_uncore_ops ivt_uncore_cbox_ops = { + .init_box = ivt_uncore_msr_init_box, + .disable_box = snbep_uncore_msr_disable_box, + .enable_box = snbep_uncore_msr_enable_box, + .disable_event = snbep_uncore_msr_disable_event, + .enable_event = ivt_cbox_enable_event, + .read_counter = uncore_msr_read_counter, + .hw_config = ivt_cbox_hw_config, + .get_constraint = ivt_cbox_get_constraint, + .put_constraint = snbep_cbox_put_constraint, +}; + +static struct intel_uncore_type ivt_uncore_cbox = { + .name = "cbox", + .num_counters = 4, + .num_boxes = 15, + .perf_ctr_bits = 44, + .event_ctl = SNBEP_C0_MSR_PMON_CTL0, + .perf_ctr = SNBEP_C0_MSR_PMON_CTR0, + .event_mask = IVT_CBO_MSR_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_C0_MSR_PMON_BOX_CTL, + .msr_offset = SNBEP_CBO_MSR_OFFSET, + .num_shared_regs = 1, + .constraints = snbep_uncore_cbox_constraints, + .ops = &ivt_uncore_cbox_ops, + .format_group = &ivt_uncore_cbox_format_group, +}; + +static struct intel_uncore_ops ivt_uncore_pcu_ops = { + IVT_UNCORE_MSR_OPS_COMMON_INIT(), + .hw_config = snbep_pcu_hw_config, + .get_constraint = snbep_pcu_get_constraint, + .put_constraint = snbep_pcu_put_constraint, +}; + +static struct intel_uncore_type ivt_uncore_pcu = { + .name = "pcu", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .perf_ctr = SNBEP_PCU_MSR_PMON_CTR0, + .event_ctl = SNBEP_PCU_MSR_PMON_CTL0, + .event_mask = IVT_PCU_MSR_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCU_MSR_PMON_BOX_CTL, + .num_shared_regs = 1, + .ops = &ivt_uncore_pcu_ops, + .format_group = &ivt_uncore_pcu_format_group, +}; + +static struct intel_uncore_type *ivt_msr_uncores[] = { + &ivt_uncore_ubox, + &ivt_uncore_cbox, + &ivt_uncore_pcu, + NULL, +}; + +static struct intel_uncore_type ivt_uncore_ha = { + .name = "ha", + .num_counters = 4, + .num_boxes = 2, + .perf_ctr_bits = 48, + IVT_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type ivt_uncore_imc = { + .name = "imc", + .num_counters = 4, + .num_boxes = 8, + .perf_ctr_bits = 48, + .fixed_ctr_bits = 48, + .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR, + .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL, + IVT_UNCORE_PCI_COMMON_INIT(), +}; + +/* registers in IRP boxes are not properly aligned */ +static unsigned ivt_uncore_irp_ctls[] = {0xd8, 0xdc, 0xe0, 0xe4}; +static unsigned ivt_uncore_irp_ctrs[] = {0xa0, 0xb0, 0xb8, 0xc0}; + +static void ivt_uncore_irp_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + + pci_write_config_dword(pdev, ivt_uncore_irp_ctls[hwc->idx], + hwc->config | SNBEP_PMON_CTL_EN); +} + +static void ivt_uncore_irp_disable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + + pci_write_config_dword(pdev, ivt_uncore_irp_ctls[hwc->idx], hwc->config); +} + +static u64 ivt_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + u64 count = 0; + + pci_read_config_dword(pdev, ivt_uncore_irp_ctrs[hwc->idx], (u32 *)&count); + pci_read_config_dword(pdev, ivt_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1); + + return count; +} + +static struct intel_uncore_ops ivt_uncore_irp_ops = { + .init_box = ivt_uncore_pci_init_box, + .disable_box = snbep_uncore_pci_disable_box, + .enable_box = snbep_uncore_pci_enable_box, + .disable_event = ivt_uncore_irp_disable_event, + .enable_event = ivt_uncore_irp_enable_event, + .read_counter = ivt_uncore_irp_read_counter, +}; + +static struct intel_uncore_type ivt_uncore_irp = { + .name = "irp", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .event_mask = IVT_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCI_PMON_BOX_CTL, + .ops = &ivt_uncore_irp_ops, + .format_group = &ivt_uncore_format_group, +}; + +static struct intel_uncore_ops ivt_uncore_qpi_ops = { + .init_box = ivt_uncore_pci_init_box, + .disable_box = snbep_uncore_pci_disable_box, + .enable_box = snbep_uncore_pci_enable_box, + .disable_event = snbep_uncore_pci_disable_event, + .enable_event = snbep_qpi_enable_event, + .read_counter = snbep_uncore_pci_read_counter, + .hw_config = snbep_qpi_hw_config, + .get_constraint = uncore_get_constraint, + .put_constraint = uncore_put_constraint, +}; + +static struct intel_uncore_type ivt_uncore_qpi = { + .name = "qpi", + .num_counters = 4, + .num_boxes = 3, + .perf_ctr_bits = 48, + .perf_ctr = SNBEP_PCI_PMON_CTR0, + .event_ctl = SNBEP_PCI_PMON_CTL0, + .event_mask = IVT_QPI_PCI_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCI_PMON_BOX_CTL, + .num_shared_regs = 1, + .ops = &ivt_uncore_qpi_ops, + .format_group = &ivt_uncore_qpi_format_group, +}; + +static struct intel_uncore_type ivt_uncore_r2pcie = { + .name = "r2pcie", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 44, + .constraints = snbep_uncore_r2pcie_constraints, + IVT_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type ivt_uncore_r3qpi = { + .name = "r3qpi", + .num_counters = 3, + .num_boxes = 2, + .perf_ctr_bits = 44, + .constraints = snbep_uncore_r3qpi_constraints, + IVT_UNCORE_PCI_COMMON_INIT(), +}; + +enum { + IVT_PCI_UNCORE_HA, + IVT_PCI_UNCORE_IMC, + IVT_PCI_UNCORE_IRP, + IVT_PCI_UNCORE_QPI, + IVT_PCI_UNCORE_R2PCIE, + IVT_PCI_UNCORE_R3QPI, +}; + +static struct intel_uncore_type *ivt_pci_uncores[] = { + [IVT_PCI_UNCORE_HA] = &ivt_uncore_ha, + [IVT_PCI_UNCORE_IMC] = &ivt_uncore_imc, + [IVT_PCI_UNCORE_IRP] = &ivt_uncore_irp, + [IVT_PCI_UNCORE_QPI] = &ivt_uncore_qpi, + [IVT_PCI_UNCORE_R2PCIE] = &ivt_uncore_r2pcie, + [IVT_PCI_UNCORE_R3QPI] = &ivt_uncore_r3qpi, + NULL, +}; + +static DEFINE_PCI_DEVICE_TABLE(ivt_uncore_pci_ids) = { + { /* Home Agent 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe30), + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_HA, 0), + }, + { /* Home Agent 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe38), + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_HA, 1), + }, + { /* MC0 Channel 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb4), + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 0), + }, + { /* MC0 Channel 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb5), + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 1), + }, + { /* MC0 Channel 3 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb0), + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 2), + }, + { /* MC0 Channel 4 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb1), + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 3), + }, + { /* MC1 Channel 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef4), + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 4), + }, + { /* MC1 Channel 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef5), + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 5), + }, + { /* MC1 Channel 3 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef0), + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 6), + }, + { /* MC1 Channel 4 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef1), + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 7), + }, + { /* IRP */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe39), + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IRP, 0), + }, + { /* QPI0 Port 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe32), + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_QPI, 0), + }, + { /* QPI0 Port 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe33), + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_QPI, 1), + }, + { /* QPI1 Port 2 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3a), + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_QPI, 2), + }, + { /* R2PCIe */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe34), + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R2PCIE, 0), + }, + { /* R3QPI0 Link 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe36), + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R3QPI, 0), + }, + { /* R3QPI0 Link 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe37), + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R3QPI, 1), + }, + { /* R3QPI1 Link 2 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3e), + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R3QPI, 2), + }, + { /* QPI Port 0 filter */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe86), + .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, + SNBEP_PCI_QPI_PORT0_FILTER), + }, + { /* QPI Port 0 filter */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe96), + .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, + SNBEP_PCI_QPI_PORT1_FILTER), + }, + { /* end: all zeroes */ } +}; + +static struct pci_driver ivt_uncore_pci_driver = { + .name = "ivt_uncore", + .id_table = ivt_uncore_pci_ids, +}; +/* end of IvyTown uncore support */ + +/* Sandy Bridge uncore support */ +static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (hwc->idx < UNCORE_PMC_IDX_FIXED) + wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN); + else + wrmsrl(hwc->config_base, SNB_UNC_CTL_EN); +} + +static void snb_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + wrmsrl(event->hw.config_base, 0); +} + +static void snb_uncore_msr_init_box(struct intel_uncore_box *box) +{ + if (box->pmu->pmu_idx == 0) { + wrmsrl(SNB_UNC_PERF_GLOBAL_CTL, + SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL); + } +} + +static struct uncore_event_desc snb_uncore_events[] = { + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"), + { /* end: all zeroes */ }, +}; + +static struct attribute *snb_uncore_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_cmask5.attr, + NULL, +}; + +static struct attribute_group snb_uncore_format_group = { + .name = "format", + .attrs = snb_uncore_formats_attr, +}; + +static struct intel_uncore_ops snb_uncore_msr_ops = { + .init_box = snb_uncore_msr_init_box, + .disable_event = snb_uncore_msr_disable_event, + .enable_event = snb_uncore_msr_enable_event, + .read_counter = uncore_msr_read_counter, +}; + +static struct event_constraint snb_uncore_cbox_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x80, 0x1), + UNCORE_EVENT_CONSTRAINT(0x83, 0x1), + EVENT_CONSTRAINT_END +}; + +static struct intel_uncore_type snb_uncore_cbox = { + .name = "cbox", + .num_counters = 2, + .num_boxes = 4, + .perf_ctr_bits = 44, + .fixed_ctr_bits = 48, + .perf_ctr = SNB_UNC_CBO_0_PER_CTR0, + .event_ctl = SNB_UNC_CBO_0_PERFEVTSEL0, + .fixed_ctr = SNB_UNC_FIXED_CTR, + .fixed_ctl = SNB_UNC_FIXED_CTR_CTRL, + .single_fixed = 1, + .event_mask = SNB_UNC_RAW_EVENT_MASK, + .msr_offset = SNB_UNC_CBO_MSR_OFFSET, + .constraints = snb_uncore_cbox_constraints, + .ops = &snb_uncore_msr_ops, + .format_group = &snb_uncore_format_group, + .event_descs = snb_uncore_events, +}; + +static struct intel_uncore_type *snb_msr_uncores[] = { + &snb_uncore_cbox, + NULL, +}; + +enum { + SNB_PCI_UNCORE_IMC, +}; + +static struct uncore_event_desc snb_uncore_imc_events[] = { + INTEL_UNCORE_EVENT_DESC(data_reads, "event=0x01"), + INTEL_UNCORE_EVENT_DESC(data_reads.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(data_reads.unit, "MiB"), + + INTEL_UNCORE_EVENT_DESC(data_writes, "event=0x02"), + INTEL_UNCORE_EVENT_DESC(data_writes.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(data_writes.unit, "MiB"), + + { /* end: all zeroes */ }, +}; + +#define SNB_UNCORE_PCI_IMC_EVENT_MASK 0xff +#define SNB_UNCORE_PCI_IMC_BAR_OFFSET 0x48 + +/* page size multiple covering all config regs */ +#define SNB_UNCORE_PCI_IMC_MAP_SIZE 0x6000 + +#define SNB_UNCORE_PCI_IMC_DATA_READS 0x1 +#define SNB_UNCORE_PCI_IMC_DATA_READS_BASE 0x5050 +#define SNB_UNCORE_PCI_IMC_DATA_WRITES 0x2 +#define SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE 0x5054 +#define SNB_UNCORE_PCI_IMC_CTR_BASE SNB_UNCORE_PCI_IMC_DATA_READS_BASE + +static struct attribute *snb_uncore_imc_formats_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group snb_uncore_imc_format_group = { + .name = "format", + .attrs = snb_uncore_imc_formats_attr, +}; + +static void snb_uncore_imc_init_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + int where = SNB_UNCORE_PCI_IMC_BAR_OFFSET; + resource_size_t addr; + u32 pci_dword; + + pci_read_config_dword(pdev, where, &pci_dword); + addr = pci_dword; + +#ifdef CONFIG_PHYS_ADDR_T_64BIT + pci_read_config_dword(pdev, where + 4, &pci_dword); + addr |= ((resource_size_t)pci_dword << 32); +#endif + + addr &= ~(PAGE_SIZE - 1); + + box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE); + box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL; +} + +static void snb_uncore_imc_enable_box(struct intel_uncore_box *box) +{} + +static void snb_uncore_imc_disable_box(struct intel_uncore_box *box) +{} + +static void snb_uncore_imc_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{} + +static void snb_uncore_imc_disable_event(struct intel_uncore_box *box, struct perf_event *event) +{} + +static u64 snb_uncore_imc_read_counter(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + return (u64)*(unsigned int *)(box->io_addr + hwc->event_base); +} + +/* + * custom event_init() function because we define our own fixed, free + * running counters, so we do not want to conflict with generic uncore + * logic. Also simplifies processing + */ +static int snb_uncore_imc_event_init(struct perf_event *event) +{ + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + struct hw_perf_event *hwc = &event->hw; + u64 cfg = event->attr.config & SNB_UNCORE_PCI_IMC_EVENT_MASK; + int idx, base; + + if (event->attr.type != event->pmu->type) + return -ENOENT; + + pmu = uncore_event_to_pmu(event); + /* no device found for this pmu */ + if (pmu->func_id < 0) + return -ENOENT; + + /* Sampling not supported yet */ + if (hwc->sample_period) + return -EINVAL; + + /* unsupported modes and filters */ + if (event->attr.exclude_user || + event->attr.exclude_kernel || + event->attr.exclude_hv || + event->attr.exclude_idle || + event->attr.exclude_host || + event->attr.exclude_guest || + event->attr.sample_period) /* no sampling */ + return -EINVAL; + + /* + * Place all uncore events for a particular physical package + * onto a single cpu + */ + if (event->cpu < 0) + return -EINVAL; + + /* check only supported bits are set */ + if (event->attr.config & ~SNB_UNCORE_PCI_IMC_EVENT_MASK) + return -EINVAL; + + box = uncore_pmu_to_box(pmu, event->cpu); + if (!box || box->cpu < 0) + return -EINVAL; + + event->cpu = box->cpu; + + event->hw.idx = -1; + event->hw.last_tag = ~0ULL; + event->hw.extra_reg.idx = EXTRA_REG_NONE; + event->hw.branch_reg.idx = EXTRA_REG_NONE; + /* + * check event is known (whitelist, determines counter) + */ + switch (cfg) { + case SNB_UNCORE_PCI_IMC_DATA_READS: + base = SNB_UNCORE_PCI_IMC_DATA_READS_BASE; + idx = UNCORE_PMC_IDX_FIXED; + break; + case SNB_UNCORE_PCI_IMC_DATA_WRITES: + base = SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE; + idx = UNCORE_PMC_IDX_FIXED + 1; + break; + default: + return -EINVAL; + } + + /* must be done before validate_group */ + event->hw.event_base = base; + event->hw.config = cfg; + event->hw.idx = idx; + + /* no group validation needed, we have free running counters */ + + return 0; +} + +static int snb_uncore_imc_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + return 0; +} + +static void snb_uncore_imc_event_start(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + u64 count; + + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) + return; + + event->hw.state = 0; + box->n_active++; + + list_add_tail(&event->active_entry, &box->active_list); + + count = snb_uncore_imc_read_counter(box, event); + local64_set(&event->hw.prev_count, count); + + if (box->n_active == 1) + uncore_pmu_start_hrtimer(box); +} + +static void snb_uncore_imc_event_stop(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + struct hw_perf_event *hwc = &event->hw; + + if (!(hwc->state & PERF_HES_STOPPED)) { + box->n_active--; + + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; + + list_del(&event->active_entry); + + if (box->n_active == 0) + uncore_pmu_cancel_hrtimer(box); + } + + if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { + /* + * Drain the remaining delta count out of a event + * that we are disabling: + */ + uncore_perf_event_update(box, event); + hwc->state |= PERF_HES_UPTODATE; + } +} + +static int snb_uncore_imc_event_add(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + struct hw_perf_event *hwc = &event->hw; + + if (!box) + return -ENODEV; + + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + if (!(flags & PERF_EF_START)) + hwc->state |= PERF_HES_ARCH; + + snb_uncore_imc_event_start(event, 0); + + box->n_events++; + + return 0; +} + +static void snb_uncore_imc_event_del(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + int i; + + snb_uncore_imc_event_stop(event, PERF_EF_UPDATE); + + for (i = 0; i < box->n_events; i++) { + if (event == box->event_list[i]) { + --box->n_events; + break; + } + } +} + +static int snb_pci2phy_map_init(int devid) +{ + struct pci_dev *dev = NULL; + int bus; + + dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, dev); + if (!dev) + return -ENOTTY; + + bus = dev->bus->number; + + pcibus_to_physid[bus] = 0; + + pci_dev_put(dev); + + return 0; +} + +static struct pmu snb_uncore_imc_pmu = { + .task_ctx_nr = perf_invalid_context, + .event_init = snb_uncore_imc_event_init, + .add = snb_uncore_imc_event_add, + .del = snb_uncore_imc_event_del, + .start = snb_uncore_imc_event_start, + .stop = snb_uncore_imc_event_stop, + .read = uncore_pmu_event_read, +}; + +static struct intel_uncore_ops snb_uncore_imc_ops = { + .init_box = snb_uncore_imc_init_box, + .enable_box = snb_uncore_imc_enable_box, + .disable_box = snb_uncore_imc_disable_box, + .disable_event = snb_uncore_imc_disable_event, + .enable_event = snb_uncore_imc_enable_event, + .hw_config = snb_uncore_imc_hw_config, + .read_counter = snb_uncore_imc_read_counter, +}; + +static struct intel_uncore_type snb_uncore_imc = { + .name = "imc", + .num_counters = 2, + .num_boxes = 1, + .fixed_ctr_bits = 32, + .fixed_ctr = SNB_UNCORE_PCI_IMC_CTR_BASE, + .event_descs = snb_uncore_imc_events, + .format_group = &snb_uncore_imc_format_group, + .perf_ctr = SNB_UNCORE_PCI_IMC_DATA_READS_BASE, + .event_mask = SNB_UNCORE_PCI_IMC_EVENT_MASK, + .ops = &snb_uncore_imc_ops, + .pmu = &snb_uncore_imc_pmu, +}; + +static struct intel_uncore_type *snb_pci_uncores[] = { + [SNB_PCI_UNCORE_IMC] = &snb_uncore_imc, + NULL, +}; + +static DEFINE_PCI_DEVICE_TABLE(snb_uncore_pci_ids) = { + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SNB_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* end: all zeroes */ }, +}; + +static DEFINE_PCI_DEVICE_TABLE(ivb_uncore_pci_ids) = { + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* end: all zeroes */ }, +}; + +static DEFINE_PCI_DEVICE_TABLE(hsw_uncore_pci_ids) = { + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* end: all zeroes */ }, +}; + +static struct pci_driver snb_uncore_pci_driver = { + .name = "snb_uncore", + .id_table = snb_uncore_pci_ids, +}; + +static struct pci_driver ivb_uncore_pci_driver = { + .name = "ivb_uncore", + .id_table = ivb_uncore_pci_ids, +}; + +static struct pci_driver hsw_uncore_pci_driver = { + .name = "hsw_uncore", + .id_table = hsw_uncore_pci_ids, +}; + +/* end of Sandy Bridge uncore support */ + +/* Nehalem uncore support */ +static void nhm_uncore_msr_disable_box(struct intel_uncore_box *box) +{ + wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, 0); +} + +static void nhm_uncore_msr_enable_box(struct intel_uncore_box *box) +{ + wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, NHM_UNC_GLOBAL_CTL_EN_PC_ALL | NHM_UNC_GLOBAL_CTL_EN_FC); +} + +static void nhm_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (hwc->idx < UNCORE_PMC_IDX_FIXED) + wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN); + else + wrmsrl(hwc->config_base, NHM_UNC_FIXED_CTR_CTL_EN); +} + +static struct attribute *nhm_uncore_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_cmask8.attr, + NULL, +}; + +static struct attribute_group nhm_uncore_format_group = { + .name = "format", + .attrs = nhm_uncore_formats_attr, +}; + +static struct uncore_event_desc nhm_uncore_events[] = { + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"), + INTEL_UNCORE_EVENT_DESC(qmc_writes_full_any, "event=0x2f,umask=0x0f"), + INTEL_UNCORE_EVENT_DESC(qmc_normal_reads_any, "event=0x2c,umask=0x0f"), + INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_reads, "event=0x20,umask=0x01"), + INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_writes, "event=0x20,umask=0x02"), + INTEL_UNCORE_EVENT_DESC(qhl_request_remote_reads, "event=0x20,umask=0x04"), + INTEL_UNCORE_EVENT_DESC(qhl_request_remote_writes, "event=0x20,umask=0x08"), + INTEL_UNCORE_EVENT_DESC(qhl_request_local_reads, "event=0x20,umask=0x10"), + INTEL_UNCORE_EVENT_DESC(qhl_request_local_writes, "event=0x20,umask=0x20"), + { /* end: all zeroes */ }, +}; + +static struct intel_uncore_ops nhm_uncore_msr_ops = { + .disable_box = nhm_uncore_msr_disable_box, + .enable_box = nhm_uncore_msr_enable_box, + .disable_event = snb_uncore_msr_disable_event, + .enable_event = nhm_uncore_msr_enable_event, + .read_counter = uncore_msr_read_counter, +}; + +static struct intel_uncore_type nhm_uncore = { + .name = "", + .num_counters = 8, + .num_boxes = 1, + .perf_ctr_bits = 48, + .fixed_ctr_bits = 48, + .event_ctl = NHM_UNC_PERFEVTSEL0, + .perf_ctr = NHM_UNC_UNCORE_PMC0, + .fixed_ctr = NHM_UNC_FIXED_CTR, + .fixed_ctl = NHM_UNC_FIXED_CTR_CTRL, + .event_mask = NHM_UNC_RAW_EVENT_MASK, + .event_descs = nhm_uncore_events, + .ops = &nhm_uncore_msr_ops, + .format_group = &nhm_uncore_format_group, +}; + +static struct intel_uncore_type *nhm_msr_uncores[] = { + &nhm_uncore, + NULL, +}; +/* end of Nehalem uncore support */ + +/* Nehalem-EX uncore support */ +DEFINE_UNCORE_FORMAT_ATTR(event5, event, "config:1-5"); +DEFINE_UNCORE_FORMAT_ATTR(counter, counter, "config:6-7"); +DEFINE_UNCORE_FORMAT_ATTR(match, match, "config1:0-63"); +DEFINE_UNCORE_FORMAT_ATTR(mask, mask, "config2:0-63"); + +static void nhmex_uncore_msr_init_box(struct intel_uncore_box *box) +{ + wrmsrl(NHMEX_U_MSR_PMON_GLOBAL_CTL, NHMEX_U_PMON_GLOBAL_EN_ALL); +} + +static void nhmex_uncore_msr_disable_box(struct intel_uncore_box *box) +{ + unsigned msr = uncore_msr_box_ctl(box); + u64 config; + + if (msr) { + rdmsrl(msr, config); + config &= ~((1ULL << uncore_num_counters(box)) - 1); + /* WBox has a fixed counter */ + if (uncore_msr_fixed_ctl(box)) + config &= ~NHMEX_W_PMON_GLOBAL_FIXED_EN; + wrmsrl(msr, config); + } +} + +static void nhmex_uncore_msr_enable_box(struct intel_uncore_box *box) +{ + unsigned msr = uncore_msr_box_ctl(box); + u64 config; + + if (msr) { + rdmsrl(msr, config); + config |= (1ULL << uncore_num_counters(box)) - 1; + /* WBox has a fixed counter */ + if (uncore_msr_fixed_ctl(box)) + config |= NHMEX_W_PMON_GLOBAL_FIXED_EN; + wrmsrl(msr, config); + } +} + +static void nhmex_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + wrmsrl(event->hw.config_base, 0); +} + +static void nhmex_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (hwc->idx >= UNCORE_PMC_IDX_FIXED) + wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0); + else if (box->pmu->type->event_mask & NHMEX_PMON_CTL_EN_BIT0) + wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22); + else + wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0); +} + +#define NHMEX_UNCORE_OPS_COMMON_INIT() \ + .init_box = nhmex_uncore_msr_init_box, \ + .disable_box = nhmex_uncore_msr_disable_box, \ + .enable_box = nhmex_uncore_msr_enable_box, \ + .disable_event = nhmex_uncore_msr_disable_event, \ + .read_counter = uncore_msr_read_counter + +static struct intel_uncore_ops nhmex_uncore_ops = { + NHMEX_UNCORE_OPS_COMMON_INIT(), + .enable_event = nhmex_uncore_msr_enable_event, +}; + +static struct attribute *nhmex_uncore_ubox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_edge.attr, + NULL, +}; + +static struct attribute_group nhmex_uncore_ubox_format_group = { + .name = "format", + .attrs = nhmex_uncore_ubox_formats_attr, +}; + +static struct intel_uncore_type nhmex_uncore_ubox = { + .name = "ubox", + .num_counters = 1, + .num_boxes = 1, + .perf_ctr_bits = 48, + .event_ctl = NHMEX_U_MSR_PMON_EV_SEL, + .perf_ctr = NHMEX_U_MSR_PMON_CTR, + .event_mask = NHMEX_U_PMON_RAW_EVENT_MASK, + .box_ctl = NHMEX_U_MSR_PMON_GLOBAL_CTL, + .ops = &nhmex_uncore_ops, + .format_group = &nhmex_uncore_ubox_format_group +}; + +static struct attribute *nhmex_uncore_cbox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + NULL, +}; + +static struct attribute_group nhmex_uncore_cbox_format_group = { + .name = "format", + .attrs = nhmex_uncore_cbox_formats_attr, +}; + +/* msr offset for each instance of cbox */ +static unsigned nhmex_cbox_msr_offsets[] = { + 0x0, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x240, 0x2c0, +}; + +static struct intel_uncore_type nhmex_uncore_cbox = { + .name = "cbox", + .num_counters = 6, + .num_boxes = 10, + .perf_ctr_bits = 48, + .event_ctl = NHMEX_C0_MSR_PMON_EV_SEL0, + .perf_ctr = NHMEX_C0_MSR_PMON_CTR0, + .event_mask = NHMEX_PMON_RAW_EVENT_MASK, + .box_ctl = NHMEX_C0_MSR_PMON_GLOBAL_CTL, + .msr_offsets = nhmex_cbox_msr_offsets, + .pair_ctr_ctl = 1, + .ops = &nhmex_uncore_ops, + .format_group = &nhmex_uncore_cbox_format_group +}; + +static struct uncore_event_desc nhmex_uncore_wbox_events[] = { + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0"), + { /* end: all zeroes */ }, +}; + +static struct intel_uncore_type nhmex_uncore_wbox = { + .name = "wbox", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .event_ctl = NHMEX_W_MSR_PMON_CNT0, + .perf_ctr = NHMEX_W_MSR_PMON_EVT_SEL0, + .fixed_ctr = NHMEX_W_MSR_PMON_FIXED_CTR, + .fixed_ctl = NHMEX_W_MSR_PMON_FIXED_CTL, + .event_mask = NHMEX_PMON_RAW_EVENT_MASK, + .box_ctl = NHMEX_W_MSR_GLOBAL_CTL, + .pair_ctr_ctl = 1, + .event_descs = nhmex_uncore_wbox_events, + .ops = &nhmex_uncore_ops, + .format_group = &nhmex_uncore_cbox_format_group +}; + +static int nhmex_bbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + int ctr, ev_sel; + + ctr = (hwc->config & NHMEX_B_PMON_CTR_MASK) >> + NHMEX_B_PMON_CTR_SHIFT; + ev_sel = (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK) >> + NHMEX_B_PMON_CTL_EV_SEL_SHIFT; + + /* events that do not use the match/mask registers */ + if ((ctr == 0 && ev_sel > 0x3) || (ctr == 1 && ev_sel > 0x6) || + (ctr == 2 && ev_sel != 0x4) || ctr == 3) + return 0; + + if (box->pmu->pmu_idx == 0) + reg1->reg = NHMEX_B0_MSR_MATCH; + else + reg1->reg = NHMEX_B1_MSR_MATCH; + reg1->idx = 0; + reg1->config = event->attr.config1; + reg2->config = event->attr.config2; + return 0; +} + +static void nhmex_bbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + + if (reg1->idx != EXTRA_REG_NONE) { + wrmsrl(reg1->reg, reg1->config); + wrmsrl(reg1->reg + 1, reg2->config); + } + wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 | + (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK)); +} + +/* + * The Bbox has 4 counters, but each counter monitors different events. + * Use bits 6-7 in the event config to select counter. + */ +static struct event_constraint nhmex_uncore_bbox_constraints[] = { + EVENT_CONSTRAINT(0 , 1, 0xc0), + EVENT_CONSTRAINT(0x40, 2, 0xc0), + EVENT_CONSTRAINT(0x80, 4, 0xc0), + EVENT_CONSTRAINT(0xc0, 8, 0xc0), + EVENT_CONSTRAINT_END, +}; + +static struct attribute *nhmex_uncore_bbox_formats_attr[] = { + &format_attr_event5.attr, + &format_attr_counter.attr, + &format_attr_match.attr, + &format_attr_mask.attr, + NULL, +}; + +static struct attribute_group nhmex_uncore_bbox_format_group = { + .name = "format", + .attrs = nhmex_uncore_bbox_formats_attr, +}; + +static struct intel_uncore_ops nhmex_uncore_bbox_ops = { + NHMEX_UNCORE_OPS_COMMON_INIT(), + .enable_event = nhmex_bbox_msr_enable_event, + .hw_config = nhmex_bbox_hw_config, + .get_constraint = uncore_get_constraint, + .put_constraint = uncore_put_constraint, +}; + +static struct intel_uncore_type nhmex_uncore_bbox = { + .name = "bbox", + .num_counters = 4, + .num_boxes = 2, + .perf_ctr_bits = 48, + .event_ctl = NHMEX_B0_MSR_PMON_CTL0, + .perf_ctr = NHMEX_B0_MSR_PMON_CTR0, + .event_mask = NHMEX_B_PMON_RAW_EVENT_MASK, + .box_ctl = NHMEX_B0_MSR_PMON_GLOBAL_CTL, + .msr_offset = NHMEX_B_MSR_OFFSET, + .pair_ctr_ctl = 1, + .num_shared_regs = 1, + .constraints = nhmex_uncore_bbox_constraints, + .ops = &nhmex_uncore_bbox_ops, + .format_group = &nhmex_uncore_bbox_format_group +}; + +static int nhmex_sbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + + /* only TO_R_PROG_EV event uses the match/mask register */ + if ((hwc->config & NHMEX_PMON_CTL_EV_SEL_MASK) != + NHMEX_S_EVENT_TO_R_PROG_EV) + return 0; + + if (box->pmu->pmu_idx == 0) + reg1->reg = NHMEX_S0_MSR_MM_CFG; + else + reg1->reg = NHMEX_S1_MSR_MM_CFG; + reg1->idx = 0; + reg1->config = event->attr.config1; + reg2->config = event->attr.config2; + return 0; +} + +static void nhmex_sbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + + if (reg1->idx != EXTRA_REG_NONE) { + wrmsrl(reg1->reg, 0); + wrmsrl(reg1->reg + 1, reg1->config); + wrmsrl(reg1->reg + 2, reg2->config); + wrmsrl(reg1->reg, NHMEX_S_PMON_MM_CFG_EN); + } + wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22); +} + +static struct attribute *nhmex_uncore_sbox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + &format_attr_match.attr, + &format_attr_mask.attr, + NULL, +}; + +static struct attribute_group nhmex_uncore_sbox_format_group = { + .name = "format", + .attrs = nhmex_uncore_sbox_formats_attr, +}; + +static struct intel_uncore_ops nhmex_uncore_sbox_ops = { + NHMEX_UNCORE_OPS_COMMON_INIT(), + .enable_event = nhmex_sbox_msr_enable_event, + .hw_config = nhmex_sbox_hw_config, + .get_constraint = uncore_get_constraint, + .put_constraint = uncore_put_constraint, +}; + +static struct intel_uncore_type nhmex_uncore_sbox = { + .name = "sbox", + .num_counters = 4, + .num_boxes = 2, + .perf_ctr_bits = 48, + .event_ctl = NHMEX_S0_MSR_PMON_CTL0, + .perf_ctr = NHMEX_S0_MSR_PMON_CTR0, + .event_mask = NHMEX_PMON_RAW_EVENT_MASK, + .box_ctl = NHMEX_S0_MSR_PMON_GLOBAL_CTL, + .msr_offset = NHMEX_S_MSR_OFFSET, + .pair_ctr_ctl = 1, + .num_shared_regs = 1, + .ops = &nhmex_uncore_sbox_ops, + .format_group = &nhmex_uncore_sbox_format_group +}; + +enum { + EXTRA_REG_NHMEX_M_FILTER, + EXTRA_REG_NHMEX_M_DSP, + EXTRA_REG_NHMEX_M_ISS, + EXTRA_REG_NHMEX_M_MAP, + EXTRA_REG_NHMEX_M_MSC_THR, + EXTRA_REG_NHMEX_M_PGT, + EXTRA_REG_NHMEX_M_PLD, + EXTRA_REG_NHMEX_M_ZDP_CTL_FVC, +}; + +static struct extra_reg nhmex_uncore_mbox_extra_regs[] = { + MBOX_INC_SEL_EXTAR_REG(0x0, DSP), + MBOX_INC_SEL_EXTAR_REG(0x4, MSC_THR), + MBOX_INC_SEL_EXTAR_REG(0x5, MSC_THR), + MBOX_INC_SEL_EXTAR_REG(0x9, ISS), + /* event 0xa uses two extra registers */ + MBOX_INC_SEL_EXTAR_REG(0xa, ISS), + MBOX_INC_SEL_EXTAR_REG(0xa, PLD), + MBOX_INC_SEL_EXTAR_REG(0xb, PLD), + /* events 0xd ~ 0x10 use the same extra register */ + MBOX_INC_SEL_EXTAR_REG(0xd, ZDP_CTL_FVC), + MBOX_INC_SEL_EXTAR_REG(0xe, ZDP_CTL_FVC), + MBOX_INC_SEL_EXTAR_REG(0xf, ZDP_CTL_FVC), + MBOX_INC_SEL_EXTAR_REG(0x10, ZDP_CTL_FVC), + MBOX_INC_SEL_EXTAR_REG(0x16, PGT), + MBOX_SET_FLAG_SEL_EXTRA_REG(0x0, DSP), + MBOX_SET_FLAG_SEL_EXTRA_REG(0x1, ISS), + MBOX_SET_FLAG_SEL_EXTRA_REG(0x5, PGT), + MBOX_SET_FLAG_SEL_EXTRA_REG(0x6, MAP), + EVENT_EXTRA_END +}; + +/* Nehalem-EX or Westmere-EX ? */ +static bool uncore_nhmex; + +static bool nhmex_mbox_get_shared_reg(struct intel_uncore_box *box, int idx, u64 config) +{ + struct intel_uncore_extra_reg *er; + unsigned long flags; + bool ret = false; + u64 mask; + + if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) { + er = &box->shared_regs[idx]; + raw_spin_lock_irqsave(&er->lock, flags); + if (!atomic_read(&er->ref) || er->config == config) { + atomic_inc(&er->ref); + er->config = config; + ret = true; + } + raw_spin_unlock_irqrestore(&er->lock, flags); + + return ret; + } + /* + * The ZDP_CTL_FVC MSR has 4 fields which are used to control + * events 0xd ~ 0x10. Besides these 4 fields, there are additional + * fields which are shared. + */ + idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC; + if (WARN_ON_ONCE(idx >= 4)) + return false; + + /* mask of the shared fields */ + if (uncore_nhmex) + mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK; + else + mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK; + er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC]; + + raw_spin_lock_irqsave(&er->lock, flags); + /* add mask of the non-shared field if it's in use */ + if (__BITS_VALUE(atomic_read(&er->ref), idx, 8)) { + if (uncore_nhmex) + mask |= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + else + mask |= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + } + + if (!atomic_read(&er->ref) || !((er->config ^ config) & mask)) { + atomic_add(1 << (idx * 8), &er->ref); + if (uncore_nhmex) + mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK | + NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + else + mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK | + WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + er->config &= ~mask; + er->config |= (config & mask); + ret = true; + } + raw_spin_unlock_irqrestore(&er->lock, flags); + + return ret; +} + +static void nhmex_mbox_put_shared_reg(struct intel_uncore_box *box, int idx) +{ + struct intel_uncore_extra_reg *er; + + if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) { + er = &box->shared_regs[idx]; + atomic_dec(&er->ref); + return; + } + + idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC; + er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC]; + atomic_sub(1 << (idx * 8), &er->ref); +} + +static u64 nhmex_mbox_alter_er(struct perf_event *event, int new_idx, bool modify) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + u64 idx, orig_idx = __BITS_VALUE(reg1->idx, 0, 8); + u64 config = reg1->config; + + /* get the non-shared control bits and shift them */ + idx = orig_idx - EXTRA_REG_NHMEX_M_ZDP_CTL_FVC; + if (uncore_nhmex) + config &= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + else + config &= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + if (new_idx > orig_idx) { + idx = new_idx - orig_idx; + config <<= 3 * idx; + } else { + idx = orig_idx - new_idx; + config >>= 3 * idx; + } + + /* add the shared control bits back */ + if (uncore_nhmex) + config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config; + else + config |= WSMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config; + config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config; + if (modify) { + /* adjust the main event selector */ + if (new_idx > orig_idx) + hwc->config += idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT; + else + hwc->config -= idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT; + reg1->config = config; + reg1->idx = ~0xff | new_idx; + } + return config; +} + +static struct event_constraint * +nhmex_mbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; + int i, idx[2], alloc = 0; + u64 config1 = reg1->config; + + idx[0] = __BITS_VALUE(reg1->idx, 0, 8); + idx[1] = __BITS_VALUE(reg1->idx, 1, 8); +again: + for (i = 0; i < 2; i++) { + if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i))) + idx[i] = 0xff; + + if (idx[i] == 0xff) + continue; + + if (!nhmex_mbox_get_shared_reg(box, idx[i], + __BITS_VALUE(config1, i, 32))) + goto fail; + alloc |= (0x1 << i); + } + + /* for the match/mask registers */ + if (reg2->idx != EXTRA_REG_NONE && + (uncore_box_is_fake(box) || !reg2->alloc) && + !nhmex_mbox_get_shared_reg(box, reg2->idx, reg2->config)) + goto fail; + + /* + * If it's a fake box -- as per validate_{group,event}() we + * shouldn't touch event state and we can avoid doing so + * since both will only call get_event_constraints() once + * on each event, this avoids the need for reg->alloc. + */ + if (!uncore_box_is_fake(box)) { + if (idx[0] != 0xff && idx[0] != __BITS_VALUE(reg1->idx, 0, 8)) + nhmex_mbox_alter_er(event, idx[0], true); + reg1->alloc |= alloc; + if (reg2->idx != EXTRA_REG_NONE) + reg2->alloc = 1; + } + return NULL; +fail: + if (idx[0] != 0xff && !(alloc & 0x1) && + idx[0] >= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) { + /* + * events 0xd ~ 0x10 are functional identical, but are + * controlled by different fields in the ZDP_CTL_FVC + * register. If we failed to take one field, try the + * rest 3 choices. + */ + BUG_ON(__BITS_VALUE(reg1->idx, 1, 8) != 0xff); + idx[0] -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC; + idx[0] = (idx[0] + 1) % 4; + idx[0] += EXTRA_REG_NHMEX_M_ZDP_CTL_FVC; + if (idx[0] != __BITS_VALUE(reg1->idx, 0, 8)) { + config1 = nhmex_mbox_alter_er(event, idx[0], false); + goto again; + } + } + + if (alloc & 0x1) + nhmex_mbox_put_shared_reg(box, idx[0]); + if (alloc & 0x2) + nhmex_mbox_put_shared_reg(box, idx[1]); + return &constraint_empty; +} + +static void nhmex_mbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; + + if (uncore_box_is_fake(box)) + return; + + if (reg1->alloc & 0x1) + nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 0, 8)); + if (reg1->alloc & 0x2) + nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 1, 8)); + reg1->alloc = 0; + + if (reg2->alloc) { + nhmex_mbox_put_shared_reg(box, reg2->idx); + reg2->alloc = 0; + } +} + +static int nhmex_mbox_extra_reg_idx(struct extra_reg *er) +{ + if (er->idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) + return er->idx; + return er->idx + (er->event >> NHMEX_M_PMON_CTL_INC_SEL_SHIFT) - 0xd; +} + +static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct intel_uncore_type *type = box->pmu->type; + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; + struct extra_reg *er; + unsigned msr; + int reg_idx = 0; + /* + * The mbox events may require 2 extra MSRs at the most. But only + * the lower 32 bits in these MSRs are significant, so we can use + * config1 to pass two MSRs' config. + */ + for (er = nhmex_uncore_mbox_extra_regs; er->msr; er++) { + if (er->event != (event->hw.config & er->config_mask)) + continue; + if (event->attr.config1 & ~er->valid_mask) + return -EINVAL; + + msr = er->msr + type->msr_offset * box->pmu->pmu_idx; + if (WARN_ON_ONCE(msr >= 0xffff || er->idx >= 0xff)) + return -EINVAL; + + /* always use the 32~63 bits to pass the PLD config */ + if (er->idx == EXTRA_REG_NHMEX_M_PLD) + reg_idx = 1; + else if (WARN_ON_ONCE(reg_idx > 0)) + return -EINVAL; + + reg1->idx &= ~(0xff << (reg_idx * 8)); + reg1->reg &= ~(0xffff << (reg_idx * 16)); + reg1->idx |= nhmex_mbox_extra_reg_idx(er) << (reg_idx * 8); + reg1->reg |= msr << (reg_idx * 16); + reg1->config = event->attr.config1; + reg_idx++; + } + /* + * The mbox only provides ability to perform address matching + * for the PLD events. + */ + if (reg_idx == 2) { + reg2->idx = EXTRA_REG_NHMEX_M_FILTER; + if (event->attr.config2 & NHMEX_M_PMON_MM_CFG_EN) + reg2->config = event->attr.config2; + else + reg2->config = ~0ULL; + if (box->pmu->pmu_idx == 0) + reg2->reg = NHMEX_M0_MSR_PMU_MM_CFG; + else + reg2->reg = NHMEX_M1_MSR_PMU_MM_CFG; + } + return 0; +} + +static u64 nhmex_mbox_shared_reg_config(struct intel_uncore_box *box, int idx) +{ + struct intel_uncore_extra_reg *er; + unsigned long flags; + u64 config; + + if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) + return box->shared_regs[idx].config; + + er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC]; + raw_spin_lock_irqsave(&er->lock, flags); + config = er->config; + raw_spin_unlock_irqrestore(&er->lock, flags); + return config; +} + +static void nhmex_mbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + int idx; + + idx = __BITS_VALUE(reg1->idx, 0, 8); + if (idx != 0xff) + wrmsrl(__BITS_VALUE(reg1->reg, 0, 16), + nhmex_mbox_shared_reg_config(box, idx)); + idx = __BITS_VALUE(reg1->idx, 1, 8); + if (idx != 0xff) + wrmsrl(__BITS_VALUE(reg1->reg, 1, 16), + nhmex_mbox_shared_reg_config(box, idx)); + + if (reg2->idx != EXTRA_REG_NONE) { + wrmsrl(reg2->reg, 0); + if (reg2->config != ~0ULL) { + wrmsrl(reg2->reg + 1, + reg2->config & NHMEX_M_PMON_ADDR_MATCH_MASK); + wrmsrl(reg2->reg + 2, NHMEX_M_PMON_ADDR_MASK_MASK & + (reg2->config >> NHMEX_M_PMON_ADDR_MASK_SHIFT)); + wrmsrl(reg2->reg, NHMEX_M_PMON_MM_CFG_EN); + } + } + + wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0); +} + +DEFINE_UNCORE_FORMAT_ATTR(count_mode, count_mode, "config:2-3"); +DEFINE_UNCORE_FORMAT_ATTR(storage_mode, storage_mode, "config:4-5"); +DEFINE_UNCORE_FORMAT_ATTR(wrap_mode, wrap_mode, "config:6"); +DEFINE_UNCORE_FORMAT_ATTR(flag_mode, flag_mode, "config:7"); +DEFINE_UNCORE_FORMAT_ATTR(inc_sel, inc_sel, "config:9-13"); +DEFINE_UNCORE_FORMAT_ATTR(set_flag_sel, set_flag_sel, "config:19-21"); +DEFINE_UNCORE_FORMAT_ATTR(filter_cfg_en, filter_cfg_en, "config2:63"); +DEFINE_UNCORE_FORMAT_ATTR(filter_match, filter_match, "config2:0-33"); +DEFINE_UNCORE_FORMAT_ATTR(filter_mask, filter_mask, "config2:34-61"); +DEFINE_UNCORE_FORMAT_ATTR(dsp, dsp, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(thr, thr, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(fvc, fvc, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(pgt, pgt, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(map, map, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(iss, iss, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(pld, pld, "config1:32-63"); + +static struct attribute *nhmex_uncore_mbox_formats_attr[] = { + &format_attr_count_mode.attr, + &format_attr_storage_mode.attr, + &format_attr_wrap_mode.attr, + &format_attr_flag_mode.attr, + &format_attr_inc_sel.attr, + &format_attr_set_flag_sel.attr, + &format_attr_filter_cfg_en.attr, + &format_attr_filter_match.attr, + &format_attr_filter_mask.attr, + &format_attr_dsp.attr, + &format_attr_thr.attr, + &format_attr_fvc.attr, + &format_attr_pgt.attr, + &format_attr_map.attr, + &format_attr_iss.attr, + &format_attr_pld.attr, + NULL, +}; + +static struct attribute_group nhmex_uncore_mbox_format_group = { + .name = "format", + .attrs = nhmex_uncore_mbox_formats_attr, +}; + +static struct uncore_event_desc nhmex_uncore_mbox_events[] = { + INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x2800"), + INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x2820"), + { /* end: all zeroes */ }, +}; + +static struct uncore_event_desc wsmex_uncore_mbox_events[] = { + INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x5000"), + INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x5040"), + { /* end: all zeroes */ }, +}; + +static struct intel_uncore_ops nhmex_uncore_mbox_ops = { + NHMEX_UNCORE_OPS_COMMON_INIT(), + .enable_event = nhmex_mbox_msr_enable_event, + .hw_config = nhmex_mbox_hw_config, + .get_constraint = nhmex_mbox_get_constraint, + .put_constraint = nhmex_mbox_put_constraint, +}; + +static struct intel_uncore_type nhmex_uncore_mbox = { + .name = "mbox", + .num_counters = 6, + .num_boxes = 2, + .perf_ctr_bits = 48, + .event_ctl = NHMEX_M0_MSR_PMU_CTL0, + .perf_ctr = NHMEX_M0_MSR_PMU_CNT0, + .event_mask = NHMEX_M_PMON_RAW_EVENT_MASK, + .box_ctl = NHMEX_M0_MSR_GLOBAL_CTL, + .msr_offset = NHMEX_M_MSR_OFFSET, + .pair_ctr_ctl = 1, + .num_shared_regs = 8, + .event_descs = nhmex_uncore_mbox_events, + .ops = &nhmex_uncore_mbox_ops, + .format_group = &nhmex_uncore_mbox_format_group, +}; + +static void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + + /* adjust the main event selector and extra register index */ + if (reg1->idx % 2) { + reg1->idx--; + hwc->config -= 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT; + } else { + reg1->idx++; + hwc->config += 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT; + } + + /* adjust extra register config */ + switch (reg1->idx % 6) { + case 2: + /* shift the 8~15 bits to the 0~7 bits */ + reg1->config >>= 8; + break; + case 3: + /* shift the 0~7 bits to the 8~15 bits */ + reg1->config <<= 8; + break; + }; +} + +/* + * Each rbox has 4 event set which monitor PQI port 0~3 or 4~7. + * An event set consists of 6 events, the 3rd and 4th events in + * an event set use the same extra register. So an event set uses + * 5 extra registers. + */ +static struct event_constraint * +nhmex_rbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + struct intel_uncore_extra_reg *er; + unsigned long flags; + int idx, er_idx; + u64 config1; + bool ok = false; + + if (!uncore_box_is_fake(box) && reg1->alloc) + return NULL; + + idx = reg1->idx % 6; + config1 = reg1->config; +again: + er_idx = idx; + /* the 3rd and 4th events use the same extra register */ + if (er_idx > 2) + er_idx--; + er_idx += (reg1->idx / 6) * 5; + + er = &box->shared_regs[er_idx]; + raw_spin_lock_irqsave(&er->lock, flags); + if (idx < 2) { + if (!atomic_read(&er->ref) || er->config == reg1->config) { + atomic_inc(&er->ref); + er->config = reg1->config; + ok = true; + } + } else if (idx == 2 || idx == 3) { + /* + * these two events use different fields in a extra register, + * the 0~7 bits and the 8~15 bits respectively. + */ + u64 mask = 0xff << ((idx - 2) * 8); + if (!__BITS_VALUE(atomic_read(&er->ref), idx - 2, 8) || + !((er->config ^ config1) & mask)) { + atomic_add(1 << ((idx - 2) * 8), &er->ref); + er->config &= ~mask; + er->config |= config1 & mask; + ok = true; + } + } else { + if (!atomic_read(&er->ref) || + (er->config == (hwc->config >> 32) && + er->config1 == reg1->config && + er->config2 == reg2->config)) { + atomic_inc(&er->ref); + er->config = (hwc->config >> 32); + er->config1 = reg1->config; + er->config2 = reg2->config; + ok = true; + } + } + raw_spin_unlock_irqrestore(&er->lock, flags); + + if (!ok) { + /* + * The Rbox events are always in pairs. The paired + * events are functional identical, but use different + * extra registers. If we failed to take an extra + * register, try the alternative. + */ + if (idx % 2) + idx--; + else + idx++; + if (idx != reg1->idx % 6) { + if (idx == 2) + config1 >>= 8; + else if (idx == 3) + config1 <<= 8; + goto again; + } + } else { + if (!uncore_box_is_fake(box)) { + if (idx != reg1->idx % 6) + nhmex_rbox_alter_er(box, event); + reg1->alloc = 1; + } + return NULL; + } + return &constraint_empty; +} + +static void nhmex_rbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + struct intel_uncore_extra_reg *er; + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + int idx, er_idx; + + if (uncore_box_is_fake(box) || !reg1->alloc) + return; + + idx = reg1->idx % 6; + er_idx = idx; + if (er_idx > 2) + er_idx--; + er_idx += (reg1->idx / 6) * 5; + + er = &box->shared_regs[er_idx]; + if (idx == 2 || idx == 3) + atomic_sub(1 << ((idx - 2) * 8), &er->ref); + else + atomic_dec(&er->ref); + + reg1->alloc = 0; +} + +static int nhmex_rbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; + int idx; + + idx = (event->hw.config & NHMEX_R_PMON_CTL_EV_SEL_MASK) >> + NHMEX_R_PMON_CTL_EV_SEL_SHIFT; + if (idx >= 0x18) + return -EINVAL; + + reg1->idx = idx; + reg1->config = event->attr.config1; + + switch (idx % 6) { + case 4: + case 5: + hwc->config |= event->attr.config & (~0ULL << 32); + reg2->config = event->attr.config2; + break; + }; + return 0; +} + +static void nhmex_rbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + int idx, port; + + idx = reg1->idx; + port = idx / 6 + box->pmu->pmu_idx * 4; + + switch (idx % 6) { + case 0: + wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG0(port), reg1->config); + break; + case 1: + wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG1(port), reg1->config); + break; + case 2: + case 3: + wrmsrl(NHMEX_R_MSR_PORTN_QLX_CFG(port), + uncore_shared_reg_config(box, 2 + (idx / 6) * 5)); + break; + case 4: + wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port), + hwc->config >> 32); + wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(port), reg1->config); + wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MASK(port), reg2->config); + break; + case 5: + wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port), + hwc->config >> 32); + wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(port), reg1->config); + wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MASK(port), reg2->config); + break; + }; + + wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 | + (hwc->config & NHMEX_R_PMON_CTL_EV_SEL_MASK)); +} + +DEFINE_UNCORE_FORMAT_ATTR(xbr_mm_cfg, xbr_mm_cfg, "config:32-63"); +DEFINE_UNCORE_FORMAT_ATTR(xbr_match, xbr_match, "config1:0-63"); +DEFINE_UNCORE_FORMAT_ATTR(xbr_mask, xbr_mask, "config2:0-63"); +DEFINE_UNCORE_FORMAT_ATTR(qlx_cfg, qlx_cfg, "config1:0-15"); +DEFINE_UNCORE_FORMAT_ATTR(iperf_cfg, iperf_cfg, "config1:0-31"); + +static struct attribute *nhmex_uncore_rbox_formats_attr[] = { + &format_attr_event5.attr, + &format_attr_xbr_mm_cfg.attr, + &format_attr_xbr_match.attr, + &format_attr_xbr_mask.attr, + &format_attr_qlx_cfg.attr, + &format_attr_iperf_cfg.attr, + NULL, +}; + +static struct attribute_group nhmex_uncore_rbox_format_group = { + .name = "format", + .attrs = nhmex_uncore_rbox_formats_attr, +}; + +static struct uncore_event_desc nhmex_uncore_rbox_events[] = { + INTEL_UNCORE_EVENT_DESC(qpi0_flit_send, "event=0x0,iperf_cfg=0x80000000"), + INTEL_UNCORE_EVENT_DESC(qpi1_filt_send, "event=0x6,iperf_cfg=0x80000000"), + INTEL_UNCORE_EVENT_DESC(qpi0_idle_filt, "event=0x0,iperf_cfg=0x40000000"), + INTEL_UNCORE_EVENT_DESC(qpi1_idle_filt, "event=0x6,iperf_cfg=0x40000000"), + INTEL_UNCORE_EVENT_DESC(qpi0_date_response, "event=0x0,iperf_cfg=0xc4"), + INTEL_UNCORE_EVENT_DESC(qpi1_date_response, "event=0x6,iperf_cfg=0xc4"), + { /* end: all zeroes */ }, +}; + +static struct intel_uncore_ops nhmex_uncore_rbox_ops = { + NHMEX_UNCORE_OPS_COMMON_INIT(), + .enable_event = nhmex_rbox_msr_enable_event, + .hw_config = nhmex_rbox_hw_config, + .get_constraint = nhmex_rbox_get_constraint, + .put_constraint = nhmex_rbox_put_constraint, +}; + +static struct intel_uncore_type nhmex_uncore_rbox = { + .name = "rbox", + .num_counters = 8, + .num_boxes = 2, + .perf_ctr_bits = 48, + .event_ctl = NHMEX_R_MSR_PMON_CTL0, + .perf_ctr = NHMEX_R_MSR_PMON_CNT0, + .event_mask = NHMEX_R_PMON_RAW_EVENT_MASK, + .box_ctl = NHMEX_R_MSR_GLOBAL_CTL, + .msr_offset = NHMEX_R_MSR_OFFSET, + .pair_ctr_ctl = 1, + .num_shared_regs = 20, + .event_descs = nhmex_uncore_rbox_events, + .ops = &nhmex_uncore_rbox_ops, + .format_group = &nhmex_uncore_rbox_format_group +}; + +static struct intel_uncore_type *nhmex_msr_uncores[] = { + &nhmex_uncore_ubox, + &nhmex_uncore_cbox, + &nhmex_uncore_bbox, + &nhmex_uncore_sbox, + &nhmex_uncore_mbox, + &nhmex_uncore_rbox, + &nhmex_uncore_wbox, + NULL, +}; +/* end of Nehalem-EX uncore support */ + +static void uncore_assign_hw_event(struct intel_uncore_box *box, struct perf_event *event, int idx) +{ + struct hw_perf_event *hwc = &event->hw; + + hwc->idx = idx; + hwc->last_tag = ++box->tags[idx]; + + if (hwc->idx == UNCORE_PMC_IDX_FIXED) { + hwc->event_base = uncore_fixed_ctr(box); + hwc->config_base = uncore_fixed_ctl(box); + return; + } + + hwc->config_base = uncore_event_ctl(box, hwc->idx); + hwc->event_base = uncore_perf_ctr(box, hwc->idx); +} + +static void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event) +{ + u64 prev_count, new_count, delta; + int shift; + + if (event->hw.idx >= UNCORE_PMC_IDX_FIXED) + shift = 64 - uncore_fixed_ctr_bits(box); + else + shift = 64 - uncore_perf_ctr_bits(box); + + /* the hrtimer might modify the previous event value */ +again: + prev_count = local64_read(&event->hw.prev_count); + new_count = uncore_read_counter(box, event); + if (local64_xchg(&event->hw.prev_count, new_count) != prev_count) + goto again; + + delta = (new_count << shift) - (prev_count << shift); + delta >>= shift; + + local64_add(delta, &event->count); +} + +/* + * The overflow interrupt is unavailable for SandyBridge-EP, is broken + * for SandyBridge. So we use hrtimer to periodically poll the counter + * to avoid overflow. + */ +static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer) +{ + struct intel_uncore_box *box; + struct perf_event *event; + unsigned long flags; + int bit; + + box = container_of(hrtimer, struct intel_uncore_box, hrtimer); + if (!box->n_active || box->cpu != smp_processor_id()) + return HRTIMER_NORESTART; + /* + * disable local interrupt to prevent uncore_pmu_event_start/stop + * to interrupt the update process + */ + local_irq_save(flags); + + /* + * handle boxes with an active event list as opposed to active + * counters + */ + list_for_each_entry(event, &box->active_list, active_entry) { + uncore_perf_event_update(box, event); + } + + for_each_set_bit(bit, box->active_mask, UNCORE_PMC_IDX_MAX) + uncore_perf_event_update(box, box->events[bit]); + + local_irq_restore(flags); + + hrtimer_forward_now(hrtimer, ns_to_ktime(box->hrtimer_duration)); + return HRTIMER_RESTART; +} + +static void uncore_pmu_start_hrtimer(struct intel_uncore_box *box) +{ + __hrtimer_start_range_ns(&box->hrtimer, + ns_to_ktime(box->hrtimer_duration), 0, + HRTIMER_MODE_REL_PINNED, 0); +} + +static void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box) +{ + hrtimer_cancel(&box->hrtimer); +} + +static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box) +{ + hrtimer_init(&box->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + box->hrtimer.function = uncore_pmu_hrtimer; +} + +static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, int node) +{ + struct intel_uncore_box *box; + int i, size; + + size = sizeof(*box) + type->num_shared_regs * sizeof(struct intel_uncore_extra_reg); + + box = kzalloc_node(size, GFP_KERNEL, node); + if (!box) + return NULL; + + for (i = 0; i < type->num_shared_regs; i++) + raw_spin_lock_init(&box->shared_regs[i].lock); + + uncore_pmu_init_hrtimer(box); + atomic_set(&box->refcnt, 1); + box->cpu = -1; + box->phys_id = -1; + + /* set default hrtimer timeout */ + box->hrtimer_duration = UNCORE_PMU_HRTIMER_INTERVAL; + + INIT_LIST_HEAD(&box->active_list); + + return box; +} + +static int +uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader, bool dogrp) +{ + struct perf_event *event; + int n, max_count; + + max_count = box->pmu->type->num_counters; + if (box->pmu->type->fixed_ctl) + max_count++; + + if (box->n_events >= max_count) + return -EINVAL; + + n = box->n_events; + box->event_list[n] = leader; + n++; + if (!dogrp) + return n; + + list_for_each_entry(event, &leader->sibling_list, group_entry) { + if (event->state <= PERF_EVENT_STATE_OFF) + continue; + + if (n >= max_count) + return -EINVAL; + + box->event_list[n] = event; + n++; + } + return n; +} + +static struct event_constraint * +uncore_get_event_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + struct intel_uncore_type *type = box->pmu->type; + struct event_constraint *c; + + if (type->ops->get_constraint) { + c = type->ops->get_constraint(box, event); + if (c) + return c; + } + + if (event->attr.config == UNCORE_FIXED_EVENT) + return &constraint_fixed; + + if (type->constraints) { + for_each_event_constraint(c, type->constraints) { + if ((event->hw.config & c->cmask) == c->code) + return c; + } + } + + return &type->unconstrainted; +} + +static void uncore_put_event_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + if (box->pmu->type->ops->put_constraint) + box->pmu->type->ops->put_constraint(box, event); +} + +static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int n) +{ + unsigned long used_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)]; + struct event_constraint *c; + int i, wmin, wmax, ret = 0; + struct hw_perf_event *hwc; + + bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX); + + for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) { + hwc = &box->event_list[i]->hw; + c = uncore_get_event_constraint(box, box->event_list[i]); + hwc->constraint = c; + wmin = min(wmin, c->weight); + wmax = max(wmax, c->weight); + } + + /* fastpath, try to reuse previous register */ + for (i = 0; i < n; i++) { + hwc = &box->event_list[i]->hw; + c = hwc->constraint; + + /* never assigned */ + if (hwc->idx == -1) + break; + + /* constraint still honored */ + if (!test_bit(hwc->idx, c->idxmsk)) + break; + + /* not already used */ + if (test_bit(hwc->idx, used_mask)) + break; + + __set_bit(hwc->idx, used_mask); + if (assign) + assign[i] = hwc->idx; + } + /* slow path */ + if (i != n) + ret = perf_assign_events(box->event_list, n, + wmin, wmax, assign); + + if (!assign || ret) { + for (i = 0; i < n; i++) + uncore_put_event_constraint(box, box->event_list[i]); + } + return ret ? -EINVAL : 0; +} + +static void uncore_pmu_event_start(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + int idx = event->hw.idx; + + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) + return; + + if (WARN_ON_ONCE(idx == -1 || idx >= UNCORE_PMC_IDX_MAX)) + return; + + event->hw.state = 0; + box->events[idx] = event; + box->n_active++; + __set_bit(idx, box->active_mask); + + local64_set(&event->hw.prev_count, uncore_read_counter(box, event)); + uncore_enable_event(box, event); + + if (box->n_active == 1) { + uncore_enable_box(box); + uncore_pmu_start_hrtimer(box); + } +} + +static void uncore_pmu_event_stop(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + struct hw_perf_event *hwc = &event->hw; + + if (__test_and_clear_bit(hwc->idx, box->active_mask)) { + uncore_disable_event(box, event); + box->n_active--; + box->events[hwc->idx] = NULL; + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; + + if (box->n_active == 0) { + uncore_disable_box(box); + uncore_pmu_cancel_hrtimer(box); + } + } + + if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { + /* + * Drain the remaining delta count out of a event + * that we are disabling: + */ + uncore_perf_event_update(box, event); + hwc->state |= PERF_HES_UPTODATE; + } +} + +static int uncore_pmu_event_add(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + struct hw_perf_event *hwc = &event->hw; + int assign[UNCORE_PMC_IDX_MAX]; + int i, n, ret; + + if (!box) + return -ENODEV; + + ret = n = uncore_collect_events(box, event, false); + if (ret < 0) + return ret; + + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + if (!(flags & PERF_EF_START)) + hwc->state |= PERF_HES_ARCH; + + ret = uncore_assign_events(box, assign, n); + if (ret) + return ret; + + /* save events moving to new counters */ + for (i = 0; i < box->n_events; i++) { + event = box->event_list[i]; + hwc = &event->hw; + + if (hwc->idx == assign[i] && + hwc->last_tag == box->tags[assign[i]]) + continue; + /* + * Ensure we don't accidentally enable a stopped + * counter simply because we rescheduled. + */ + if (hwc->state & PERF_HES_STOPPED) + hwc->state |= PERF_HES_ARCH; + + uncore_pmu_event_stop(event, PERF_EF_UPDATE); + } + + /* reprogram moved events into new counters */ + for (i = 0; i < n; i++) { + event = box->event_list[i]; + hwc = &event->hw; + + if (hwc->idx != assign[i] || + hwc->last_tag != box->tags[assign[i]]) + uncore_assign_hw_event(box, event, assign[i]); + else if (i < box->n_events) + continue; + + if (hwc->state & PERF_HES_ARCH) + continue; + + uncore_pmu_event_start(event, 0); + } + box->n_events = n; + + return 0; +} + +static void uncore_pmu_event_del(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + int i; + + uncore_pmu_event_stop(event, PERF_EF_UPDATE); + + for (i = 0; i < box->n_events; i++) { + if (event == box->event_list[i]) { + uncore_put_event_constraint(box, event); + + while (++i < box->n_events) + box->event_list[i - 1] = box->event_list[i]; + + --box->n_events; + break; + } + } + + event->hw.idx = -1; + event->hw.last_tag = ~0ULL; +} + +static void uncore_pmu_event_read(struct perf_event *event) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + uncore_perf_event_update(box, event); +} + +/* + * validation ensures the group can be loaded onto the + * PMU if it was the only group available. + */ +static int uncore_validate_group(struct intel_uncore_pmu *pmu, + struct perf_event *event) +{ + struct perf_event *leader = event->group_leader; + struct intel_uncore_box *fake_box; + int ret = -EINVAL, n; + + fake_box = uncore_alloc_box(pmu->type, NUMA_NO_NODE); + if (!fake_box) + return -ENOMEM; + + fake_box->pmu = pmu; + /* + * the event is not yet connected with its + * siblings therefore we must first collect + * existing siblings, then add the new event + * before we can simulate the scheduling + */ + n = uncore_collect_events(fake_box, leader, true); + if (n < 0) + goto out; + + fake_box->n_events = n; + n = uncore_collect_events(fake_box, event, false); + if (n < 0) + goto out; + + fake_box->n_events = n; + + ret = uncore_assign_events(fake_box, NULL, n); +out: + kfree(fake_box); + return ret; +} + +static int uncore_pmu_event_init(struct perf_event *event) +{ + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + struct hw_perf_event *hwc = &event->hw; + int ret; + + if (event->attr.type != event->pmu->type) + return -ENOENT; + + pmu = uncore_event_to_pmu(event); + /* no device found for this pmu */ + if (pmu->func_id < 0) + return -ENOENT; + + /* + * Uncore PMU does measure at all privilege level all the time. + * So it doesn't make sense to specify any exclude bits. + */ + if (event->attr.exclude_user || event->attr.exclude_kernel || + event->attr.exclude_hv || event->attr.exclude_idle) + return -EINVAL; + + /* Sampling not supported yet */ + if (hwc->sample_period) + return -EINVAL; + + /* + * Place all uncore events for a particular physical package + * onto a single cpu + */ + if (event->cpu < 0) + return -EINVAL; + box = uncore_pmu_to_box(pmu, event->cpu); + if (!box || box->cpu < 0) + return -EINVAL; + event->cpu = box->cpu; + + event->hw.idx = -1; + event->hw.last_tag = ~0ULL; + event->hw.extra_reg.idx = EXTRA_REG_NONE; + event->hw.branch_reg.idx = EXTRA_REG_NONE; + + if (event->attr.config == UNCORE_FIXED_EVENT) { + /* no fixed counter */ + if (!pmu->type->fixed_ctl) + return -EINVAL; + /* + * if there is only one fixed counter, only the first pmu + * can access the fixed counter + */ + if (pmu->type->single_fixed && pmu->pmu_idx > 0) + return -EINVAL; + + /* fixed counters have event field hardcoded to zero */ + hwc->config = 0ULL; + } else { + hwc->config = event->attr.config & pmu->type->event_mask; + if (pmu->type->ops->hw_config) { + ret = pmu->type->ops->hw_config(box, event); + if (ret) + return ret; + } + } + + if (event->group_leader != event) + ret = uncore_validate_group(pmu, event); + else + ret = 0; + + return ret; +} + +static ssize_t uncore_get_attr_cpumask(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &uncore_cpu_mask); + + buf[n++] = '\n'; + buf[n] = '\0'; + return n; +} + +static DEVICE_ATTR(cpumask, S_IRUGO, uncore_get_attr_cpumask, NULL); + +static struct attribute *uncore_pmu_attrs[] = { + &dev_attr_cpumask.attr, + NULL, +}; + +static struct attribute_group uncore_pmu_attr_group = { + .attrs = uncore_pmu_attrs, +}; + +static int __init uncore_pmu_register(struct intel_uncore_pmu *pmu) +{ + int ret; + + if (!pmu->type->pmu) { + pmu->pmu = (struct pmu) { + .attr_groups = pmu->type->attr_groups, + .task_ctx_nr = perf_invalid_context, + .event_init = uncore_pmu_event_init, + .add = uncore_pmu_event_add, + .del = uncore_pmu_event_del, + .start = uncore_pmu_event_start, + .stop = uncore_pmu_event_stop, + .read = uncore_pmu_event_read, + }; + } else { + pmu->pmu = *pmu->type->pmu; + pmu->pmu.attr_groups = pmu->type->attr_groups; + } + + if (pmu->type->num_boxes == 1) { + if (strlen(pmu->type->name) > 0) + sprintf(pmu->name, "uncore_%s", pmu->type->name); + else + sprintf(pmu->name, "uncore"); + } else { + sprintf(pmu->name, "uncore_%s_%d", pmu->type->name, + pmu->pmu_idx); + } + + ret = perf_pmu_register(&pmu->pmu, pmu->name, -1); + return ret; +} + +static void __init uncore_type_exit(struct intel_uncore_type *type) +{ + int i; + + for (i = 0; i < type->num_boxes; i++) + free_percpu(type->pmus[i].box); + kfree(type->pmus); + type->pmus = NULL; + kfree(type->events_group); + type->events_group = NULL; +} + +static void __init uncore_types_exit(struct intel_uncore_type **types) +{ + int i; + for (i = 0; types[i]; i++) + uncore_type_exit(types[i]); +} + +static int __init uncore_type_init(struct intel_uncore_type *type) +{ + struct intel_uncore_pmu *pmus; + struct attribute_group *attr_group; + struct attribute **attrs; + int i, j; + + pmus = kzalloc(sizeof(*pmus) * type->num_boxes, GFP_KERNEL); + if (!pmus) + return -ENOMEM; + + type->pmus = pmus; + + type->unconstrainted = (struct event_constraint) + __EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1, + 0, type->num_counters, 0, 0); + + for (i = 0; i < type->num_boxes; i++) { + pmus[i].func_id = -1; + pmus[i].pmu_idx = i; + pmus[i].type = type; + INIT_LIST_HEAD(&pmus[i].box_list); + pmus[i].box = alloc_percpu(struct intel_uncore_box *); + if (!pmus[i].box) + goto fail; + } + + if (type->event_descs) { + i = 0; + while (type->event_descs[i].attr.attr.name) + i++; + + attr_group = kzalloc(sizeof(struct attribute *) * (i + 1) + + sizeof(*attr_group), GFP_KERNEL); + if (!attr_group) + goto fail; + + attrs = (struct attribute **)(attr_group + 1); + attr_group->name = "events"; + attr_group->attrs = attrs; + + for (j = 0; j < i; j++) + attrs[j] = &type->event_descs[j].attr.attr; + + type->events_group = attr_group; + } + + type->pmu_group = &uncore_pmu_attr_group; + return 0; +fail: + uncore_type_exit(type); + return -ENOMEM; +} + +static int __init uncore_types_init(struct intel_uncore_type **types) +{ + int i, ret; + + for (i = 0; types[i]; i++) { + ret = uncore_type_init(types[i]); + if (ret) + goto fail; + } + return 0; +fail: + while (--i >= 0) + uncore_type_exit(types[i]); + return ret; +} + +static struct pci_driver *uncore_pci_driver; +static bool pcidrv_registered; + +/* + * add a pci uncore device + */ +static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + struct intel_uncore_type *type; + int phys_id; + + phys_id = pcibus_to_physid[pdev->bus->number]; + if (phys_id < 0) + return -ENODEV; + + if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) { + extra_pci_dev[phys_id][UNCORE_PCI_DEV_IDX(id->driver_data)] = pdev; + pci_set_drvdata(pdev, NULL); + return 0; + } + + type = pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)]; + box = uncore_alloc_box(type, NUMA_NO_NODE); + if (!box) + return -ENOMEM; + + /* + * for performance monitoring unit with multiple boxes, + * each box has a different function id. + */ + pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)]; + if (pmu->func_id < 0) + pmu->func_id = pdev->devfn; + else + WARN_ON_ONCE(pmu->func_id != pdev->devfn); + + box->phys_id = phys_id; + box->pci_dev = pdev; + box->pmu = pmu; + uncore_box_init(box); + pci_set_drvdata(pdev, box); + + raw_spin_lock(&uncore_box_lock); + list_add_tail(&box->list, &pmu->box_list); + raw_spin_unlock(&uncore_box_lock); + + return 0; +} + +static void uncore_pci_remove(struct pci_dev *pdev) +{ + struct intel_uncore_box *box = pci_get_drvdata(pdev); + struct intel_uncore_pmu *pmu; + int i, cpu, phys_id = pcibus_to_physid[pdev->bus->number]; + + box = pci_get_drvdata(pdev); + if (!box) { + for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) { + if (extra_pci_dev[phys_id][i] == pdev) { + extra_pci_dev[phys_id][i] = NULL; + break; + } + } + WARN_ON_ONCE(i >= UNCORE_EXTRA_PCI_DEV_MAX); + return; + } + + pmu = box->pmu; + if (WARN_ON_ONCE(phys_id != box->phys_id)) + return; + + pci_set_drvdata(pdev, NULL); + + raw_spin_lock(&uncore_box_lock); + list_del(&box->list); + raw_spin_unlock(&uncore_box_lock); + + for_each_possible_cpu(cpu) { + if (*per_cpu_ptr(pmu->box, cpu) == box) { + *per_cpu_ptr(pmu->box, cpu) = NULL; + atomic_dec(&box->refcnt); + } + } + + WARN_ON_ONCE(atomic_read(&box->refcnt) != 1); + kfree(box); +} + +static int __init uncore_pci_init(void) +{ + int ret; + + switch (boot_cpu_data.x86_model) { + case 45: /* Sandy Bridge-EP */ + ret = snbep_pci2phy_map_init(0x3ce0); + if (ret) + return ret; + pci_uncores = snbep_pci_uncores; + uncore_pci_driver = &snbep_uncore_pci_driver; + break; + case 62: /* IvyTown */ + ret = snbep_pci2phy_map_init(0x0e1e); + if (ret) + return ret; + pci_uncores = ivt_pci_uncores; + uncore_pci_driver = &ivt_uncore_pci_driver; + break; + case 42: /* Sandy Bridge */ + ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_SNB_IMC); + if (ret) + return ret; + pci_uncores = snb_pci_uncores; + uncore_pci_driver = &snb_uncore_pci_driver; + break; + case 58: /* Ivy Bridge */ + ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_IVB_IMC); + if (ret) + return ret; + pci_uncores = snb_pci_uncores; + uncore_pci_driver = &ivb_uncore_pci_driver; + break; + case 60: /* Haswell */ + case 69: /* Haswell Celeron */ + ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_HSW_IMC); + if (ret) + return ret; + pci_uncores = snb_pci_uncores; + uncore_pci_driver = &hsw_uncore_pci_driver; + break; + default: + return 0; + } + + ret = uncore_types_init(pci_uncores); + if (ret) + return ret; + + uncore_pci_driver->probe = uncore_pci_probe; + uncore_pci_driver->remove = uncore_pci_remove; + + ret = pci_register_driver(uncore_pci_driver); + if (ret == 0) + pcidrv_registered = true; + else + uncore_types_exit(pci_uncores); + + return ret; +} + +static void __init uncore_pci_exit(void) +{ + if (pcidrv_registered) { + pcidrv_registered = false; + pci_unregister_driver(uncore_pci_driver); + uncore_types_exit(pci_uncores); + } +} + +/* CPU hot plug/unplug are serialized by cpu_add_remove_lock mutex */ +static LIST_HEAD(boxes_to_free); + +static void uncore_kfree_boxes(void) +{ + struct intel_uncore_box *box; + + while (!list_empty(&boxes_to_free)) { + box = list_entry(boxes_to_free.next, + struct intel_uncore_box, list); + list_del(&box->list); + kfree(box); + } +} + +static void uncore_cpu_dying(int cpu) +{ + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + int i, j; + + for (i = 0; msr_uncores[i]; i++) { + type = msr_uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + box = *per_cpu_ptr(pmu->box, cpu); + *per_cpu_ptr(pmu->box, cpu) = NULL; + if (box && atomic_dec_and_test(&box->refcnt)) + list_add(&box->list, &boxes_to_free); + } + } +} + +static int uncore_cpu_starting(int cpu) +{ + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box, *exist; + int i, j, k, phys_id; + + phys_id = topology_physical_package_id(cpu); + + for (i = 0; msr_uncores[i]; i++) { + type = msr_uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + box = *per_cpu_ptr(pmu->box, cpu); + /* called by uncore_cpu_init? */ + if (box && box->phys_id >= 0) { + uncore_box_init(box); + continue; + } + + for_each_online_cpu(k) { + exist = *per_cpu_ptr(pmu->box, k); + if (exist && exist->phys_id == phys_id) { + atomic_inc(&exist->refcnt); + *per_cpu_ptr(pmu->box, cpu) = exist; + if (box) { + list_add(&box->list, + &boxes_to_free); + box = NULL; + } + break; + } + } + + if (box) { + box->phys_id = phys_id; + uncore_box_init(box); + } + } + } + return 0; +} + +static int uncore_cpu_prepare(int cpu, int phys_id) +{ + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + int i, j; + + for (i = 0; msr_uncores[i]; i++) { + type = msr_uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + if (pmu->func_id < 0) + pmu->func_id = j; + + box = uncore_alloc_box(type, cpu_to_node(cpu)); + if (!box) + return -ENOMEM; + + box->pmu = pmu; + box->phys_id = phys_id; + *per_cpu_ptr(pmu->box, cpu) = box; + } + } + return 0; +} + +static void +uncore_change_context(struct intel_uncore_type **uncores, int old_cpu, int new_cpu) +{ + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + int i, j; + + for (i = 0; uncores[i]; i++) { + type = uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + if (old_cpu < 0) + box = uncore_pmu_to_box(pmu, new_cpu); + else + box = uncore_pmu_to_box(pmu, old_cpu); + if (!box) + continue; + + if (old_cpu < 0) { + WARN_ON_ONCE(box->cpu != -1); + box->cpu = new_cpu; + continue; + } + + WARN_ON_ONCE(box->cpu != old_cpu); + if (new_cpu >= 0) { + uncore_pmu_cancel_hrtimer(box); + perf_pmu_migrate_context(&pmu->pmu, + old_cpu, new_cpu); + box->cpu = new_cpu; + } else { + box->cpu = -1; + } + } + } +} + +static void uncore_event_exit_cpu(int cpu) +{ + int i, phys_id, target; + + /* if exiting cpu is used for collecting uncore events */ + if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask)) + return; + + /* find a new cpu to collect uncore events */ + phys_id = topology_physical_package_id(cpu); + target = -1; + for_each_online_cpu(i) { + if (i == cpu) + continue; + if (phys_id == topology_physical_package_id(i)) { + target = i; + break; + } + } + + /* migrate uncore events to the new cpu */ + if (target >= 0) + cpumask_set_cpu(target, &uncore_cpu_mask); + + uncore_change_context(msr_uncores, cpu, target); + uncore_change_context(pci_uncores, cpu, target); +} + +static void uncore_event_init_cpu(int cpu) +{ + int i, phys_id; + + phys_id = topology_physical_package_id(cpu); + for_each_cpu(i, &uncore_cpu_mask) { + if (phys_id == topology_physical_package_id(i)) + return; + } + + cpumask_set_cpu(cpu, &uncore_cpu_mask); + + uncore_change_context(msr_uncores, -1, cpu); + uncore_change_context(pci_uncores, -1, cpu); +} + +static int uncore_cpu_notifier(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (long)hcpu; + + /* allocate/free data structure for uncore box */ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_UP_PREPARE: + uncore_cpu_prepare(cpu, -1); + break; + case CPU_STARTING: + uncore_cpu_starting(cpu); + break; + case CPU_UP_CANCELED: + case CPU_DYING: + uncore_cpu_dying(cpu); + break; + case CPU_ONLINE: + case CPU_DEAD: + uncore_kfree_boxes(); + break; + default: + break; + } + + /* select the cpu that collects uncore events */ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_FAILED: + case CPU_STARTING: + uncore_event_init_cpu(cpu); + break; + case CPU_DOWN_PREPARE: + uncore_event_exit_cpu(cpu); + break; + default: + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block uncore_cpu_nb = { + .notifier_call = uncore_cpu_notifier, + /* + * to migrate uncore events, our notifier should be executed + * before perf core's notifier. + */ + .priority = CPU_PRI_PERF + 1, +}; + +static void __init uncore_cpu_setup(void *dummy) +{ + uncore_cpu_starting(smp_processor_id()); +} + +static int __init uncore_cpu_init(void) +{ + int ret, max_cores; + + max_cores = boot_cpu_data.x86_max_cores; + switch (boot_cpu_data.x86_model) { + case 26: /* Nehalem */ + case 30: + case 37: /* Westmere */ + case 44: + msr_uncores = nhm_msr_uncores; + break; + case 42: /* Sandy Bridge */ + case 58: /* Ivy Bridge */ + if (snb_uncore_cbox.num_boxes > max_cores) + snb_uncore_cbox.num_boxes = max_cores; + msr_uncores = snb_msr_uncores; + break; + case 45: /* Sandy Bridge-EP */ + if (snbep_uncore_cbox.num_boxes > max_cores) + snbep_uncore_cbox.num_boxes = max_cores; + msr_uncores = snbep_msr_uncores; + break; + case 46: /* Nehalem-EX */ + uncore_nhmex = true; + case 47: /* Westmere-EX aka. Xeon E7 */ + if (!uncore_nhmex) + nhmex_uncore_mbox.event_descs = wsmex_uncore_mbox_events; + if (nhmex_uncore_cbox.num_boxes > max_cores) + nhmex_uncore_cbox.num_boxes = max_cores; + msr_uncores = nhmex_msr_uncores; + break; + case 62: /* IvyTown */ + if (ivt_uncore_cbox.num_boxes > max_cores) + ivt_uncore_cbox.num_boxes = max_cores; + msr_uncores = ivt_msr_uncores; + break; + + default: + return 0; + } + + ret = uncore_types_init(msr_uncores); + if (ret) + return ret; + + return 0; +} + +static int __init uncore_pmus_register(void) +{ + struct intel_uncore_pmu *pmu; + struct intel_uncore_type *type; + int i, j; + + for (i = 0; msr_uncores[i]; i++) { + type = msr_uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + uncore_pmu_register(pmu); + } + } + + for (i = 0; pci_uncores[i]; i++) { + type = pci_uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + uncore_pmu_register(pmu); + } + } + + return 0; +} + +static void __init uncore_cpumask_init(void) +{ + int cpu; + + /* + * ony invoke once from msr or pci init code + */ + if (!cpumask_empty(&uncore_cpu_mask)) + return; + + cpu_notifier_register_begin(); + + for_each_online_cpu(cpu) { + int i, phys_id = topology_physical_package_id(cpu); + + for_each_cpu(i, &uncore_cpu_mask) { + if (phys_id == topology_physical_package_id(i)) { + phys_id = -1; + break; + } + } + if (phys_id < 0) + continue; + + uncore_cpu_prepare(cpu, phys_id); + uncore_event_init_cpu(cpu); + } + on_each_cpu(uncore_cpu_setup, NULL, 1); + + __register_cpu_notifier(&uncore_cpu_nb); + + cpu_notifier_register_done(); +} + + +static int __init intel_uncore_init(void) +{ + int ret; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return -ENODEV; + + if (cpu_has_hypervisor) + return -ENODEV; + + ret = uncore_pci_init(); + if (ret) + goto fail; + ret = uncore_cpu_init(); + if (ret) { + uncore_pci_exit(); + goto fail; + } + uncore_cpumask_init(); + + uncore_pmus_register(); + return 0; +fail: + return ret; +} +device_initcall(intel_uncore_init); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h new file mode 100644 index 00000000000..90236f0c94a --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -0,0 +1,696 @@ +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/pci.h> +#include <linux/perf_event.h> +#include "perf_event.h" + +#define UNCORE_PMU_NAME_LEN 32 +#define UNCORE_PMU_HRTIMER_INTERVAL (60LL * NSEC_PER_SEC) +#define UNCORE_SNB_IMC_HRTIMER_INTERVAL (5ULL * NSEC_PER_SEC) + +#define UNCORE_FIXED_EVENT 0xff +#define UNCORE_PMC_IDX_MAX_GENERIC 8 +#define UNCORE_PMC_IDX_FIXED UNCORE_PMC_IDX_MAX_GENERIC +#define UNCORE_PMC_IDX_MAX (UNCORE_PMC_IDX_FIXED + 1) + +#define UNCORE_PCI_DEV_DATA(type, idx) ((type << 8) | idx) +#define UNCORE_PCI_DEV_TYPE(data) ((data >> 8) & 0xff) +#define UNCORE_PCI_DEV_IDX(data) (data & 0xff) +#define UNCORE_EXTRA_PCI_DEV 0xff +#define UNCORE_EXTRA_PCI_DEV_MAX 2 + +/* support up to 8 sockets */ +#define UNCORE_SOCKET_MAX 8 + +#define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff) + +/* SNB event control */ +#define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff +#define SNB_UNC_CTL_UMASK_MASK 0x0000ff00 +#define SNB_UNC_CTL_EDGE_DET (1 << 18) +#define SNB_UNC_CTL_EN (1 << 22) +#define SNB_UNC_CTL_INVERT (1 << 23) +#define SNB_UNC_CTL_CMASK_MASK 0x1f000000 +#define NHM_UNC_CTL_CMASK_MASK 0xff000000 +#define NHM_UNC_FIXED_CTR_CTL_EN (1 << 0) + +#define SNB_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \ + SNB_UNC_CTL_UMASK_MASK | \ + SNB_UNC_CTL_EDGE_DET | \ + SNB_UNC_CTL_INVERT | \ + SNB_UNC_CTL_CMASK_MASK) + +#define NHM_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \ + SNB_UNC_CTL_UMASK_MASK | \ + SNB_UNC_CTL_EDGE_DET | \ + SNB_UNC_CTL_INVERT | \ + NHM_UNC_CTL_CMASK_MASK) + +/* SNB global control register */ +#define SNB_UNC_PERF_GLOBAL_CTL 0x391 +#define SNB_UNC_FIXED_CTR_CTRL 0x394 +#define SNB_UNC_FIXED_CTR 0x395 + +/* SNB uncore global control */ +#define SNB_UNC_GLOBAL_CTL_CORE_ALL ((1 << 4) - 1) +#define SNB_UNC_GLOBAL_CTL_EN (1 << 29) + +/* SNB Cbo register */ +#define SNB_UNC_CBO_0_PERFEVTSEL0 0x700 +#define SNB_UNC_CBO_0_PER_CTR0 0x706 +#define SNB_UNC_CBO_MSR_OFFSET 0x10 + +/* NHM global control register */ +#define NHM_UNC_PERF_GLOBAL_CTL 0x391 +#define NHM_UNC_FIXED_CTR 0x394 +#define NHM_UNC_FIXED_CTR_CTRL 0x395 + +/* NHM uncore global control */ +#define NHM_UNC_GLOBAL_CTL_EN_PC_ALL ((1ULL << 8) - 1) +#define NHM_UNC_GLOBAL_CTL_EN_FC (1ULL << 32) + +/* NHM uncore register */ +#define NHM_UNC_PERFEVTSEL0 0x3c0 +#define NHM_UNC_UNCORE_PMC0 0x3b0 + +/* SNB-EP Box level control */ +#define SNBEP_PMON_BOX_CTL_RST_CTRL (1 << 0) +#define SNBEP_PMON_BOX_CTL_RST_CTRS (1 << 1) +#define SNBEP_PMON_BOX_CTL_FRZ (1 << 8) +#define SNBEP_PMON_BOX_CTL_FRZ_EN (1 << 16) +#define SNBEP_PMON_BOX_CTL_INT (SNBEP_PMON_BOX_CTL_RST_CTRL | \ + SNBEP_PMON_BOX_CTL_RST_CTRS | \ + SNBEP_PMON_BOX_CTL_FRZ_EN) +/* SNB-EP event control */ +#define SNBEP_PMON_CTL_EV_SEL_MASK 0x000000ff +#define SNBEP_PMON_CTL_UMASK_MASK 0x0000ff00 +#define SNBEP_PMON_CTL_RST (1 << 17) +#define SNBEP_PMON_CTL_EDGE_DET (1 << 18) +#define SNBEP_PMON_CTL_EV_SEL_EXT (1 << 21) +#define SNBEP_PMON_CTL_EN (1 << 22) +#define SNBEP_PMON_CTL_INVERT (1 << 23) +#define SNBEP_PMON_CTL_TRESH_MASK 0xff000000 +#define SNBEP_PMON_RAW_EVENT_MASK (SNBEP_PMON_CTL_EV_SEL_MASK | \ + SNBEP_PMON_CTL_UMASK_MASK | \ + SNBEP_PMON_CTL_EDGE_DET | \ + SNBEP_PMON_CTL_INVERT | \ + SNBEP_PMON_CTL_TRESH_MASK) + +/* SNB-EP Ubox event control */ +#define SNBEP_U_MSR_PMON_CTL_TRESH_MASK 0x1f000000 +#define SNBEP_U_MSR_PMON_RAW_EVENT_MASK \ + (SNBEP_PMON_CTL_EV_SEL_MASK | \ + SNBEP_PMON_CTL_UMASK_MASK | \ + SNBEP_PMON_CTL_EDGE_DET | \ + SNBEP_PMON_CTL_INVERT | \ + SNBEP_U_MSR_PMON_CTL_TRESH_MASK) + +#define SNBEP_CBO_PMON_CTL_TID_EN (1 << 19) +#define SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK (SNBEP_PMON_RAW_EVENT_MASK | \ + SNBEP_CBO_PMON_CTL_TID_EN) + +/* SNB-EP PCU event control */ +#define SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK 0x0000c000 +#define SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK 0x1f000000 +#define SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT (1 << 30) +#define SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET (1 << 31) +#define SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK \ + (SNBEP_PMON_CTL_EV_SEL_MASK | \ + SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \ + SNBEP_PMON_CTL_EDGE_DET | \ + SNBEP_PMON_CTL_EV_SEL_EXT | \ + SNBEP_PMON_CTL_INVERT | \ + SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \ + SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \ + SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET) + +#define SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK \ + (SNBEP_PMON_RAW_EVENT_MASK | \ + SNBEP_PMON_CTL_EV_SEL_EXT) + +/* SNB-EP pci control register */ +#define SNBEP_PCI_PMON_BOX_CTL 0xf4 +#define SNBEP_PCI_PMON_CTL0 0xd8 +/* SNB-EP pci counter register */ +#define SNBEP_PCI_PMON_CTR0 0xa0 + +/* SNB-EP home agent register */ +#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH0 0x40 +#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH1 0x44 +#define SNBEP_HA_PCI_PMON_BOX_OPCODEMATCH 0x48 +/* SNB-EP memory controller register */ +#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTL 0xf0 +#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTR 0xd0 +/* SNB-EP QPI register */ +#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH0 0x228 +#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH1 0x22c +#define SNBEP_Q_Py_PCI_PMON_PKT_MASK0 0x238 +#define SNBEP_Q_Py_PCI_PMON_PKT_MASK1 0x23c + +/* SNB-EP Ubox register */ +#define SNBEP_U_MSR_PMON_CTR0 0xc16 +#define SNBEP_U_MSR_PMON_CTL0 0xc10 + +#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTL 0xc08 +#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTR 0xc09 + +/* SNB-EP Cbo register */ +#define SNBEP_C0_MSR_PMON_CTR0 0xd16 +#define SNBEP_C0_MSR_PMON_CTL0 0xd10 +#define SNBEP_C0_MSR_PMON_BOX_CTL 0xd04 +#define SNBEP_C0_MSR_PMON_BOX_FILTER 0xd14 +#define SNBEP_CBO_MSR_OFFSET 0x20 + +#define SNBEP_CB0_MSR_PMON_BOX_FILTER_TID 0x1f +#define SNBEP_CB0_MSR_PMON_BOX_FILTER_NID 0x3fc00 +#define SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE 0x7c0000 +#define SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC 0xff800000 + +#define SNBEP_CBO_EVENT_EXTRA_REG(e, m, i) { \ + .event = (e), \ + .msr = SNBEP_C0_MSR_PMON_BOX_FILTER, \ + .config_mask = (m), \ + .idx = (i) \ +} + +/* SNB-EP PCU register */ +#define SNBEP_PCU_MSR_PMON_CTR0 0xc36 +#define SNBEP_PCU_MSR_PMON_CTL0 0xc30 +#define SNBEP_PCU_MSR_PMON_BOX_CTL 0xc24 +#define SNBEP_PCU_MSR_PMON_BOX_FILTER 0xc34 +#define SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK 0xffffffff +#define SNBEP_PCU_MSR_CORE_C3_CTR 0x3fc +#define SNBEP_PCU_MSR_CORE_C6_CTR 0x3fd + +/* IVT event control */ +#define IVT_PMON_BOX_CTL_INT (SNBEP_PMON_BOX_CTL_RST_CTRL | \ + SNBEP_PMON_BOX_CTL_RST_CTRS) +#define IVT_PMON_RAW_EVENT_MASK (SNBEP_PMON_CTL_EV_SEL_MASK | \ + SNBEP_PMON_CTL_UMASK_MASK | \ + SNBEP_PMON_CTL_EDGE_DET | \ + SNBEP_PMON_CTL_TRESH_MASK) +/* IVT Ubox */ +#define IVT_U_MSR_PMON_GLOBAL_CTL 0xc00 +#define IVT_U_PMON_GLOBAL_FRZ_ALL (1 << 31) +#define IVT_U_PMON_GLOBAL_UNFRZ_ALL (1 << 29) + +#define IVT_U_MSR_PMON_RAW_EVENT_MASK \ + (SNBEP_PMON_CTL_EV_SEL_MASK | \ + SNBEP_PMON_CTL_UMASK_MASK | \ + SNBEP_PMON_CTL_EDGE_DET | \ + SNBEP_U_MSR_PMON_CTL_TRESH_MASK) +/* IVT Cbo */ +#define IVT_CBO_MSR_PMON_RAW_EVENT_MASK (IVT_PMON_RAW_EVENT_MASK | \ + SNBEP_CBO_PMON_CTL_TID_EN) + +#define IVT_CB0_MSR_PMON_BOX_FILTER_TID (0x1fULL << 0) +#define IVT_CB0_MSR_PMON_BOX_FILTER_LINK (0xfULL << 5) +#define IVT_CB0_MSR_PMON_BOX_FILTER_STATE (0x3fULL << 17) +#define IVT_CB0_MSR_PMON_BOX_FILTER_NID (0xffffULL << 32) +#define IVT_CB0_MSR_PMON_BOX_FILTER_OPC (0x1ffULL << 52) +#define IVT_CB0_MSR_PMON_BOX_FILTER_C6 (0x1ULL << 61) +#define IVT_CB0_MSR_PMON_BOX_FILTER_NC (0x1ULL << 62) +#define IVT_CB0_MSR_PMON_BOX_FILTER_IOSC (0x1ULL << 63) + +/* IVT home agent */ +#define IVT_HA_PCI_PMON_CTL_Q_OCC_RST (1 << 16) +#define IVT_HA_PCI_PMON_RAW_EVENT_MASK \ + (IVT_PMON_RAW_EVENT_MASK | \ + IVT_HA_PCI_PMON_CTL_Q_OCC_RST) +/* IVT PCU */ +#define IVT_PCU_MSR_PMON_RAW_EVENT_MASK \ + (SNBEP_PMON_CTL_EV_SEL_MASK | \ + SNBEP_PMON_CTL_EV_SEL_EXT | \ + SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \ + SNBEP_PMON_CTL_EDGE_DET | \ + SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \ + SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \ + SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET) +/* IVT QPI */ +#define IVT_QPI_PCI_PMON_RAW_EVENT_MASK \ + (IVT_PMON_RAW_EVENT_MASK | \ + SNBEP_PMON_CTL_EV_SEL_EXT) + +/* NHM-EX event control */ +#define NHMEX_PMON_CTL_EV_SEL_MASK 0x000000ff +#define NHMEX_PMON_CTL_UMASK_MASK 0x0000ff00 +#define NHMEX_PMON_CTL_EN_BIT0 (1 << 0) +#define NHMEX_PMON_CTL_EDGE_DET (1 << 18) +#define NHMEX_PMON_CTL_PMI_EN (1 << 20) +#define NHMEX_PMON_CTL_EN_BIT22 (1 << 22) +#define NHMEX_PMON_CTL_INVERT (1 << 23) +#define NHMEX_PMON_CTL_TRESH_MASK 0xff000000 +#define NHMEX_PMON_RAW_EVENT_MASK (NHMEX_PMON_CTL_EV_SEL_MASK | \ + NHMEX_PMON_CTL_UMASK_MASK | \ + NHMEX_PMON_CTL_EDGE_DET | \ + NHMEX_PMON_CTL_INVERT | \ + NHMEX_PMON_CTL_TRESH_MASK) + +/* NHM-EX Ubox */ +#define NHMEX_U_MSR_PMON_GLOBAL_CTL 0xc00 +#define NHMEX_U_MSR_PMON_CTR 0xc11 +#define NHMEX_U_MSR_PMON_EV_SEL 0xc10 + +#define NHMEX_U_PMON_GLOBAL_EN (1 << 0) +#define NHMEX_U_PMON_GLOBAL_PMI_CORE_SEL 0x0000001e +#define NHMEX_U_PMON_GLOBAL_EN_ALL (1 << 28) +#define NHMEX_U_PMON_GLOBAL_RST_ALL (1 << 29) +#define NHMEX_U_PMON_GLOBAL_FRZ_ALL (1 << 31) + +#define NHMEX_U_PMON_RAW_EVENT_MASK \ + (NHMEX_PMON_CTL_EV_SEL_MASK | \ + NHMEX_PMON_CTL_EDGE_DET) + +/* NHM-EX Cbox */ +#define NHMEX_C0_MSR_PMON_GLOBAL_CTL 0xd00 +#define NHMEX_C0_MSR_PMON_CTR0 0xd11 +#define NHMEX_C0_MSR_PMON_EV_SEL0 0xd10 +#define NHMEX_C_MSR_OFFSET 0x20 + +/* NHM-EX Bbox */ +#define NHMEX_B0_MSR_PMON_GLOBAL_CTL 0xc20 +#define NHMEX_B0_MSR_PMON_CTR0 0xc31 +#define NHMEX_B0_MSR_PMON_CTL0 0xc30 +#define NHMEX_B_MSR_OFFSET 0x40 +#define NHMEX_B0_MSR_MATCH 0xe45 +#define NHMEX_B0_MSR_MASK 0xe46 +#define NHMEX_B1_MSR_MATCH 0xe4d +#define NHMEX_B1_MSR_MASK 0xe4e + +#define NHMEX_B_PMON_CTL_EN (1 << 0) +#define NHMEX_B_PMON_CTL_EV_SEL_SHIFT 1 +#define NHMEX_B_PMON_CTL_EV_SEL_MASK \ + (0x1f << NHMEX_B_PMON_CTL_EV_SEL_SHIFT) +#define NHMEX_B_PMON_CTR_SHIFT 6 +#define NHMEX_B_PMON_CTR_MASK \ + (0x3 << NHMEX_B_PMON_CTR_SHIFT) +#define NHMEX_B_PMON_RAW_EVENT_MASK \ + (NHMEX_B_PMON_CTL_EV_SEL_MASK | \ + NHMEX_B_PMON_CTR_MASK) + +/* NHM-EX Sbox */ +#define NHMEX_S0_MSR_PMON_GLOBAL_CTL 0xc40 +#define NHMEX_S0_MSR_PMON_CTR0 0xc51 +#define NHMEX_S0_MSR_PMON_CTL0 0xc50 +#define NHMEX_S_MSR_OFFSET 0x80 +#define NHMEX_S0_MSR_MM_CFG 0xe48 +#define NHMEX_S0_MSR_MATCH 0xe49 +#define NHMEX_S0_MSR_MASK 0xe4a +#define NHMEX_S1_MSR_MM_CFG 0xe58 +#define NHMEX_S1_MSR_MATCH 0xe59 +#define NHMEX_S1_MSR_MASK 0xe5a + +#define NHMEX_S_PMON_MM_CFG_EN (0x1ULL << 63) +#define NHMEX_S_EVENT_TO_R_PROG_EV 0 + +/* NHM-EX Mbox */ +#define NHMEX_M0_MSR_GLOBAL_CTL 0xca0 +#define NHMEX_M0_MSR_PMU_DSP 0xca5 +#define NHMEX_M0_MSR_PMU_ISS 0xca6 +#define NHMEX_M0_MSR_PMU_MAP 0xca7 +#define NHMEX_M0_MSR_PMU_MSC_THR 0xca8 +#define NHMEX_M0_MSR_PMU_PGT 0xca9 +#define NHMEX_M0_MSR_PMU_PLD 0xcaa +#define NHMEX_M0_MSR_PMU_ZDP_CTL_FVC 0xcab +#define NHMEX_M0_MSR_PMU_CTL0 0xcb0 +#define NHMEX_M0_MSR_PMU_CNT0 0xcb1 +#define NHMEX_M_MSR_OFFSET 0x40 +#define NHMEX_M0_MSR_PMU_MM_CFG 0xe54 +#define NHMEX_M1_MSR_PMU_MM_CFG 0xe5c + +#define NHMEX_M_PMON_MM_CFG_EN (1ULL << 63) +#define NHMEX_M_PMON_ADDR_MATCH_MASK 0x3ffffffffULL +#define NHMEX_M_PMON_ADDR_MASK_MASK 0x7ffffffULL +#define NHMEX_M_PMON_ADDR_MASK_SHIFT 34 + +#define NHMEX_M_PMON_CTL_EN (1 << 0) +#define NHMEX_M_PMON_CTL_PMI_EN (1 << 1) +#define NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT 2 +#define NHMEX_M_PMON_CTL_COUNT_MODE_MASK \ + (0x3 << NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT) +#define NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT 4 +#define NHMEX_M_PMON_CTL_STORAGE_MODE_MASK \ + (0x3 << NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT) +#define NHMEX_M_PMON_CTL_WRAP_MODE (1 << 6) +#define NHMEX_M_PMON_CTL_FLAG_MODE (1 << 7) +#define NHMEX_M_PMON_CTL_INC_SEL_SHIFT 9 +#define NHMEX_M_PMON_CTL_INC_SEL_MASK \ + (0x1f << NHMEX_M_PMON_CTL_INC_SEL_SHIFT) +#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT 19 +#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK \ + (0x7 << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT) +#define NHMEX_M_PMON_RAW_EVENT_MASK \ + (NHMEX_M_PMON_CTL_COUNT_MODE_MASK | \ + NHMEX_M_PMON_CTL_STORAGE_MODE_MASK | \ + NHMEX_M_PMON_CTL_WRAP_MODE | \ + NHMEX_M_PMON_CTL_FLAG_MODE | \ + NHMEX_M_PMON_CTL_INC_SEL_MASK | \ + NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK) + +#define NHMEX_M_PMON_ZDP_CTL_FVC_MASK (((1 << 11) - 1) | (1 << 23)) +#define NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7ULL << (11 + 3 * (n))) + +#define WSMEX_M_PMON_ZDP_CTL_FVC_MASK (((1 << 12) - 1) | (1 << 24)) +#define WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7ULL << (12 + 3 * (n))) + +/* + * use the 9~13 bits to select event If the 7th bit is not set, + * otherwise use the 19~21 bits to select event. + */ +#define MBOX_INC_SEL(x) ((x) << NHMEX_M_PMON_CTL_INC_SEL_SHIFT) +#define MBOX_SET_FLAG_SEL(x) (((x) << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT) | \ + NHMEX_M_PMON_CTL_FLAG_MODE) +#define MBOX_INC_SEL_MASK (NHMEX_M_PMON_CTL_INC_SEL_MASK | \ + NHMEX_M_PMON_CTL_FLAG_MODE) +#define MBOX_SET_FLAG_SEL_MASK (NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK | \ + NHMEX_M_PMON_CTL_FLAG_MODE) +#define MBOX_INC_SEL_EXTAR_REG(c, r) \ + EVENT_EXTRA_REG(MBOX_INC_SEL(c), NHMEX_M0_MSR_PMU_##r, \ + MBOX_INC_SEL_MASK, (u64)-1, NHMEX_M_##r) +#define MBOX_SET_FLAG_SEL_EXTRA_REG(c, r) \ + EVENT_EXTRA_REG(MBOX_SET_FLAG_SEL(c), NHMEX_M0_MSR_PMU_##r, \ + MBOX_SET_FLAG_SEL_MASK, \ + (u64)-1, NHMEX_M_##r) + +/* NHM-EX Rbox */ +#define NHMEX_R_MSR_GLOBAL_CTL 0xe00 +#define NHMEX_R_MSR_PMON_CTL0 0xe10 +#define NHMEX_R_MSR_PMON_CNT0 0xe11 +#define NHMEX_R_MSR_OFFSET 0x20 + +#define NHMEX_R_MSR_PORTN_QLX_CFG(n) \ + ((n) < 4 ? (0xe0c + (n)) : (0xe2c + (n) - 4)) +#define NHMEX_R_MSR_PORTN_IPERF_CFG0(n) (0xe04 + (n)) +#define NHMEX_R_MSR_PORTN_IPERF_CFG1(n) (0xe24 + (n)) +#define NHMEX_R_MSR_PORTN_XBR_OFFSET(n) \ + (((n) < 4 ? 0 : 0x10) + (n) * 4) +#define NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) \ + (0xe60 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n)) +#define NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(n) \ + (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 1) +#define NHMEX_R_MSR_PORTN_XBR_SET1_MASK(n) \ + (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 2) +#define NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) \ + (0xe70 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n)) +#define NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(n) \ + (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 1) +#define NHMEX_R_MSR_PORTN_XBR_SET2_MASK(n) \ + (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 2) + +#define NHMEX_R_PMON_CTL_EN (1 << 0) +#define NHMEX_R_PMON_CTL_EV_SEL_SHIFT 1 +#define NHMEX_R_PMON_CTL_EV_SEL_MASK \ + (0x1f << NHMEX_R_PMON_CTL_EV_SEL_SHIFT) +#define NHMEX_R_PMON_CTL_PMI_EN (1 << 6) +#define NHMEX_R_PMON_RAW_EVENT_MASK NHMEX_R_PMON_CTL_EV_SEL_MASK + +/* NHM-EX Wbox */ +#define NHMEX_W_MSR_GLOBAL_CTL 0xc80 +#define NHMEX_W_MSR_PMON_CNT0 0xc90 +#define NHMEX_W_MSR_PMON_EVT_SEL0 0xc91 +#define NHMEX_W_MSR_PMON_FIXED_CTR 0x394 +#define NHMEX_W_MSR_PMON_FIXED_CTL 0x395 + +#define NHMEX_W_PMON_GLOBAL_FIXED_EN (1ULL << 31) + +struct intel_uncore_ops; +struct intel_uncore_pmu; +struct intel_uncore_box; +struct uncore_event_desc; + +struct intel_uncore_type { + const char *name; + int num_counters; + int num_boxes; + int perf_ctr_bits; + int fixed_ctr_bits; + unsigned perf_ctr; + unsigned event_ctl; + unsigned event_mask; + unsigned fixed_ctr; + unsigned fixed_ctl; + unsigned box_ctl; + unsigned msr_offset; + unsigned num_shared_regs:8; + unsigned single_fixed:1; + unsigned pair_ctr_ctl:1; + unsigned *msr_offsets; + struct event_constraint unconstrainted; + struct event_constraint *constraints; + struct intel_uncore_pmu *pmus; + struct intel_uncore_ops *ops; + struct uncore_event_desc *event_descs; + const struct attribute_group *attr_groups[4]; + struct pmu *pmu; /* for custom pmu ops */ +}; + +#define pmu_group attr_groups[0] +#define format_group attr_groups[1] +#define events_group attr_groups[2] + +struct intel_uncore_ops { + void (*init_box)(struct intel_uncore_box *); + void (*disable_box)(struct intel_uncore_box *); + void (*enable_box)(struct intel_uncore_box *); + void (*disable_event)(struct intel_uncore_box *, struct perf_event *); + void (*enable_event)(struct intel_uncore_box *, struct perf_event *); + u64 (*read_counter)(struct intel_uncore_box *, struct perf_event *); + int (*hw_config)(struct intel_uncore_box *, struct perf_event *); + struct event_constraint *(*get_constraint)(struct intel_uncore_box *, + struct perf_event *); + void (*put_constraint)(struct intel_uncore_box *, struct perf_event *); +}; + +struct intel_uncore_pmu { + struct pmu pmu; + char name[UNCORE_PMU_NAME_LEN]; + int pmu_idx; + int func_id; + struct intel_uncore_type *type; + struct intel_uncore_box ** __percpu box; + struct list_head box_list; +}; + +struct intel_uncore_extra_reg { + raw_spinlock_t lock; + u64 config, config1, config2; + atomic_t ref; +}; + +struct intel_uncore_box { + int phys_id; + int n_active; /* number of active events */ + int n_events; + int cpu; /* cpu to collect events */ + unsigned long flags; + atomic_t refcnt; + struct perf_event *events[UNCORE_PMC_IDX_MAX]; + struct perf_event *event_list[UNCORE_PMC_IDX_MAX]; + unsigned long active_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)]; + u64 tags[UNCORE_PMC_IDX_MAX]; + struct pci_dev *pci_dev; + struct intel_uncore_pmu *pmu; + u64 hrtimer_duration; /* hrtimer timeout for this box */ + struct hrtimer hrtimer; + struct list_head list; + struct list_head active_list; + void *io_addr; + struct intel_uncore_extra_reg shared_regs[0]; +}; + +#define UNCORE_BOX_FLAG_INITIATED 0 + +struct uncore_event_desc { + struct kobj_attribute attr; + const char *config; +}; + +#define INTEL_UNCORE_EVENT_DESC(_name, _config) \ +{ \ + .attr = __ATTR(_name, 0444, uncore_event_show, NULL), \ + .config = _config, \ +} + +#define DEFINE_UNCORE_FORMAT_ATTR(_var, _name, _format) \ +static ssize_t __uncore_##_var##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + char *page) \ +{ \ + BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ + return sprintf(page, _format "\n"); \ +} \ +static struct kobj_attribute format_attr_##_var = \ + __ATTR(_name, 0444, __uncore_##_var##_show, NULL) + + +static ssize_t uncore_event_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct uncore_event_desc *event = + container_of(attr, struct uncore_event_desc, attr); + return sprintf(buf, "%s", event->config); +} + +static inline unsigned uncore_pci_box_ctl(struct intel_uncore_box *box) +{ + return box->pmu->type->box_ctl; +} + +static inline unsigned uncore_pci_fixed_ctl(struct intel_uncore_box *box) +{ + return box->pmu->type->fixed_ctl; +} + +static inline unsigned uncore_pci_fixed_ctr(struct intel_uncore_box *box) +{ + return box->pmu->type->fixed_ctr; +} + +static inline +unsigned uncore_pci_event_ctl(struct intel_uncore_box *box, int idx) +{ + return idx * 4 + box->pmu->type->event_ctl; +} + +static inline +unsigned uncore_pci_perf_ctr(struct intel_uncore_box *box, int idx) +{ + return idx * 8 + box->pmu->type->perf_ctr; +} + +static inline unsigned uncore_msr_box_offset(struct intel_uncore_box *box) +{ + struct intel_uncore_pmu *pmu = box->pmu; + return pmu->type->msr_offsets ? + pmu->type->msr_offsets[pmu->pmu_idx] : + pmu->type->msr_offset * pmu->pmu_idx; +} + +static inline unsigned uncore_msr_box_ctl(struct intel_uncore_box *box) +{ + if (!box->pmu->type->box_ctl) + return 0; + return box->pmu->type->box_ctl + uncore_msr_box_offset(box); +} + +static inline unsigned uncore_msr_fixed_ctl(struct intel_uncore_box *box) +{ + if (!box->pmu->type->fixed_ctl) + return 0; + return box->pmu->type->fixed_ctl + uncore_msr_box_offset(box); +} + +static inline unsigned uncore_msr_fixed_ctr(struct intel_uncore_box *box) +{ + return box->pmu->type->fixed_ctr + uncore_msr_box_offset(box); +} + +static inline +unsigned uncore_msr_event_ctl(struct intel_uncore_box *box, int idx) +{ + return box->pmu->type->event_ctl + + (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) + + uncore_msr_box_offset(box); +} + +static inline +unsigned uncore_msr_perf_ctr(struct intel_uncore_box *box, int idx) +{ + return box->pmu->type->perf_ctr + + (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) + + uncore_msr_box_offset(box); +} + +static inline +unsigned uncore_fixed_ctl(struct intel_uncore_box *box) +{ + if (box->pci_dev) + return uncore_pci_fixed_ctl(box); + else + return uncore_msr_fixed_ctl(box); +} + +static inline +unsigned uncore_fixed_ctr(struct intel_uncore_box *box) +{ + if (box->pci_dev) + return uncore_pci_fixed_ctr(box); + else + return uncore_msr_fixed_ctr(box); +} + +static inline +unsigned uncore_event_ctl(struct intel_uncore_box *box, int idx) +{ + if (box->pci_dev) + return uncore_pci_event_ctl(box, idx); + else + return uncore_msr_event_ctl(box, idx); +} + +static inline +unsigned uncore_perf_ctr(struct intel_uncore_box *box, int idx) +{ + if (box->pci_dev) + return uncore_pci_perf_ctr(box, idx); + else + return uncore_msr_perf_ctr(box, idx); +} + +static inline int uncore_perf_ctr_bits(struct intel_uncore_box *box) +{ + return box->pmu->type->perf_ctr_bits; +} + +static inline int uncore_fixed_ctr_bits(struct intel_uncore_box *box) +{ + return box->pmu->type->fixed_ctr_bits; +} + +static inline int uncore_num_counters(struct intel_uncore_box *box) +{ + return box->pmu->type->num_counters; +} + +static inline void uncore_disable_box(struct intel_uncore_box *box) +{ + if (box->pmu->type->ops->disable_box) + box->pmu->type->ops->disable_box(box); +} + +static inline void uncore_enable_box(struct intel_uncore_box *box) +{ + if (box->pmu->type->ops->enable_box) + box->pmu->type->ops->enable_box(box); +} + +static inline void uncore_disable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + box->pmu->type->ops->disable_event(box, event); +} + +static inline void uncore_enable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + box->pmu->type->ops->enable_event(box, event); +} + +static inline u64 uncore_read_counter(struct intel_uncore_box *box, + struct perf_event *event) +{ + return box->pmu->type->ops->read_counter(box, event); +} + +static inline void uncore_box_init(struct intel_uncore_box *box) +{ + if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) { + if (box->pmu->type->ops->init_box) + box->pmu->type->ops->init_box(box); + } +} + +static inline bool uncore_box_is_fake(struct intel_uncore_box *box) +{ + return (box->phys_id < 0); +} diff --git a/arch/x86/kernel/cpu/perf_event_knc.c b/arch/x86/kernel/cpu/perf_event_knc.c new file mode 100644 index 00000000000..838fa8772c6 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_knc.c @@ -0,0 +1,319 @@ +/* Driver for Intel Xeon Phi "Knights Corner" PMU */ + +#include <linux/perf_event.h> +#include <linux/types.h> + +#include <asm/hardirq.h> + +#include "perf_event.h" + +static const u64 knc_perfmon_event_map[] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x002a, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x0016, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0028, + [PERF_COUNT_HW_CACHE_MISSES] = 0x0029, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0012, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x002b, +}; + +static const u64 __initconst knc_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + /* On Xeon Phi event "0" is a valid DATA_READ */ + /* (L1 Data Cache Reads) Instruction. */ + /* We code this as ARCH_PERFMON_EVENTSEL_INT as this */ + /* bit will always be set in x86_pmu_hw_config(). */ + [ C(RESULT_ACCESS) ] = ARCH_PERFMON_EVENTSEL_INT, + /* DATA_READ */ + [ C(RESULT_MISS) ] = 0x0003, /* DATA_READ_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0001, /* DATA_WRITE */ + [ C(RESULT_MISS) ] = 0x0004, /* DATA_WRITE_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0011, /* L1_DATA_PF1 */ + [ C(RESULT_MISS) ] = 0x001c, /* L1_DATA_PF1_MISS */ + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x000c, /* CODE_READ */ + [ C(RESULT_MISS) ] = 0x000e, /* CODE_CACHE_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0x10cb, /* L2_READ_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x10cc, /* L2_WRITE_HIT */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x10fc, /* L2_DATA_PF2 */ + [ C(RESULT_MISS) ] = 0x10fe, /* L2_DATA_PF2_MISS */ + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = ARCH_PERFMON_EVENTSEL_INT, + /* DATA_READ */ + /* see note on L1 OP_READ */ + [ C(RESULT_MISS) ] = 0x0002, /* DATA_PAGE_WALK */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0001, /* DATA_WRITE */ + [ C(RESULT_MISS) ] = 0x0002, /* DATA_PAGE_WALK */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x000c, /* CODE_READ */ + [ C(RESULT_MISS) ] = 0x000d, /* CODE_PAGE_WALK */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0012, /* BRANCHES */ + [ C(RESULT_MISS) ] = 0x002b, /* BRANCHES_MISPREDICTED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + + +static u64 knc_pmu_event_map(int hw_event) +{ + return knc_perfmon_event_map[hw_event]; +} + +static struct event_constraint knc_event_constraints[] = +{ + INTEL_EVENT_CONSTRAINT(0xc3, 0x1), /* HWP_L2HIT */ + INTEL_EVENT_CONSTRAINT(0xc4, 0x1), /* HWP_L2MISS */ + INTEL_EVENT_CONSTRAINT(0xc8, 0x1), /* L2_READ_HIT_E */ + INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* L2_READ_HIT_M */ + INTEL_EVENT_CONSTRAINT(0xca, 0x1), /* L2_READ_HIT_S */ + INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* L2_READ_MISS */ + INTEL_EVENT_CONSTRAINT(0xcc, 0x1), /* L2_WRITE_HIT */ + INTEL_EVENT_CONSTRAINT(0xce, 0x1), /* L2_STRONGLY_ORDERED_STREAMING_VSTORES_MISS */ + INTEL_EVENT_CONSTRAINT(0xcf, 0x1), /* L2_WEAKLY_ORDERED_STREAMING_VSTORE_MISS */ + INTEL_EVENT_CONSTRAINT(0xd7, 0x1), /* L2_VICTIM_REQ_WITH_DATA */ + INTEL_EVENT_CONSTRAINT(0xe3, 0x1), /* SNP_HITM_BUNIT */ + INTEL_EVENT_CONSTRAINT(0xe6, 0x1), /* SNP_HIT_L2 */ + INTEL_EVENT_CONSTRAINT(0xe7, 0x1), /* SNP_HITM_L2 */ + INTEL_EVENT_CONSTRAINT(0xf1, 0x1), /* L2_DATA_READ_MISS_CACHE_FILL */ + INTEL_EVENT_CONSTRAINT(0xf2, 0x1), /* L2_DATA_WRITE_MISS_CACHE_FILL */ + INTEL_EVENT_CONSTRAINT(0xf6, 0x1), /* L2_DATA_READ_MISS_MEM_FILL */ + INTEL_EVENT_CONSTRAINT(0xf7, 0x1), /* L2_DATA_WRITE_MISS_MEM_FILL */ + INTEL_EVENT_CONSTRAINT(0xfc, 0x1), /* L2_DATA_PF2 */ + INTEL_EVENT_CONSTRAINT(0xfd, 0x1), /* L2_DATA_PF2_DROP */ + INTEL_EVENT_CONSTRAINT(0xfe, 0x1), /* L2_DATA_PF2_MISS */ + INTEL_EVENT_CONSTRAINT(0xff, 0x1), /* L2_DATA_HIT_INFLIGHT_PF2 */ + EVENT_CONSTRAINT_END +}; + +#define MSR_KNC_IA32_PERF_GLOBAL_STATUS 0x0000002d +#define MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL 0x0000002e +#define MSR_KNC_IA32_PERF_GLOBAL_CTRL 0x0000002f + +#define KNC_ENABLE_COUNTER0 0x00000001 +#define KNC_ENABLE_COUNTER1 0x00000002 + +static void knc_pmu_disable_all(void) +{ + u64 val; + + rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val); + val &= ~(KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1); + wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val); +} + +static void knc_pmu_enable_all(int added) +{ + u64 val; + + rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val); + val |= (KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1); + wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val); +} + +static inline void +knc_pmu_disable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 val; + + val = hwc->config; + val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; + + (void)wrmsrl_safe(hwc->config_base + hwc->idx, val); +} + +static void knc_pmu_enable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 val; + + val = hwc->config; + val |= ARCH_PERFMON_EVENTSEL_ENABLE; + + (void)wrmsrl_safe(hwc->config_base + hwc->idx, val); +} + +static inline u64 knc_pmu_get_status(void) +{ + u64 status; + + rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_STATUS, status); + + return status; +} + +static inline void knc_pmu_ack_status(u64 ack) +{ + wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL, ack); +} + +static int knc_pmu_handle_irq(struct pt_regs *regs) +{ + struct perf_sample_data data; + struct cpu_hw_events *cpuc; + int handled = 0; + int bit, loops; + u64 status; + + cpuc = &__get_cpu_var(cpu_hw_events); + + knc_pmu_disable_all(); + + status = knc_pmu_get_status(); + if (!status) { + knc_pmu_enable_all(0); + return handled; + } + + loops = 0; +again: + knc_pmu_ack_status(status); + if (++loops > 100) { + WARN_ONCE(1, "perf: irq loop stuck!\n"); + perf_event_print_debug(); + goto done; + } + + inc_irq_stat(apic_perf_irqs); + + for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { + struct perf_event *event = cpuc->events[bit]; + + handled++; + + if (!test_bit(bit, cpuc->active_mask)) + continue; + + if (!intel_pmu_save_and_restart(event)) + continue; + + perf_sample_data_init(&data, 0, event->hw.last_period); + + if (perf_event_overflow(event, &data, regs)) + x86_pmu_stop(event, 0); + } + + /* + * Repeat if there is more work to be done: + */ + status = knc_pmu_get_status(); + if (status) + goto again; + +done: + knc_pmu_enable_all(0); + + return handled; +} + + +PMU_FORMAT_ATTR(event, "config:0-7" ); +PMU_FORMAT_ATTR(umask, "config:8-15" ); +PMU_FORMAT_ATTR(edge, "config:18" ); +PMU_FORMAT_ATTR(inv, "config:23" ); +PMU_FORMAT_ATTR(cmask, "config:24-31" ); + +static struct attribute *intel_knc_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_cmask.attr, + NULL, +}; + +static const struct x86_pmu knc_pmu __initconst = { + .name = "knc", + .handle_irq = knc_pmu_handle_irq, + .disable_all = knc_pmu_disable_all, + .enable_all = knc_pmu_enable_all, + .enable = knc_pmu_enable_event, + .disable = knc_pmu_disable_event, + .hw_config = x86_pmu_hw_config, + .schedule_events = x86_schedule_events, + .eventsel = MSR_KNC_EVNTSEL0, + .perfctr = MSR_KNC_PERFCTR0, + .event_map = knc_pmu_event_map, + .max_events = ARRAY_SIZE(knc_perfmon_event_map), + .apic = 1, + .max_period = (1ULL << 39) - 1, + .version = 0, + .num_counters = 2, + .cntval_bits = 40, + .cntval_mask = (1ULL << 40) - 1, + .get_event_constraints = x86_get_event_constraints, + .event_constraints = knc_event_constraints, + .format_attrs = intel_knc_formats_attr, +}; + +__init int knc_pmu_init(void) +{ + x86_pmu = knc_pmu; + + memcpy(hw_cache_event_ids, knc_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + return 0; +} diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c new file mode 100644 index 00000000000..5d466b7d860 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -0,0 +1,1376 @@ +/* + * Netburst Performance Events (P4, old Xeon) + * + * Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org> + * Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com> + * + * For licencing details see kernel-base/COPYING + */ + +#include <linux/perf_event.h> + +#include <asm/perf_event_p4.h> +#include <asm/hardirq.h> +#include <asm/apic.h> + +#include "perf_event.h" + +#define P4_CNTR_LIMIT 3 +/* + * array indices: 0,1 - HT threads, used with HT enabled cpu + */ +struct p4_event_bind { + unsigned int opcode; /* Event code and ESCR selector */ + unsigned int escr_msr[2]; /* ESCR MSR for this event */ + unsigned int escr_emask; /* valid ESCR EventMask bits */ + unsigned int shared; /* event is shared across threads */ + char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */ +}; + +struct p4_pebs_bind { + unsigned int metric_pebs; + unsigned int metric_vert; +}; + +/* it sets P4_PEBS_ENABLE_UOP_TAG as well */ +#define P4_GEN_PEBS_BIND(name, pebs, vert) \ + [P4_PEBS_METRIC__##name] = { \ + .metric_pebs = pebs | P4_PEBS_ENABLE_UOP_TAG, \ + .metric_vert = vert, \ + } + +/* + * note we have P4_PEBS_ENABLE_UOP_TAG always set here + * + * it's needed for mapping P4_PEBS_CONFIG_METRIC_MASK bits of + * event configuration to find out which values are to be + * written into MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT + * resgisters + */ +static struct p4_pebs_bind p4_pebs_bind_map[] = { + P4_GEN_PEBS_BIND(1stl_cache_load_miss_retired, 0x0000001, 0x0000001), + P4_GEN_PEBS_BIND(2ndl_cache_load_miss_retired, 0x0000002, 0x0000001), + P4_GEN_PEBS_BIND(dtlb_load_miss_retired, 0x0000004, 0x0000001), + P4_GEN_PEBS_BIND(dtlb_store_miss_retired, 0x0000004, 0x0000002), + P4_GEN_PEBS_BIND(dtlb_all_miss_retired, 0x0000004, 0x0000003), + P4_GEN_PEBS_BIND(tagged_mispred_branch, 0x0018000, 0x0000010), + P4_GEN_PEBS_BIND(mob_load_replay_retired, 0x0000200, 0x0000001), + P4_GEN_PEBS_BIND(split_load_retired, 0x0000400, 0x0000001), + P4_GEN_PEBS_BIND(split_store_retired, 0x0000400, 0x0000002), +}; + +/* + * Note that we don't use CCCR1 here, there is an + * exception for P4_BSQ_ALLOCATION but we just have + * no workaround + * + * consider this binding as resources which particular + * event may borrow, it doesn't contain EventMask, + * Tags and friends -- they are left to a caller + */ +static struct p4_event_bind p4_event_bind_map[] = { + [P4_EVENT_TC_DELIVER_MODE] = { + .opcode = P4_OPCODE(P4_EVENT_TC_DELIVER_MODE), + .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DD) | + P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DB) | + P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DI) | + P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BD) | + P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BB) | + P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BI) | + P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, ID), + .shared = 1, + .cntr = { {4, 5, -1}, {6, 7, -1} }, + }, + [P4_EVENT_BPU_FETCH_REQUEST] = { + .opcode = P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST), + .escr_msr = { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_BPU_FETCH_REQUEST, TCMISS), + .cntr = { {0, -1, -1}, {2, -1, -1} }, + }, + [P4_EVENT_ITLB_REFERENCE] = { + .opcode = P4_OPCODE(P4_EVENT_ITLB_REFERENCE), + .escr_msr = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT) | + P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, MISS) | + P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT_UK), + .cntr = { {0, -1, -1}, {2, -1, -1} }, + }, + [P4_EVENT_MEMORY_CANCEL] = { + .opcode = P4_OPCODE(P4_EVENT_MEMORY_CANCEL), + .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, ST_RB_FULL) | + P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, 64K_CONF), + .cntr = { {8, 9, -1}, {10, 11, -1} }, + }, + [P4_EVENT_MEMORY_COMPLETE] = { + .opcode = P4_OPCODE(P4_EVENT_MEMORY_COMPLETE), + .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, LSC) | + P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, SSC), + .cntr = { {8, 9, -1}, {10, 11, -1} }, + }, + [P4_EVENT_LOAD_PORT_REPLAY] = { + .opcode = P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY), + .escr_msr = { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_LOAD_PORT_REPLAY, SPLIT_LD), + .cntr = { {8, 9, -1}, {10, 11, -1} }, + }, + [P4_EVENT_STORE_PORT_REPLAY] = { + .opcode = P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY), + .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_STORE_PORT_REPLAY, SPLIT_ST), + .cntr = { {8, 9, -1}, {10, 11, -1} }, + }, + [P4_EVENT_MOB_LOAD_REPLAY] = { + .opcode = P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY), + .escr_msr = { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STA) | + P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STD) | + P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, PARTIAL_DATA) | + P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, UNALGN_ADDR), + .cntr = { {0, -1, -1}, {2, -1, -1} }, + }, + [P4_EVENT_PAGE_WALK_TYPE] = { + .opcode = P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE), + .escr_msr = { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, DTMISS) | + P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, ITMISS), + .shared = 1, + .cntr = { {0, -1, -1}, {2, -1, -1} }, + }, + [P4_EVENT_BSQ_CACHE_REFERENCE] = { + .opcode = P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE), + .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS), + .cntr = { {0, -1, -1}, {2, -1, -1} }, + }, + [P4_EVENT_IOQ_ALLOCATION] = { + .opcode = P4_OPCODE(P4_EVENT_IOQ_ALLOCATION), + .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, DEFAULT) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_READ) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_WRITE) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_UC) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WC) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WT) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WP) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WB) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OWN) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OTHER) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, PREFETCH), + .cntr = { {0, -1, -1}, {2, -1, -1} }, + }, + [P4_EVENT_IOQ_ACTIVE_ENTRIES] = { /* shared ESCR */ + .opcode = P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES), + .escr_msr = { MSR_P4_FSB_ESCR1, MSR_P4_FSB_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, DEFAULT) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_READ) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_WRITE) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_UC) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WC) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WT) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WP) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WB) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OWN) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OTHER) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, PREFETCH), + .cntr = { {2, -1, -1}, {3, -1, -1} }, + }, + [P4_EVENT_FSB_DATA_ACTIVITY] = { + .opcode = P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY), + .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV) | + P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN) | + P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OTHER) | + P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_DRV) | + P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OWN) | + P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OTHER), + .shared = 1, + .cntr = { {0, -1, -1}, {2, -1, -1} }, + }, + [P4_EVENT_BSQ_ALLOCATION] = { /* shared ESCR, broken CCCR1 */ + .opcode = P4_OPCODE(P4_EVENT_BSQ_ALLOCATION), + .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE0) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE1) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN0) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN1) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_IO_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LOCK_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_CACHE_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_SPLIT_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_DEM_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_ORD_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE0) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE1) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE2), + .cntr = { {0, -1, -1}, {1, -1, -1} }, + }, + [P4_EVENT_BSQ_ACTIVE_ENTRIES] = { /* shared ESCR */ + .opcode = P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES), + .escr_msr = { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE0) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE1) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN0) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN1) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE0) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE1) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE2), + .cntr = { {2, -1, -1}, {3, -1, -1} }, + }, + [P4_EVENT_SSE_INPUT_ASSIST] = { + .opcode = P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST), + .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_SSE_INPUT_ASSIST, ALL), + .shared = 1, + .cntr = { {8, 9, -1}, {10, 11, -1} }, + }, + [P4_EVENT_PACKED_SP_UOP] = { + .opcode = P4_OPCODE(P4_EVENT_PACKED_SP_UOP), + .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_SP_UOP, ALL), + .shared = 1, + .cntr = { {8, 9, -1}, {10, 11, -1} }, + }, + [P4_EVENT_PACKED_DP_UOP] = { + .opcode = P4_OPCODE(P4_EVENT_PACKED_DP_UOP), + .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_DP_UOP, ALL), + .shared = 1, + .cntr = { {8, 9, -1}, {10, 11, -1} }, + }, + [P4_EVENT_SCALAR_SP_UOP] = { + .opcode = P4_OPCODE(P4_EVENT_SCALAR_SP_UOP), + .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_SP_UOP, ALL), + .shared = 1, + .cntr = { {8, 9, -1}, {10, 11, -1} }, + }, + [P4_EVENT_SCALAR_DP_UOP] = { + .opcode = P4_OPCODE(P4_EVENT_SCALAR_DP_UOP), + .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_DP_UOP, ALL), + .shared = 1, + .cntr = { {8, 9, -1}, {10, 11, -1} }, + }, + [P4_EVENT_64BIT_MMX_UOP] = { + .opcode = P4_OPCODE(P4_EVENT_64BIT_MMX_UOP), + .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_64BIT_MMX_UOP, ALL), + .shared = 1, + .cntr = { {8, 9, -1}, {10, 11, -1} }, + }, + [P4_EVENT_128BIT_MMX_UOP] = { + .opcode = P4_OPCODE(P4_EVENT_128BIT_MMX_UOP), + .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_128BIT_MMX_UOP, ALL), + .shared = 1, + .cntr = { {8, 9, -1}, {10, 11, -1} }, + }, + [P4_EVENT_X87_FP_UOP] = { + .opcode = P4_OPCODE(P4_EVENT_X87_FP_UOP), + .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_X87_FP_UOP, ALL), + .shared = 1, + .cntr = { {8, 9, -1}, {10, 11, -1} }, + }, + [P4_EVENT_TC_MISC] = { + .opcode = P4_OPCODE(P4_EVENT_TC_MISC), + .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_TC_MISC, FLUSH), + .cntr = { {4, 5, -1}, {6, 7, -1} }, + }, + [P4_EVENT_GLOBAL_POWER_EVENTS] = { + .opcode = P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS), + .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING), + .cntr = { {0, -1, -1}, {2, -1, -1} }, + }, + [P4_EVENT_TC_MS_XFER] = { + .opcode = P4_OPCODE(P4_EVENT_TC_MS_XFER), + .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_TC_MS_XFER, CISC), + .cntr = { {4, 5, -1}, {6, 7, -1} }, + }, + [P4_EVENT_UOP_QUEUE_WRITES] = { + .opcode = P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES), + .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_BUILD) | + P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_DELIVER) | + P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_ROM), + .cntr = { {4, 5, -1}, {6, 7, -1} }, + }, + [P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = { + .opcode = P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE), + .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL) | + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CALL) | + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, RETURN) | + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT), + .cntr = { {4, 5, -1}, {6, 7, -1} }, + }, + [P4_EVENT_RETIRED_BRANCH_TYPE] = { + .opcode = P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE), + .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL) | + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL) | + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN) | + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT), + .cntr = { {4, 5, -1}, {6, 7, -1} }, + }, + [P4_EVENT_RESOURCE_STALL] = { + .opcode = P4_OPCODE(P4_EVENT_RESOURCE_STALL), + .escr_msr = { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_RESOURCE_STALL, SBFULL), + .cntr = { {12, 13, 16}, {14, 15, 17} }, + }, + [P4_EVENT_WC_BUFFER] = { + .opcode = P4_OPCODE(P4_EVENT_WC_BUFFER), + .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_EVICTS) | + P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_FULL_EVICTS), + .shared = 1, + .cntr = { {8, 9, -1}, {10, 11, -1} }, + }, + [P4_EVENT_B2B_CYCLES] = { + .opcode = P4_OPCODE(P4_EVENT_B2B_CYCLES), + .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, + .escr_emask = 0, + .cntr = { {0, -1, -1}, {2, -1, -1} }, + }, + [P4_EVENT_BNR] = { + .opcode = P4_OPCODE(P4_EVENT_BNR), + .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, + .escr_emask = 0, + .cntr = { {0, -1, -1}, {2, -1, -1} }, + }, + [P4_EVENT_SNOOP] = { + .opcode = P4_OPCODE(P4_EVENT_SNOOP), + .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, + .escr_emask = 0, + .cntr = { {0, -1, -1}, {2, -1, -1} }, + }, + [P4_EVENT_RESPONSE] = { + .opcode = P4_OPCODE(P4_EVENT_RESPONSE), + .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, + .escr_emask = 0, + .cntr = { {0, -1, -1}, {2, -1, -1} }, + }, + [P4_EVENT_FRONT_END_EVENT] = { + .opcode = P4_OPCODE(P4_EVENT_FRONT_END_EVENT), + .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, NBOGUS) | + P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, BOGUS), + .cntr = { {12, 13, 16}, {14, 15, 17} }, + }, + [P4_EVENT_EXECUTION_EVENT] = { + .opcode = P4_OPCODE(P4_EVENT_EXECUTION_EVENT), + .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3), + .cntr = { {12, 13, 16}, {14, 15, 17} }, + }, + [P4_EVENT_REPLAY_EVENT] = { + .opcode = P4_OPCODE(P4_EVENT_REPLAY_EVENT), + .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, NBOGUS) | + P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, BOGUS), + .cntr = { {12, 13, 16}, {14, 15, 17} }, + }, + [P4_EVENT_INSTR_RETIRED] = { + .opcode = P4_OPCODE(P4_EVENT_INSTR_RETIRED), + .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG) | + P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSTAG) | + P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG) | + P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSTAG), + .cntr = { {12, 13, 16}, {14, 15, 17} }, + }, + [P4_EVENT_UOPS_RETIRED] = { + .opcode = P4_OPCODE(P4_EVENT_UOPS_RETIRED), + .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, NBOGUS) | + P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, BOGUS), + .cntr = { {12, 13, 16}, {14, 15, 17} }, + }, + [P4_EVENT_UOP_TYPE] = { + .opcode = P4_OPCODE(P4_EVENT_UOP_TYPE), + .escr_msr = { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGLOADS) | + P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGSTORES), + .cntr = { {12, 13, 16}, {14, 15, 17} }, + }, + [P4_EVENT_BRANCH_RETIRED] = { + .opcode = P4_OPCODE(P4_EVENT_BRANCH_RETIRED), + .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNP) | + P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNM) | + P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTP) | + P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTM), + .cntr = { {12, 13, 16}, {14, 15, 17} }, + }, + [P4_EVENT_MISPRED_BRANCH_RETIRED] = { + .opcode = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED), + .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS), + .cntr = { {12, 13, 16}, {14, 15, 17} }, + }, + [P4_EVENT_X87_ASSIST] = { + .opcode = P4_OPCODE(P4_EVENT_X87_ASSIST), + .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSU) | + P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSO) | + P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAO) | + P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAU) | + P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, PREA), + .cntr = { {12, 13, 16}, {14, 15, 17} }, + }, + [P4_EVENT_MACHINE_CLEAR] = { + .opcode = P4_OPCODE(P4_EVENT_MACHINE_CLEAR), + .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, CLEAR) | + P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, MOCLEAR) | + P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, SMCLEAR), + .cntr = { {12, 13, 16}, {14, 15, 17} }, + }, + [P4_EVENT_INSTR_COMPLETED] = { + .opcode = P4_OPCODE(P4_EVENT_INSTR_COMPLETED), + .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, NBOGUS) | + P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, BOGUS), + .cntr = { {12, 13, 16}, {14, 15, 17} }, + }, +}; + +#define P4_GEN_CACHE_EVENT(event, bit, metric) \ + p4_config_pack_escr(P4_ESCR_EVENT(event) | \ + P4_ESCR_EMASK_BIT(event, bit)) | \ + p4_config_pack_cccr(metric | \ + P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event)))) + +static __initconst const u64 p4_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, + P4_PEBS_METRIC__1stl_cache_load_miss_retired), + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, + P4_PEBS_METRIC__2ndl_cache_load_miss_retired), + }, +}, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, + P4_PEBS_METRIC__dtlb_load_miss_retired), + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, + P4_PEBS_METRIC__dtlb_store_miss_retired), + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT, + P4_PEBS_METRIC__none), + [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS, + P4_PEBS_METRIC__none), + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +/* + * Because of Netburst being quite restricted in how many + * identical events may run simultaneously, we introduce event aliases, + * ie the different events which have the same functionality but + * utilize non-intersected resources (ESCR/CCCR/counter registers). + * + * This allow us to relax restrictions a bit and run two or more + * identical events together. + * + * Never set any custom internal bits such as P4_CONFIG_HT, + * P4_CONFIG_ALIASABLE or bits for P4_PEBS_METRIC, they are + * either up to date automatically or not applicable at all. + */ +struct p4_event_alias { + u64 original; + u64 alternative; +} p4_event_aliases[] = { + { + /* + * Non-halted cycles can be substituted with non-sleeping cycles (see + * Intel SDM Vol3b for details). We need this alias to be able + * to run nmi-watchdog and 'perf top' (or any other user space tool + * which is interested in running PERF_COUNT_HW_CPU_CYCLES) + * simultaneously. + */ + .original = + p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) | + P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)), + .alternative = + p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)| + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)| + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)| + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)| + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3))| + p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT | + P4_CCCR_COMPARE), + }, +}; + +static u64 p4_get_alias_event(u64 config) +{ + u64 config_match; + int i; + + /* + * Only event with special mark is allowed, + * we're to be sure it didn't come as malformed + * RAW event. + */ + if (!(config & P4_CONFIG_ALIASABLE)) + return 0; + + config_match = config & P4_CONFIG_EVENT_ALIAS_MASK; + + for (i = 0; i < ARRAY_SIZE(p4_event_aliases); i++) { + if (config_match == p4_event_aliases[i].original) { + config_match = p4_event_aliases[i].alternative; + break; + } else if (config_match == p4_event_aliases[i].alternative) { + config_match = p4_event_aliases[i].original; + break; + } + } + + if (i >= ARRAY_SIZE(p4_event_aliases)) + return 0; + + return config_match | (config & P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS); +} + +static u64 p4_general_events[PERF_COUNT_HW_MAX] = { + /* non-halted CPU clocks */ + [PERF_COUNT_HW_CPU_CYCLES] = + p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) | + P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)) | + P4_CONFIG_ALIASABLE, + + /* + * retired instructions + * in a sake of simplicity we don't use the FSB tagging + */ + [PERF_COUNT_HW_INSTRUCTIONS] = + p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_INSTR_RETIRED) | + P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG) | + P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)), + + /* cache hits */ + [PERF_COUNT_HW_CACHE_REFERENCES] = + p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)), + + /* cache misses */ + [PERF_COUNT_HW_CACHE_MISSES] = + p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS)), + + /* branch instructions retired */ + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = + p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_RETIRED_BRANCH_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL) | + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL) | + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN) | + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT)), + + /* mispredicted branches retired */ + [PERF_COUNT_HW_BRANCH_MISSES] = + p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_MISPRED_BRANCH_RETIRED) | + P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS)), + + /* bus ready clocks (cpu is driving #DRDY_DRV\#DRDY_OWN): */ + [PERF_COUNT_HW_BUS_CYCLES] = + p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_FSB_DATA_ACTIVITY) | + P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV) | + P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN)) | + p4_config_pack_cccr(P4_CCCR_EDGE | P4_CCCR_COMPARE), +}; + +static struct p4_event_bind *p4_config_get_bind(u64 config) +{ + unsigned int evnt = p4_config_unpack_event(config); + struct p4_event_bind *bind = NULL; + + if (evnt < ARRAY_SIZE(p4_event_bind_map)) + bind = &p4_event_bind_map[evnt]; + + return bind; +} + +static u64 p4_pmu_event_map(int hw_event) +{ + struct p4_event_bind *bind; + unsigned int esel; + u64 config; + + config = p4_general_events[hw_event]; + bind = p4_config_get_bind(config); + esel = P4_OPCODE_ESEL(bind->opcode); + config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel)); + + return config; +} + +/* check cpu model specifics */ +static bool p4_event_match_cpu_model(unsigned int event_idx) +{ + /* INSTR_COMPLETED event only exist for model 3, 4, 6 (Prescott) */ + if (event_idx == P4_EVENT_INSTR_COMPLETED) { + if (boot_cpu_data.x86_model != 3 && + boot_cpu_data.x86_model != 4 && + boot_cpu_data.x86_model != 6) + return false; + } + + /* + * For info + * - IQ_ESCR0, IQ_ESCR1 only for models 1 and 2 + */ + + return true; +} + +static int p4_validate_raw_event(struct perf_event *event) +{ + unsigned int v, emask; + + /* User data may have out-of-bound event index */ + v = p4_config_unpack_event(event->attr.config); + if (v >= ARRAY_SIZE(p4_event_bind_map)) + return -EINVAL; + + /* It may be unsupported: */ + if (!p4_event_match_cpu_model(v)) + return -EINVAL; + + /* + * NOTE: P4_CCCR_THREAD_ANY has not the same meaning as + * in Architectural Performance Monitoring, it means not + * on _which_ logical cpu to count but rather _when_, ie it + * depends on logical cpu state -- count event if one cpu active, + * none, both or any, so we just allow user to pass any value + * desired. + * + * In turn we always set Tx_OS/Tx_USR bits bound to logical + * cpu without their propagation to another cpu + */ + + /* + * if an event is shared across the logical threads + * the user needs special permissions to be able to use it + */ + if (p4_ht_active() && p4_event_bind_map[v].shared) { + if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) + return -EACCES; + } + + /* ESCR EventMask bits may be invalid */ + emask = p4_config_unpack_escr(event->attr.config) & P4_ESCR_EVENTMASK_MASK; + if (emask & ~p4_event_bind_map[v].escr_emask) + return -EINVAL; + + /* + * it may have some invalid PEBS bits + */ + if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE)) + return -EINVAL; + + v = p4_config_unpack_metric(event->attr.config); + if (v >= ARRAY_SIZE(p4_pebs_bind_map)) + return -EINVAL; + + return 0; +} + +static int p4_hw_config(struct perf_event *event) +{ + int cpu = get_cpu(); + int rc = 0; + u32 escr, cccr; + + /* + * the reason we use cpu that early is that: if we get scheduled + * first time on the same cpu -- we will not need swap thread + * specific flags in config (and will save some cpu cycles) + */ + + cccr = p4_default_cccr_conf(cpu); + escr = p4_default_escr_conf(cpu, event->attr.exclude_kernel, + event->attr.exclude_user); + event->hw.config = p4_config_pack_escr(escr) | + p4_config_pack_cccr(cccr); + + if (p4_ht_active() && p4_ht_thread(cpu)) + event->hw.config = p4_set_ht_bit(event->hw.config); + + if (event->attr.type == PERF_TYPE_RAW) { + struct p4_event_bind *bind; + unsigned int esel; + /* + * Clear bits we reserve to be managed by kernel itself + * and never allowed from a user space + */ + event->attr.config &= P4_CONFIG_MASK; + + rc = p4_validate_raw_event(event); + if (rc) + goto out; + + /* + * Note that for RAW events we allow user to use P4_CCCR_RESERVED + * bits since we keep additional info here (for cache events and etc) + */ + event->hw.config |= event->attr.config; + bind = p4_config_get_bind(event->attr.config); + if (!bind) { + rc = -EINVAL; + goto out; + } + esel = P4_OPCODE_ESEL(bind->opcode); + event->hw.config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel)); + } + + rc = x86_setup_perfctr(event); +out: + put_cpu(); + return rc; +} + +static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc) +{ + u64 v; + + /* an official way for overflow indication */ + rdmsrl(hwc->config_base, v); + if (v & P4_CCCR_OVF) { + wrmsrl(hwc->config_base, v & ~P4_CCCR_OVF); + return 1; + } + + /* + * In some circumstances the overflow might issue an NMI but did + * not set P4_CCCR_OVF bit. Because a counter holds a negative value + * we simply check for high bit being set, if it's cleared it means + * the counter has reached zero value and continued counting before + * real NMI signal was received: + */ + rdmsrl(hwc->event_base, v); + if (!(v & ARCH_P4_UNFLAGGED_BIT)) + return 1; + + return 0; +} + +static void p4_pmu_disable_pebs(void) +{ + /* + * FIXME + * + * It's still allowed that two threads setup same cache + * events so we can't simply clear metrics until we knew + * no one is depending on us, so we need kind of counter + * for "ReplayEvent" users. + * + * What is more complex -- RAW events, if user (for some + * reason) will pass some cache event metric with improper + * event opcode -- it's fine from hardware point of view + * but completely nonsense from "meaning" of such action. + * + * So at moment let leave metrics turned on forever -- it's + * ok for now but need to be revisited! + * + * (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, 0); + * (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, 0); + */ +} + +static inline void p4_pmu_disable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + /* + * If event gets disabled while counter is in overflowed + * state we need to clear P4_CCCR_OVF, otherwise interrupt get + * asserted again and again + */ + (void)wrmsrl_safe(hwc->config_base, + p4_config_unpack_cccr(hwc->config) & ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED); +} + +static void p4_pmu_disable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + int idx; + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + struct perf_event *event = cpuc->events[idx]; + if (!test_bit(idx, cpuc->active_mask)) + continue; + p4_pmu_disable_event(event); + } + + p4_pmu_disable_pebs(); +} + +/* configuration must be valid */ +static void p4_pmu_enable_pebs(u64 config) +{ + struct p4_pebs_bind *bind; + unsigned int idx; + + BUILD_BUG_ON(P4_PEBS_METRIC__max > P4_PEBS_CONFIG_METRIC_MASK); + + idx = p4_config_unpack_metric(config); + if (idx == P4_PEBS_METRIC__none) + return; + + bind = &p4_pebs_bind_map[idx]; + + (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs); + (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert); +} + +static void p4_pmu_enable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + int thread = p4_ht_config_thread(hwc->config); + u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config)); + unsigned int idx = p4_config_unpack_event(hwc->config); + struct p4_event_bind *bind; + u64 escr_addr, cccr; + + bind = &p4_event_bind_map[idx]; + escr_addr = bind->escr_msr[thread]; + + /* + * - we dont support cascaded counters yet + * - and counter 1 is broken (erratum) + */ + WARN_ON_ONCE(p4_is_event_cascaded(hwc->config)); + WARN_ON_ONCE(hwc->idx == 1); + + /* we need a real Event value */ + escr_conf &= ~P4_ESCR_EVENT_MASK; + escr_conf |= P4_ESCR_EVENT(P4_OPCODE_EVNT(bind->opcode)); + + cccr = p4_config_unpack_cccr(hwc->config); + + /* + * it could be Cache event so we need to write metrics + * into additional MSRs + */ + p4_pmu_enable_pebs(hwc->config); + + (void)wrmsrl_safe(escr_addr, escr_conf); + (void)wrmsrl_safe(hwc->config_base, + (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE); +} + +static void p4_pmu_enable_all(int added) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + int idx; + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + struct perf_event *event = cpuc->events[idx]; + if (!test_bit(idx, cpuc->active_mask)) + continue; + p4_pmu_enable_event(event); + } +} + +static int p4_pmu_handle_irq(struct pt_regs *regs) +{ + struct perf_sample_data data; + struct cpu_hw_events *cpuc; + struct perf_event *event; + struct hw_perf_event *hwc; + int idx, handled = 0; + u64 val; + + cpuc = &__get_cpu_var(cpu_hw_events); + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + int overflow; + + if (!test_bit(idx, cpuc->active_mask)) { + /* catch in-flight IRQs */ + if (__test_and_clear_bit(idx, cpuc->running)) + handled++; + continue; + } + + event = cpuc->events[idx]; + hwc = &event->hw; + + WARN_ON_ONCE(hwc->idx != idx); + + /* it might be unflagged overflow */ + overflow = p4_pmu_clear_cccr_ovf(hwc); + + val = x86_perf_event_update(event); + if (!overflow && (val & (1ULL << (x86_pmu.cntval_bits - 1)))) + continue; + + handled += overflow; + + /* event overflow for sure */ + perf_sample_data_init(&data, 0, hwc->last_period); + + if (!x86_perf_event_set_period(event)) + continue; + + + if (perf_event_overflow(event, &data, regs)) + x86_pmu_stop(event, 0); + } + + if (handled) + inc_irq_stat(apic_perf_irqs); + + /* + * When dealing with the unmasking of the LVTPC on P4 perf hw, it has + * been observed that the OVF bit flag has to be cleared first _before_ + * the LVTPC can be unmasked. + * + * The reason is the NMI line will continue to be asserted while the OVF + * bit is set. This causes a second NMI to generate if the LVTPC is + * unmasked before the OVF bit is cleared, leading to unknown NMI + * messages. + */ + apic_write(APIC_LVTPC, APIC_DM_NMI); + + return handled; +} + +/* + * swap thread specific fields according to a thread + * we are going to run on + */ +static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu) +{ + u32 escr, cccr; + + /* + * we either lucky and continue on same cpu or no HT support + */ + if (!p4_should_swap_ts(hwc->config, cpu)) + return; + + /* + * the event is migrated from an another logical + * cpu, so we need to swap thread specific flags + */ + + escr = p4_config_unpack_escr(hwc->config); + cccr = p4_config_unpack_cccr(hwc->config); + + if (p4_ht_thread(cpu)) { + cccr &= ~P4_CCCR_OVF_PMI_T0; + cccr |= P4_CCCR_OVF_PMI_T1; + if (escr & P4_ESCR_T0_OS) { + escr &= ~P4_ESCR_T0_OS; + escr |= P4_ESCR_T1_OS; + } + if (escr & P4_ESCR_T0_USR) { + escr &= ~P4_ESCR_T0_USR; + escr |= P4_ESCR_T1_USR; + } + hwc->config = p4_config_pack_escr(escr); + hwc->config |= p4_config_pack_cccr(cccr); + hwc->config |= P4_CONFIG_HT; + } else { + cccr &= ~P4_CCCR_OVF_PMI_T1; + cccr |= P4_CCCR_OVF_PMI_T0; + if (escr & P4_ESCR_T1_OS) { + escr &= ~P4_ESCR_T1_OS; + escr |= P4_ESCR_T0_OS; + } + if (escr & P4_ESCR_T1_USR) { + escr &= ~P4_ESCR_T1_USR; + escr |= P4_ESCR_T0_USR; + } + hwc->config = p4_config_pack_escr(escr); + hwc->config |= p4_config_pack_cccr(cccr); + hwc->config &= ~P4_CONFIG_HT; + } +} + +/* + * ESCR address hashing is tricky, ESCRs are not sequential + * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03a0) and + * the metric between any ESCRs is laid in range [0xa0,0xe1] + * + * so we make ~70% filled hashtable + */ + +#define P4_ESCR_MSR_BASE 0x000003a0 +#define P4_ESCR_MSR_MAX 0x000003e1 +#define P4_ESCR_MSR_TABLE_SIZE (P4_ESCR_MSR_MAX - P4_ESCR_MSR_BASE + 1) +#define P4_ESCR_MSR_IDX(msr) (msr - P4_ESCR_MSR_BASE) +#define P4_ESCR_MSR_TABLE_ENTRY(msr) [P4_ESCR_MSR_IDX(msr)] = msr + +static const unsigned int p4_escr_table[P4_ESCR_MSR_TABLE_SIZE] = { + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR2), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR3), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR4), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR5), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR1), +}; + +static int p4_get_escr_idx(unsigned int addr) +{ + unsigned int idx = P4_ESCR_MSR_IDX(addr); + + if (unlikely(idx >= P4_ESCR_MSR_TABLE_SIZE || + !p4_escr_table[idx] || + p4_escr_table[idx] != addr)) { + WARN_ONCE(1, "P4 PMU: Wrong address passed: %x\n", addr); + return -1; + } + + return idx; +} + +static int p4_next_cntr(int thread, unsigned long *used_mask, + struct p4_event_bind *bind) +{ + int i, j; + + for (i = 0; i < P4_CNTR_LIMIT; i++) { + j = bind->cntr[thread][i]; + if (j != -1 && !test_bit(j, used_mask)) + return j; + } + + return -1; +} + +static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) +{ + unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + unsigned long escr_mask[BITS_TO_LONGS(P4_ESCR_MSR_TABLE_SIZE)]; + int cpu = smp_processor_id(); + struct hw_perf_event *hwc; + struct p4_event_bind *bind; + unsigned int i, thread, num; + int cntr_idx, escr_idx; + u64 config_alias; + int pass; + + bitmap_zero(used_mask, X86_PMC_IDX_MAX); + bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE); + + for (i = 0, num = n; i < n; i++, num--) { + + hwc = &cpuc->event_list[i]->hw; + thread = p4_ht_thread(cpu); + pass = 0; + +again: + /* + * It's possible to hit a circular lock + * between original and alternative events + * if both are scheduled already. + */ + if (pass > 2) + goto done; + + bind = p4_config_get_bind(hwc->config); + escr_idx = p4_get_escr_idx(bind->escr_msr[thread]); + if (unlikely(escr_idx == -1)) + goto done; + + if (hwc->idx != -1 && !p4_should_swap_ts(hwc->config, cpu)) { + cntr_idx = hwc->idx; + if (assign) + assign[i] = hwc->idx; + goto reserve; + } + + cntr_idx = p4_next_cntr(thread, used_mask, bind); + if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) { + /* + * Check whether an event alias is still available. + */ + config_alias = p4_get_alias_event(hwc->config); + if (!config_alias) + goto done; + hwc->config = config_alias; + pass++; + goto again; + } + /* + * Perf does test runs to see if a whole group can be assigned + * together succesfully. There can be multiple rounds of this. + * Unfortunately, p4_pmu_swap_config_ts touches the hwc->config + * bits, such that the next round of group assignments will + * cause the above p4_should_swap_ts to pass instead of fail. + * This leads to counters exclusive to thread0 being used by + * thread1. + * + * Solve this with a cheap hack, reset the idx back to -1 to + * force a new lookup (p4_next_cntr) to get the right counter + * for the right thread. + * + * This probably doesn't comply with the general spirit of how + * perf wants to work, but P4 is special. :-( + */ + if (p4_should_swap_ts(hwc->config, cpu)) + hwc->idx = -1; + p4_pmu_swap_config_ts(hwc, cpu); + if (assign) + assign[i] = cntr_idx; +reserve: + set_bit(cntr_idx, used_mask); + set_bit(escr_idx, escr_mask); + } + +done: + return num ? -EINVAL : 0; +} + +PMU_FORMAT_ATTR(cccr, "config:0-31" ); +PMU_FORMAT_ATTR(escr, "config:32-62"); +PMU_FORMAT_ATTR(ht, "config:63" ); + +static struct attribute *intel_p4_formats_attr[] = { + &format_attr_cccr.attr, + &format_attr_escr.attr, + &format_attr_ht.attr, + NULL, +}; + +static __initconst const struct x86_pmu p4_pmu = { + .name = "Netburst P4/Xeon", + .handle_irq = p4_pmu_handle_irq, + .disable_all = p4_pmu_disable_all, + .enable_all = p4_pmu_enable_all, + .enable = p4_pmu_enable_event, + .disable = p4_pmu_disable_event, + .eventsel = MSR_P4_BPU_CCCR0, + .perfctr = MSR_P4_BPU_PERFCTR0, + .event_map = p4_pmu_event_map, + .max_events = ARRAY_SIZE(p4_general_events), + .get_event_constraints = x86_get_event_constraints, + /* + * IF HT disabled we may need to use all + * ARCH_P4_MAX_CCCR counters simulaneously + * though leave it restricted at moment assuming + * HT is on + */ + .num_counters = ARCH_P4_MAX_CCCR, + .apic = 1, + .cntval_bits = ARCH_P4_CNTRVAL_BITS, + .cntval_mask = ARCH_P4_CNTRVAL_MASK, + .max_period = (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1, + .hw_config = p4_hw_config, + .schedule_events = p4_pmu_schedule_events, + /* + * This handles erratum N15 in intel doc 249199-029, + * the counter may not be updated correctly on write + * so we need a second write operation to do the trick + * (the official workaround didn't work) + * + * the former idea is taken from OProfile code + */ + .perfctr_second_write = 1, + + .format_attrs = intel_p4_formats_attr, +}; + +__init int p4_pmu_init(void) +{ + unsigned int low, high; + int i, reg; + + /* If we get stripped -- indexing fails */ + BUILD_BUG_ON(ARCH_P4_MAX_CCCR > INTEL_PMC_MAX_GENERIC); + + rdmsr(MSR_IA32_MISC_ENABLE, low, high); + if (!(low & (1 << 7))) { + pr_cont("unsupported Netburst CPU model %d ", + boot_cpu_data.x86_model); + return -ENODEV; + } + + memcpy(hw_cache_event_ids, p4_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + pr_cont("Netburst events, "); + + x86_pmu = p4_pmu; + + /* + * Even though the counters are configured to interrupt a particular + * logical processor when an overflow happens, testing has shown that + * on kdump kernels (which uses a single cpu), thread1's counter + * continues to run and will report an NMI on thread0. Due to the + * overflow bug, this leads to a stream of unknown NMIs. + * + * Solve this by zero'ing out the registers to mimic a reset. + */ + for (i = 0; i < x86_pmu.num_counters; i++) { + reg = x86_pmu_config_addr(i); + wrmsrl_safe(reg, 0ULL); + } + + return 0; +} diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c new file mode 100644 index 00000000000..7c1a0c07b60 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -0,0 +1,279 @@ +#include <linux/perf_event.h> +#include <linux/types.h> + +#include "perf_event.h" + +/* + * Not sure about some of these + */ +static const u64 p6_perfmon_event_map[] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, /* CPU_CLK_UNHALTED */ + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, /* INST_RETIRED */ + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, /* L2_RQSTS:M:E:S:I */ + [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, /* L2_RQSTS:I */ + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, /* BR_INST_RETIRED */ + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, /* BR_MISS_PRED_RETIRED */ + [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, /* BUS_DRDY_CLOCKS */ + [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00a2, /* RESOURCE_STALLS */ + +}; + +static const u64 __initconst p6_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0043, /* DATA_MEM_REFS */ + [ C(RESULT_MISS) ] = 0x0045, /* DCU_LINES_IN */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0x0f29, /* L2_LD:M:E:S:I */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* IFU_IFETCH */ + [ C(RESULT_MISS) ] = 0x0f28, /* L2_IFETCH:M:E:S:I */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0x0025, /* L2_M_LINES_INM */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0043, /* DATA_MEM_REFS */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* IFU_IFETCH */ + [ C(RESULT_MISS) ] = 0x0085, /* ITLB_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED */ + [ C(RESULT_MISS) ] = 0x00c5, /* BR_MISS_PRED_RETIRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static u64 p6_pmu_event_map(int hw_event) +{ + return p6_perfmon_event_map[hw_event]; +} + +/* + * Event setting that is specified not to count anything. + * We use this to effectively disable a counter. + * + * L2_RQSTS with 0 MESI unit mask. + */ +#define P6_NOP_EVENT 0x0000002EULL + +static struct event_constraint p6_event_constraints[] = +{ + INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */ + INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ + INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ + INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ + INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ + INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ + EVENT_CONSTRAINT_END +}; + +static void p6_pmu_disable_all(void) +{ + u64 val; + + /* p6 only has one enable register */ + rdmsrl(MSR_P6_EVNTSEL0, val); + val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; + wrmsrl(MSR_P6_EVNTSEL0, val); +} + +static void p6_pmu_enable_all(int added) +{ + unsigned long val; + + /* p6 only has one enable register */ + rdmsrl(MSR_P6_EVNTSEL0, val); + val |= ARCH_PERFMON_EVENTSEL_ENABLE; + wrmsrl(MSR_P6_EVNTSEL0, val); +} + +static inline void +p6_pmu_disable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 val = P6_NOP_EVENT; + + (void)wrmsrl_safe(hwc->config_base, val); +} + +static void p6_pmu_enable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 val; + + val = hwc->config; + + /* + * p6 only has a global event enable, set on PerfEvtSel0 + * We "disable" events by programming P6_NOP_EVENT + * and we rely on p6_pmu_enable_all() being called + * to actually enable the events. + */ + + (void)wrmsrl_safe(hwc->config_base, val); +} + +PMU_FORMAT_ATTR(event, "config:0-7" ); +PMU_FORMAT_ATTR(umask, "config:8-15" ); +PMU_FORMAT_ATTR(edge, "config:18" ); +PMU_FORMAT_ATTR(pc, "config:19" ); +PMU_FORMAT_ATTR(inv, "config:23" ); +PMU_FORMAT_ATTR(cmask, "config:24-31" ); + +static struct attribute *intel_p6_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_pc.attr, + &format_attr_inv.attr, + &format_attr_cmask.attr, + NULL, +}; + +static __initconst const struct x86_pmu p6_pmu = { + .name = "p6", + .handle_irq = x86_pmu_handle_irq, + .disable_all = p6_pmu_disable_all, + .enable_all = p6_pmu_enable_all, + .enable = p6_pmu_enable_event, + .disable = p6_pmu_disable_event, + .hw_config = x86_pmu_hw_config, + .schedule_events = x86_schedule_events, + .eventsel = MSR_P6_EVNTSEL0, + .perfctr = MSR_P6_PERFCTR0, + .event_map = p6_pmu_event_map, + .max_events = ARRAY_SIZE(p6_perfmon_event_map), + .apic = 1, + .max_period = (1ULL << 31) - 1, + .version = 0, + .num_counters = 2, + /* + * Events have 40 bits implemented. However they are designed such + * that bits [32-39] are sign extensions of bit 31. As such the + * effective width of a event for P6-like PMU is 32 bits only. + * + * See IA-32 Intel Architecture Software developer manual Vol 3B + */ + .cntval_bits = 32, + .cntval_mask = (1ULL << 32) - 1, + .get_event_constraints = x86_get_event_constraints, + .event_constraints = p6_event_constraints, + + .format_attrs = intel_p6_formats_attr, + .events_sysfs_show = intel_event_sysfs_show, + +}; + +static __init void p6_pmu_rdpmc_quirk(void) +{ + if (boot_cpu_data.x86_mask < 9) { + /* + * PPro erratum 26; fixed in stepping 9 and above. + */ + pr_warn("Userspace RDPMC support disabled due to a CPU erratum\n"); + x86_pmu.attr_rdpmc_broken = 1; + x86_pmu.attr_rdpmc = 0; + } +} + +__init int p6_pmu_init(void) +{ + x86_pmu = p6_pmu; + + switch (boot_cpu_data.x86_model) { + case 1: /* Pentium Pro */ + x86_add_quirk(p6_pmu_rdpmc_quirk); + break; + + case 3: /* Pentium II - Klamath */ + case 5: /* Pentium II - Deschutes */ + case 6: /* Pentium II - Mendocino */ + break; + + case 7: /* Pentium III - Katmai */ + case 8: /* Pentium III - Coppermine */ + case 10: /* Pentium III Xeon */ + case 11: /* Pentium III - Tualatin */ + break; + + case 9: /* Pentium M - Banias */ + case 13: /* Pentium M - Dothan */ + break; + + default: + pr_cont("unsupported p6 CPU model %d ", boot_cpu_data.x86_model); + return -ENODEV; + } + + memcpy(hw_cache_event_ids, p6_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + return 0; +} diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c new file mode 100644 index 00000000000..2e8caf03f59 --- /dev/null +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -0,0 +1,160 @@ +/* + * local apic based NMI watchdog for various CPUs. + * + * This file also handles reservation of performance counters for coordination + * with other users (like oprofile). + * + * Note that these events normally don't tick when the CPU idles. This means + * the frequency varies with CPU load. + * + * Original code for K7/P6 written by Keith Owens + * + */ + +#include <linux/percpu.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/bitops.h> +#include <linux/smp.h> +#include <asm/nmi.h> +#include <linux/kprobes.h> + +#include <asm/apic.h> +#include <asm/perf_event.h> + +/* + * this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's + * offset from MSR_P4_BSU_ESCR0. + * + * It will be the max for all platforms (for now) + */ +#define NMI_MAX_COUNTER_BITS 66 + +/* + * perfctr_nmi_owner tracks the ownership of the perfctr registers: + * evtsel_nmi_owner tracks the ownership of the event selection + * - different performance counters/ event selection may be reserved for + * different subsystems this reservation system just tries to coordinate + * things a little + */ +static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS); +static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS); + +/* converts an msr to an appropriate reservation bit */ +static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) +{ + /* returns the bit offset of the performance counter register */ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + if (msr >= MSR_F15H_PERF_CTR) + return (msr - MSR_F15H_PERF_CTR) >> 1; + return msr - MSR_K7_PERFCTR0; + case X86_VENDOR_INTEL: + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return msr - MSR_ARCH_PERFMON_PERFCTR0; + + switch (boot_cpu_data.x86) { + case 6: + return msr - MSR_P6_PERFCTR0; + case 11: + return msr - MSR_KNC_PERFCTR0; + case 15: + return msr - MSR_P4_BPU_PERFCTR0; + } + } + return 0; +} + +/* + * converts an msr to an appropriate reservation bit + * returns the bit offset of the event selection register + */ +static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) +{ + /* returns the bit offset of the event selection register */ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + if (msr >= MSR_F15H_PERF_CTL) + return (msr - MSR_F15H_PERF_CTL) >> 1; + return msr - MSR_K7_EVNTSEL0; + case X86_VENDOR_INTEL: + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return msr - MSR_ARCH_PERFMON_EVENTSEL0; + + switch (boot_cpu_data.x86) { + case 6: + return msr - MSR_P6_EVNTSEL0; + case 11: + return msr - MSR_KNC_EVNTSEL0; + case 15: + return msr - MSR_P4_BSU_ESCR0; + } + } + return 0; + +} + +/* checks for a bit availability (hack for oprofile) */ +int avail_to_resrv_perfctr_nmi_bit(unsigned int counter) +{ + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + return !test_bit(counter, perfctr_nmi_owner); +} +EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); + +int reserve_perfctr_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_perfctr_msr_to_bit(msr); + /* register not managed by the allocator? */ + if (counter > NMI_MAX_COUNTER_BITS) + return 1; + + if (!test_and_set_bit(counter, perfctr_nmi_owner)) + return 1; + return 0; +} +EXPORT_SYMBOL(reserve_perfctr_nmi); + +void release_perfctr_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_perfctr_msr_to_bit(msr); + /* register not managed by the allocator? */ + if (counter > NMI_MAX_COUNTER_BITS) + return; + + clear_bit(counter, perfctr_nmi_owner); +} +EXPORT_SYMBOL(release_perfctr_nmi); + +int reserve_evntsel_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_evntsel_msr_to_bit(msr); + /* register not managed by the allocator? */ + if (counter > NMI_MAX_COUNTER_BITS) + return 1; + + if (!test_and_set_bit(counter, evntsel_nmi_owner)) + return 1; + return 0; +} +EXPORT_SYMBOL(reserve_evntsel_nmi); + +void release_evntsel_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_evntsel_msr_to_bit(msr); + /* register not managed by the allocator? */ + if (counter > NMI_MAX_COUNTER_BITS) + return; + + clear_bit(counter, evntsel_nmi_owner); +} +EXPORT_SYMBOL(release_evntsel_nmi); diff --git a/arch/x86/kernel/cpu/powerflags.c b/arch/x86/kernel/cpu/powerflags.c new file mode 100644 index 00000000000..31f0f335ed2 --- /dev/null +++ b/arch/x86/kernel/cpu/powerflags.c @@ -0,0 +1,21 @@ +/* + * Strings for the various x86 power flags + * + * This file must not contain any executable code. + */ + +#include <asm/cpufeature.h> + +const char *const x86_power_flags[32] = { + "ts", /* temperature sensor */ + "fid", /* frequency id control */ + "vid", /* voltage id control */ + "ttp", /* thermal trip */ + "tm", /* hardware thermal control */ + "stc", /* software thermal control */ + "100mhzsteps", /* 100 MHz multiplier control */ + "hwpstate", /* hardware P-state control */ + "", /* tsc invariant mapped to constant_tsc */ + "cpb", /* core performance boost */ + "eff_freq_ro", /* Readonly aperf/mperf */ +}; diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c new file mode 100644 index 00000000000..06fe3ed8b85 --- /dev/null +++ b/arch/x86/kernel/cpu/proc.c @@ -0,0 +1,154 @@ +#include <linux/smp.h> +#include <linux/timex.h> +#include <linux/string.h> +#include <linux/seq_file.h> +#include <linux/cpufreq.h> + +/* + * Get CPU information for use by the procfs. + */ +static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c, + unsigned int cpu) +{ +#ifdef CONFIG_SMP + seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); + seq_printf(m, "siblings\t: %d\n", cpumask_weight(cpu_core_mask(cpu))); + seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); + seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); + seq_printf(m, "apicid\t\t: %d\n", c->apicid); + seq_printf(m, "initial apicid\t: %d\n", c->initial_apicid); +#endif +} + +#ifdef CONFIG_X86_32 +static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) +{ + seq_printf(m, + "fdiv_bug\t: %s\n" + "f00f_bug\t: %s\n" + "coma_bug\t: %s\n" + "fpu\t\t: %s\n" + "fpu_exception\t: %s\n" + "cpuid level\t: %d\n" + "wp\t\t: %s\n", + static_cpu_has_bug(X86_BUG_FDIV) ? "yes" : "no", + static_cpu_has_bug(X86_BUG_F00F) ? "yes" : "no", + static_cpu_has_bug(X86_BUG_COMA) ? "yes" : "no", + static_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", + static_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", + c->cpuid_level, + c->wp_works_ok ? "yes" : "no"); +} +#else +static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) +{ + seq_printf(m, + "fpu\t\t: yes\n" + "fpu_exception\t: yes\n" + "cpuid level\t: %d\n" + "wp\t\t: yes\n", + c->cpuid_level); +} +#endif + +static int show_cpuinfo(struct seq_file *m, void *v) +{ + struct cpuinfo_x86 *c = v; + unsigned int cpu; + int i; + + cpu = c->cpu_index; + seq_printf(m, "processor\t: %u\n" + "vendor_id\t: %s\n" + "cpu family\t: %d\n" + "model\t\t: %u\n" + "model name\t: %s\n", + cpu, + c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown", + c->x86, + c->x86_model, + c->x86_model_id[0] ? c->x86_model_id : "unknown"); + + if (c->x86_mask || c->cpuid_level >= 0) + seq_printf(m, "stepping\t: %d\n", c->x86_mask); + else + seq_printf(m, "stepping\t: unknown\n"); + if (c->microcode) + seq_printf(m, "microcode\t: 0x%x\n", c->microcode); + + if (cpu_has(c, X86_FEATURE_TSC)) { + unsigned int freq = cpufreq_quick_get(cpu); + + if (!freq) + freq = cpu_khz; + seq_printf(m, "cpu MHz\t\t: %u.%03u\n", + freq / 1000, (freq % 1000)); + } + + /* Cache size */ + if (c->x86_cache_size >= 0) + seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size); + + show_cpuinfo_core(m, c, cpu); + show_cpuinfo_misc(m, c); + + seq_printf(m, "flags\t\t:"); + for (i = 0; i < 32*NCAPINTS; i++) + if (cpu_has(c, i) && x86_cap_flags[i] != NULL) + seq_printf(m, " %s", x86_cap_flags[i]); + + seq_printf(m, "\nbogomips\t: %lu.%02lu\n", + c->loops_per_jiffy/(500000/HZ), + (c->loops_per_jiffy/(5000/HZ)) % 100); + +#ifdef CONFIG_X86_64 + if (c->x86_tlbsize > 0) + seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize); +#endif + seq_printf(m, "clflush size\t: %u\n", c->x86_clflush_size); + seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment); + seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", + c->x86_phys_bits, c->x86_virt_bits); + + seq_printf(m, "power management:"); + for (i = 0; i < 32; i++) { + if (c->x86_power & (1 << i)) { + if (i < ARRAY_SIZE(x86_power_flags) && + x86_power_flags[i]) + seq_printf(m, "%s%s", + x86_power_flags[i][0] ? " " : "", + x86_power_flags[i]); + else + seq_printf(m, " [%d]", i); + } + } + + seq_printf(m, "\n\n"); + + return 0; +} + +static void *c_start(struct seq_file *m, loff_t *pos) +{ + *pos = cpumask_next(*pos - 1, cpu_online_mask); + if ((*pos) < nr_cpu_ids) + return &cpu_data(*pos); + return NULL; +} + +static void *c_next(struct seq_file *m, void *v, loff_t *pos) +{ + (*pos)++; + return c_start(m, pos); +} + +static void c_stop(struct seq_file *m, void *v) +{ +} + +const struct seq_operations cpuinfo_op = { + .start = c_start, + .next = c_next, + .stop = c_stop, + .show = show_cpuinfo, +}; diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c new file mode 100644 index 00000000000..136ac74dee8 --- /dev/null +++ b/arch/x86/kernel/cpu/rdrand.c @@ -0,0 +1,60 @@ +/* + * This file is part of the Linux kernel. + * + * Copyright (c) 2011, Intel Corporation + * Authors: Fenghua Yu <fenghua.yu@intel.com>, + * H. Peter Anvin <hpa@linux.intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#include <asm/processor.h> +#include <asm/archrandom.h> +#include <asm/sections.h> + +static int __init x86_rdrand_setup(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_RDRAND); + setup_clear_cpu_cap(X86_FEATURE_RDSEED); + return 1; +} +__setup("nordrand", x86_rdrand_setup); + +/* + * Force a reseed cycle; we are architecturally guaranteed a reseed + * after no more than 512 128-bit chunks of random data. This also + * acts as a test of the CPU capability. + */ +#define RESEED_LOOP ((512*128)/sizeof(unsigned long)) + +void x86_init_rdrand(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_ARCH_RANDOM + unsigned long tmp; + int i, count, ok; + + if (!cpu_has(c, X86_FEATURE_RDRAND)) + return; /* Nothing to do */ + + for (count = i = 0; i < RESEED_LOOP; i++) { + ok = rdrand_long(&tmp); + if (ok) + count++; + } + + if (count != RESEED_LOOP) + clear_cpu_cap(c, X86_FEATURE_RDRAND); +#endif +} diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c new file mode 100644 index 00000000000..b6f794aa169 --- /dev/null +++ b/arch/x86/kernel/cpu/scattered.c @@ -0,0 +1,72 @@ +/* + * Routines to identify additional cpu features that are scattered in + * cpuid space. + */ +#include <linux/cpu.h> + +#include <asm/pat.h> +#include <asm/processor.h> + +#include <asm/apic.h> + +struct cpuid_bit { + u16 feature; + u8 reg; + u8 bit; + u32 level; + u32 sub_leaf; +}; + +enum cpuid_regs { + CR_EAX = 0, + CR_ECX, + CR_EDX, + CR_EBX +}; + +void init_scattered_cpuid_features(struct cpuinfo_x86 *c) +{ + u32 max_level; + u32 regs[4]; + const struct cpuid_bit *cb; + + static const struct cpuid_bit cpuid_bits[] = { + { X86_FEATURE_DTHERM, CR_EAX, 0, 0x00000006, 0 }, + { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 }, + { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 }, + { X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 }, + { X86_FEATURE_PTS, CR_EAX, 6, 0x00000006, 0 }, + { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 }, + { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, + { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 }, + { X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 }, + { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 }, + { X86_FEATURE_PROC_FEEDBACK, CR_EDX,11, 0x80000007, 0 }, + { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 }, + { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 }, + { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 }, + { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 }, + { X86_FEATURE_TSCRATEMSR, CR_EDX, 4, 0x8000000a, 0 }, + { X86_FEATURE_VMCBCLEAN, CR_EDX, 5, 0x8000000a, 0 }, + { X86_FEATURE_FLUSHBYASID, CR_EDX, 6, 0x8000000a, 0 }, + { X86_FEATURE_DECODEASSISTS, CR_EDX, 7, 0x8000000a, 0 }, + { X86_FEATURE_PAUSEFILTER, CR_EDX,10, 0x8000000a, 0 }, + { X86_FEATURE_PFTHRESHOLD, CR_EDX,12, 0x8000000a, 0 }, + { 0, 0, 0, 0, 0 } + }; + + for (cb = cpuid_bits; cb->feature; cb++) { + + /* Verify that the level is valid */ + max_level = cpuid_eax(cb->level & 0xffff0000); + if (max_level < cb->level || + max_level > (cb->level | 0xffff)) + continue; + + cpuid_count(cb->level, cb->sub_leaf, ®s[CR_EAX], + ®s[CR_EBX], ®s[CR_ECX], ®s[CR_EDX]); + + if (regs[cb->reg] & (1 << cb->bit)) + set_cpu_cap(c, cb->feature); + } +} diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c new file mode 100644 index 00000000000..4c60eaf0571 --- /dev/null +++ b/arch/x86/kernel/cpu/topology.c @@ -0,0 +1,99 @@ +/* + * Check for extended topology enumeration cpuid leaf 0xb and if it + * exists, use it for populating initial_apicid and cpu topology + * detection. + */ + +#include <linux/cpu.h> +#include <asm/apic.h> +#include <asm/pat.h> +#include <asm/processor.h> + +/* leaf 0xb SMT level */ +#define SMT_LEVEL 0 + +/* leaf 0xb sub-leaf types */ +#define INVALID_TYPE 0 +#define SMT_TYPE 1 +#define CORE_TYPE 2 + +#define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff) +#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f) +#define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff) + +/* + * Check for extended topology enumeration cpuid leaf 0xb and if it + * exists, use it for populating initial_apicid and cpu topology + * detection. + */ +void detect_extended_topology(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + unsigned int eax, ebx, ecx, edx, sub_index; + unsigned int ht_mask_width, core_plus_mask_width; + unsigned int core_select_mask, core_level_siblings; + static bool printed; + + if (c->cpuid_level < 0xb) + return; + + cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx); + + /* + * check if the cpuid leaf 0xb is actually implemented. + */ + if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE)) + return; + + set_cpu_cap(c, X86_FEATURE_XTOPOLOGY); + + /* + * initial apic id, which also represents 32-bit extended x2apic id. + */ + c->initial_apicid = edx; + + /* + * Populate HT related information from sub-leaf level 0. + */ + core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx); + core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); + + sub_index = 1; + do { + cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx); + + /* + * Check for the Core type in the implemented sub leaves. + */ + if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) { + core_level_siblings = LEVEL_MAX_SIBLINGS(ebx); + core_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); + break; + } + + sub_index++; + } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE); + + core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width; + + c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, ht_mask_width) + & core_select_mask; + c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, core_plus_mask_width); + /* + * Reinit the apicid, now that we have extended initial_apicid. + */ + c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); + + c->x86_max_cores = (core_level_siblings / smp_num_siblings); + + if (!printed) { + printk(KERN_INFO "CPU: Physical Processor ID: %d\n", + c->phys_proc_id); + if (c->x86_max_cores > 1) + printk(KERN_INFO "CPU: Processor Core ID: %d\n", + c->cpu_core_id); + printed = 1; + } + return; +#endif +} diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c new file mode 100644 index 00000000000..3fa0e5ad86b --- /dev/null +++ b/arch/x86/kernel/cpu/transmeta.c @@ -0,0 +1,108 @@ +#include <linux/kernel.h> +#include <linux/mm.h> +#include <asm/processor.h> +#include <asm/msr.h> +#include "cpu.h" + +static void early_init_transmeta(struct cpuinfo_x86 *c) +{ + u32 xlvl; + + /* Transmeta-defined flags: level 0x80860001 */ + xlvl = cpuid_eax(0x80860000); + if ((xlvl & 0xffff0000) == 0x80860000) { + if (xlvl >= 0x80860001) + c->x86_capability[2] = cpuid_edx(0x80860001); + } +} + +static void init_transmeta(struct cpuinfo_x86 *c) +{ + unsigned int cap_mask, uk, max, dummy; + unsigned int cms_rev1, cms_rev2; + unsigned int cpu_rev, cpu_freq = 0, cpu_flags, new_cpu_rev; + char cpu_info[65]; + + early_init_transmeta(c); + + cpu_detect_cache_sizes(c); + + /* Print CMS and CPU revision */ + max = cpuid_eax(0x80860000); + cpu_rev = 0; + if (max >= 0x80860001) { + cpuid(0x80860001, &dummy, &cpu_rev, &cpu_freq, &cpu_flags); + if (cpu_rev != 0x02000000) { + printk(KERN_INFO "CPU: Processor revision %u.%u.%u.%u, %u MHz\n", + (cpu_rev >> 24) & 0xff, + (cpu_rev >> 16) & 0xff, + (cpu_rev >> 8) & 0xff, + cpu_rev & 0xff, + cpu_freq); + } + } + if (max >= 0x80860002) { + cpuid(0x80860002, &new_cpu_rev, &cms_rev1, &cms_rev2, &dummy); + if (cpu_rev == 0x02000000) { + printk(KERN_INFO "CPU: Processor revision %08X, %u MHz\n", + new_cpu_rev, cpu_freq); + } + printk(KERN_INFO "CPU: Code Morphing Software revision %u.%u.%u-%u-%u\n", + (cms_rev1 >> 24) & 0xff, + (cms_rev1 >> 16) & 0xff, + (cms_rev1 >> 8) & 0xff, + cms_rev1 & 0xff, + cms_rev2); + } + if (max >= 0x80860006) { + cpuid(0x80860003, + (void *)&cpu_info[0], + (void *)&cpu_info[4], + (void *)&cpu_info[8], + (void *)&cpu_info[12]); + cpuid(0x80860004, + (void *)&cpu_info[16], + (void *)&cpu_info[20], + (void *)&cpu_info[24], + (void *)&cpu_info[28]); + cpuid(0x80860005, + (void *)&cpu_info[32], + (void *)&cpu_info[36], + (void *)&cpu_info[40], + (void *)&cpu_info[44]); + cpuid(0x80860006, + (void *)&cpu_info[48], + (void *)&cpu_info[52], + (void *)&cpu_info[56], + (void *)&cpu_info[60]); + cpu_info[64] = '\0'; + printk(KERN_INFO "CPU: %s\n", cpu_info); + } + + /* Unhide possibly hidden capability flags */ + rdmsr(0x80860004, cap_mask, uk); + wrmsr(0x80860004, ~0, uk); + c->x86_capability[0] = cpuid_edx(0x00000001); + wrmsr(0x80860004, cap_mask, uk); + + /* All Transmeta CPUs have a constant TSC */ + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + +#ifdef CONFIG_SYSCTL + /* + * randomize_va_space slows us down enormously; + * it probably triggers retranslation of x86->native bytecode + */ + randomize_va_space = 0; +#endif +} + +static const struct cpu_dev transmeta_cpu_dev = { + .c_vendor = "Transmeta", + .c_ident = { "GenuineTMx86", "TransmetaCPU" }, + .c_early_init = early_init_transmeta, + .c_init = init_transmeta, + .c_x86_vendor = X86_VENDOR_TRANSMETA, +}; + +cpu_dev_register(transmeta_cpu_dev); diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c new file mode 100644 index 00000000000..ef9c2a0078b --- /dev/null +++ b/arch/x86/kernel/cpu/umc.c @@ -0,0 +1,25 @@ +#include <linux/kernel.h> +#include <asm/processor.h> +#include "cpu.h" + +/* + * UMC chips appear to be only either 386 or 486, + * so no special init takes place. + */ + +static const struct cpu_dev umc_cpu_dev = { + .c_vendor = "UMC", + .c_ident = { "UMC UMC UMC" }, + .legacy_models = { + { .family = 4, .model_names = + { + [1] = "U5D", + [2] = "U5S", + } + }, + }, + .c_x86_vendor = X86_VENDOR_UMC, +}; + +cpu_dev_register(umc_cpu_dev); + diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c new file mode 100644 index 00000000000..628a059a9a0 --- /dev/null +++ b/arch/x86/kernel/cpu/vmware.c @@ -0,0 +1,147 @@ +/* + * VMware Detection code. + * + * Copyright (C) 2008, VMware, Inc. + * Author : Alok N Kataria <akataria@vmware.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#include <linux/dmi.h> +#include <linux/module.h> +#include <asm/div64.h> +#include <asm/x86_init.h> +#include <asm/hypervisor.h> + +#define CPUID_VMWARE_INFO_LEAF 0x40000000 +#define VMWARE_HYPERVISOR_MAGIC 0x564D5868 +#define VMWARE_HYPERVISOR_PORT 0x5658 + +#define VMWARE_PORT_CMD_GETVERSION 10 +#define VMWARE_PORT_CMD_GETHZ 45 +#define VMWARE_PORT_CMD_GETVCPU_INFO 68 +#define VMWARE_PORT_CMD_LEGACY_X2APIC 3 +#define VMWARE_PORT_CMD_VCPU_RESERVED 31 + +#define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \ + __asm__("inl (%%dx)" : \ + "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \ + "0"(VMWARE_HYPERVISOR_MAGIC), \ + "1"(VMWARE_PORT_CMD_##cmd), \ + "2"(VMWARE_HYPERVISOR_PORT), "3"(UINT_MAX) : \ + "memory"); + +static inline int __vmware_platform(void) +{ + uint32_t eax, ebx, ecx, edx; + VMWARE_PORT(GETVERSION, eax, ebx, ecx, edx); + return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC; +} + +static unsigned long vmware_get_tsc_khz(void) +{ + uint64_t tsc_hz, lpj; + uint32_t eax, ebx, ecx, edx; + + VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); + + tsc_hz = eax | (((uint64_t)ebx) << 32); + do_div(tsc_hz, 1000); + BUG_ON(tsc_hz >> 32); + printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n", + (unsigned long) tsc_hz / 1000, + (unsigned long) tsc_hz % 1000); + + if (!preset_lpj) { + lpj = ((u64)tsc_hz * 1000); + do_div(lpj, HZ); + preset_lpj = lpj; + } + + return tsc_hz; +} + +static void __init vmware_platform_setup(void) +{ + uint32_t eax, ebx, ecx, edx; + + VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); + + if (ebx != UINT_MAX) + x86_platform.calibrate_tsc = vmware_get_tsc_khz; + else + printk(KERN_WARNING + "Failed to get TSC freq from the hypervisor\n"); +} + +/* + * While checking the dmi string information, just checking the product + * serial key should be enough, as this will always have a VMware + * specific string when running under VMware hypervisor. + */ +static uint32_t __init vmware_platform(void) +{ + if (cpu_has_hypervisor) { + unsigned int eax; + unsigned int hyper_vendor_id[3]; + + cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &hyper_vendor_id[0], + &hyper_vendor_id[1], &hyper_vendor_id[2]); + if (!memcmp(hyper_vendor_id, "VMwareVMware", 12)) + return CPUID_VMWARE_INFO_LEAF; + } else if (dmi_available && dmi_name_in_serial("VMware") && + __vmware_platform()) + return 1; + + return 0; +} + +/* + * VMware hypervisor takes care of exporting a reliable TSC to the guest. + * Still, due to timing difference when running on virtual cpus, the TSC can + * be marked as unstable in some cases. For example, the TSC sync check at + * bootup can fail due to a marginal offset between vcpus' TSCs (though the + * TSCs do not drift from each other). Also, the ACPI PM timer clocksource + * is not suitable as a watchdog when running on a hypervisor because the + * kernel may miss a wrap of the counter if the vcpu is descheduled for a + * long time. To skip these checks at runtime we set these capability bits, + * so that the kernel could just trust the hypervisor with providing a + * reliable virtual TSC that is suitable for timekeeping. + */ +static void vmware_set_cpu_features(struct cpuinfo_x86 *c) +{ + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE); +} + +/* Checks if hypervisor supports x2apic without VT-D interrupt remapping. */ +static bool __init vmware_legacy_x2apic_available(void) +{ + uint32_t eax, ebx, ecx, edx; + VMWARE_PORT(GETVCPU_INFO, eax, ebx, ecx, edx); + return (eax & (1 << VMWARE_PORT_CMD_VCPU_RESERVED)) == 0 && + (eax & (1 << VMWARE_PORT_CMD_LEGACY_X2APIC)) != 0; +} + +const __refconst struct hypervisor_x86 x86_hyper_vmware = { + .name = "VMware", + .detect = vmware_platform, + .set_cpu_features = vmware_set_cpu_features, + .init_platform = vmware_platform_setup, + .x2apic_available = vmware_legacy_x2apic_available, +}; +EXPORT_SYMBOL(x86_hyper_vmware); diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c new file mode 100644 index 00000000000..3225ae6c518 --- /dev/null +++ b/arch/x86/kernel/cpuid.c @@ -0,0 +1,245 @@ +/* ----------------------------------------------------------------------- * + * + * Copyright 2000-2008 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, + * USA; either version 2 of the License, or (at your option) any later + * version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * x86 CPUID access device + * + * This device is accessed by lseek() to the appropriate CPUID level + * and then read in chunks of 16 bytes. A larger size means multiple + * reads of consecutive levels. + * + * The lower 32 bits of the file position is used as the incoming %eax, + * and the upper 32 bits of the file position as the incoming %ecx, + * the latter intended for "counting" eax levels like eax=4. + * + * This driver uses /dev/cpu/%d/cpuid where %d is the minor number, and on + * an SMP box will direct the access to CPU %d. + */ + +#include <linux/module.h> + +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/fcntl.h> +#include <linux/init.h> +#include <linux/poll.h> +#include <linux/smp.h> +#include <linux/major.h> +#include <linux/fs.h> +#include <linux/device.h> +#include <linux/cpu.h> +#include <linux/notifier.h> +#include <linux/uaccess.h> +#include <linux/gfp.h> + +#include <asm/processor.h> +#include <asm/msr.h> + +static struct class *cpuid_class; + +struct cpuid_regs { + u32 eax, ebx, ecx, edx; +}; + +static void cpuid_smp_cpuid(void *cmd_block) +{ + struct cpuid_regs *cmd = (struct cpuid_regs *)cmd_block; + + cpuid_count(cmd->eax, cmd->ecx, + &cmd->eax, &cmd->ebx, &cmd->ecx, &cmd->edx); +} + +static loff_t cpuid_seek(struct file *file, loff_t offset, int orig) +{ + loff_t ret; + struct inode *inode = file->f_mapping->host; + + mutex_lock(&inode->i_mutex); + switch (orig) { + case 0: + file->f_pos = offset; + ret = file->f_pos; + break; + case 1: + file->f_pos += offset; + ret = file->f_pos; + break; + default: + ret = -EINVAL; + } + mutex_unlock(&inode->i_mutex); + return ret; +} + +static ssize_t cpuid_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + char __user *tmp = buf; + struct cpuid_regs cmd; + int cpu = iminor(file_inode(file)); + u64 pos = *ppos; + ssize_t bytes = 0; + int err = 0; + + if (count % 16) + return -EINVAL; /* Invalid chunk size */ + + for (; count; count -= 16) { + cmd.eax = pos; + cmd.ecx = pos >> 32; + err = smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1); + if (err) + break; + if (copy_to_user(tmp, &cmd, 16)) { + err = -EFAULT; + break; + } + tmp += 16; + bytes += 16; + *ppos = ++pos; + } + + return bytes ? bytes : err; +} + +static int cpuid_open(struct inode *inode, struct file *file) +{ + unsigned int cpu; + struct cpuinfo_x86 *c; + + cpu = iminor(file_inode(file)); + if (cpu >= nr_cpu_ids || !cpu_online(cpu)) + return -ENXIO; /* No such CPU */ + + c = &cpu_data(cpu); + if (c->cpuid_level < 0) + return -EIO; /* CPUID not supported */ + + return 0; +} + +/* + * File operations we support + */ +static const struct file_operations cpuid_fops = { + .owner = THIS_MODULE, + .llseek = cpuid_seek, + .read = cpuid_read, + .open = cpuid_open, +}; + +static int cpuid_device_create(int cpu) +{ + struct device *dev; + + dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), NULL, + "cpu%d", cpu); + return IS_ERR(dev) ? PTR_ERR(dev) : 0; +} + +static void cpuid_device_destroy(int cpu) +{ + device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu)); +} + +static int cpuid_class_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + int err = 0; + + switch (action) { + case CPU_UP_PREPARE: + err = cpuid_device_create(cpu); + break; + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + case CPU_DEAD: + cpuid_device_destroy(cpu); + break; + } + return notifier_from_errno(err); +} + +static struct notifier_block __refdata cpuid_class_cpu_notifier = +{ + .notifier_call = cpuid_class_cpu_callback, +}; + +static char *cpuid_devnode(struct device *dev, umode_t *mode) +{ + return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt)); +} + +static int __init cpuid_init(void) +{ + int i, err = 0; + i = 0; + + if (__register_chrdev(CPUID_MAJOR, 0, NR_CPUS, + "cpu/cpuid", &cpuid_fops)) { + printk(KERN_ERR "cpuid: unable to get major %d for cpuid\n", + CPUID_MAJOR); + err = -EBUSY; + goto out; + } + cpuid_class = class_create(THIS_MODULE, "cpuid"); + if (IS_ERR(cpuid_class)) { + err = PTR_ERR(cpuid_class); + goto out_chrdev; + } + cpuid_class->devnode = cpuid_devnode; + + cpu_notifier_register_begin(); + for_each_online_cpu(i) { + err = cpuid_device_create(i); + if (err != 0) + goto out_class; + } + __register_hotcpu_notifier(&cpuid_class_cpu_notifier); + cpu_notifier_register_done(); + + err = 0; + goto out; + +out_class: + i = 0; + for_each_online_cpu(i) { + cpuid_device_destroy(i); + } + cpu_notifier_register_done(); + class_destroy(cpuid_class); +out_chrdev: + __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); +out: + return err; +} + +static void __exit cpuid_exit(void) +{ + int cpu = 0; + + cpu_notifier_register_begin(); + for_each_online_cpu(cpu) + cpuid_device_destroy(cpu); + class_destroy(cpuid_class); + __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); + __unregister_hotcpu_notifier(&cpuid_class_cpu_notifier); + cpu_notifier_register_done(); +} + +module_init(cpuid_init); +module_exit(cpuid_exit); + +MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>"); +MODULE_DESCRIPTION("x86 generic CPUID driver"); +MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c new file mode 100644 index 00000000000..507de806659 --- /dev/null +++ b/arch/x86/kernel/crash.c @@ -0,0 +1,137 @@ +/* + * Architecture specific (i386/x86_64) functions for kexec based crash dumps. + * + * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) + * + * Copyright (C) IBM Corporation, 2004. All rights reserved. + * + */ + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/smp.h> +#include <linux/reboot.h> +#include <linux/kexec.h> +#include <linux/delay.h> +#include <linux/elf.h> +#include <linux/elfcore.h> +#include <linux/module.h> + +#include <asm/processor.h> +#include <asm/hardirq.h> +#include <asm/nmi.h> +#include <asm/hw_irq.h> +#include <asm/apic.h> +#include <asm/hpet.h> +#include <linux/kdebug.h> +#include <asm/cpu.h> +#include <asm/reboot.h> +#include <asm/virtext.h> + +int in_crash_kexec; + +/* + * This is used to VMCLEAR all VMCSs loaded on the + * processor. And when loading kvm_intel module, the + * callback function pointer will be assigned. + * + * protected by rcu. + */ +crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL; +EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss); + +static inline void cpu_crash_vmclear_loaded_vmcss(void) +{ + crash_vmclear_fn *do_vmclear_operation = NULL; + + rcu_read_lock(); + do_vmclear_operation = rcu_dereference(crash_vmclear_loaded_vmcss); + if (do_vmclear_operation) + do_vmclear_operation(); + rcu_read_unlock(); +} + +#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) + +static void kdump_nmi_callback(int cpu, struct pt_regs *regs) +{ +#ifdef CONFIG_X86_32 + struct pt_regs fixed_regs; + + if (!user_mode_vm(regs)) { + crash_fixup_ss_esp(&fixed_regs, regs); + regs = &fixed_regs; + } +#endif + crash_save_cpu(regs, cpu); + + /* + * VMCLEAR VMCSs loaded on all cpus if needed. + */ + cpu_crash_vmclear_loaded_vmcss(); + + /* Disable VMX or SVM if needed. + * + * We need to disable virtualization on all CPUs. + * Having VMX or SVM enabled on any CPU may break rebooting + * after the kdump kernel has finished its task. + */ + cpu_emergency_vmxoff(); + cpu_emergency_svm_disable(); + + disable_local_APIC(); +} + +static void kdump_nmi_shootdown_cpus(void) +{ + in_crash_kexec = 1; + nmi_shootdown_cpus(kdump_nmi_callback); + + disable_local_APIC(); +} + +#else +static void kdump_nmi_shootdown_cpus(void) +{ + /* There are no cpus to shootdown */ +} +#endif + +void native_machine_crash_shutdown(struct pt_regs *regs) +{ + /* This function is only called after the system + * has panicked or is otherwise in a critical state. + * The minimum amount of code to allow a kexec'd kernel + * to run successfully needs to happen here. + * + * In practice this means shooting down the other cpus in + * an SMP system. + */ + /* The kernel is broken so disable interrupts */ + local_irq_disable(); + + kdump_nmi_shootdown_cpus(); + + /* + * VMCLEAR VMCSs loaded on this cpu if needed. + */ + cpu_crash_vmclear_loaded_vmcss(); + + /* Booting kdump kernel with VMX or SVM enabled won't work, + * because (among other limitations) we can't disable paging + * with the virt flags. + */ + cpu_emergency_vmxoff(); + cpu_emergency_svm_disable(); + +#ifdef CONFIG_X86_IO_APIC + /* Prevent crash_kexec() from deadlocking on ioapic_lock. */ + ioapic_zap_locks(); + disable_IO_APIC(); +#endif + lapic_shutdown(); +#ifdef CONFIG_HPET_TIMER + hpet_disable(); +#endif + crash_save_cpu(regs, safe_smp_processor_id()); +} diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c new file mode 100644 index 00000000000..11891ca7b71 --- /dev/null +++ b/arch/x86/kernel/crash_dump_32.c @@ -0,0 +1,95 @@ +/* + * Memory preserving reboot related code. + * + * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) + * Copyright (C) IBM Corporation, 2004. All rights reserved + */ + +#include <linux/slab.h> +#include <linux/errno.h> +#include <linux/highmem.h> +#include <linux/crash_dump.h> + +#include <asm/uaccess.h> + +static void *kdump_buf_page; + +static inline bool is_crashed_pfn_valid(unsigned long pfn) +{ +#ifndef CONFIG_X86_PAE + /* + * non-PAE kdump kernel executed from a PAE one will crop high pte + * bits and poke unwanted space counting again from address 0, we + * don't want that. pte must fit into unsigned long. In fact the + * test checks high 12 bits for being zero (pfn will be shifted left + * by PAGE_SHIFT). + */ + return pte_pfn(pfn_pte(pfn, __pgprot(0))) == pfn; +#else + return true; +#endif +} + +/** + * copy_oldmem_page - copy one page from "oldmem" + * @pfn: page frame number to be copied + * @buf: target memory address for the copy; this can be in kernel address + * space or user address space (see @userbuf) + * @csize: number of bytes to copy + * @offset: offset in bytes into the page (based on pfn) to begin the copy + * @userbuf: if set, @buf is in user address space, use copy_to_user(), + * otherwise @buf is in kernel address space, use memcpy(). + * + * Copy a page from "oldmem". For this page, there is no pte mapped + * in the current kernel. We stitch up a pte, similar to kmap_atomic. + * + * Calling copy_to_user() in atomic context is not desirable. Hence first + * copying the data to a pre-allocated kernel page and then copying to user + * space in non-atomic context. + */ +ssize_t copy_oldmem_page(unsigned long pfn, char *buf, + size_t csize, unsigned long offset, int userbuf) +{ + void *vaddr; + + if (!csize) + return 0; + + if (!is_crashed_pfn_valid(pfn)) + return -EFAULT; + + vaddr = kmap_atomic_pfn(pfn); + + if (!userbuf) { + memcpy(buf, (vaddr + offset), csize); + kunmap_atomic(vaddr); + } else { + if (!kdump_buf_page) { + printk(KERN_WARNING "Kdump: Kdump buffer page not" + " allocated\n"); + kunmap_atomic(vaddr); + return -EFAULT; + } + copy_page(kdump_buf_page, vaddr); + kunmap_atomic(vaddr); + if (copy_to_user(buf, (kdump_buf_page + offset), csize)) + return -EFAULT; + } + + return csize; +} + +static int __init kdump_buf_page_init(void) +{ + int ret = 0; + + kdump_buf_page = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!kdump_buf_page) { + printk(KERN_WARNING "Kdump: Failed to allocate kdump buffer" + " page\n"); + ret = -ENOMEM; + } + + return ret; +} +arch_initcall(kdump_buf_page_init); diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c new file mode 100644 index 00000000000..afa64adb75e --- /dev/null +++ b/arch/x86/kernel/crash_dump_64.c @@ -0,0 +1,49 @@ +/* + * Memory preserving reboot related code. + * + * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) + * Copyright (C) IBM Corporation, 2004. All rights reserved + */ + +#include <linux/errno.h> +#include <linux/crash_dump.h> +#include <linux/uaccess.h> +#include <linux/io.h> + +/** + * copy_oldmem_page - copy one page from "oldmem" + * @pfn: page frame number to be copied + * @buf: target memory address for the copy; this can be in kernel address + * space or user address space (see @userbuf) + * @csize: number of bytes to copy + * @offset: offset in bytes into the page (based on pfn) to begin the copy + * @userbuf: if set, @buf is in user address space, use copy_to_user(), + * otherwise @buf is in kernel address space, use memcpy(). + * + * Copy a page from "oldmem". For this page, there is no pte mapped + * in the current kernel. We stitch up a pte, similar to kmap_atomic. + */ +ssize_t copy_oldmem_page(unsigned long pfn, char *buf, + size_t csize, unsigned long offset, int userbuf) +{ + void *vaddr; + + if (!csize) + return 0; + + vaddr = ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE); + if (!vaddr) + return -ENOMEM; + + if (userbuf) { + if (copy_to_user(buf, vaddr + offset, csize)) { + iounmap(vaddr); + return -EFAULT; + } + } else + memcpy(buf, vaddr + offset, csize); + + set_iounmap_nonlazy(); + iounmap(vaddr); + return csize; +} diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c new file mode 100644 index 00000000000..7db54b5d5f8 --- /dev/null +++ b/arch/x86/kernel/devicetree.c @@ -0,0 +1,385 @@ +/* + * Architecture specific OF callbacks. + */ +#include <linux/bootmem.h> +#include <linux/export.h> +#include <linux/io.h> +#include <linux/irqdomain.h> +#include <linux/interrupt.h> +#include <linux/list.h> +#include <linux/of.h> +#include <linux/of_fdt.h> +#include <linux/of_address.h> +#include <linux/of_platform.h> +#include <linux/of_irq.h> +#include <linux/slab.h> +#include <linux/pci.h> +#include <linux/of_pci.h> +#include <linux/initrd.h> + +#include <asm/hpet.h> +#include <asm/apic.h> +#include <asm/pci_x86.h> +#include <asm/setup.h> + +__initdata u64 initial_dtb; +char __initdata cmd_line[COMMAND_LINE_SIZE]; + +int __initdata of_ioapic; + +void __init early_init_dt_scan_chosen_arch(unsigned long node) +{ + BUG(); +} + +void __init early_init_dt_add_memory_arch(u64 base, u64 size) +{ + BUG(); +} + +void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align) +{ + return __alloc_bootmem(size, align, __pa(MAX_DMA_ADDRESS)); +} + +void __init add_dtb(u64 data) +{ + initial_dtb = data + offsetof(struct setup_data, data); +} + +/* + * CE4100 ids. Will be moved to machine_device_initcall() once we have it. + */ +static struct of_device_id __initdata ce4100_ids[] = { + { .compatible = "intel,ce4100-cp", }, + { .compatible = "isa", }, + { .compatible = "pci", }, + {}, +}; + +static int __init add_bus_probe(void) +{ + if (!of_have_populated_dt()) + return 0; + + return of_platform_bus_probe(NULL, ce4100_ids, NULL); +} +module_init(add_bus_probe); + +#ifdef CONFIG_PCI +struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus) +{ + struct device_node *np; + + for_each_node_by_type(np, "pci") { + const void *prop; + unsigned int bus_min; + + prop = of_get_property(np, "bus-range", NULL); + if (!prop) + continue; + bus_min = be32_to_cpup(prop); + if (bus->number == bus_min) + return np; + } + return NULL; +} + +static int x86_of_pci_irq_enable(struct pci_dev *dev) +{ + u32 virq; + int ret; + u8 pin; + + ret = pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); + if (ret) + return ret; + if (!pin) + return 0; + + virq = of_irq_parse_and_map_pci(dev, 0, 0); + if (virq == 0) + return -EINVAL; + dev->irq = virq; + return 0; +} + +static void x86_of_pci_irq_disable(struct pci_dev *dev) +{ +} + +void x86_of_pci_init(void) +{ + pcibios_enable_irq = x86_of_pci_irq_enable; + pcibios_disable_irq = x86_of_pci_irq_disable; +} +#endif + +static void __init dtb_setup_hpet(void) +{ +#ifdef CONFIG_HPET_TIMER + struct device_node *dn; + struct resource r; + int ret; + + dn = of_find_compatible_node(NULL, NULL, "intel,ce4100-hpet"); + if (!dn) + return; + ret = of_address_to_resource(dn, 0, &r); + if (ret) { + WARN_ON(1); + return; + } + hpet_address = r.start; +#endif +} + +static void __init dtb_lapic_setup(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + struct device_node *dn; + struct resource r; + int ret; + + dn = of_find_compatible_node(NULL, NULL, "intel,ce4100-lapic"); + if (!dn) + return; + + ret = of_address_to_resource(dn, 0, &r); + if (WARN_ON(ret)) + return; + + /* Did the boot loader setup the local APIC ? */ + if (!cpu_has_apic) { + if (apic_force_enable(r.start)) + return; + } + smp_found_config = 1; + pic_mode = 1; + register_lapic_address(r.start); + generic_processor_info(boot_cpu_physical_apicid, + GET_APIC_VERSION(apic_read(APIC_LVR))); +#endif +} + +#ifdef CONFIG_X86_IO_APIC +static unsigned int ioapic_id; + +static void __init dtb_add_ioapic(struct device_node *dn) +{ + struct resource r; + int ret; + + ret = of_address_to_resource(dn, 0, &r); + if (ret) { + printk(KERN_ERR "Can't obtain address from node %s.\n", + dn->full_name); + return; + } + mp_register_ioapic(++ioapic_id, r.start, gsi_top); +} + +static void __init dtb_ioapic_setup(void) +{ + struct device_node *dn; + + for_each_compatible_node(dn, NULL, "intel,ce4100-ioapic") + dtb_add_ioapic(dn); + + if (nr_ioapics) { + of_ioapic = 1; + return; + } + printk(KERN_ERR "Error: No information about IO-APIC in OF.\n"); +} +#else +static void __init dtb_ioapic_setup(void) {} +#endif + +static void __init dtb_apic_setup(void) +{ + dtb_lapic_setup(); + dtb_ioapic_setup(); +} + +#ifdef CONFIG_OF_FLATTREE +static void __init x86_flattree_get_config(void) +{ + u32 size, map_len; + void *dt; + + if (!initial_dtb) + return; + + map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), (u64)128); + + initial_boot_params = dt = early_memremap(initial_dtb, map_len); + size = of_get_flat_dt_size(); + if (map_len < size) { + early_iounmap(dt, map_len); + initial_boot_params = dt = early_memremap(initial_dtb, size); + map_len = size; + } + + unflatten_and_copy_device_tree(); + early_iounmap(dt, map_len); +} +#else +static inline void x86_flattree_get_config(void) { } +#endif + +void __init x86_dtb_init(void) +{ + x86_flattree_get_config(); + + if (!of_have_populated_dt()) + return; + + dtb_setup_hpet(); + dtb_apic_setup(); +} + +#ifdef CONFIG_X86_IO_APIC + +struct of_ioapic_type { + u32 out_type; + u32 trigger; + u32 polarity; +}; + +static struct of_ioapic_type of_ioapic_type[] = +{ + { + .out_type = IRQ_TYPE_EDGE_RISING, + .trigger = IOAPIC_EDGE, + .polarity = 1, + }, + { + .out_type = IRQ_TYPE_LEVEL_LOW, + .trigger = IOAPIC_LEVEL, + .polarity = 0, + }, + { + .out_type = IRQ_TYPE_LEVEL_HIGH, + .trigger = IOAPIC_LEVEL, + .polarity = 1, + }, + { + .out_type = IRQ_TYPE_EDGE_FALLING, + .trigger = IOAPIC_EDGE, + .polarity = 0, + }, +}; + +static int ioapic_xlate(struct irq_domain *domain, + struct device_node *controller, + const u32 *intspec, u32 intsize, + irq_hw_number_t *out_hwirq, u32 *out_type) +{ + struct io_apic_irq_attr attr; + struct of_ioapic_type *it; + u32 line, idx; + int rc; + + if (WARN_ON(intsize < 2)) + return -EINVAL; + + line = intspec[0]; + + if (intspec[1] >= ARRAY_SIZE(of_ioapic_type)) + return -EINVAL; + + it = &of_ioapic_type[intspec[1]]; + + idx = (u32) domain->host_data; + set_io_apic_irq_attr(&attr, idx, line, it->trigger, it->polarity); + + rc = io_apic_setup_irq_pin_once(irq_find_mapping(domain, line), + cpu_to_node(0), &attr); + if (rc) + return rc; + + *out_hwirq = line; + *out_type = it->out_type; + return 0; +} + +const struct irq_domain_ops ioapic_irq_domain_ops = { + .xlate = ioapic_xlate, +}; + +static void dt_add_ioapic_domain(unsigned int ioapic_num, + struct device_node *np) +{ + struct irq_domain *id; + struct mp_ioapic_gsi *gsi_cfg; + int ret; + int num; + + gsi_cfg = mp_ioapic_gsi_routing(ioapic_num); + num = gsi_cfg->gsi_end - gsi_cfg->gsi_base + 1; + + id = irq_domain_add_linear(np, num, &ioapic_irq_domain_ops, + (void *)ioapic_num); + BUG_ON(!id); + if (gsi_cfg->gsi_base == 0) { + /* + * The first NR_IRQS_LEGACY irq descs are allocated in + * early_irq_init() and need just a mapping. The + * remaining irqs need both. All of them are preallocated + * and assigned so we can keep the 1:1 mapping which the ioapic + * is having. + */ + irq_domain_associate_many(id, 0, 0, NR_IRQS_LEGACY); + + if (num > NR_IRQS_LEGACY) { + ret = irq_create_strict_mappings(id, NR_IRQS_LEGACY, + NR_IRQS_LEGACY, num - NR_IRQS_LEGACY); + if (ret) + pr_err("Error creating mapping for the " + "remaining IRQs: %d\n", ret); + } + irq_set_default_host(id); + } else { + ret = irq_create_strict_mappings(id, gsi_cfg->gsi_base, 0, num); + if (ret) + pr_err("Error creating IRQ mapping: %d\n", ret); + } +} + +static void __init ioapic_add_ofnode(struct device_node *np) +{ + struct resource r; + int i, ret; + + ret = of_address_to_resource(np, 0, &r); + if (ret) { + printk(KERN_ERR "Failed to obtain address for %s\n", + np->full_name); + return; + } + + for (i = 0; i < nr_ioapics; i++) { + if (r.start == mpc_ioapic_addr(i)) { + dt_add_ioapic_domain(i, np); + return; + } + } + printk(KERN_ERR "IOxAPIC at %s is not registered.\n", np->full_name); +} + +void __init x86_add_irq_domains(void) +{ + struct device_node *dp; + + if (!of_have_populated_dt()) + return; + + for_each_node_with_property(dp, "interrupt-controller") { + if (of_device_is_compatible(dp, "intel,ce4100-ioapic")) + ioapic_add_ofnode(dp); + } +} +#else +void __init x86_add_irq_domains(void) { } +#endif diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c new file mode 100644 index 00000000000..f6dfd9334b6 --- /dev/null +++ b/arch/x86/kernel/doublefault.c @@ -0,0 +1,83 @@ +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/init_task.h> +#include <linux/fs.h> + +#include <asm/uaccess.h> +#include <asm/pgtable.h> +#include <asm/processor.h> +#include <asm/desc.h> + +#ifdef CONFIG_X86_32 + +#define DOUBLEFAULT_STACKSIZE (1024) +static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE]; +#define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE) + +#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM) + +static void doublefault_fn(void) +{ + struct desc_ptr gdt_desc = {0, 0}; + unsigned long gdt, tss; + + native_store_gdt(&gdt_desc); + gdt = gdt_desc.address; + + printk(KERN_EMERG "PANIC: double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size); + + if (ptr_ok(gdt)) { + gdt += GDT_ENTRY_TSS << 3; + tss = get_desc_base((struct desc_struct *)gdt); + printk(KERN_EMERG "double fault, tss at %08lx\n", tss); + + if (ptr_ok(tss)) { + struct x86_hw_tss *t = (struct x86_hw_tss *)tss; + + printk(KERN_EMERG "eip = %08lx, esp = %08lx\n", + t->ip, t->sp); + + printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n", + t->ax, t->bx, t->cx, t->dx); + printk(KERN_EMERG "esi = %08lx, edi = %08lx\n", + t->si, t->di); + } + } + + for (;;) + cpu_relax(); +} + +struct tss_struct doublefault_tss __cacheline_aligned = { + .x86_tss = { + .sp0 = STACK_START, + .ss0 = __KERNEL_DS, + .ldt = 0, + .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, + + .ip = (unsigned long) doublefault_fn, + /* 0x2 bit is always set */ + .flags = X86_EFLAGS_SF | 0x2, + .sp = STACK_START, + .es = __USER_DS, + .cs = __KERNEL_CS, + .ss = __KERNEL_DS, + .ds = __USER_DS, + .fs = __KERNEL_PERCPU, + + .__cr3 = __pa_nodebug(swapper_pg_dir), + } +}; + +/* dummy for do_double_fault() call */ +void df_debug(struct pt_regs *regs, long error_code) {} + +#else /* !CONFIG_X86_32 */ + +void df_debug(struct pt_regs *regs, long error_code) +{ + pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code); + show_regs(regs); + panic("Machine halted."); +} +#endif diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c new file mode 100644 index 00000000000..b74ebc7c440 --- /dev/null +++ b/arch/x86/kernel/dumpstack.c @@ -0,0 +1,349 @@ +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + */ +#include <linux/kallsyms.h> +#include <linux/kprobes.h> +#include <linux/uaccess.h> +#include <linux/utsname.h> +#include <linux/hardirq.h> +#include <linux/kdebug.h> +#include <linux/module.h> +#include <linux/ptrace.h> +#include <linux/ftrace.h> +#include <linux/kexec.h> +#include <linux/bug.h> +#include <linux/nmi.h> +#include <linux/sysfs.h> + +#include <asm/stacktrace.h> + + +int panic_on_unrecovered_nmi; +int panic_on_io_nmi; +unsigned int code_bytes = 64; +int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; +static int die_counter; + +static void printk_stack_address(unsigned long address, int reliable) +{ + pr_cont(" [<%p>] %s%pB\n", + (void *)address, reliable ? "" : "? ", (void *)address); +} + +void printk_address(unsigned long address) +{ + pr_cont(" [<%p>] %pS\n", (void *)address, (void *)address); +} + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static void +print_ftrace_graph_addr(unsigned long addr, void *data, + const struct stacktrace_ops *ops, + struct thread_info *tinfo, int *graph) +{ + struct task_struct *task; + unsigned long ret_addr; + int index; + + if (addr != (unsigned long)return_to_handler) + return; + + task = tinfo->task; + index = task->curr_ret_stack; + + if (!task->ret_stack || index < *graph) + return; + + index -= *graph; + ret_addr = task->ret_stack[index].ret; + + ops->address(data, ret_addr, 1); + + (*graph)++; +} +#else +static inline void +print_ftrace_graph_addr(unsigned long addr, void *data, + const struct stacktrace_ops *ops, + struct thread_info *tinfo, int *graph) +{ } +#endif + +/* + * x86-64 can have up to three kernel stacks: + * process stack + * interrupt stack + * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack + */ + +static inline int valid_stack_ptr(struct thread_info *tinfo, + void *p, unsigned int size, void *end) +{ + void *t = tinfo; + if (end) { + if (p < end && p >= (end-THREAD_SIZE)) + return 1; + else + return 0; + } + return p > t && p < t + THREAD_SIZE - size; +} + +unsigned long +print_context_stack(struct thread_info *tinfo, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data, + unsigned long *end, int *graph) +{ + struct stack_frame *frame = (struct stack_frame *)bp; + + while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { + unsigned long addr; + + addr = *stack; + if (__kernel_text_address(addr)) { + if ((unsigned long) stack == bp + sizeof(long)) { + ops->address(data, addr, 1); + frame = frame->next_frame; + bp = (unsigned long) frame; + } else { + ops->address(data, addr, 0); + } + print_ftrace_graph_addr(addr, data, ops, tinfo, graph); + } + stack++; + } + return bp; +} +EXPORT_SYMBOL_GPL(print_context_stack); + +unsigned long +print_context_stack_bp(struct thread_info *tinfo, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data, + unsigned long *end, int *graph) +{ + struct stack_frame *frame = (struct stack_frame *)bp; + unsigned long *ret_addr = &frame->return_address; + + while (valid_stack_ptr(tinfo, ret_addr, sizeof(*ret_addr), end)) { + unsigned long addr = *ret_addr; + + if (!__kernel_text_address(addr)) + break; + + ops->address(data, addr, 1); + frame = frame->next_frame; + ret_addr = &frame->return_address; + print_ftrace_graph_addr(addr, data, ops, tinfo, graph); + } + + return (unsigned long)frame; +} +EXPORT_SYMBOL_GPL(print_context_stack_bp); + +static int print_trace_stack(void *data, char *name) +{ + printk("%s <%s> ", (char *)data, name); + return 0; +} + +/* + * Print one address/symbol entries per line. + */ +static void print_trace_address(void *data, unsigned long addr, int reliable) +{ + touch_nmi_watchdog(); + printk(data); + printk_stack_address(addr, reliable); +} + +static const struct stacktrace_ops print_trace_ops = { + .stack = print_trace_stack, + .address = print_trace_address, + .walk_stack = print_context_stack, +}; + +void +show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp, char *log_lvl) +{ + printk("%sCall Trace:\n", log_lvl); + dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); +} + +void show_trace(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp) +{ + show_trace_log_lvl(task, regs, stack, bp, ""); +} + +void show_stack(struct task_struct *task, unsigned long *sp) +{ + unsigned long bp = 0; + unsigned long stack; + + /* + * Stack frames below this one aren't interesting. Don't show them + * if we're printing for %current. + */ + if (!sp && (!task || task == current)) { + sp = &stack; + bp = stack_frame(current, NULL); + } + + show_stack_log_lvl(task, NULL, sp, bp, ""); +} + +static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED; +static int die_owner = -1; +static unsigned int die_nest_count; + +unsigned long oops_begin(void) +{ + int cpu; + unsigned long flags; + + oops_enter(); + + /* racy, but better than risking deadlock. */ + raw_local_irq_save(flags); + cpu = smp_processor_id(); + if (!arch_spin_trylock(&die_lock)) { + if (cpu == die_owner) + /* nested oops. should stop eventually */; + else + arch_spin_lock(&die_lock); + } + die_nest_count++; + die_owner = cpu; + console_verbose(); + bust_spinlocks(1); + return flags; +} +EXPORT_SYMBOL_GPL(oops_begin); +NOKPROBE_SYMBOL(oops_begin); + +void oops_end(unsigned long flags, struct pt_regs *regs, int signr) +{ + if (regs && kexec_should_crash(current)) + crash_kexec(regs); + + bust_spinlocks(0); + die_owner = -1; + add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE); + die_nest_count--; + if (!die_nest_count) + /* Nest count reaches zero, release the lock. */ + arch_spin_unlock(&die_lock); + raw_local_irq_restore(flags); + oops_exit(); + + if (!signr) + return; + if (in_interrupt()) + panic("Fatal exception in interrupt"); + if (panic_on_oops) + panic("Fatal exception"); + do_exit(signr); +} +NOKPROBE_SYMBOL(oops_end); + +int __die(const char *str, struct pt_regs *regs, long err) +{ +#ifdef CONFIG_X86_32 + unsigned short ss; + unsigned long sp; +#endif + printk(KERN_DEFAULT + "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); +#ifdef CONFIG_PREEMPT + printk("PREEMPT "); +#endif +#ifdef CONFIG_SMP + printk("SMP "); +#endif +#ifdef CONFIG_DEBUG_PAGEALLOC + printk("DEBUG_PAGEALLOC"); +#endif + printk("\n"); + if (notify_die(DIE_OOPS, str, regs, err, + current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP) + return 1; + + print_modules(); + show_regs(regs); +#ifdef CONFIG_X86_32 + if (user_mode_vm(regs)) { + sp = regs->sp; + ss = regs->ss & 0xffff; + } else { + sp = kernel_stack_pointer(regs); + savesegment(ss, ss); + } + printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); + print_symbol("%s", regs->ip); + printk(" SS:ESP %04x:%08lx\n", ss, sp); +#else + /* Executive summary in case the oops scrolled away */ + printk(KERN_ALERT "RIP "); + printk_address(regs->ip); + printk(" RSP <%016lx>\n", regs->sp); +#endif + return 0; +} +NOKPROBE_SYMBOL(__die); + +/* + * This is gone through when something in the kernel has done something bad + * and is about to be terminated: + */ +void die(const char *str, struct pt_regs *regs, long err) +{ + unsigned long flags = oops_begin(); + int sig = SIGSEGV; + + if (!user_mode_vm(regs)) + report_bug(regs->ip, regs); + + if (__die(str, regs, err)) + sig = 0; + oops_end(flags, regs, sig); +} + +static int __init kstack_setup(char *s) +{ + ssize_t ret; + unsigned long val; + + if (!s) + return -EINVAL; + + ret = kstrtoul(s, 0, &val); + if (ret) + return ret; + kstack_depth_to_print = val; + return 0; +} +early_param("kstack", kstack_setup); + +static int __init code_bytes_setup(char *s) +{ + ssize_t ret; + unsigned long val; + + if (!s) + return -EINVAL; + + ret = kstrtoul(s, 0, &val); + if (ret) + return ret; + + code_bytes = val; + if (code_bytes > 8192) + code_bytes = 8192; + + return 1; +} +__setup("code_bytes=", code_bytes_setup); diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c new file mode 100644 index 00000000000..5abd4cd4230 --- /dev/null +++ b/arch/x86/kernel/dumpstack_32.c @@ -0,0 +1,174 @@ +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + */ +#include <linux/kallsyms.h> +#include <linux/kprobes.h> +#include <linux/uaccess.h> +#include <linux/hardirq.h> +#include <linux/kdebug.h> +#include <linux/module.h> +#include <linux/ptrace.h> +#include <linux/kexec.h> +#include <linux/sysfs.h> +#include <linux/bug.h> +#include <linux/nmi.h> + +#include <asm/stacktrace.h> + +static void *is_irq_stack(void *p, void *irq) +{ + if (p < irq || p >= (irq + THREAD_SIZE)) + return NULL; + return irq + THREAD_SIZE; +} + + +static void *is_hardirq_stack(unsigned long *stack, int cpu) +{ + void *irq = per_cpu(hardirq_stack, cpu); + + return is_irq_stack(stack, irq); +} + +static void *is_softirq_stack(unsigned long *stack, int cpu) +{ + void *irq = per_cpu(softirq_stack, cpu); + + return is_irq_stack(stack, irq); +} + +void dump_trace(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data) +{ + const unsigned cpu = get_cpu(); + int graph = 0; + u32 *prev_esp; + + if (!task) + task = current; + + if (!stack) { + unsigned long dummy; + + stack = &dummy; + if (task != current) + stack = (unsigned long *)task->thread.sp; + } + + if (!bp) + bp = stack_frame(task, regs); + + for (;;) { + struct thread_info *context; + void *end_stack; + + end_stack = is_hardirq_stack(stack, cpu); + if (!end_stack) + end_stack = is_softirq_stack(stack, cpu); + + context = task_thread_info(task); + bp = ops->walk_stack(context, stack, bp, ops, data, + end_stack, &graph); + + /* Stop if not on irq stack */ + if (!end_stack) + break; + + /* The previous esp is saved on the bottom of the stack */ + prev_esp = (u32 *)(end_stack - THREAD_SIZE); + stack = (unsigned long *)*prev_esp; + if (!stack) + break; + + if (ops->stack(data, "IRQ") < 0) + break; + touch_nmi_watchdog(); + } + put_cpu(); +} +EXPORT_SYMBOL(dump_trace); + +void +show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *sp, unsigned long bp, char *log_lvl) +{ + unsigned long *stack; + int i; + + if (sp == NULL) { + if (task) + sp = (unsigned long *)task->thread.sp; + else + sp = (unsigned long *)&sp; + } + + stack = sp; + for (i = 0; i < kstack_depth_to_print; i++) { + if (kstack_end(stack)) + break; + if (i && ((i % STACKSLOTS_PER_LINE) == 0)) + pr_cont("\n"); + pr_cont(" %08lx", *stack++); + touch_nmi_watchdog(); + } + pr_cont("\n"); + show_trace_log_lvl(task, regs, sp, bp, log_lvl); +} + + +void show_regs(struct pt_regs *regs) +{ + int i; + + show_regs_print_info(KERN_EMERG); + __show_regs(regs, !user_mode_vm(regs)); + + /* + * When in-kernel, we also print out the stack and code at the + * time of the fault.. + */ + if (!user_mode_vm(regs)) { + unsigned int code_prologue = code_bytes * 43 / 64; + unsigned int code_len = code_bytes; + unsigned char c; + u8 *ip; + + pr_emerg("Stack:\n"); + show_stack_log_lvl(NULL, regs, ®s->sp, 0, KERN_EMERG); + + pr_emerg("Code:"); + + ip = (u8 *)regs->ip - code_prologue; + if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { + /* try starting at IP */ + ip = (u8 *)regs->ip; + code_len = code_len - code_prologue + 1; + } + for (i = 0; i < code_len; i++, ip++) { + if (ip < (u8 *)PAGE_OFFSET || + probe_kernel_address(ip, c)) { + pr_cont(" Bad EIP value."); + break; + } + if (ip == (u8 *)regs->ip) + pr_cont(" <%02x>", c); + else + pr_cont(" %02x", c); + } + } + pr_cont("\n"); +} + +int is_valid_bugaddr(unsigned long ip) +{ + unsigned short ud2; + + if (ip < PAGE_OFFSET) + return 0; + if (probe_kernel_address((unsigned short *)ip, ud2)) + return 0; + + return ud2 == 0x0b0f; +} diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c new file mode 100644 index 00000000000..1abcb50b48a --- /dev/null +++ b/arch/x86/kernel/dumpstack_64.c @@ -0,0 +1,352 @@ +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + */ +#include <linux/kallsyms.h> +#include <linux/kprobes.h> +#include <linux/uaccess.h> +#include <linux/hardirq.h> +#include <linux/kdebug.h> +#include <linux/module.h> +#include <linux/ptrace.h> +#include <linux/kexec.h> +#include <linux/sysfs.h> +#include <linux/bug.h> +#include <linux/nmi.h> + +#include <asm/stacktrace.h> + + +#define N_EXCEPTION_STACKS_END \ + (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2) + +static char x86_stack_ids[][8] = { + [ DEBUG_STACK-1 ] = "#DB", + [ NMI_STACK-1 ] = "NMI", + [ DOUBLEFAULT_STACK-1 ] = "#DF", + [ STACKFAULT_STACK-1 ] = "#SS", + [ MCE_STACK-1 ] = "#MC", +#if DEBUG_STKSZ > EXCEPTION_STKSZ + [ N_EXCEPTION_STACKS ... + N_EXCEPTION_STACKS_END ] = "#DB[?]" +#endif +}; + +static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, + unsigned *usedp, char **idp) +{ + unsigned k; + + /* + * Iterate over all exception stacks, and figure out whether + * 'stack' is in one of them: + */ + for (k = 0; k < N_EXCEPTION_STACKS; k++) { + unsigned long end = per_cpu(orig_ist, cpu).ist[k]; + /* + * Is 'stack' above this exception frame's end? + * If yes then skip to the next frame. + */ + if (stack >= end) + continue; + /* + * Is 'stack' above this exception frame's start address? + * If yes then we found the right frame. + */ + if (stack >= end - EXCEPTION_STKSZ) { + /* + * Make sure we only iterate through an exception + * stack once. If it comes up for the second time + * then there's something wrong going on - just + * break out and return NULL: + */ + if (*usedp & (1U << k)) + break; + *usedp |= 1U << k; + *idp = x86_stack_ids[k]; + return (unsigned long *)end; + } + /* + * If this is a debug stack, and if it has a larger size than + * the usual exception stacks, then 'stack' might still + * be within the lower portion of the debug stack: + */ +#if DEBUG_STKSZ > EXCEPTION_STKSZ + if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) { + unsigned j = N_EXCEPTION_STACKS - 1; + + /* + * Black magic. A large debug stack is composed of + * multiple exception stack entries, which we + * iterate through now. Dont look: + */ + do { + ++j; + end -= EXCEPTION_STKSZ; + x86_stack_ids[j][4] = '1' + + (j - N_EXCEPTION_STACKS); + } while (stack < end - EXCEPTION_STKSZ); + if (*usedp & (1U << j)) + break; + *usedp |= 1U << j; + *idp = x86_stack_ids[j]; + return (unsigned long *)end; + } +#endif + } + return NULL; +} + +static inline int +in_irq_stack(unsigned long *stack, unsigned long *irq_stack, + unsigned long *irq_stack_end) +{ + return (stack >= irq_stack && stack < irq_stack_end); +} + +static const unsigned long irq_stack_size = + (IRQ_STACK_SIZE - 64) / sizeof(unsigned long); + +enum stack_type { + STACK_IS_UNKNOWN, + STACK_IS_NORMAL, + STACK_IS_EXCEPTION, + STACK_IS_IRQ, +}; + +static enum stack_type +analyze_stack(int cpu, struct task_struct *task, unsigned long *stack, + unsigned long **stack_end, unsigned long *irq_stack, + unsigned *used, char **id) +{ + unsigned long addr; + + addr = ((unsigned long)stack & (~(THREAD_SIZE - 1))); + if ((unsigned long)task_stack_page(task) == addr) + return STACK_IS_NORMAL; + + *stack_end = in_exception_stack(cpu, (unsigned long)stack, + used, id); + if (*stack_end) + return STACK_IS_EXCEPTION; + + if (!irq_stack) + return STACK_IS_NORMAL; + + *stack_end = irq_stack; + irq_stack = irq_stack - irq_stack_size; + + if (in_irq_stack(stack, irq_stack, *stack_end)) + return STACK_IS_IRQ; + + return STACK_IS_UNKNOWN; +} + +/* + * x86-64 can have up to three kernel stacks: + * process stack + * interrupt stack + * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack + */ + +void dump_trace(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data) +{ + const unsigned cpu = get_cpu(); + struct thread_info *tinfo; + unsigned long *irq_stack = (unsigned long *)per_cpu(irq_stack_ptr, cpu); + unsigned long dummy; + unsigned used = 0; + int graph = 0; + int done = 0; + + if (!task) + task = current; + + if (!stack) { + if (regs) + stack = (unsigned long *)regs->sp; + else if (task != current) + stack = (unsigned long *)task->thread.sp; + else + stack = &dummy; + } + + if (!bp) + bp = stack_frame(task, regs); + /* + * Print function call entries in all stacks, starting at the + * current stack address. If the stacks consist of nested + * exceptions + */ + tinfo = task_thread_info(task); + while (!done) { + unsigned long *stack_end; + enum stack_type stype; + char *id; + + stype = analyze_stack(cpu, task, stack, &stack_end, + irq_stack, &used, &id); + + /* Default finish unless specified to continue */ + done = 1; + + switch (stype) { + + /* Break out early if we are on the thread stack */ + case STACK_IS_NORMAL: + break; + + case STACK_IS_EXCEPTION: + + if (ops->stack(data, id) < 0) + break; + + bp = ops->walk_stack(tinfo, stack, bp, ops, + data, stack_end, &graph); + ops->stack(data, "<EOE>"); + /* + * We link to the next stack via the + * second-to-last pointer (index -2 to end) in the + * exception stack: + */ + stack = (unsigned long *) stack_end[-2]; + done = 0; + break; + + case STACK_IS_IRQ: + + if (ops->stack(data, "IRQ") < 0) + break; + bp = ops->walk_stack(tinfo, stack, bp, + ops, data, stack_end, &graph); + /* + * We link to the next stack (which would be + * the process stack normally) the last + * pointer (index -1 to end) in the IRQ stack: + */ + stack = (unsigned long *) (stack_end[-1]); + irq_stack = NULL; + ops->stack(data, "EOI"); + done = 0; + break; + + case STACK_IS_UNKNOWN: + ops->stack(data, "UNK"); + break; + } + } + + /* + * This handles the process stack: + */ + bp = ops->walk_stack(tinfo, stack, bp, ops, data, NULL, &graph); + put_cpu(); +} +EXPORT_SYMBOL(dump_trace); + +void +show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *sp, unsigned long bp, char *log_lvl) +{ + unsigned long *irq_stack_end; + unsigned long *irq_stack; + unsigned long *stack; + int cpu; + int i; + + preempt_disable(); + cpu = smp_processor_id(); + + irq_stack_end = (unsigned long *)(per_cpu(irq_stack_ptr, cpu)); + irq_stack = (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE); + + /* + * Debugging aid: "show_stack(NULL, NULL);" prints the + * back trace for this cpu: + */ + if (sp == NULL) { + if (task) + sp = (unsigned long *)task->thread.sp; + else + sp = (unsigned long *)&sp; + } + + stack = sp; + for (i = 0; i < kstack_depth_to_print; i++) { + if (stack >= irq_stack && stack <= irq_stack_end) { + if (stack == irq_stack_end) { + stack = (unsigned long *) (irq_stack_end[-1]); + pr_cont(" <EOI> "); + } + } else { + if (((long) stack & (THREAD_SIZE-1)) == 0) + break; + } + if (i && ((i % STACKSLOTS_PER_LINE) == 0)) + pr_cont("\n"); + pr_cont(" %016lx", *stack++); + touch_nmi_watchdog(); + } + preempt_enable(); + + pr_cont("\n"); + show_trace_log_lvl(task, regs, sp, bp, log_lvl); +} + +void show_regs(struct pt_regs *regs) +{ + int i; + unsigned long sp; + + sp = regs->sp; + show_regs_print_info(KERN_DEFAULT); + __show_regs(regs, 1); + + /* + * When in-kernel, we also print out the stack and code at the + * time of the fault.. + */ + if (!user_mode(regs)) { + unsigned int code_prologue = code_bytes * 43 / 64; + unsigned int code_len = code_bytes; + unsigned char c; + u8 *ip; + + printk(KERN_DEFAULT "Stack:\n"); + show_stack_log_lvl(NULL, regs, (unsigned long *)sp, + 0, KERN_DEFAULT); + + printk(KERN_DEFAULT "Code: "); + + ip = (u8 *)regs->ip - code_prologue; + if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { + /* try starting at IP */ + ip = (u8 *)regs->ip; + code_len = code_len - code_prologue + 1; + } + for (i = 0; i < code_len; i++, ip++) { + if (ip < (u8 *)PAGE_OFFSET || + probe_kernel_address(ip, c)) { + pr_cont(" Bad RIP value."); + break; + } + if (ip == (u8 *)regs->ip) + pr_cont("<%02x> ", c); + else + pr_cont("%02x ", c); + } + } + pr_cont("\n"); +} + +int is_valid_bugaddr(unsigned long ip) +{ + unsigned short ud2; + + if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2))) + return 0; + + return ud2 == 0x0b0f; +} diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c new file mode 100644 index 00000000000..988c00a1f60 --- /dev/null +++ b/arch/x86/kernel/e820.c @@ -0,0 +1,1132 @@ +/* + * Handle the memory map. + * The functions here do the job until bootmem takes over. + * + * Getting sanitize_e820_map() in sync with i386 version by applying change: + * - Provisions for empty E820 memory regions (reported by certain BIOSes). + * Alex Achenbach <xela@slit.de>, December 2002. + * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> + * + */ +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/crash_dump.h> +#include <linux/export.h> +#include <linux/bootmem.h> +#include <linux/pfn.h> +#include <linux/suspend.h> +#include <linux/acpi.h> +#include <linux/firmware-map.h> +#include <linux/memblock.h> +#include <linux/sort.h> + +#include <asm/e820.h> +#include <asm/proto.h> +#include <asm/setup.h> + +/* + * The e820 map is the map that gets modified e.g. with command line parameters + * and that is also registered with modifications in the kernel resource tree + * with the iomem_resource as parent. + * + * The e820_saved is directly saved after the BIOS-provided memory map is + * copied. It doesn't get modified afterwards. It's registered for the + * /sys/firmware/memmap interface. + * + * That memory map is not modified and is used as base for kexec. The kexec'd + * kernel should get the same memory map as the firmware provides. Then the + * user can e.g. boot the original kernel with mem=1G while still booting the + * next kernel with full memory. + */ +struct e820map e820; +struct e820map e820_saved; + +/* For PCI or other memory-mapped resources */ +unsigned long pci_mem_start = 0xaeedbabe; +#ifdef CONFIG_PCI +EXPORT_SYMBOL(pci_mem_start); +#endif + +/* + * This function checks if any part of the range <start,end> is mapped + * with type. + */ +int +e820_any_mapped(u64 start, u64 end, unsigned type) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + + if (type && ei->type != type) + continue; + if (ei->addr >= end || ei->addr + ei->size <= start) + continue; + return 1; + } + return 0; +} +EXPORT_SYMBOL_GPL(e820_any_mapped); + +/* + * This function checks if the entire range <start,end> is mapped with type. + * + * Note: this function only works correct if the e820 table is sorted and + * not-overlapping, which is the case + */ +int __init e820_all_mapped(u64 start, u64 end, unsigned type) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + + if (type && ei->type != type) + continue; + /* is the region (part) in overlap with the current region ?*/ + if (ei->addr >= end || ei->addr + ei->size <= start) + continue; + + /* if the region is at the beginning of <start,end> we move + * start to the end of the region since it's ok until there + */ + if (ei->addr <= start) + start = ei->addr + ei->size; + /* + * if start is now at or beyond end, we're done, full + * coverage + */ + if (start >= end) + return 1; + } + return 0; +} + +/* + * Add a memory region to the kernel e820 map. + */ +static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size, + int type) +{ + int x = e820x->nr_map; + + if (x >= ARRAY_SIZE(e820x->map)) { + printk(KERN_ERR "e820: too many entries; ignoring [mem %#010llx-%#010llx]\n", + (unsigned long long) start, + (unsigned long long) (start + size - 1)); + return; + } + + e820x->map[x].addr = start; + e820x->map[x].size = size; + e820x->map[x].type = type; + e820x->nr_map++; +} + +void __init e820_add_region(u64 start, u64 size, int type) +{ + __e820_add_region(&e820, start, size, type); +} + +static void __init e820_print_type(u32 type) +{ + switch (type) { + case E820_RAM: + case E820_RESERVED_KERN: + printk(KERN_CONT "usable"); + break; + case E820_RESERVED: + printk(KERN_CONT "reserved"); + break; + case E820_ACPI: + printk(KERN_CONT "ACPI data"); + break; + case E820_NVS: + printk(KERN_CONT "ACPI NVS"); + break; + case E820_UNUSABLE: + printk(KERN_CONT "unusable"); + break; + default: + printk(KERN_CONT "type %u", type); + break; + } +} + +void __init e820_print_map(char *who) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + printk(KERN_INFO "%s: [mem %#018Lx-%#018Lx] ", who, + (unsigned long long) e820.map[i].addr, + (unsigned long long) + (e820.map[i].addr + e820.map[i].size - 1)); + e820_print_type(e820.map[i].type); + printk(KERN_CONT "\n"); + } +} + +/* + * Sanitize the BIOS e820 map. + * + * Some e820 responses include overlapping entries. The following + * replaces the original e820 map with a new one, removing overlaps, + * and resolving conflicting memory types in favor of highest + * numbered type. + * + * The input parameter biosmap points to an array of 'struct + * e820entry' which on entry has elements in the range [0, *pnr_map) + * valid, and which has space for up to max_nr_map entries. + * On return, the resulting sanitized e820 map entries will be in + * overwritten in the same location, starting at biosmap. + * + * The integer pointed to by pnr_map must be valid on entry (the + * current number of valid entries located at biosmap) and will + * be updated on return, with the new number of valid entries + * (something no more than max_nr_map.) + * + * The return value from sanitize_e820_map() is zero if it + * successfully 'sanitized' the map entries passed in, and is -1 + * if it did nothing, which can happen if either of (1) it was + * only passed one map entry, or (2) any of the input map entries + * were invalid (start + size < start, meaning that the size was + * so big the described memory range wrapped around through zero.) + * + * Visually we're performing the following + * (1,2,3,4 = memory types)... + * + * Sample memory map (w/overlaps): + * ____22__________________ + * ______________________4_ + * ____1111________________ + * _44_____________________ + * 11111111________________ + * ____________________33__ + * ___________44___________ + * __________33333_________ + * ______________22________ + * ___________________2222_ + * _________111111111______ + * _____________________11_ + * _________________4______ + * + * Sanitized equivalent (no overlap): + * 1_______________________ + * _44_____________________ + * ___1____________________ + * ____22__________________ + * ______11________________ + * _________1______________ + * __________3_____________ + * ___________44___________ + * _____________33_________ + * _______________2________ + * ________________1_______ + * _________________4______ + * ___________________2____ + * ____________________33__ + * ______________________4_ + */ +struct change_member { + struct e820entry *pbios; /* pointer to original bios entry */ + unsigned long long addr; /* address for this change point */ +}; + +static int __init cpcompare(const void *a, const void *b) +{ + struct change_member * const *app = a, * const *bpp = b; + const struct change_member *ap = *app, *bp = *bpp; + + /* + * Inputs are pointers to two elements of change_point[]. If their + * addresses are unequal, their difference dominates. If the addresses + * are equal, then consider one that represents the end of its region + * to be greater than one that does not. + */ + if (ap->addr != bp->addr) + return ap->addr > bp->addr ? 1 : -1; + + return (ap->addr != ap->pbios->addr) - (bp->addr != bp->pbios->addr); +} + +int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, + u32 *pnr_map) +{ + static struct change_member change_point_list[2*E820_X_MAX] __initdata; + static struct change_member *change_point[2*E820_X_MAX] __initdata; + static struct e820entry *overlap_list[E820_X_MAX] __initdata; + static struct e820entry new_bios[E820_X_MAX] __initdata; + unsigned long current_type, last_type; + unsigned long long last_addr; + int chgidx; + int overlap_entries; + int new_bios_entry; + int old_nr, new_nr, chg_nr; + int i; + + /* if there's only one memory region, don't bother */ + if (*pnr_map < 2) + return -1; + + old_nr = *pnr_map; + BUG_ON(old_nr > max_nr_map); + + /* bail out if we find any unreasonable addresses in bios map */ + for (i = 0; i < old_nr; i++) + if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) + return -1; + + /* create pointers for initial change-point information (for sorting) */ + for (i = 0; i < 2 * old_nr; i++) + change_point[i] = &change_point_list[i]; + + /* record all known change-points (starting and ending addresses), + omitting those that are for empty memory regions */ + chgidx = 0; + for (i = 0; i < old_nr; i++) { + if (biosmap[i].size != 0) { + change_point[chgidx]->addr = biosmap[i].addr; + change_point[chgidx++]->pbios = &biosmap[i]; + change_point[chgidx]->addr = biosmap[i].addr + + biosmap[i].size; + change_point[chgidx++]->pbios = &biosmap[i]; + } + } + chg_nr = chgidx; + + /* sort change-point list by memory addresses (low -> high) */ + sort(change_point, chg_nr, sizeof *change_point, cpcompare, NULL); + + /* create a new bios memory map, removing overlaps */ + overlap_entries = 0; /* number of entries in the overlap table */ + new_bios_entry = 0; /* index for creating new bios map entries */ + last_type = 0; /* start with undefined memory type */ + last_addr = 0; /* start with 0 as last starting address */ + + /* loop through change-points, determining affect on the new bios map */ + for (chgidx = 0; chgidx < chg_nr; chgidx++) { + /* keep track of all overlapping bios entries */ + if (change_point[chgidx]->addr == + change_point[chgidx]->pbios->addr) { + /* + * add map entry to overlap list (> 1 entry + * implies an overlap) + */ + overlap_list[overlap_entries++] = + change_point[chgidx]->pbios; + } else { + /* + * remove entry from list (order independent, + * so swap with last) + */ + for (i = 0; i < overlap_entries; i++) { + if (overlap_list[i] == + change_point[chgidx]->pbios) + overlap_list[i] = + overlap_list[overlap_entries-1]; + } + overlap_entries--; + } + /* + * if there are overlapping entries, decide which + * "type" to use (larger value takes precedence -- + * 1=usable, 2,3,4,4+=unusable) + */ + current_type = 0; + for (i = 0; i < overlap_entries; i++) + if (overlap_list[i]->type > current_type) + current_type = overlap_list[i]->type; + /* + * continue building up new bios map based on this + * information + */ + if (current_type != last_type) { + if (last_type != 0) { + new_bios[new_bios_entry].size = + change_point[chgidx]->addr - last_addr; + /* + * move forward only if the new size + * was non-zero + */ + if (new_bios[new_bios_entry].size != 0) + /* + * no more space left for new + * bios entries ? + */ + if (++new_bios_entry >= max_nr_map) + break; + } + if (current_type != 0) { + new_bios[new_bios_entry].addr = + change_point[chgidx]->addr; + new_bios[new_bios_entry].type = current_type; + last_addr = change_point[chgidx]->addr; + } + last_type = current_type; + } + } + /* retain count for new bios entries */ + new_nr = new_bios_entry; + + /* copy new bios mapping into original location */ + memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry)); + *pnr_map = new_nr; + + return 0; +} + +static int __init __append_e820_map(struct e820entry *biosmap, int nr_map) +{ + while (nr_map) { + u64 start = biosmap->addr; + u64 size = biosmap->size; + u64 end = start + size; + u32 type = biosmap->type; + + /* Overflow in 64 bits? Ignore the memory map. */ + if (start > end) + return -1; + + e820_add_region(start, size, type); + + biosmap++; + nr_map--; + } + return 0; +} + +/* + * Copy the BIOS e820 map into a safe place. + * + * Sanity-check it while we're at it.. + * + * If we're lucky and live on a modern system, the setup code + * will have given us a memory map that we can use to properly + * set up memory. If we aren't, we'll fake a memory map. + */ +static int __init append_e820_map(struct e820entry *biosmap, int nr_map) +{ + /* Only one memory region (or negative)? Ignore it */ + if (nr_map < 2) + return -1; + + return __append_e820_map(biosmap, nr_map); +} + +static u64 __init __e820_update_range(struct e820map *e820x, u64 start, + u64 size, unsigned old_type, + unsigned new_type) +{ + u64 end; + unsigned int i; + u64 real_updated_size = 0; + + BUG_ON(old_type == new_type); + + if (size > (ULLONG_MAX - start)) + size = ULLONG_MAX - start; + + end = start + size; + printk(KERN_DEBUG "e820: update [mem %#010Lx-%#010Lx] ", + (unsigned long long) start, (unsigned long long) (end - 1)); + e820_print_type(old_type); + printk(KERN_CONT " ==> "); + e820_print_type(new_type); + printk(KERN_CONT "\n"); + + for (i = 0; i < e820x->nr_map; i++) { + struct e820entry *ei = &e820x->map[i]; + u64 final_start, final_end; + u64 ei_end; + + if (ei->type != old_type) + continue; + + ei_end = ei->addr + ei->size; + /* totally covered by new range? */ + if (ei->addr >= start && ei_end <= end) { + ei->type = new_type; + real_updated_size += ei->size; + continue; + } + + /* new range is totally covered? */ + if (ei->addr < start && ei_end > end) { + __e820_add_region(e820x, start, size, new_type); + __e820_add_region(e820x, end, ei_end - end, ei->type); + ei->size = start - ei->addr; + real_updated_size += size; + continue; + } + + /* partially covered */ + final_start = max(start, ei->addr); + final_end = min(end, ei_end); + if (final_start >= final_end) + continue; + + __e820_add_region(e820x, final_start, final_end - final_start, + new_type); + + real_updated_size += final_end - final_start; + + /* + * left range could be head or tail, so need to update + * size at first. + */ + ei->size -= final_end - final_start; + if (ei->addr < final_start) + continue; + ei->addr = final_end; + } + return real_updated_size; +} + +u64 __init e820_update_range(u64 start, u64 size, unsigned old_type, + unsigned new_type) +{ + return __e820_update_range(&e820, start, size, old_type, new_type); +} + +static u64 __init e820_update_range_saved(u64 start, u64 size, + unsigned old_type, unsigned new_type) +{ + return __e820_update_range(&e820_saved, start, size, old_type, + new_type); +} + +/* make e820 not cover the range */ +u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type, + int checktype) +{ + int i; + u64 end; + u64 real_removed_size = 0; + + if (size > (ULLONG_MAX - start)) + size = ULLONG_MAX - start; + + end = start + size; + printk(KERN_DEBUG "e820: remove [mem %#010Lx-%#010Lx] ", + (unsigned long long) start, (unsigned long long) (end - 1)); + if (checktype) + e820_print_type(old_type); + printk(KERN_CONT "\n"); + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + u64 final_start, final_end; + u64 ei_end; + + if (checktype && ei->type != old_type) + continue; + + ei_end = ei->addr + ei->size; + /* totally covered? */ + if (ei->addr >= start && ei_end <= end) { + real_removed_size += ei->size; + memset(ei, 0, sizeof(struct e820entry)); + continue; + } + + /* new range is totally covered? */ + if (ei->addr < start && ei_end > end) { + e820_add_region(end, ei_end - end, ei->type); + ei->size = start - ei->addr; + real_removed_size += size; + continue; + } + + /* partially covered */ + final_start = max(start, ei->addr); + final_end = min(end, ei_end); + if (final_start >= final_end) + continue; + real_removed_size += final_end - final_start; + + /* + * left range could be head or tail, so need to update + * size at first. + */ + ei->size -= final_end - final_start; + if (ei->addr < final_start) + continue; + ei->addr = final_end; + } + return real_removed_size; +} + +void __init update_e820(void) +{ + u32 nr_map; + + nr_map = e820.nr_map; + if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map)) + return; + e820.nr_map = nr_map; + printk(KERN_INFO "e820: modified physical RAM map:\n"); + e820_print_map("modified"); +} +static void __init update_e820_saved(void) +{ + u32 nr_map; + + nr_map = e820_saved.nr_map; + if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map)) + return; + e820_saved.nr_map = nr_map; +} +#define MAX_GAP_END 0x100000000ull +/* + * Search for a gap in the e820 memory space from start_addr to end_addr. + */ +__init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize, + unsigned long start_addr, unsigned long long end_addr) +{ + unsigned long long last; + int i = e820.nr_map; + int found = 0; + + last = (end_addr && end_addr < MAX_GAP_END) ? end_addr : MAX_GAP_END; + + while (--i >= 0) { + unsigned long long start = e820.map[i].addr; + unsigned long long end = start + e820.map[i].size; + + if (end < start_addr) + continue; + + /* + * Since "last" is at most 4GB, we know we'll + * fit in 32 bits if this condition is true + */ + if (last > end) { + unsigned long gap = last - end; + + if (gap >= *gapsize) { + *gapsize = gap; + *gapstart = end; + found = 1; + } + } + if (start < last) + last = start; + } + return found; +} + +/* + * Search for the biggest gap in the low 32 bits of the e820 + * memory space. We pass this space to PCI to assign MMIO resources + * for hotplug or unconfigured devices in. + * Hopefully the BIOS let enough space left. + */ +__init void e820_setup_gap(void) +{ + unsigned long gapstart, gapsize; + int found; + + gapstart = 0x10000000; + gapsize = 0x400000; + found = e820_search_gap(&gapstart, &gapsize, 0, MAX_GAP_END); + +#ifdef CONFIG_X86_64 + if (!found) { + gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024; + printk(KERN_ERR + "e820: cannot find a gap in the 32bit address range\n" + "e820: PCI devices with unassigned 32bit BARs may break!\n"); + } +#endif + + /* + * e820_reserve_resources_late protect stolen RAM already + */ + pci_mem_start = gapstart; + + printk(KERN_INFO + "e820: [mem %#010lx-%#010lx] available for PCI devices\n", + gapstart, gapstart + gapsize - 1); +} + +/** + * Because of the size limitation of struct boot_params, only first + * 128 E820 memory entries are passed to kernel via + * boot_params.e820_map, others are passed via SETUP_E820_EXT node of + * linked list of struct setup_data, which is parsed here. + */ +void __init parse_e820_ext(u64 phys_addr, u32 data_len) +{ + int entries; + struct e820entry *extmap; + struct setup_data *sdata; + + sdata = early_memremap(phys_addr, data_len); + entries = sdata->len / sizeof(struct e820entry); + extmap = (struct e820entry *)(sdata->data); + __append_e820_map(extmap, entries); + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + early_iounmap(sdata, data_len); + printk(KERN_INFO "e820: extended physical RAM map:\n"); + e820_print_map("extended"); +} + +#if defined(CONFIG_X86_64) || \ + (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION)) +/** + * Find the ranges of physical addresses that do not correspond to + * e820 RAM areas and mark the corresponding pages as nosave for + * hibernation (32 bit) or software suspend and suspend to RAM (64 bit). + * + * This function requires the e820 map to be sorted and without any + * overlapping entries and assumes the first e820 area to be RAM. + */ +void __init e820_mark_nosave_regions(unsigned long limit_pfn) +{ + int i; + unsigned long pfn; + + pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size); + for (i = 1; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + + if (pfn < PFN_UP(ei->addr)) + register_nosave_region(pfn, PFN_UP(ei->addr)); + + pfn = PFN_DOWN(ei->addr + ei->size); + if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN) + register_nosave_region(PFN_UP(ei->addr), pfn); + + if (pfn >= limit_pfn) + break; + } +} +#endif + +#ifdef CONFIG_ACPI +/** + * Mark ACPI NVS memory region, so that we can save/restore it during + * hibernation and the subsequent resume. + */ +static int __init e820_mark_nvs_memory(void) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + + if (ei->type == E820_NVS) + acpi_nvs_register(ei->addr, ei->size); + } + + return 0; +} +core_initcall(e820_mark_nvs_memory); +#endif + +/* + * pre allocated 4k and reserved it in memblock and e820_saved + */ +u64 __init early_reserve_e820(u64 size, u64 align) +{ + u64 addr; + + addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); + if (addr) { + e820_update_range_saved(addr, size, E820_RAM, E820_RESERVED); + printk(KERN_INFO "e820: update e820_saved for early_reserve_e820\n"); + update_e820_saved(); + } + + return addr; +} + +#ifdef CONFIG_X86_32 +# ifdef CONFIG_X86_PAE +# define MAX_ARCH_PFN (1ULL<<(36-PAGE_SHIFT)) +# else +# define MAX_ARCH_PFN (1ULL<<(32-PAGE_SHIFT)) +# endif +#else /* CONFIG_X86_32 */ +# define MAX_ARCH_PFN MAXMEM>>PAGE_SHIFT +#endif + +/* + * Find the highest page frame number we have available + */ +static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type) +{ + int i; + unsigned long last_pfn = 0; + unsigned long max_arch_pfn = MAX_ARCH_PFN; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + unsigned long start_pfn; + unsigned long end_pfn; + + if (ei->type != type) + continue; + + start_pfn = ei->addr >> PAGE_SHIFT; + end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT; + + if (start_pfn >= limit_pfn) + continue; + if (end_pfn > limit_pfn) { + last_pfn = limit_pfn; + break; + } + if (end_pfn > last_pfn) + last_pfn = end_pfn; + } + + if (last_pfn > max_arch_pfn) + last_pfn = max_arch_pfn; + + printk(KERN_INFO "e820: last_pfn = %#lx max_arch_pfn = %#lx\n", + last_pfn, max_arch_pfn); + return last_pfn; +} +unsigned long __init e820_end_of_ram_pfn(void) +{ + return e820_end_pfn(MAX_ARCH_PFN, E820_RAM); +} + +unsigned long __init e820_end_of_low_ram_pfn(void) +{ + return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM); +} + +static void early_panic(char *msg) +{ + early_printk(msg); + panic(msg); +} + +static int userdef __initdata; + +/* "mem=nopentium" disables the 4MB page tables. */ +static int __init parse_memopt(char *p) +{ + u64 mem_size; + + if (!p) + return -EINVAL; + + if (!strcmp(p, "nopentium")) { +#ifdef CONFIG_X86_32 + setup_clear_cpu_cap(X86_FEATURE_PSE); + return 0; +#else + printk(KERN_WARNING "mem=nopentium ignored! (only supported on x86_32)\n"); + return -EINVAL; +#endif + } + + userdef = 1; + mem_size = memparse(p, &p); + /* don't remove all of memory when handling "mem={invalid}" param */ + if (mem_size == 0) + return -EINVAL; + e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); + + return 0; +} +early_param("mem", parse_memopt); + +static int __init parse_memmap_one(char *p) +{ + char *oldp; + u64 start_at, mem_size; + + if (!p) + return -EINVAL; + + if (!strncmp(p, "exactmap", 8)) { +#ifdef CONFIG_CRASH_DUMP + /* + * If we are doing a crash dump, we still need to know + * the real mem size before original memory map is + * reset. + */ + saved_max_pfn = e820_end_of_ram_pfn(); +#endif + e820.nr_map = 0; + userdef = 1; + return 0; + } + + oldp = p; + mem_size = memparse(p, &p); + if (p == oldp) + return -EINVAL; + + userdef = 1; + if (*p == '@') { + start_at = memparse(p+1, &p); + e820_add_region(start_at, mem_size, E820_RAM); + } else if (*p == '#') { + start_at = memparse(p+1, &p); + e820_add_region(start_at, mem_size, E820_ACPI); + } else if (*p == '$') { + start_at = memparse(p+1, &p); + e820_add_region(start_at, mem_size, E820_RESERVED); + } else + e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); + + return *p == '\0' ? 0 : -EINVAL; +} +static int __init parse_memmap_opt(char *str) +{ + while (str) { + char *k = strchr(str, ','); + + if (k) + *k++ = 0; + + parse_memmap_one(str); + str = k; + } + + return 0; +} +early_param("memmap", parse_memmap_opt); + +void __init finish_e820_parsing(void) +{ + if (userdef) { + u32 nr = e820.nr_map; + + if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0) + early_panic("Invalid user supplied memory map"); + e820.nr_map = nr; + + printk(KERN_INFO "e820: user-defined physical RAM map:\n"); + e820_print_map("user"); + } +} + +static inline const char *e820_type_to_string(int e820_type) +{ + switch (e820_type) { + case E820_RESERVED_KERN: + case E820_RAM: return "System RAM"; + case E820_ACPI: return "ACPI Tables"; + case E820_NVS: return "ACPI Non-volatile Storage"; + case E820_UNUSABLE: return "Unusable memory"; + default: return "reserved"; + } +} + +/* + * Mark e820 reserved areas as busy for the resource manager. + */ +static struct resource __initdata *e820_res; +void __init e820_reserve_resources(void) +{ + int i; + struct resource *res; + u64 end; + + res = alloc_bootmem(sizeof(struct resource) * e820.nr_map); + e820_res = res; + for (i = 0; i < e820.nr_map; i++) { + end = e820.map[i].addr + e820.map[i].size - 1; + if (end != (resource_size_t)end) { + res++; + continue; + } + res->name = e820_type_to_string(e820.map[i].type); + res->start = e820.map[i].addr; + res->end = end; + + res->flags = IORESOURCE_MEM; + + /* + * don't register the region that could be conflicted with + * pci device BAR resource and insert them later in + * pcibios_resource_survey() + */ + if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20)) { + res->flags |= IORESOURCE_BUSY; + insert_resource(&iomem_resource, res); + } + res++; + } + + for (i = 0; i < e820_saved.nr_map; i++) { + struct e820entry *entry = &e820_saved.map[i]; + firmware_map_add_early(entry->addr, + entry->addr + entry->size, + e820_type_to_string(entry->type)); + } +} + +/* How much should we pad RAM ending depending on where it is? */ +static unsigned long ram_alignment(resource_size_t pos) +{ + unsigned long mb = pos >> 20; + + /* To 64kB in the first megabyte */ + if (!mb) + return 64*1024; + + /* To 1MB in the first 16MB */ + if (mb < 16) + return 1024*1024; + + /* To 64MB for anything above that */ + return 64*1024*1024; +} + +#define MAX_RESOURCE_SIZE ((resource_size_t)-1) + +void __init e820_reserve_resources_late(void) +{ + int i; + struct resource *res; + + res = e820_res; + for (i = 0; i < e820.nr_map; i++) { + if (!res->parent && res->end) + insert_resource_expand_to_fit(&iomem_resource, res); + res++; + } + + /* + * Try to bump up RAM regions to reasonable boundaries to + * avoid stolen RAM: + */ + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *entry = &e820.map[i]; + u64 start, end; + + if (entry->type != E820_RAM) + continue; + start = entry->addr + entry->size; + end = round_up(start, ram_alignment(start)) - 1; + if (end > MAX_RESOURCE_SIZE) + end = MAX_RESOURCE_SIZE; + if (start >= end) + continue; + printk(KERN_DEBUG + "e820: reserve RAM buffer [mem %#010llx-%#010llx]\n", + start, end); + reserve_region_with_split(&iomem_resource, start, end, + "RAM buffer"); + } +} + +char *__init default_machine_specific_memory_setup(void) +{ + char *who = "BIOS-e820"; + u32 new_nr; + /* + * Try to copy the BIOS-supplied E820-map. + * + * Otherwise fake a memory map; one section from 0k->640k, + * the next section from 1mb->appropriate_mem_k + */ + new_nr = boot_params.e820_entries; + sanitize_e820_map(boot_params.e820_map, + ARRAY_SIZE(boot_params.e820_map), + &new_nr); + boot_params.e820_entries = new_nr; + if (append_e820_map(boot_params.e820_map, boot_params.e820_entries) + < 0) { + u64 mem_size; + + /* compare results from other methods and take the greater */ + if (boot_params.alt_mem_k + < boot_params.screen_info.ext_mem_k) { + mem_size = boot_params.screen_info.ext_mem_k; + who = "BIOS-88"; + } else { + mem_size = boot_params.alt_mem_k; + who = "BIOS-e801"; + } + + e820.nr_map = 0; + e820_add_region(0, LOWMEMSIZE(), E820_RAM); + e820_add_region(HIGH_MEMORY, mem_size << 10, E820_RAM); + } + + /* In case someone cares... */ + return who; +} + +void __init setup_memory_map(void) +{ + char *who; + + who = x86_init.resources.memory_setup(); + memcpy(&e820_saved, &e820, sizeof(struct e820map)); + printk(KERN_INFO "e820: BIOS-provided physical RAM map:\n"); + e820_print_map(who); +} + +void __init memblock_x86_fill(void) +{ + int i; + u64 end; + + /* + * EFI may have more than 128 entries + * We are safe to enable resizing, beause memblock_x86_fill() + * is rather later for x86 + */ + memblock_allow_resize(); + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + + end = ei->addr + ei->size; + if (end != (resource_size_t)end) + continue; + + if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN) + continue; + + memblock_add(ei->addr, ei->size); + } + + /* throw away partial pages */ + memblock_trim_memory(PAGE_SIZE); + + memblock_dump_all(); +} + +void __init memblock_find_dma_reserve(void) +{ +#ifdef CONFIG_X86_64 + u64 nr_pages = 0, nr_free_pages = 0; + unsigned long start_pfn, end_pfn; + phys_addr_t start, end; + int i; + u64 u; + + /* + * need to find out used area below MAX_DMA_PFN + * need to use memblock to get free size in [0, MAX_DMA_PFN] + * at first, and assume boot_mem will not take below MAX_DMA_PFN + */ + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { + start_pfn = min_t(unsigned long, start_pfn, MAX_DMA_PFN); + end_pfn = min_t(unsigned long, end_pfn, MAX_DMA_PFN); + nr_pages += end_pfn - start_pfn; + } + + for_each_free_mem_range(u, NUMA_NO_NODE, &start, &end, NULL) { + start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN); + end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN); + if (start_pfn < end_pfn) + nr_free_pages += end_pfn - start_pfn; + } + + set_dma_reserve(nr_pages - nr_free_pages); +#endif +} diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c new file mode 100644 index 00000000000..2e1a6853e00 --- /dev/null +++ b/arch/x86/kernel/early-quirks.c @@ -0,0 +1,680 @@ +/* Various workarounds for chipset bugs. + This code runs very early and can't use the regular PCI subsystem + The entries are keyed to PCI bridges which usually identify chipsets + uniquely. + This is only for whole classes of chipsets with specific problems which + need early invasive action (e.g. before the timers are initialized). + Most PCI device specific workarounds can be done later and should be + in standard PCI quirks + Mainboard specific bugs should be handled by DMI entries. + CPU specific bugs in setup.c */ + +#include <linux/pci.h> +#include <linux/acpi.h> +#include <linux/pci_ids.h> +#include <drm/i915_drm.h> +#include <asm/pci-direct.h> +#include <asm/dma.h> +#include <asm/io_apic.h> +#include <asm/apic.h> +#include <asm/hpet.h> +#include <asm/iommu.h> +#include <asm/gart.h> +#include <asm/irq_remapping.h> + +static void __init fix_hypertransport_config(int num, int slot, int func) +{ + u32 htcfg; + /* + * we found a hypertransport bus + * make sure that we are broadcasting + * interrupts to all cpus on the ht bus + * if we're using extended apic ids + */ + htcfg = read_pci_config(num, slot, func, 0x68); + if (htcfg & (1 << 18)) { + printk(KERN_INFO "Detected use of extended apic ids " + "on hypertransport bus\n"); + if ((htcfg & (1 << 17)) == 0) { + printk(KERN_INFO "Enabling hypertransport extended " + "apic interrupt broadcast\n"); + printk(KERN_INFO "Note this is a bios bug, " + "please contact your hw vendor\n"); + htcfg |= (1 << 17); + write_pci_config(num, slot, func, 0x68, htcfg); + } + } + + +} + +static void __init via_bugs(int num, int slot, int func) +{ +#ifdef CONFIG_GART_IOMMU + if ((max_pfn > MAX_DMA32_PFN || force_iommu) && + !gart_iommu_aperture_allowed) { + printk(KERN_INFO + "Looks like a VIA chipset. Disabling IOMMU." + " Override with iommu=allowed\n"); + gart_iommu_aperture_disabled = 1; + } +#endif +} + +#ifdef CONFIG_ACPI +#ifdef CONFIG_X86_IO_APIC + +static int __init nvidia_hpet_check(struct acpi_table_header *header) +{ + return 0; +} +#endif /* CONFIG_X86_IO_APIC */ +#endif /* CONFIG_ACPI */ + +static void __init nvidia_bugs(int num, int slot, int func) +{ +#ifdef CONFIG_ACPI +#ifdef CONFIG_X86_IO_APIC + /* + * All timer overrides on Nvidia are + * wrong unless HPET is enabled. + * Unfortunately that's not true on many Asus boards. + * We don't know yet how to detect this automatically, but + * at least allow a command line override. + */ + if (acpi_use_timer_override) + return; + + if (acpi_table_parse(ACPI_SIG_HPET, nvidia_hpet_check)) { + acpi_skip_timer_override = 1; + printk(KERN_INFO "Nvidia board " + "detected. Ignoring ACPI " + "timer override.\n"); + printk(KERN_INFO "If you got timer trouble " + "try acpi_use_timer_override\n"); + } +#endif +#endif + /* RED-PEN skip them on mptables too? */ + +} + +#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC) +static u32 __init ati_ixp4x0_rev(int num, int slot, int func) +{ + u32 d; + u8 b; + + b = read_pci_config_byte(num, slot, func, 0xac); + b &= ~(1<<5); + write_pci_config_byte(num, slot, func, 0xac, b); + + d = read_pci_config(num, slot, func, 0x70); + d |= 1<<8; + write_pci_config(num, slot, func, 0x70, d); + + d = read_pci_config(num, slot, func, 0x8); + d &= 0xff; + return d; +} + +static void __init ati_bugs(int num, int slot, int func) +{ + u32 d; + u8 b; + + if (acpi_use_timer_override) + return; + + d = ati_ixp4x0_rev(num, slot, func); + if (d < 0x82) + acpi_skip_timer_override = 1; + else { + /* check for IRQ0 interrupt swap */ + outb(0x72, 0xcd6); b = inb(0xcd7); + if (!(b & 0x2)) + acpi_skip_timer_override = 1; + } + + if (acpi_skip_timer_override) { + printk(KERN_INFO "SB4X0 revision 0x%x\n", d); + printk(KERN_INFO "Ignoring ACPI timer override.\n"); + printk(KERN_INFO "If you got timer trouble " + "try acpi_use_timer_override\n"); + } +} + +static u32 __init ati_sbx00_rev(int num, int slot, int func) +{ + u32 d; + + d = read_pci_config(num, slot, func, 0x8); + d &= 0xff; + + return d; +} + +static void __init ati_bugs_contd(int num, int slot, int func) +{ + u32 d, rev; + + rev = ati_sbx00_rev(num, slot, func); + if (rev >= 0x40) + acpi_fix_pin2_polarity = 1; + + /* + * SB600: revisions 0x11, 0x12, 0x13, 0x14, ... + * SB700: revisions 0x39, 0x3a, ... + * SB800: revisions 0x40, 0x41, ... + */ + if (rev >= 0x39) + return; + + if (acpi_use_timer_override) + return; + + /* check for IRQ0 interrupt swap */ + d = read_pci_config(num, slot, func, 0x64); + if (!(d & (1<<14))) + acpi_skip_timer_override = 1; + + if (acpi_skip_timer_override) { + printk(KERN_INFO "SB600 revision 0x%x\n", rev); + printk(KERN_INFO "Ignoring ACPI timer override.\n"); + printk(KERN_INFO "If you got timer trouble " + "try acpi_use_timer_override\n"); + } +} +#else +static void __init ati_bugs(int num, int slot, int func) +{ +} + +static void __init ati_bugs_contd(int num, int slot, int func) +{ +} +#endif + +static void __init intel_remapping_check(int num, int slot, int func) +{ + u8 revision; + u16 device; + + device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID); + revision = read_pci_config_byte(num, slot, func, PCI_REVISION_ID); + + /* + * Revision <= 13 of all triggering devices id in this quirk + * have a problem draining interrupts when irq remapping is + * enabled, and should be flagged as broken. Additionally + * revision 0x22 of device id 0x3405 has this problem. + */ + if (revision <= 0x13) + set_irq_remapping_broken(); + else if (device == 0x3405 && revision == 0x22) + set_irq_remapping_broken(); +} + +/* + * Systems with Intel graphics controllers set aside memory exclusively + * for gfx driver use. This memory is not marked in the E820 as reserved + * or as RAM, and so is subject to overlap from E820 manipulation later + * in the boot process. On some systems, MMIO space is allocated on top, + * despite the efforts of the "RAM buffer" approach, which simply rounds + * memory boundaries up to 64M to try to catch space that may decode + * as RAM and so is not suitable for MMIO. + * + * And yes, so far on current devices the base addr is always under 4G. + */ +static u32 __init intel_stolen_base(int num, int slot, int func, size_t stolen_size) +{ + u32 base; + + /* + * For the PCI IDs in this quirk, the stolen base is always + * in 0x5c, aka the BDSM register (yes that's really what + * it's called). + */ + base = read_pci_config(num, slot, func, 0x5c); + base &= ~((1<<20) - 1); + + return base; +} + +#define KB(x) ((x) * 1024UL) +#define MB(x) (KB (KB (x))) +#define GB(x) (MB (KB (x))) + +static size_t __init i830_tseg_size(void) +{ + u8 tmp = read_pci_config_byte(0, 0, 0, I830_ESMRAMC); + + if (!(tmp & TSEG_ENABLE)) + return 0; + + if (tmp & I830_TSEG_SIZE_1M) + return MB(1); + else + return KB(512); +} + +static size_t __init i845_tseg_size(void) +{ + u8 tmp = read_pci_config_byte(0, 0, 0, I845_ESMRAMC); + + if (!(tmp & TSEG_ENABLE)) + return 0; + + switch (tmp & I845_TSEG_SIZE_MASK) { + case I845_TSEG_SIZE_512K: + return KB(512); + case I845_TSEG_SIZE_1M: + return MB(1); + default: + WARN_ON(1); + return 0; + } +} + +static size_t __init i85x_tseg_size(void) +{ + u8 tmp = read_pci_config_byte(0, 0, 0, I85X_ESMRAMC); + + if (!(tmp & TSEG_ENABLE)) + return 0; + + return MB(1); +} + +static size_t __init i830_mem_size(void) +{ + return read_pci_config_byte(0, 0, 0, I830_DRB3) * MB(32); +} + +static size_t __init i85x_mem_size(void) +{ + return read_pci_config_byte(0, 0, 1, I85X_DRB3) * MB(32); +} + +/* + * On 830/845/85x the stolen memory base isn't available in any + * register. We need to calculate it as TOM-TSEG_SIZE-stolen_size. + */ +static u32 __init i830_stolen_base(int num, int slot, int func, size_t stolen_size) +{ + return i830_mem_size() - i830_tseg_size() - stolen_size; +} + +static u32 __init i845_stolen_base(int num, int slot, int func, size_t stolen_size) +{ + return i830_mem_size() - i845_tseg_size() - stolen_size; +} + +static u32 __init i85x_stolen_base(int num, int slot, int func, size_t stolen_size) +{ + return i85x_mem_size() - i85x_tseg_size() - stolen_size; +} + +static u32 __init i865_stolen_base(int num, int slot, int func, size_t stolen_size) +{ + /* + * FIXME is the graphics stolen memory region + * always at TOUD? Ie. is it always the last + * one to be allocated by the BIOS? + */ + return read_pci_config_16(0, 0, 0, I865_TOUD) << 16; +} + +static size_t __init i830_stolen_size(int num, int slot, int func) +{ + size_t stolen_size; + u16 gmch_ctrl; + + gmch_ctrl = read_pci_config_16(0, 0, 0, I830_GMCH_CTRL); + + switch (gmch_ctrl & I830_GMCH_GMS_MASK) { + case I830_GMCH_GMS_STOLEN_512: + stolen_size = KB(512); + break; + case I830_GMCH_GMS_STOLEN_1024: + stolen_size = MB(1); + break; + case I830_GMCH_GMS_STOLEN_8192: + stolen_size = MB(8); + break; + case I830_GMCH_GMS_LOCAL: + /* local memory isn't part of the normal address space */ + stolen_size = 0; + break; + default: + return 0; + } + + return stolen_size; +} + +static size_t __init gen3_stolen_size(int num, int slot, int func) +{ + size_t stolen_size; + u16 gmch_ctrl; + + gmch_ctrl = read_pci_config_16(0, 0, 0, I830_GMCH_CTRL); + + switch (gmch_ctrl & I855_GMCH_GMS_MASK) { + case I855_GMCH_GMS_STOLEN_1M: + stolen_size = MB(1); + break; + case I855_GMCH_GMS_STOLEN_4M: + stolen_size = MB(4); + break; + case I855_GMCH_GMS_STOLEN_8M: + stolen_size = MB(8); + break; + case I855_GMCH_GMS_STOLEN_16M: + stolen_size = MB(16); + break; + case I855_GMCH_GMS_STOLEN_32M: + stolen_size = MB(32); + break; + case I915_GMCH_GMS_STOLEN_48M: + stolen_size = MB(48); + break; + case I915_GMCH_GMS_STOLEN_64M: + stolen_size = MB(64); + break; + case G33_GMCH_GMS_STOLEN_128M: + stolen_size = MB(128); + break; + case G33_GMCH_GMS_STOLEN_256M: + stolen_size = MB(256); + break; + case INTEL_GMCH_GMS_STOLEN_96M: + stolen_size = MB(96); + break; + case INTEL_GMCH_GMS_STOLEN_160M: + stolen_size = MB(160); + break; + case INTEL_GMCH_GMS_STOLEN_224M: + stolen_size = MB(224); + break; + case INTEL_GMCH_GMS_STOLEN_352M: + stolen_size = MB(352); + break; + default: + stolen_size = 0; + break; + } + + return stolen_size; +} + +static size_t __init gen6_stolen_size(int num, int slot, int func) +{ + u16 gmch_ctrl; + + gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL); + gmch_ctrl >>= SNB_GMCH_GMS_SHIFT; + gmch_ctrl &= SNB_GMCH_GMS_MASK; + + return gmch_ctrl << 25; /* 32 MB units */ +} + +static size_t __init gen8_stolen_size(int num, int slot, int func) +{ + u16 gmch_ctrl; + + gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL); + gmch_ctrl >>= BDW_GMCH_GMS_SHIFT; + gmch_ctrl &= BDW_GMCH_GMS_MASK; + return gmch_ctrl << 25; /* 32 MB units */ +} + +static size_t __init chv_stolen_size(int num, int slot, int func) +{ + u16 gmch_ctrl; + + gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL); + gmch_ctrl >>= SNB_GMCH_GMS_SHIFT; + gmch_ctrl &= SNB_GMCH_GMS_MASK; + + /* + * 0x0 to 0x10: 32MB increments starting at 0MB + * 0x11 to 0x16: 4MB increments starting at 8MB + * 0x17 to 0x1d: 4MB increments start at 36MB + */ + if (gmch_ctrl < 0x11) + return gmch_ctrl << 25; + else if (gmch_ctrl < 0x17) + return (gmch_ctrl - 0x11 + 2) << 22; + else + return (gmch_ctrl - 0x17 + 9) << 22; +} + +struct intel_stolen_funcs { + size_t (*size)(int num, int slot, int func); + u32 (*base)(int num, int slot, int func, size_t size); +}; + +static const struct intel_stolen_funcs i830_stolen_funcs __initconst = { + .base = i830_stolen_base, + .size = i830_stolen_size, +}; + +static const struct intel_stolen_funcs i845_stolen_funcs __initconst = { + .base = i845_stolen_base, + .size = i830_stolen_size, +}; + +static const struct intel_stolen_funcs i85x_stolen_funcs __initconst = { + .base = i85x_stolen_base, + .size = gen3_stolen_size, +}; + +static const struct intel_stolen_funcs i865_stolen_funcs __initconst = { + .base = i865_stolen_base, + .size = gen3_stolen_size, +}; + +static const struct intel_stolen_funcs gen3_stolen_funcs __initconst = { + .base = intel_stolen_base, + .size = gen3_stolen_size, +}; + +static const struct intel_stolen_funcs gen6_stolen_funcs __initconst = { + .base = intel_stolen_base, + .size = gen6_stolen_size, +}; + +static const struct intel_stolen_funcs gen8_stolen_funcs __initconst = { + .base = intel_stolen_base, + .size = gen8_stolen_size, +}; + +static const struct intel_stolen_funcs chv_stolen_funcs __initconst = { + .base = intel_stolen_base, + .size = chv_stolen_size, +}; + +static const struct pci_device_id intel_stolen_ids[] __initconst = { + INTEL_I830_IDS(&i830_stolen_funcs), + INTEL_I845G_IDS(&i845_stolen_funcs), + INTEL_I85X_IDS(&i85x_stolen_funcs), + INTEL_I865G_IDS(&i865_stolen_funcs), + INTEL_I915G_IDS(&gen3_stolen_funcs), + INTEL_I915GM_IDS(&gen3_stolen_funcs), + INTEL_I945G_IDS(&gen3_stolen_funcs), + INTEL_I945GM_IDS(&gen3_stolen_funcs), + INTEL_VLV_M_IDS(&gen6_stolen_funcs), + INTEL_VLV_D_IDS(&gen6_stolen_funcs), + INTEL_PINEVIEW_IDS(&gen3_stolen_funcs), + INTEL_I965G_IDS(&gen3_stolen_funcs), + INTEL_G33_IDS(&gen3_stolen_funcs), + INTEL_I965GM_IDS(&gen3_stolen_funcs), + INTEL_GM45_IDS(&gen3_stolen_funcs), + INTEL_G45_IDS(&gen3_stolen_funcs), + INTEL_IRONLAKE_D_IDS(&gen3_stolen_funcs), + INTEL_IRONLAKE_M_IDS(&gen3_stolen_funcs), + INTEL_SNB_D_IDS(&gen6_stolen_funcs), + INTEL_SNB_M_IDS(&gen6_stolen_funcs), + INTEL_IVB_M_IDS(&gen6_stolen_funcs), + INTEL_IVB_D_IDS(&gen6_stolen_funcs), + INTEL_HSW_D_IDS(&gen6_stolen_funcs), + INTEL_HSW_M_IDS(&gen6_stolen_funcs), + INTEL_BDW_M_IDS(&gen8_stolen_funcs), + INTEL_BDW_D_IDS(&gen8_stolen_funcs), + INTEL_CHV_IDS(&chv_stolen_funcs), +}; + +static void __init intel_graphics_stolen(int num, int slot, int func) +{ + size_t size; + int i; + u32 start; + u16 device, subvendor, subdevice; + + device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID); + subvendor = read_pci_config_16(num, slot, func, + PCI_SUBSYSTEM_VENDOR_ID); + subdevice = read_pci_config_16(num, slot, func, PCI_SUBSYSTEM_ID); + + for (i = 0; i < ARRAY_SIZE(intel_stolen_ids); i++) { + if (intel_stolen_ids[i].device == device) { + const struct intel_stolen_funcs *stolen_funcs = + (const struct intel_stolen_funcs *)intel_stolen_ids[i].driver_data; + size = stolen_funcs->size(num, slot, func); + start = stolen_funcs->base(num, slot, func, size); + if (size && start) { + printk(KERN_INFO "Reserving Intel graphics stolen memory at 0x%x-0x%x\n", + start, start + (u32)size - 1); + /* Mark this space as reserved */ + e820_add_region(start, size, E820_RESERVED); + sanitize_e820_map(e820.map, + ARRAY_SIZE(e820.map), + &e820.nr_map); + } + return; + } + } +} + +static void __init force_disable_hpet(int num, int slot, int func) +{ +#ifdef CONFIG_HPET_TIMER + boot_hpet_disable = 1; + pr_info("x86/hpet: Will disable the HPET for this platform because it's not reliable\n"); +#endif +} + + +#define QFLAG_APPLY_ONCE 0x1 +#define QFLAG_APPLIED 0x2 +#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) +struct chipset { + u32 vendor; + u32 device; + u32 class; + u32 class_mask; + u32 flags; + void (*f)(int num, int slot, int func); +}; + +/* + * Only works for devices on the root bus. If you add any devices + * not on bus 0 readd another loop level in early_quirks(). But + * be careful because at least the Nvidia quirk here relies on + * only matching on bus 0. + */ +static struct chipset early_qrk[] __initdata = { + { PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID, + PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, nvidia_bugs }, + { PCI_VENDOR_ID_VIA, PCI_ANY_ID, + PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs }, + { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, + PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config }, + { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS, + PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs }, + { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS, + PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd }, + { PCI_VENDOR_ID_INTEL, 0x3403, PCI_CLASS_BRIDGE_HOST, + PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check }, + { PCI_VENDOR_ID_INTEL, 0x3405, PCI_CLASS_BRIDGE_HOST, + PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check }, + { PCI_VENDOR_ID_INTEL, 0x3406, PCI_CLASS_BRIDGE_HOST, + PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check }, + { PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID, + QFLAG_APPLY_ONCE, intel_graphics_stolen }, + /* + * HPET on current version of Baytrail platform has accuracy + * problems, disable it for now: + */ + { PCI_VENDOR_ID_INTEL, 0x0f00, + PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, + {} +}; + +/** + * check_dev_quirk - apply early quirks to a given PCI device + * @num: bus number + * @slot: slot number + * @func: PCI function + * + * Check the vendor & device ID against the early quirks table. + * + * If the device is single function, let early_quirks() know so we don't + * poke at this device again. + */ +static int __init check_dev_quirk(int num, int slot, int func) +{ + u16 class; + u16 vendor; + u16 device; + u8 type; + int i; + + class = read_pci_config_16(num, slot, func, PCI_CLASS_DEVICE); + + if (class == 0xffff) + return -1; /* no class, treat as single function */ + + vendor = read_pci_config_16(num, slot, func, PCI_VENDOR_ID); + + device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID); + + for (i = 0; early_qrk[i].f != NULL; i++) { + if (((early_qrk[i].vendor == PCI_ANY_ID) || + (early_qrk[i].vendor == vendor)) && + ((early_qrk[i].device == PCI_ANY_ID) || + (early_qrk[i].device == device)) && + (!((early_qrk[i].class ^ class) & + early_qrk[i].class_mask))) { + if ((early_qrk[i].flags & + QFLAG_DONE) != QFLAG_DONE) + early_qrk[i].f(num, slot, func); + early_qrk[i].flags |= QFLAG_APPLIED; + } + } + + type = read_pci_config_byte(num, slot, func, + PCI_HEADER_TYPE); + if (!(type & 0x80)) + return -1; + + return 0; +} + +void __init early_quirks(void) +{ + int slot, func; + + if (!early_pci_allowed()) + return; + + /* Poor man's PCI discovery */ + /* Only scan the root bus */ + for (slot = 0; slot < 32; slot++) + for (func = 0; func < 8; func++) { + /* Only probe function 0 on single fn devices */ + if (check_dev_quirk(0, slot, func)) + break; + } +} diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c new file mode 100644 index 00000000000..01d1c187c9f --- /dev/null +++ b/arch/x86/kernel/early_printk.c @@ -0,0 +1,249 @@ +#include <linux/console.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/screen_info.h> +#include <linux/usb/ch9.h> +#include <linux/pci_regs.h> +#include <linux/pci_ids.h> +#include <linux/errno.h> +#include <asm/io.h> +#include <asm/processor.h> +#include <asm/fcntl.h> +#include <asm/setup.h> +#include <xen/hvc-console.h> +#include <asm/pci-direct.h> +#include <asm/fixmap.h> +#include <asm/intel-mid.h> +#include <asm/pgtable.h> +#include <linux/usb/ehci_def.h> +#include <linux/efi.h> +#include <asm/efi.h> + +/* Simple VGA output */ +#define VGABASE (__ISA_IO_base + 0xb8000) + +static int max_ypos = 25, max_xpos = 80; +static int current_ypos = 25, current_xpos; + +static void early_vga_write(struct console *con, const char *str, unsigned n) +{ + char c; + int i, k, j; + + while ((c = *str++) != '\0' && n-- > 0) { + if (current_ypos >= max_ypos) { + /* scroll 1 line up */ + for (k = 1, j = 0; k < max_ypos; k++, j++) { + for (i = 0; i < max_xpos; i++) { + writew(readw(VGABASE+2*(max_xpos*k+i)), + VGABASE + 2*(max_xpos*j + i)); + } + } + for (i = 0; i < max_xpos; i++) + writew(0x720, VGABASE + 2*(max_xpos*j + i)); + current_ypos = max_ypos-1; + } +#ifdef CONFIG_KGDB_KDB + if (c == '\b') { + if (current_xpos > 0) + current_xpos--; + } else if (c == '\r') { + current_xpos = 0; + } else +#endif + if (c == '\n') { + current_xpos = 0; + current_ypos++; + } else if (c != '\r') { + writew(((0x7 << 8) | (unsigned short) c), + VGABASE + 2*(max_xpos*current_ypos + + current_xpos++)); + if (current_xpos >= max_xpos) { + current_xpos = 0; + current_ypos++; + } + } + } +} + +static struct console early_vga_console = { + .name = "earlyvga", + .write = early_vga_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; + +/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */ + +static int early_serial_base = 0x3f8; /* ttyS0 */ + +#define XMTRDY 0x20 + +#define DLAB 0x80 + +#define TXR 0 /* Transmit register (WRITE) */ +#define RXR 0 /* Receive register (READ) */ +#define IER 1 /* Interrupt Enable */ +#define IIR 2 /* Interrupt ID */ +#define FCR 2 /* FIFO control */ +#define LCR 3 /* Line control */ +#define MCR 4 /* Modem control */ +#define LSR 5 /* Line Status */ +#define MSR 6 /* Modem Status */ +#define DLL 0 /* Divisor Latch Low */ +#define DLH 1 /* Divisor latch High */ + +static int early_serial_putc(unsigned char ch) +{ + unsigned timeout = 0xffff; + + while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) + cpu_relax(); + outb(ch, early_serial_base + TXR); + return timeout ? 0 : -1; +} + +static void early_serial_write(struct console *con, const char *s, unsigned n) +{ + while (*s && n-- > 0) { + if (*s == '\n') + early_serial_putc('\r'); + early_serial_putc(*s); + s++; + } +} + +#define DEFAULT_BAUD 9600 + +static __init void early_serial_init(char *s) +{ + unsigned char c; + unsigned divisor; + unsigned baud = DEFAULT_BAUD; + char *e; + + if (*s == ',') + ++s; + + if (*s) { + unsigned port; + if (!strncmp(s, "0x", 2)) { + early_serial_base = simple_strtoul(s, &e, 16); + } else { + static const int __initconst bases[] = { 0x3f8, 0x2f8 }; + + if (!strncmp(s, "ttyS", 4)) + s += 4; + port = simple_strtoul(s, &e, 10); + if (port > 1 || s == e) + port = 0; + early_serial_base = bases[port]; + } + s += strcspn(s, ","); + if (*s == ',') + s++; + } + + outb(0x3, early_serial_base + LCR); /* 8n1 */ + outb(0, early_serial_base + IER); /* no interrupt */ + outb(0, early_serial_base + FCR); /* no fifo */ + outb(0x3, early_serial_base + MCR); /* DTR + RTS */ + + if (*s) { + baud = simple_strtoul(s, &e, 0); + if (baud == 0 || s == e) + baud = DEFAULT_BAUD; + } + + divisor = 115200 / baud; + c = inb(early_serial_base + LCR); + outb(c | DLAB, early_serial_base + LCR); + outb(divisor & 0xff, early_serial_base + DLL); + outb((divisor >> 8) & 0xff, early_serial_base + DLH); + outb(c & ~DLAB, early_serial_base + LCR); +} + +static struct console early_serial_console = { + .name = "earlyser", + .write = early_serial_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; + +static inline void early_console_register(struct console *con, int keep_early) +{ + if (con->index != -1) { + printk(KERN_CRIT "ERROR: earlyprintk= %s already used\n", + con->name); + return; + } + early_console = con; + if (keep_early) + early_console->flags &= ~CON_BOOT; + else + early_console->flags |= CON_BOOT; + register_console(early_console); +} + +static int __init setup_early_printk(char *buf) +{ + int keep; + + if (!buf) + return 0; + + if (early_console) + return 0; + + keep = (strstr(buf, "keep") != NULL); + + while (*buf != '\0') { + if (!strncmp(buf, "serial", 6)) { + buf += 6; + early_serial_init(buf); + early_console_register(&early_serial_console, keep); + if (!strncmp(buf, ",ttyS", 5)) + buf += 5; + } + if (!strncmp(buf, "ttyS", 4)) { + early_serial_init(buf + 4); + early_console_register(&early_serial_console, keep); + } + if (!strncmp(buf, "vga", 3) && + boot_params.screen_info.orig_video_isVGA == 1) { + max_xpos = boot_params.screen_info.orig_video_cols; + max_ypos = boot_params.screen_info.orig_video_lines; + current_ypos = boot_params.screen_info.orig_y; + early_console_register(&early_vga_console, keep); + } +#ifdef CONFIG_EARLY_PRINTK_DBGP + if (!strncmp(buf, "dbgp", 4) && !early_dbgp_init(buf + 4)) + early_console_register(&early_dbgp_console, keep); +#endif +#ifdef CONFIG_HVC_XEN + if (!strncmp(buf, "xen", 3)) + early_console_register(&xenboot_console, keep); +#endif +#ifdef CONFIG_EARLY_PRINTK_INTEL_MID + if (!strncmp(buf, "mrst", 4)) { + mrst_early_console_init(); + early_console_register(&early_mrst_console, keep); + } + + if (!strncmp(buf, "hsu", 3)) { + hsu_early_console_init(buf + 3); + early_console_register(&early_hsu_console, keep); + } +#endif +#ifdef CONFIG_EARLY_PRINTK_EFI + if (!strncmp(buf, "efi", 3)) + early_console_register(&early_efi_console, keep); +#endif + + buf++; + } + return 0; +} + +early_param("earlyprintk", setup_early_printk); diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S new file mode 100644 index 00000000000..0d0c9d4ab6d --- /dev/null +++ b/arch/x86/kernel/entry_32.S @@ -0,0 +1,1440 @@ +/* + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +/* + * entry.S contains the system-call and fault low-level handling routines. + * This also contains the timer-interrupt handler, as well as all interrupts + * and faults that can result in a task-switch. + * + * NOTE: This code handles signal-recognition, which happens every time + * after a timer-interrupt and after each system call. + * + * I changed all the .align's to 4 (16 byte alignment), as that's faster + * on a 486. + * + * Stack layout in 'syscall_exit': + * ptrace needs to have all regs on the stack. + * if the order here is changed, it needs to be + * updated in fork.c:copy_process, signal.c:do_signal, + * ptrace.c and ptrace.h + * + * 0(%esp) - %ebx + * 4(%esp) - %ecx + * 8(%esp) - %edx + * C(%esp) - %esi + * 10(%esp) - %edi + * 14(%esp) - %ebp + * 18(%esp) - %eax + * 1C(%esp) - %ds + * 20(%esp) - %es + * 24(%esp) - %fs + * 28(%esp) - %gs saved iff !CONFIG_X86_32_LAZY_GS + * 2C(%esp) - orig_eax + * 30(%esp) - %eip + * 34(%esp) - %cs + * 38(%esp) - %eflags + * 3C(%esp) - %oldesp + * 40(%esp) - %oldss + * + * "current" is in register %ebx during any slow entries. + */ + +#include <linux/linkage.h> +#include <linux/err.h> +#include <asm/thread_info.h> +#include <asm/irqflags.h> +#include <asm/errno.h> +#include <asm/segment.h> +#include <asm/smp.h> +#include <asm/page_types.h> +#include <asm/percpu.h> +#include <asm/dwarf2.h> +#include <asm/processor-flags.h> +#include <asm/ftrace.h> +#include <asm/irq_vectors.h> +#include <asm/cpufeature.h> +#include <asm/alternative-asm.h> +#include <asm/asm.h> +#include <asm/smap.h> + +/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ +#include <linux/elf-em.h> +#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) +#define __AUDIT_ARCH_LE 0x40000000 + +#ifndef CONFIG_AUDITSYSCALL +#define sysenter_audit syscall_trace_entry +#define sysexit_audit syscall_exit_work +#endif + + .section .entry.text, "ax" + +/* + * We use macros for low-level operations which need to be overridden + * for paravirtualization. The following will never clobber any registers: + * INTERRUPT_RETURN (aka. "iret") + * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax") + * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit"). + * + * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must + * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY). + * Allowing a register to be clobbered can shrink the paravirt replacement + * enough to patch inline, increasing performance. + */ + +#ifdef CONFIG_PREEMPT +#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF +#else +#define preempt_stop(clobbers) +#define resume_kernel restore_all +#endif + +.macro TRACE_IRQS_IRET +#ifdef CONFIG_TRACE_IRQFLAGS + testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off? + jz 1f + TRACE_IRQS_ON +1: +#endif +.endm + +/* + * User gs save/restore + * + * %gs is used for userland TLS and kernel only uses it for stack + * canary which is required to be at %gs:20 by gcc. Read the comment + * at the top of stackprotector.h for more info. + * + * Local labels 98 and 99 are used. + */ +#ifdef CONFIG_X86_32_LAZY_GS + + /* unfortunately push/pop can't be no-op */ +.macro PUSH_GS + pushl_cfi $0 +.endm +.macro POP_GS pop=0 + addl $(4 + \pop), %esp + CFI_ADJUST_CFA_OFFSET -(4 + \pop) +.endm +.macro POP_GS_EX +.endm + + /* all the rest are no-op */ +.macro PTGS_TO_GS +.endm +.macro PTGS_TO_GS_EX +.endm +.macro GS_TO_REG reg +.endm +.macro REG_TO_PTGS reg +.endm +.macro SET_KERNEL_GS reg +.endm + +#else /* CONFIG_X86_32_LAZY_GS */ + +.macro PUSH_GS + pushl_cfi %gs + /*CFI_REL_OFFSET gs, 0*/ +.endm + +.macro POP_GS pop=0 +98: popl_cfi %gs + /*CFI_RESTORE gs*/ + .if \pop <> 0 + add $\pop, %esp + CFI_ADJUST_CFA_OFFSET -\pop + .endif +.endm +.macro POP_GS_EX +.pushsection .fixup, "ax" +99: movl $0, (%esp) + jmp 98b +.popsection + _ASM_EXTABLE(98b,99b) +.endm + +.macro PTGS_TO_GS +98: mov PT_GS(%esp), %gs +.endm +.macro PTGS_TO_GS_EX +.pushsection .fixup, "ax" +99: movl $0, PT_GS(%esp) + jmp 98b +.popsection + _ASM_EXTABLE(98b,99b) +.endm + +.macro GS_TO_REG reg + movl %gs, \reg + /*CFI_REGISTER gs, \reg*/ +.endm +.macro REG_TO_PTGS reg + movl \reg, PT_GS(%esp) + /*CFI_REL_OFFSET gs, PT_GS*/ +.endm +.macro SET_KERNEL_GS reg + movl $(__KERNEL_STACK_CANARY), \reg + movl \reg, %gs +.endm + +#endif /* CONFIG_X86_32_LAZY_GS */ + +.macro SAVE_ALL + cld + PUSH_GS + pushl_cfi %fs + /*CFI_REL_OFFSET fs, 0;*/ + pushl_cfi %es + /*CFI_REL_OFFSET es, 0;*/ + pushl_cfi %ds + /*CFI_REL_OFFSET ds, 0;*/ + pushl_cfi %eax + CFI_REL_OFFSET eax, 0 + pushl_cfi %ebp + CFI_REL_OFFSET ebp, 0 + pushl_cfi %edi + CFI_REL_OFFSET edi, 0 + pushl_cfi %esi + CFI_REL_OFFSET esi, 0 + pushl_cfi %edx + CFI_REL_OFFSET edx, 0 + pushl_cfi %ecx + CFI_REL_OFFSET ecx, 0 + pushl_cfi %ebx + CFI_REL_OFFSET ebx, 0 + movl $(__USER_DS), %edx + movl %edx, %ds + movl %edx, %es + movl $(__KERNEL_PERCPU), %edx + movl %edx, %fs + SET_KERNEL_GS %edx +.endm + +.macro RESTORE_INT_REGS + popl_cfi %ebx + CFI_RESTORE ebx + popl_cfi %ecx + CFI_RESTORE ecx + popl_cfi %edx + CFI_RESTORE edx + popl_cfi %esi + CFI_RESTORE esi + popl_cfi %edi + CFI_RESTORE edi + popl_cfi %ebp + CFI_RESTORE ebp + popl_cfi %eax + CFI_RESTORE eax +.endm + +.macro RESTORE_REGS pop=0 + RESTORE_INT_REGS +1: popl_cfi %ds + /*CFI_RESTORE ds;*/ +2: popl_cfi %es + /*CFI_RESTORE es;*/ +3: popl_cfi %fs + /*CFI_RESTORE fs;*/ + POP_GS \pop +.pushsection .fixup, "ax" +4: movl $0, (%esp) + jmp 1b +5: movl $0, (%esp) + jmp 2b +6: movl $0, (%esp) + jmp 3b +.popsection + _ASM_EXTABLE(1b,4b) + _ASM_EXTABLE(2b,5b) + _ASM_EXTABLE(3b,6b) + POP_GS_EX +.endm + +.macro RING0_INT_FRAME + CFI_STARTPROC simple + CFI_SIGNAL_FRAME + CFI_DEF_CFA esp, 3*4 + /*CFI_OFFSET cs, -2*4;*/ + CFI_OFFSET eip, -3*4 +.endm + +.macro RING0_EC_FRAME + CFI_STARTPROC simple + CFI_SIGNAL_FRAME + CFI_DEF_CFA esp, 4*4 + /*CFI_OFFSET cs, -2*4;*/ + CFI_OFFSET eip, -3*4 +.endm + +.macro RING0_PTREGS_FRAME + CFI_STARTPROC simple + CFI_SIGNAL_FRAME + CFI_DEF_CFA esp, PT_OLDESP-PT_EBX + /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/ + CFI_OFFSET eip, PT_EIP-PT_OLDESP + /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/ + /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/ + CFI_OFFSET eax, PT_EAX-PT_OLDESP + CFI_OFFSET ebp, PT_EBP-PT_OLDESP + CFI_OFFSET edi, PT_EDI-PT_OLDESP + CFI_OFFSET esi, PT_ESI-PT_OLDESP + CFI_OFFSET edx, PT_EDX-PT_OLDESP + CFI_OFFSET ecx, PT_ECX-PT_OLDESP + CFI_OFFSET ebx, PT_EBX-PT_OLDESP +.endm + +ENTRY(ret_from_fork) + CFI_STARTPROC + pushl_cfi %eax + call schedule_tail + GET_THREAD_INFO(%ebp) + popl_cfi %eax + pushl_cfi $0x0202 # Reset kernel eflags + popfl_cfi + jmp syscall_exit + CFI_ENDPROC +END(ret_from_fork) + +ENTRY(ret_from_kernel_thread) + CFI_STARTPROC + pushl_cfi %eax + call schedule_tail + GET_THREAD_INFO(%ebp) + popl_cfi %eax + pushl_cfi $0x0202 # Reset kernel eflags + popfl_cfi + movl PT_EBP(%esp),%eax + call *PT_EBX(%esp) + movl $0,PT_EAX(%esp) + jmp syscall_exit + CFI_ENDPROC +ENDPROC(ret_from_kernel_thread) + +/* + * Return to user mode is not as complex as all this looks, + * but we want the default path for a system call return to + * go as quickly as possible which is why some of this is + * less clear than it otherwise should be. + */ + + # userspace resumption stub bypassing syscall exit tracing + ALIGN + RING0_PTREGS_FRAME +ret_from_exception: + preempt_stop(CLBR_ANY) +ret_from_intr: + GET_THREAD_INFO(%ebp) +#ifdef CONFIG_VM86 + movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS + movb PT_CS(%esp), %al + andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax +#else + /* + * We can be coming here from child spawned by kernel_thread(). + */ + movl PT_CS(%esp), %eax + andl $SEGMENT_RPL_MASK, %eax +#endif + cmpl $USER_RPL, %eax + jb resume_kernel # not returning to v8086 or userspace + +ENTRY(resume_userspace) + LOCKDEP_SYS_EXIT + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + andl $_TIF_WORK_MASK, %ecx # is there any work to be done on + # int/exception return? + jne work_pending + jmp restore_all +END(ret_from_exception) + +#ifdef CONFIG_PREEMPT +ENTRY(resume_kernel) + DISABLE_INTERRUPTS(CLBR_ANY) +need_resched: + cmpl $0,PER_CPU_VAR(__preempt_count) + jnz restore_all + testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ? + jz restore_all + call preempt_schedule_irq + jmp need_resched +END(resume_kernel) +#endif + CFI_ENDPROC + +/* SYSENTER_RETURN points to after the "sysenter" instruction in + the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ + + # sysenter call handler stub +ENTRY(ia32_sysenter_target) + CFI_STARTPROC simple + CFI_SIGNAL_FRAME + CFI_DEF_CFA esp, 0 + CFI_REGISTER esp, ebp + movl TSS_sysenter_sp0(%esp),%esp +sysenter_past_esp: + /* + * Interrupts are disabled here, but we can't trace it until + * enough kernel state to call TRACE_IRQS_OFF can be called - but + * we immediately enable interrupts at that point anyway. + */ + pushl_cfi $__USER_DS + /*CFI_REL_OFFSET ss, 0*/ + pushl_cfi %ebp + CFI_REL_OFFSET esp, 0 + pushfl_cfi + orl $X86_EFLAGS_IF, (%esp) + pushl_cfi $__USER_CS + /*CFI_REL_OFFSET cs, 0*/ + /* + * Push current_thread_info()->sysenter_return to the stack. + * A tiny bit of offset fixup is necessary - 4*4 means the 4 words + * pushed above; +8 corresponds to copy_thread's esp0 setting. + */ + pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+8+4*4)(%esp) + CFI_REL_OFFSET eip, 0 + + pushl_cfi %eax + SAVE_ALL + ENABLE_INTERRUPTS(CLBR_NONE) + +/* + * Load the potential sixth argument from user stack. + * Careful about security. + */ + cmpl $__PAGE_OFFSET-3,%ebp + jae syscall_fault + ASM_STAC +1: movl (%ebp),%ebp + ASM_CLAC + movl %ebp,PT_EBP(%esp) + _ASM_EXTABLE(1b,syscall_fault) + + GET_THREAD_INFO(%ebp) + + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) + jnz sysenter_audit +sysenter_do_call: + cmpl $(NR_syscalls), %eax + jae sysenter_badsys + call *sys_call_table(,%eax,4) +sysenter_after_call: + movl %eax,PT_EAX(%esp) + LOCKDEP_SYS_EXIT + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + testl $_TIF_ALLWORK_MASK, %ecx + jne sysexit_audit +sysenter_exit: +/* if something modifies registers it must also disable sysexit */ + movl PT_EIP(%esp), %edx + movl PT_OLDESP(%esp), %ecx + xorl %ebp,%ebp + TRACE_IRQS_ON +1: mov PT_FS(%esp), %fs + PTGS_TO_GS + ENABLE_INTERRUPTS_SYSEXIT + +#ifdef CONFIG_AUDITSYSCALL +sysenter_audit: + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + jnz syscall_trace_entry + addl $4,%esp + CFI_ADJUST_CFA_OFFSET -4 + /* %esi already in 8(%esp) 6th arg: 4th syscall arg */ + /* %edx already in 4(%esp) 5th arg: 3rd syscall arg */ + /* %ecx already in 0(%esp) 4th arg: 2nd syscall arg */ + movl %ebx,%ecx /* 3rd arg: 1st syscall arg */ + movl %eax,%edx /* 2nd arg: syscall number */ + movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */ + call __audit_syscall_entry + pushl_cfi %ebx + movl PT_EAX(%esp),%eax /* reload syscall number */ + jmp sysenter_do_call + +sysexit_audit: + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx + jne syscall_exit_work + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_ANY) + movl %eax,%edx /* second arg, syscall return value */ + cmpl $-MAX_ERRNO,%eax /* is it an error ? */ + setbe %al /* 1 if so, 0 if not */ + movzbl %al,%eax /* zero-extend that */ + call __audit_syscall_exit + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx + jne syscall_exit_work + movl PT_EAX(%esp),%eax /* reload syscall return value */ + jmp sysenter_exit +#endif + + CFI_ENDPROC +.pushsection .fixup,"ax" +2: movl $0,PT_FS(%esp) + jmp 1b +.popsection + _ASM_EXTABLE(1b,2b) + PTGS_TO_GS_EX +ENDPROC(ia32_sysenter_target) + + # system call handler stub +ENTRY(system_call) + RING0_INT_FRAME # can't unwind into user space anyway + ASM_CLAC + pushl_cfi %eax # save orig_eax + SAVE_ALL + GET_THREAD_INFO(%ebp) + # system call tracing in operation / emulation + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) + jnz syscall_trace_entry + cmpl $(NR_syscalls), %eax + jae syscall_badsys +syscall_call: + call *sys_call_table(,%eax,4) +syscall_after_call: + movl %eax,PT_EAX(%esp) # store the return value +syscall_exit: + LOCKDEP_SYS_EXIT + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + testl $_TIF_ALLWORK_MASK, %ecx # current->work + jne syscall_exit_work + +restore_all: + TRACE_IRQS_IRET +restore_all_notrace: +#ifdef CONFIG_X86_ESPFIX32 + movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS + # Warning: PT_OLDSS(%esp) contains the wrong/random values if we + # are returning to the kernel. + # See comments in process.c:copy_thread() for details. + movb PT_OLDSS(%esp), %ah + movb PT_CS(%esp), %al + andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax + cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax + CFI_REMEMBER_STATE + je ldt_ss # returning to user-space with LDT SS +#endif +restore_nocheck: + RESTORE_REGS 4 # skip orig_eax/error_code +irq_return: + INTERRUPT_RETURN +.section .fixup,"ax" +ENTRY(iret_exc) + pushl $0 # no error code + pushl $do_iret_error + jmp error_code +.previous + _ASM_EXTABLE(irq_return,iret_exc) + +#ifdef CONFIG_X86_ESPFIX32 + CFI_RESTORE_STATE +ldt_ss: +#ifdef CONFIG_PARAVIRT + /* + * The kernel can't run on a non-flat stack if paravirt mode + * is active. Rather than try to fixup the high bits of + * ESP, bypass this code entirely. This may break DOSemu + * and/or Wine support in a paravirt VM, although the option + * is still available to implement the setting of the high + * 16-bits in the INTERRUPT_RETURN paravirt-op. + */ + cmpl $0, pv_info+PARAVIRT_enabled + jne restore_nocheck +#endif + +/* + * Setup and switch to ESPFIX stack + * + * We're returning to userspace with a 16 bit stack. The CPU will not + * restore the high word of ESP for us on executing iret... This is an + * "official" bug of all the x86-compatible CPUs, which we can work + * around to make dosemu and wine happy. We do this by preloading the + * high word of ESP with the high word of the userspace ESP while + * compensating for the offset by changing to the ESPFIX segment with + * a base address that matches for the difference. + */ +#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8) + mov %esp, %edx /* load kernel esp */ + mov PT_OLDESP(%esp), %eax /* load userspace esp */ + mov %dx, %ax /* eax: new kernel esp */ + sub %eax, %edx /* offset (low word is 0) */ + shr $16, %edx + mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ + mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ + pushl_cfi $__ESPFIX_SS + pushl_cfi %eax /* new kernel esp */ + /* Disable interrupts, but do not irqtrace this section: we + * will soon execute iret and the tracer was already set to + * the irqstate after the iret */ + DISABLE_INTERRUPTS(CLBR_EAX) + lss (%esp), %esp /* switch to espfix segment */ + CFI_ADJUST_CFA_OFFSET -8 + jmp restore_nocheck +#endif + CFI_ENDPROC +ENDPROC(system_call) + + # perform work that needs to be done immediately before resumption + ALIGN + RING0_PTREGS_FRAME # can't unwind into user space anyway +work_pending: + testb $_TIF_NEED_RESCHED, %cl + jz work_notifysig +work_resched: + call schedule + LOCKDEP_SYS_EXIT + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + andl $_TIF_WORK_MASK, %ecx # is there any work to be done other + # than syscall tracing? + jz restore_all + testb $_TIF_NEED_RESCHED, %cl + jnz work_resched + +work_notifysig: # deal with pending signals and + # notify-resume requests +#ifdef CONFIG_VM86 + testl $X86_EFLAGS_VM, PT_EFLAGS(%esp) + movl %esp, %eax + jne work_notifysig_v86 # returning to kernel-space or + # vm86-space +1: +#else + movl %esp, %eax +#endif + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + movb PT_CS(%esp), %bl + andb $SEGMENT_RPL_MASK, %bl + cmpb $USER_RPL, %bl + jb resume_kernel + xorl %edx, %edx + call do_notify_resume + jmp resume_userspace + +#ifdef CONFIG_VM86 + ALIGN +work_notifysig_v86: + pushl_cfi %ecx # save ti_flags for do_notify_resume + call save_v86_state # %eax contains pt_regs pointer + popl_cfi %ecx + movl %eax, %esp + jmp 1b +#endif +END(work_pending) + + # perform syscall exit tracing + ALIGN +syscall_trace_entry: + movl $-ENOSYS,PT_EAX(%esp) + movl %esp, %eax + call syscall_trace_enter + /* What it returned is what we'll actually use. */ + cmpl $(NR_syscalls), %eax + jnae syscall_call + jmp syscall_exit +END(syscall_trace_entry) + + # perform syscall exit tracing + ALIGN +syscall_exit_work: + testl $_TIF_WORK_SYSCALL_EXIT, %ecx + jz work_pending + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call + # schedule() instead + movl %esp, %eax + call syscall_trace_leave + jmp resume_userspace +END(syscall_exit_work) + CFI_ENDPROC + + RING0_INT_FRAME # can't unwind into user space anyway +syscall_fault: + ASM_CLAC + GET_THREAD_INFO(%ebp) + movl $-EFAULT,PT_EAX(%esp) + jmp resume_userspace +END(syscall_fault) + +syscall_badsys: + movl $-ENOSYS,%eax + jmp syscall_after_call +END(syscall_badsys) + +sysenter_badsys: + movl $-ENOSYS,%eax + jmp sysenter_after_call +END(syscall_badsys) + CFI_ENDPROC + +.macro FIXUP_ESPFIX_STACK +/* + * Switch back for ESPFIX stack to the normal zerobased stack + * + * We can't call C functions using the ESPFIX stack. This code reads + * the high word of the segment base from the GDT and swiches to the + * normal stack and adjusts ESP with the matching offset. + */ +#ifdef CONFIG_X86_ESPFIX32 + /* fixup the stack */ + mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */ + mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ + shl $16, %eax + addl %esp, %eax /* the adjusted stack pointer */ + pushl_cfi $__KERNEL_DS + pushl_cfi %eax + lss (%esp), %esp /* switch to the normal stack segment */ + CFI_ADJUST_CFA_OFFSET -8 +#endif +.endm +.macro UNWIND_ESPFIX_STACK +#ifdef CONFIG_X86_ESPFIX32 + movl %ss, %eax + /* see if on espfix stack */ + cmpw $__ESPFIX_SS, %ax + jne 27f + movl $__KERNEL_DS, %eax + movl %eax, %ds + movl %eax, %es + /* switch to normal stack */ + FIXUP_ESPFIX_STACK +27: +#endif +.endm + +/* + * Build the entry stubs and pointer table with some assembler magic. + * We pack 7 stubs into a single 32-byte chunk, which will fit in a + * single cache line on all modern x86 implementations. + */ +.section .init.rodata,"a" +ENTRY(interrupt) +.section .entry.text, "ax" + .p2align 5 + .p2align CONFIG_X86_L1_CACHE_SHIFT +ENTRY(irq_entries_start) + RING0_INT_FRAME +vector=FIRST_EXTERNAL_VECTOR +.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7 + .balign 32 + .rept 7 + .if vector < NR_VECTORS + .if vector <> FIRST_EXTERNAL_VECTOR + CFI_ADJUST_CFA_OFFSET -4 + .endif +1: pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */ + .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 + jmp 2f + .endif + .previous + .long 1b + .section .entry.text, "ax" +vector=vector+1 + .endif + .endr +2: jmp common_interrupt +.endr +END(irq_entries_start) + +.previous +END(interrupt) +.previous + +/* + * the CPU automatically disables interrupts when executing an IRQ vector, + * so IRQ-flags tracing has to follow that: + */ + .p2align CONFIG_X86_L1_CACHE_SHIFT +common_interrupt: + ASM_CLAC + addl $-0x80,(%esp) /* Adjust vector into the [-256,-1] range */ + SAVE_ALL + TRACE_IRQS_OFF + movl %esp,%eax + call do_IRQ + jmp ret_from_intr +ENDPROC(common_interrupt) + CFI_ENDPROC + +#define BUILD_INTERRUPT3(name, nr, fn) \ +ENTRY(name) \ + RING0_INT_FRAME; \ + ASM_CLAC; \ + pushl_cfi $~(nr); \ + SAVE_ALL; \ + TRACE_IRQS_OFF \ + movl %esp,%eax; \ + call fn; \ + jmp ret_from_intr; \ + CFI_ENDPROC; \ +ENDPROC(name) + + +#ifdef CONFIG_TRACING +#define TRACE_BUILD_INTERRUPT(name, nr) \ + BUILD_INTERRUPT3(trace_##name, nr, smp_trace_##name) +#else +#define TRACE_BUILD_INTERRUPT(name, nr) +#endif + +#define BUILD_INTERRUPT(name, nr) \ + BUILD_INTERRUPT3(name, nr, smp_##name); \ + TRACE_BUILD_INTERRUPT(name, nr) + +/* The include is where all of the SMP etc. interrupts come from */ +#include <asm/entry_arch.h> + +ENTRY(coprocessor_error) + RING0_INT_FRAME + ASM_CLAC + pushl_cfi $0 + pushl_cfi $do_coprocessor_error + jmp error_code + CFI_ENDPROC +END(coprocessor_error) + +ENTRY(simd_coprocessor_error) + RING0_INT_FRAME + ASM_CLAC + pushl_cfi $0 +#ifdef CONFIG_X86_INVD_BUG + /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ +661: pushl_cfi $do_general_protection +662: +.section .altinstructions,"a" + altinstruction_entry 661b, 663f, X86_FEATURE_XMM, 662b-661b, 664f-663f +.previous +.section .altinstr_replacement,"ax" +663: pushl $do_simd_coprocessor_error +664: +.previous +#else + pushl_cfi $do_simd_coprocessor_error +#endif + jmp error_code + CFI_ENDPROC +END(simd_coprocessor_error) + +ENTRY(device_not_available) + RING0_INT_FRAME + ASM_CLAC + pushl_cfi $-1 # mark this as an int + pushl_cfi $do_device_not_available + jmp error_code + CFI_ENDPROC +END(device_not_available) + +#ifdef CONFIG_PARAVIRT +ENTRY(native_iret) + iret + _ASM_EXTABLE(native_iret, iret_exc) +END(native_iret) + +ENTRY(native_irq_enable_sysexit) + sti + sysexit +END(native_irq_enable_sysexit) +#endif + +ENTRY(overflow) + RING0_INT_FRAME + ASM_CLAC + pushl_cfi $0 + pushl_cfi $do_overflow + jmp error_code + CFI_ENDPROC +END(overflow) + +ENTRY(bounds) + RING0_INT_FRAME + ASM_CLAC + pushl_cfi $0 + pushl_cfi $do_bounds + jmp error_code + CFI_ENDPROC +END(bounds) + +ENTRY(invalid_op) + RING0_INT_FRAME + ASM_CLAC + pushl_cfi $0 + pushl_cfi $do_invalid_op + jmp error_code + CFI_ENDPROC +END(invalid_op) + +ENTRY(coprocessor_segment_overrun) + RING0_INT_FRAME + ASM_CLAC + pushl_cfi $0 + pushl_cfi $do_coprocessor_segment_overrun + jmp error_code + CFI_ENDPROC +END(coprocessor_segment_overrun) + +ENTRY(invalid_TSS) + RING0_EC_FRAME + ASM_CLAC + pushl_cfi $do_invalid_TSS + jmp error_code + CFI_ENDPROC +END(invalid_TSS) + +ENTRY(segment_not_present) + RING0_EC_FRAME + ASM_CLAC + pushl_cfi $do_segment_not_present + jmp error_code + CFI_ENDPROC +END(segment_not_present) + +ENTRY(stack_segment) + RING0_EC_FRAME + ASM_CLAC + pushl_cfi $do_stack_segment + jmp error_code + CFI_ENDPROC +END(stack_segment) + +ENTRY(alignment_check) + RING0_EC_FRAME + ASM_CLAC + pushl_cfi $do_alignment_check + jmp error_code + CFI_ENDPROC +END(alignment_check) + +ENTRY(divide_error) + RING0_INT_FRAME + ASM_CLAC + pushl_cfi $0 # no error code + pushl_cfi $do_divide_error + jmp error_code + CFI_ENDPROC +END(divide_error) + +#ifdef CONFIG_X86_MCE +ENTRY(machine_check) + RING0_INT_FRAME + ASM_CLAC + pushl_cfi $0 + pushl_cfi machine_check_vector + jmp error_code + CFI_ENDPROC +END(machine_check) +#endif + +ENTRY(spurious_interrupt_bug) + RING0_INT_FRAME + ASM_CLAC + pushl_cfi $0 + pushl_cfi $do_spurious_interrupt_bug + jmp error_code + CFI_ENDPROC +END(spurious_interrupt_bug) + +#ifdef CONFIG_XEN +/* Xen doesn't set %esp to be precisely what the normal sysenter + entrypoint expects, so fix it up before using the normal path. */ +ENTRY(xen_sysenter_target) + RING0_INT_FRAME + addl $5*4, %esp /* remove xen-provided frame */ + CFI_ADJUST_CFA_OFFSET -5*4 + jmp sysenter_past_esp + CFI_ENDPROC + +ENTRY(xen_hypervisor_callback) + CFI_STARTPROC + pushl_cfi $-1 /* orig_ax = -1 => not a system call */ + SAVE_ALL + TRACE_IRQS_OFF + + /* Check to see if we got the event in the critical + region in xen_iret_direct, after we've reenabled + events and checked for pending events. This simulates + iret instruction's behaviour where it delivers a + pending interrupt when enabling interrupts. */ + movl PT_EIP(%esp),%eax + cmpl $xen_iret_start_crit,%eax + jb 1f + cmpl $xen_iret_end_crit,%eax + jae 1f + + jmp xen_iret_crit_fixup + +ENTRY(xen_do_upcall) +1: mov %esp, %eax + call xen_evtchn_do_upcall + jmp ret_from_intr + CFI_ENDPROC +ENDPROC(xen_hypervisor_callback) + +# Hypervisor uses this for application faults while it executes. +# We get here for two reasons: +# 1. Fault while reloading DS, ES, FS or GS +# 2. Fault while executing IRET +# Category 1 we fix up by reattempting the load, and zeroing the segment +# register if the load fails. +# Category 2 we fix up by jumping to do_iret_error. We cannot use the +# normal Linux return path in this case because if we use the IRET hypercall +# to pop the stack frame we end up in an infinite loop of failsafe callbacks. +# We distinguish between categories by maintaining a status value in EAX. +ENTRY(xen_failsafe_callback) + CFI_STARTPROC + pushl_cfi %eax + movl $1,%eax +1: mov 4(%esp),%ds +2: mov 8(%esp),%es +3: mov 12(%esp),%fs +4: mov 16(%esp),%gs + /* EAX == 0 => Category 1 (Bad segment) + EAX != 0 => Category 2 (Bad IRET) */ + testl %eax,%eax + popl_cfi %eax + lea 16(%esp),%esp + CFI_ADJUST_CFA_OFFSET -16 + jz 5f + jmp iret_exc +5: pushl_cfi $-1 /* orig_ax = -1 => not a system call */ + SAVE_ALL + jmp ret_from_exception + CFI_ENDPROC + +.section .fixup,"ax" +6: xorl %eax,%eax + movl %eax,4(%esp) + jmp 1b +7: xorl %eax,%eax + movl %eax,8(%esp) + jmp 2b +8: xorl %eax,%eax + movl %eax,12(%esp) + jmp 3b +9: xorl %eax,%eax + movl %eax,16(%esp) + jmp 4b +.previous + _ASM_EXTABLE(1b,6b) + _ASM_EXTABLE(2b,7b) + _ASM_EXTABLE(3b,8b) + _ASM_EXTABLE(4b,9b) +ENDPROC(xen_failsafe_callback) + +BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR, + xen_evtchn_do_upcall) + +#endif /* CONFIG_XEN */ + +#if IS_ENABLED(CONFIG_HYPERV) + +BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR, + hyperv_vector_handler) + +#endif /* CONFIG_HYPERV */ + +#ifdef CONFIG_FUNCTION_TRACER +#ifdef CONFIG_DYNAMIC_FTRACE + +ENTRY(mcount) + ret +END(mcount) + +ENTRY(ftrace_caller) + cmpl $0, function_trace_stop + jne ftrace_stub + + pushl %eax + pushl %ecx + pushl %edx + pushl $0 /* Pass NULL as regs pointer */ + movl 4*4(%esp), %eax + movl 0x4(%ebp), %edx + movl function_trace_op, %ecx + subl $MCOUNT_INSN_SIZE, %eax + +.globl ftrace_call +ftrace_call: + call ftrace_stub + + addl $4,%esp /* skip NULL pointer */ + popl %edx + popl %ecx + popl %eax +ftrace_ret: +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +.globl ftrace_graph_call +ftrace_graph_call: + jmp ftrace_stub +#endif + +.globl ftrace_stub +ftrace_stub: + ret +END(ftrace_caller) + +ENTRY(ftrace_regs_caller) + pushf /* push flags before compare (in cs location) */ + cmpl $0, function_trace_stop + jne ftrace_restore_flags + + /* + * i386 does not save SS and ESP when coming from kernel. + * Instead, to get sp, ®s->sp is used (see ptrace.h). + * Unfortunately, that means eflags must be at the same location + * as the current return ip is. We move the return ip into the + * ip location, and move flags into the return ip location. + */ + pushl 4(%esp) /* save return ip into ip slot */ + + pushl $0 /* Load 0 into orig_ax */ + pushl %gs + pushl %fs + pushl %es + pushl %ds + pushl %eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx + + movl 13*4(%esp), %eax /* Get the saved flags */ + movl %eax, 14*4(%esp) /* Move saved flags into regs->flags location */ + /* clobbering return ip */ + movl $__KERNEL_CS,13*4(%esp) + + movl 12*4(%esp), %eax /* Load ip (1st parameter) */ + subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */ + movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */ + movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */ + pushl %esp /* Save pt_regs as 4th parameter */ + +GLOBAL(ftrace_regs_call) + call ftrace_stub + + addl $4, %esp /* Skip pt_regs */ + movl 14*4(%esp), %eax /* Move flags back into cs */ + movl %eax, 13*4(%esp) /* Needed to keep addl from modifying flags */ + movl 12*4(%esp), %eax /* Get return ip from regs->ip */ + movl %eax, 14*4(%esp) /* Put return ip back for ret */ + + popl %ebx + popl %ecx + popl %edx + popl %esi + popl %edi + popl %ebp + popl %eax + popl %ds + popl %es + popl %fs + popl %gs + addl $8, %esp /* Skip orig_ax and ip */ + popf /* Pop flags at end (no addl to corrupt flags) */ + jmp ftrace_ret + +ftrace_restore_flags: + popf + jmp ftrace_stub +#else /* ! CONFIG_DYNAMIC_FTRACE */ + +ENTRY(mcount) + cmpl $__PAGE_OFFSET, %esp + jb ftrace_stub /* Paging not enabled yet? */ + + cmpl $0, function_trace_stop + jne ftrace_stub + + cmpl $ftrace_stub, ftrace_trace_function + jnz trace +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + cmpl $ftrace_stub, ftrace_graph_return + jnz ftrace_graph_caller + + cmpl $ftrace_graph_entry_stub, ftrace_graph_entry + jnz ftrace_graph_caller +#endif +.globl ftrace_stub +ftrace_stub: + ret + + /* taken from glibc */ +trace: + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + movl 0x4(%ebp), %edx + subl $MCOUNT_INSN_SIZE, %eax + + call *ftrace_trace_function + + popl %edx + popl %ecx + popl %eax + jmp ftrace_stub +END(mcount) +#endif /* CONFIG_DYNAMIC_FTRACE */ +#endif /* CONFIG_FUNCTION_TRACER */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +ENTRY(ftrace_graph_caller) + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %edx + lea 0x4(%ebp), %eax + movl (%ebp), %ecx + subl $MCOUNT_INSN_SIZE, %edx + call prepare_ftrace_return + popl %edx + popl %ecx + popl %eax + ret +END(ftrace_graph_caller) + +.globl return_to_handler +return_to_handler: + pushl %eax + pushl %edx + movl %ebp, %eax + call ftrace_return_to_handler + movl %eax, %ecx + popl %edx + popl %eax + jmp *%ecx +#endif + +#ifdef CONFIG_TRACING +ENTRY(trace_page_fault) + RING0_EC_FRAME + ASM_CLAC + pushl_cfi $trace_do_page_fault + jmp error_code + CFI_ENDPROC +END(trace_page_fault) +#endif + +ENTRY(page_fault) + RING0_EC_FRAME + ASM_CLAC + pushl_cfi $do_page_fault + ALIGN +error_code: + /* the function address is in %gs's slot on the stack */ + pushl_cfi %fs + /*CFI_REL_OFFSET fs, 0*/ + pushl_cfi %es + /*CFI_REL_OFFSET es, 0*/ + pushl_cfi %ds + /*CFI_REL_OFFSET ds, 0*/ + pushl_cfi %eax + CFI_REL_OFFSET eax, 0 + pushl_cfi %ebp + CFI_REL_OFFSET ebp, 0 + pushl_cfi %edi + CFI_REL_OFFSET edi, 0 + pushl_cfi %esi + CFI_REL_OFFSET esi, 0 + pushl_cfi %edx + CFI_REL_OFFSET edx, 0 + pushl_cfi %ecx + CFI_REL_OFFSET ecx, 0 + pushl_cfi %ebx + CFI_REL_OFFSET ebx, 0 + cld + movl $(__KERNEL_PERCPU), %ecx + movl %ecx, %fs + UNWIND_ESPFIX_STACK + GS_TO_REG %ecx + movl PT_GS(%esp), %edi # get the function address + movl PT_ORIG_EAX(%esp), %edx # get the error code + movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart + REG_TO_PTGS %ecx + SET_KERNEL_GS %ecx + movl $(__USER_DS), %ecx + movl %ecx, %ds + movl %ecx, %es + TRACE_IRQS_OFF + movl %esp,%eax # pt_regs pointer + call *%edi + jmp ret_from_exception + CFI_ENDPROC +END(page_fault) + +/* + * Debug traps and NMI can happen at the one SYSENTER instruction + * that sets up the real kernel stack. Check here, since we can't + * allow the wrong stack to be used. + * + * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have + * already pushed 3 words if it hits on the sysenter instruction: + * eflags, cs and eip. + * + * We just load the right stack, and push the three (known) values + * by hand onto the new stack - while updating the return eip past + * the instruction that would have done it for sysenter. + */ +.macro FIX_STACK offset ok label + cmpw $__KERNEL_CS, 4(%esp) + jne \ok +\label: + movl TSS_sysenter_sp0 + \offset(%esp), %esp + CFI_DEF_CFA esp, 0 + CFI_UNDEFINED eip + pushfl_cfi + pushl_cfi $__KERNEL_CS + pushl_cfi $sysenter_past_esp + CFI_REL_OFFSET eip, 0 +.endm + +ENTRY(debug) + RING0_INT_FRAME + ASM_CLAC + cmpl $ia32_sysenter_target,(%esp) + jne debug_stack_correct + FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn +debug_stack_correct: + pushl_cfi $-1 # mark this as an int + SAVE_ALL + TRACE_IRQS_OFF + xorl %edx,%edx # error code 0 + movl %esp,%eax # pt_regs pointer + call do_debug + jmp ret_from_exception + CFI_ENDPROC +END(debug) + +/* + * NMI is doubly nasty. It can happen _while_ we're handling + * a debug fault, and the debug fault hasn't yet been able to + * clear up the stack. So we first check whether we got an + * NMI on the sysenter entry path, but after that we need to + * check whether we got an NMI on the debug path where the debug + * fault happened on the sysenter path. + */ +ENTRY(nmi) + RING0_INT_FRAME + ASM_CLAC +#ifdef CONFIG_X86_ESPFIX32 + pushl_cfi %eax + movl %ss, %eax + cmpw $__ESPFIX_SS, %ax + popl_cfi %eax + je nmi_espfix_stack +#endif + cmpl $ia32_sysenter_target,(%esp) + je nmi_stack_fixup + pushl_cfi %eax + movl %esp,%eax + /* Do not access memory above the end of our stack page, + * it might not exist. + */ + andl $(THREAD_SIZE-1),%eax + cmpl $(THREAD_SIZE-20),%eax + popl_cfi %eax + jae nmi_stack_correct + cmpl $ia32_sysenter_target,12(%esp) + je nmi_debug_stack_check +nmi_stack_correct: + /* We have a RING0_INT_FRAME here */ + pushl_cfi %eax + SAVE_ALL + xorl %edx,%edx # zero error code + movl %esp,%eax # pt_regs pointer + call do_nmi + jmp restore_all_notrace + CFI_ENDPROC + +nmi_stack_fixup: + RING0_INT_FRAME + FIX_STACK 12, nmi_stack_correct, 1 + jmp nmi_stack_correct + +nmi_debug_stack_check: + /* We have a RING0_INT_FRAME here */ + cmpw $__KERNEL_CS,16(%esp) + jne nmi_stack_correct + cmpl $debug,(%esp) + jb nmi_stack_correct + cmpl $debug_esp_fix_insn,(%esp) + ja nmi_stack_correct + FIX_STACK 24, nmi_stack_correct, 1 + jmp nmi_stack_correct + +#ifdef CONFIG_X86_ESPFIX32 +nmi_espfix_stack: + /* We have a RING0_INT_FRAME here. + * + * create the pointer to lss back + */ + pushl_cfi %ss + pushl_cfi %esp + addl $4, (%esp) + /* copy the iret frame of 12 bytes */ + .rept 3 + pushl_cfi 16(%esp) + .endr + pushl_cfi %eax + SAVE_ALL + FIXUP_ESPFIX_STACK # %eax == %esp + xorl %edx,%edx # zero error code + call do_nmi + RESTORE_REGS + lss 12+4(%esp), %esp # back to espfix stack + CFI_ADJUST_CFA_OFFSET -24 + jmp irq_return +#endif + CFI_ENDPROC +END(nmi) + +ENTRY(int3) + RING0_INT_FRAME + ASM_CLAC + pushl_cfi $-1 # mark this as an int + SAVE_ALL + TRACE_IRQS_OFF + xorl %edx,%edx # zero error code + movl %esp,%eax # pt_regs pointer + call do_int3 + jmp ret_from_exception + CFI_ENDPROC +END(int3) + +ENTRY(general_protection) + RING0_EC_FRAME + pushl_cfi $do_general_protection + jmp error_code + CFI_ENDPROC +END(general_protection) + +#ifdef CONFIG_KVM_GUEST +ENTRY(async_page_fault) + RING0_EC_FRAME + ASM_CLAC + pushl_cfi $do_async_page_fault + jmp error_code + CFI_ENDPROC +END(async_page_fault) +#endif + diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S new file mode 100644 index 00000000000..c844f0816ab --- /dev/null +++ b/arch/x86/kernel/entry_64.S @@ -0,0 +1,1719 @@ +/* + * linux/arch/x86_64/entry.S + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs + * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> + */ + +/* + * entry.S contains the system-call and fault low-level handling routines. + * + * Some of this is documented in Documentation/x86/entry_64.txt + * + * NOTE: This code handles signal-recognition, which happens every time + * after an interrupt and after each system call. + * + * Normal syscalls and interrupts don't save a full stack frame, this is + * only done for syscall tracing, signals or fork/exec et.al. + * + * A note on terminology: + * - top of stack: Architecture defined interrupt frame from SS to RIP + * at the top of the kernel process stack. + * - partial stack frame: partially saved registers up to R11. + * - full stack frame: Like partial stack frame, but all register saved. + * + * Some macro usage: + * - CFI macros are used to generate dwarf2 unwind information for better + * backtraces. They don't change any code. + * - SAVE_ALL/RESTORE_ALL - Save/restore all registers + * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify. + * There are unfortunately lots of special cases where some registers + * not touched. The macro is a big mess that should be cleaned up. + * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS. + * Gives a full stack frame. + * - ENTRY/END Define functions in the symbol table. + * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack + * frame that is otherwise undefined after a SYSCALL + * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. + * - idtentry - Define exception entry points. + */ + +#include <linux/linkage.h> +#include <asm/segment.h> +#include <asm/cache.h> +#include <asm/errno.h> +#include <asm/dwarf2.h> +#include <asm/calling.h> +#include <asm/asm-offsets.h> +#include <asm/msr.h> +#include <asm/unistd.h> +#include <asm/thread_info.h> +#include <asm/hw_irq.h> +#include <asm/page_types.h> +#include <asm/irqflags.h> +#include <asm/paravirt.h> +#include <asm/percpu.h> +#include <asm/asm.h> +#include <asm/context_tracking.h> +#include <asm/smap.h> +#include <asm/pgtable_types.h> +#include <linux/err.h> + +/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ +#include <linux/elf-em.h> +#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE) +#define __AUDIT_ARCH_64BIT 0x80000000 +#define __AUDIT_ARCH_LE 0x40000000 + + .code64 + .section .entry.text, "ax" + + +#ifndef CONFIG_PREEMPT +#define retint_kernel retint_restore_args +#endif + +#ifdef CONFIG_PARAVIRT +ENTRY(native_usergs_sysret64) + swapgs + sysretq +ENDPROC(native_usergs_sysret64) +#endif /* CONFIG_PARAVIRT */ + + +.macro TRACE_IRQS_IRETQ offset=ARGOFFSET +#ifdef CONFIG_TRACE_IRQFLAGS + bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */ + jnc 1f + TRACE_IRQS_ON +1: +#endif +.endm + +/* + * When dynamic function tracer is enabled it will add a breakpoint + * to all locations that it is about to modify, sync CPUs, update + * all the code, sync CPUs, then remove the breakpoints. In this time + * if lockdep is enabled, it might jump back into the debug handler + * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF). + * + * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to + * make sure the stack pointer does not get reset back to the top + * of the debug stack, and instead just reuses the current stack. + */ +#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS) + +.macro TRACE_IRQS_OFF_DEBUG + call debug_stack_set_zero + TRACE_IRQS_OFF + call debug_stack_reset +.endm + +.macro TRACE_IRQS_ON_DEBUG + call debug_stack_set_zero + TRACE_IRQS_ON + call debug_stack_reset +.endm + +.macro TRACE_IRQS_IRETQ_DEBUG offset=ARGOFFSET + bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */ + jnc 1f + TRACE_IRQS_ON_DEBUG +1: +.endm + +#else +# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF +# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON +# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ +#endif + +/* + * C code is not supposed to know about undefined top of stack. Every time + * a C function with an pt_regs argument is called from the SYSCALL based + * fast path FIXUP_TOP_OF_STACK is needed. + * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs + * manipulation. + */ + + /* %rsp:at FRAMEEND */ + .macro FIXUP_TOP_OF_STACK tmp offset=0 + movq PER_CPU_VAR(old_rsp),\tmp + movq \tmp,RSP+\offset(%rsp) + movq $__USER_DS,SS+\offset(%rsp) + movq $__USER_CS,CS+\offset(%rsp) + movq $-1,RCX+\offset(%rsp) + movq R11+\offset(%rsp),\tmp /* get eflags */ + movq \tmp,EFLAGS+\offset(%rsp) + .endm + + .macro RESTORE_TOP_OF_STACK tmp offset=0 + movq RSP+\offset(%rsp),\tmp + movq \tmp,PER_CPU_VAR(old_rsp) + movq EFLAGS+\offset(%rsp),\tmp + movq \tmp,R11+\offset(%rsp) + .endm + + .macro FAKE_STACK_FRAME child_rip + /* push in order ss, rsp, eflags, cs, rip */ + xorl %eax, %eax + pushq_cfi $__KERNEL_DS /* ss */ + /*CFI_REL_OFFSET ss,0*/ + pushq_cfi %rax /* rsp */ + CFI_REL_OFFSET rsp,0 + pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) /* eflags - interrupts on */ + /*CFI_REL_OFFSET rflags,0*/ + pushq_cfi $__KERNEL_CS /* cs */ + /*CFI_REL_OFFSET cs,0*/ + pushq_cfi \child_rip /* rip */ + CFI_REL_OFFSET rip,0 + pushq_cfi %rax /* orig rax */ + .endm + + .macro UNFAKE_STACK_FRAME + addq $8*6, %rsp + CFI_ADJUST_CFA_OFFSET -(6*8) + .endm + +/* + * initial frame state for interrupts (and exceptions without error code) + */ + .macro EMPTY_FRAME start=1 offset=0 + .if \start + CFI_STARTPROC simple + CFI_SIGNAL_FRAME + CFI_DEF_CFA rsp,8+\offset + .else + CFI_DEF_CFA_OFFSET 8+\offset + .endif + .endm + +/* + * initial frame state for interrupts (and exceptions without error code) + */ + .macro INTR_FRAME start=1 offset=0 + EMPTY_FRAME \start, SS+8+\offset-RIP + /*CFI_REL_OFFSET ss, SS+\offset-RIP*/ + CFI_REL_OFFSET rsp, RSP+\offset-RIP + /*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/ + /*CFI_REL_OFFSET cs, CS+\offset-RIP*/ + CFI_REL_OFFSET rip, RIP+\offset-RIP + .endm + +/* + * initial frame state for exceptions with error code (and interrupts + * with vector already pushed) + */ + .macro XCPT_FRAME start=1 offset=0 + INTR_FRAME \start, RIP+\offset-ORIG_RAX + /*CFI_REL_OFFSET orig_rax, ORIG_RAX-ORIG_RAX*/ + .endm + +/* + * frame that enables calling into C. + */ + .macro PARTIAL_FRAME start=1 offset=0 + XCPT_FRAME \start, ORIG_RAX+\offset-ARGOFFSET + CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET + CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET + CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET + CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET + CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET + CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET + CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET + CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET + CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET + .endm + +/* + * frame that enables passing a complete pt_regs to a C function. + */ + .macro DEFAULT_FRAME start=1 offset=0 + PARTIAL_FRAME \start, R11+\offset-R15 + CFI_REL_OFFSET rbx, RBX+\offset + CFI_REL_OFFSET rbp, RBP+\offset + CFI_REL_OFFSET r12, R12+\offset + CFI_REL_OFFSET r13, R13+\offset + CFI_REL_OFFSET r14, R14+\offset + CFI_REL_OFFSET r15, R15+\offset + .endm + +/* save partial stack frame */ + .macro SAVE_ARGS_IRQ + cld + /* start from rbp in pt_regs and jump over */ + movq_cfi rdi, (RDI-RBP) + movq_cfi rsi, (RSI-RBP) + movq_cfi rdx, (RDX-RBP) + movq_cfi rcx, (RCX-RBP) + movq_cfi rax, (RAX-RBP) + movq_cfi r8, (R8-RBP) + movq_cfi r9, (R9-RBP) + movq_cfi r10, (R10-RBP) + movq_cfi r11, (R11-RBP) + + /* Save rbp so that we can unwind from get_irq_regs() */ + movq_cfi rbp, 0 + + /* Save previous stack value */ + movq %rsp, %rsi + + leaq -RBP(%rsp),%rdi /* arg1 for handler */ + testl $3, CS-RBP(%rsi) + je 1f + SWAPGS + /* + * irq_count is used to check if a CPU is already on an interrupt stack + * or not. While this is essentially redundant with preempt_count it is + * a little cheaper to use a separate counter in the PDA (short of + * moving irq_enter into assembly, which would be too much work) + */ +1: incl PER_CPU_VAR(irq_count) + cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp + CFI_DEF_CFA_REGISTER rsi + + /* Store previous stack value */ + pushq %rsi + CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \ + 0x77 /* DW_OP_breg7 */, 0, \ + 0x06 /* DW_OP_deref */, \ + 0x08 /* DW_OP_const1u */, SS+8-RBP, \ + 0x22 /* DW_OP_plus */ + /* We entered an interrupt context - irqs are off: */ + TRACE_IRQS_OFF + .endm + +ENTRY(save_paranoid) + XCPT_FRAME 1 RDI+8 + cld + movq_cfi rdi, RDI+8 + movq_cfi rsi, RSI+8 + movq_cfi rdx, RDX+8 + movq_cfi rcx, RCX+8 + movq_cfi rax, RAX+8 + movq_cfi r8, R8+8 + movq_cfi r9, R9+8 + movq_cfi r10, R10+8 + movq_cfi r11, R11+8 + movq_cfi rbx, RBX+8 + movq_cfi rbp, RBP+8 + movq_cfi r12, R12+8 + movq_cfi r13, R13+8 + movq_cfi r14, R14+8 + movq_cfi r15, R15+8 + movl $1,%ebx + movl $MSR_GS_BASE,%ecx + rdmsr + testl %edx,%edx + js 1f /* negative -> in kernel */ + SWAPGS + xorl %ebx,%ebx +1: ret + CFI_ENDPROC +END(save_paranoid) + +/* + * A newly forked process directly context switches into this address. + * + * rdi: prev task we switched from + */ +ENTRY(ret_from_fork) + DEFAULT_FRAME + + LOCK ; btr $TIF_FORK,TI_flags(%r8) + + pushq_cfi $0x0002 + popfq_cfi # reset kernel eflags + + call schedule_tail # rdi: 'prev' task parameter + + GET_THREAD_INFO(%rcx) + + RESTORE_REST + + testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread? + jz 1f + + testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET + jnz int_ret_from_sys_call + + RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET + jmp ret_from_sys_call # go to the SYSRET fastpath + +1: + subq $REST_SKIP, %rsp # leave space for volatiles + CFI_ADJUST_CFA_OFFSET REST_SKIP + movq %rbp, %rdi + call *%rbx + movl $0, RAX(%rsp) + RESTORE_REST + jmp int_ret_from_sys_call + CFI_ENDPROC +END(ret_from_fork) + +/* + * System call entry. Up to 6 arguments in registers are supported. + * + * SYSCALL does not save anything on the stack and does not change the + * stack pointer. However, it does mask the flags register for us, so + * CLD and CLAC are not needed. + */ + +/* + * Register setup: + * rax system call number + * rdi arg0 + * rcx return address for syscall/sysret, C arg3 + * rsi arg1 + * rdx arg2 + * r10 arg3 (--> moved to rcx for C) + * r8 arg4 + * r9 arg5 + * r11 eflags for syscall/sysret, temporary for C + * r12-r15,rbp,rbx saved by C code, not touched. + * + * Interrupts are off on entry. + * Only called from user space. + * + * XXX if we had a free scratch register we could save the RSP into the stack frame + * and report it properly in ps. Unfortunately we haven't. + * + * When user can change the frames always force IRET. That is because + * it deals with uncanonical addresses better. SYSRET has trouble + * with them due to bugs in both AMD and Intel CPUs. + */ + +ENTRY(system_call) + CFI_STARTPROC simple + CFI_SIGNAL_FRAME + CFI_DEF_CFA rsp,KERNEL_STACK_OFFSET + CFI_REGISTER rip,rcx + /*CFI_REGISTER rflags,r11*/ + SWAPGS_UNSAFE_STACK + /* + * A hypervisor implementation might want to use a label + * after the swapgs, so that it can do the swapgs + * for the guest and jump here on syscall. + */ +GLOBAL(system_call_after_swapgs) + + movq %rsp,PER_CPU_VAR(old_rsp) + movq PER_CPU_VAR(kernel_stack),%rsp + /* + * No need to follow this irqs off/on section - it's straight + * and short: + */ + ENABLE_INTERRUPTS(CLBR_NONE) + SAVE_ARGS 8,0 + movq %rax,ORIG_RAX-ARGOFFSET(%rsp) + movq %rcx,RIP-ARGOFFSET(%rsp) + CFI_REL_OFFSET rip,RIP-ARGOFFSET + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + jnz tracesys +system_call_fastpath: +#if __SYSCALL_MASK == ~0 + cmpq $__NR_syscall_max,%rax +#else + andl $__SYSCALL_MASK,%eax + cmpl $__NR_syscall_max,%eax +#endif + ja badsys + movq %r10,%rcx + call *sys_call_table(,%rax,8) # XXX: rip relative + movq %rax,RAX-ARGOFFSET(%rsp) +/* + * Syscall return path ending with SYSRET (fast path) + * Has incomplete stack frame and undefined top of stack. + */ +ret_from_sys_call: + movl $_TIF_ALLWORK_MASK,%edi + /* edi: flagmask */ +sysret_check: + LOCKDEP_SYS_EXIT + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx + andl %edi,%edx + jnz sysret_careful + CFI_REMEMBER_STATE + /* + * sysretq will re-enable interrupts: + */ + TRACE_IRQS_ON + movq RIP-ARGOFFSET(%rsp),%rcx + CFI_REGISTER rip,rcx + RESTORE_ARGS 1,-ARG_SKIP,0 + /*CFI_REGISTER rflags,r11*/ + movq PER_CPU_VAR(old_rsp), %rsp + USERGS_SYSRET64 + + CFI_RESTORE_STATE + /* Handle reschedules */ + /* edx: work, edi: workmask */ +sysret_careful: + bt $TIF_NEED_RESCHED,%edx + jnc sysret_signal + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + pushq_cfi %rdi + SCHEDULE_USER + popq_cfi %rdi + jmp sysret_check + + /* Handle a signal */ +sysret_signal: + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) +#ifdef CONFIG_AUDITSYSCALL + bt $TIF_SYSCALL_AUDIT,%edx + jc sysret_audit +#endif + /* + * We have a signal, or exit tracing or single-step. + * These all wind up with the iret return path anyway, + * so just join that path right now. + */ + FIXUP_TOP_OF_STACK %r11, -ARGOFFSET + jmp int_check_syscall_exit_work + +badsys: + movq $-ENOSYS,RAX-ARGOFFSET(%rsp) + jmp ret_from_sys_call + +#ifdef CONFIG_AUDITSYSCALL + /* + * Fast path for syscall audit without full syscall trace. + * We just call __audit_syscall_entry() directly, and then + * jump back to the normal fast path. + */ +auditsys: + movq %r10,%r9 /* 6th arg: 4th syscall arg */ + movq %rdx,%r8 /* 5th arg: 3rd syscall arg */ + movq %rsi,%rcx /* 4th arg: 2nd syscall arg */ + movq %rdi,%rdx /* 3rd arg: 1st syscall arg */ + movq %rax,%rsi /* 2nd arg: syscall number */ + movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */ + call __audit_syscall_entry + LOAD_ARGS 0 /* reload call-clobbered registers */ + jmp system_call_fastpath + + /* + * Return fast path for syscall audit. Call __audit_syscall_exit() + * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT + * masked off. + */ +sysret_audit: + movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */ + cmpq $-MAX_ERRNO,%rsi /* is it < -MAX_ERRNO? */ + setbe %al /* 1 if so, 0 if not */ + movzbl %al,%edi /* zero-extend that into %edi */ + call __audit_syscall_exit + movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi + jmp sysret_check +#endif /* CONFIG_AUDITSYSCALL */ + + /* Do syscall tracing */ +tracesys: +#ifdef CONFIG_AUDITSYSCALL + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + jz auditsys +#endif + SAVE_REST + movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ + FIXUP_TOP_OF_STACK %rdi + movq %rsp,%rdi + call syscall_trace_enter + /* + * Reload arg registers from stack in case ptrace changed them. + * We don't reload %rax because syscall_trace_enter() returned + * the value it wants us to use in the table lookup. + */ + LOAD_ARGS ARGOFFSET, 1 + RESTORE_REST +#if __SYSCALL_MASK == ~0 + cmpq $__NR_syscall_max,%rax +#else + andl $__SYSCALL_MASK,%eax + cmpl $__NR_syscall_max,%eax +#endif + ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */ + movq %r10,%rcx /* fixup for C */ + call *sys_call_table(,%rax,8) + movq %rax,RAX-ARGOFFSET(%rsp) + /* Use IRET because user could have changed frame */ + +/* + * Syscall return path ending with IRET. + * Has correct top of stack, but partial stack frame. + */ +GLOBAL(int_ret_from_sys_call) + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + movl $_TIF_ALLWORK_MASK,%edi + /* edi: mask to check */ +GLOBAL(int_with_check) + LOCKDEP_SYS_EXIT_IRQ + GET_THREAD_INFO(%rcx) + movl TI_flags(%rcx),%edx + andl %edi,%edx + jnz int_careful + andl $~TS_COMPAT,TI_status(%rcx) + jmp retint_swapgs + + /* Either reschedule or signal or syscall exit tracking needed. */ + /* First do a reschedule test. */ + /* edx: work, edi: workmask */ +int_careful: + bt $TIF_NEED_RESCHED,%edx + jnc int_very_careful + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + pushq_cfi %rdi + SCHEDULE_USER + popq_cfi %rdi + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + jmp int_with_check + + /* handle signals and tracing -- both require a full stack frame */ +int_very_careful: + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) +int_check_syscall_exit_work: + SAVE_REST + /* Check for syscall exit trace */ + testl $_TIF_WORK_SYSCALL_EXIT,%edx + jz int_signal + pushq_cfi %rdi + leaq 8(%rsp),%rdi # &ptregs -> arg1 + call syscall_trace_leave + popq_cfi %rdi + andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi + jmp int_restore_rest + +int_signal: + testl $_TIF_DO_NOTIFY_MASK,%edx + jz 1f + movq %rsp,%rdi # &ptregs -> arg1 + xorl %esi,%esi # oldset -> arg2 + call do_notify_resume +1: movl $_TIF_WORK_MASK,%edi +int_restore_rest: + RESTORE_REST + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + jmp int_with_check + CFI_ENDPROC +END(system_call) + + .macro FORK_LIKE func +ENTRY(stub_\func) + CFI_STARTPROC + popq %r11 /* save return address */ + PARTIAL_FRAME 0 + SAVE_REST + pushq %r11 /* put it back on stack */ + FIXUP_TOP_OF_STACK %r11, 8 + DEFAULT_FRAME 0 8 /* offset 8: return address */ + call sys_\func + RESTORE_TOP_OF_STACK %r11, 8 + ret $REST_SKIP /* pop extended registers */ + CFI_ENDPROC +END(stub_\func) + .endm + + .macro FIXED_FRAME label,func +ENTRY(\label) + CFI_STARTPROC + PARTIAL_FRAME 0 8 /* offset 8: return address */ + FIXUP_TOP_OF_STACK %r11, 8-ARGOFFSET + call \func + RESTORE_TOP_OF_STACK %r11, 8-ARGOFFSET + ret + CFI_ENDPROC +END(\label) + .endm + + FORK_LIKE clone + FORK_LIKE fork + FORK_LIKE vfork + FIXED_FRAME stub_iopl, sys_iopl + +ENTRY(ptregscall_common) + DEFAULT_FRAME 1 8 /* offset 8: return address */ + RESTORE_TOP_OF_STACK %r11, 8 + movq_cfi_restore R15+8, r15 + movq_cfi_restore R14+8, r14 + movq_cfi_restore R13+8, r13 + movq_cfi_restore R12+8, r12 + movq_cfi_restore RBP+8, rbp + movq_cfi_restore RBX+8, rbx + ret $REST_SKIP /* pop extended registers */ + CFI_ENDPROC +END(ptregscall_common) + +ENTRY(stub_execve) + CFI_STARTPROC + addq $8, %rsp + PARTIAL_FRAME 0 + SAVE_REST + FIXUP_TOP_OF_STACK %r11 + call sys_execve + movq %rax,RAX(%rsp) + RESTORE_REST + jmp int_ret_from_sys_call + CFI_ENDPROC +END(stub_execve) + +/* + * sigreturn is special because it needs to restore all registers on return. + * This cannot be done with SYSRET, so use the IRET return path instead. + */ +ENTRY(stub_rt_sigreturn) + CFI_STARTPROC + addq $8, %rsp + PARTIAL_FRAME 0 + SAVE_REST + FIXUP_TOP_OF_STACK %r11 + call sys_rt_sigreturn + movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer + RESTORE_REST + jmp int_ret_from_sys_call + CFI_ENDPROC +END(stub_rt_sigreturn) + +#ifdef CONFIG_X86_X32_ABI +ENTRY(stub_x32_rt_sigreturn) + CFI_STARTPROC + addq $8, %rsp + PARTIAL_FRAME 0 + SAVE_REST + FIXUP_TOP_OF_STACK %r11 + call sys32_x32_rt_sigreturn + movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer + RESTORE_REST + jmp int_ret_from_sys_call + CFI_ENDPROC +END(stub_x32_rt_sigreturn) + +ENTRY(stub_x32_execve) + CFI_STARTPROC + addq $8, %rsp + PARTIAL_FRAME 0 + SAVE_REST + FIXUP_TOP_OF_STACK %r11 + call compat_sys_execve + RESTORE_TOP_OF_STACK %r11 + movq %rax,RAX(%rsp) + RESTORE_REST + jmp int_ret_from_sys_call + CFI_ENDPROC +END(stub_x32_execve) + +#endif + +/* + * Build the entry stubs and pointer table with some assembler magic. + * We pack 7 stubs into a single 32-byte chunk, which will fit in a + * single cache line on all modern x86 implementations. + */ + .section .init.rodata,"a" +ENTRY(interrupt) + .section .entry.text + .p2align 5 + .p2align CONFIG_X86_L1_CACHE_SHIFT +ENTRY(irq_entries_start) + INTR_FRAME +vector=FIRST_EXTERNAL_VECTOR +.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7 + .balign 32 + .rept 7 + .if vector < NR_VECTORS + .if vector <> FIRST_EXTERNAL_VECTOR + CFI_ADJUST_CFA_OFFSET -8 + .endif +1: pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */ + .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 + jmp 2f + .endif + .previous + .quad 1b + .section .entry.text +vector=vector+1 + .endif + .endr +2: jmp common_interrupt +.endr + CFI_ENDPROC +END(irq_entries_start) + +.previous +END(interrupt) +.previous + +/* + * Interrupt entry/exit. + * + * Interrupt entry points save only callee clobbered registers in fast path. + * + * Entry runs with interrupts off. + */ + +/* 0(%rsp): ~(interrupt number) */ + .macro interrupt func + /* reserve pt_regs for scratch regs and rbp */ + subq $ORIG_RAX-RBP, %rsp + CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP + SAVE_ARGS_IRQ + call \func + .endm + + /* + * The interrupt stubs push (~vector+0x80) onto the stack and + * then jump to common_interrupt. + */ + .p2align CONFIG_X86_L1_CACHE_SHIFT +common_interrupt: + XCPT_FRAME + ASM_CLAC + addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ + interrupt do_IRQ + /* 0(%rsp): old_rsp-ARGOFFSET */ +ret_from_intr: + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + decl PER_CPU_VAR(irq_count) + + /* Restore saved previous stack */ + popq %rsi + CFI_DEF_CFA rsi,SS+8-RBP /* reg/off reset after def_cfa_expr */ + leaq ARGOFFSET-RBP(%rsi), %rsp + CFI_DEF_CFA_REGISTER rsp + CFI_ADJUST_CFA_OFFSET RBP-ARGOFFSET + +exit_intr: + GET_THREAD_INFO(%rcx) + testl $3,CS-ARGOFFSET(%rsp) + je retint_kernel + + /* Interrupt came from user space */ + /* + * Has a correct top of stack, but a partial stack frame + * %rcx: thread info. Interrupts off. + */ +retint_with_reschedule: + movl $_TIF_WORK_MASK,%edi +retint_check: + LOCKDEP_SYS_EXIT_IRQ + movl TI_flags(%rcx),%edx + andl %edi,%edx + CFI_REMEMBER_STATE + jnz retint_careful + +retint_swapgs: /* return to user-space */ + /* + * The iretq could re-enable interrupts: + */ + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_IRETQ + SWAPGS + jmp restore_args + +retint_restore_args: /* return to kernel space */ + DISABLE_INTERRUPTS(CLBR_ANY) + /* + * The iretq could re-enable interrupts: + */ + TRACE_IRQS_IRETQ +restore_args: + RESTORE_ARGS 1,8,1 + +irq_return: + INTERRUPT_RETURN + +ENTRY(native_iret) + /* + * Are we returning to a stack segment from the LDT? Note: in + * 64-bit mode SS:RSP on the exception stack is always valid. + */ +#ifdef CONFIG_X86_ESPFIX64 + testb $4,(SS-RIP)(%rsp) + jnz native_irq_return_ldt +#endif + +native_irq_return_iret: + iretq + _ASM_EXTABLE(native_irq_return_iret, bad_iret) + +#ifdef CONFIG_X86_ESPFIX64 +native_irq_return_ldt: + pushq_cfi %rax + pushq_cfi %rdi + SWAPGS + movq PER_CPU_VAR(espfix_waddr),%rdi + movq %rax,(0*8)(%rdi) /* RAX */ + movq (2*8)(%rsp),%rax /* RIP */ + movq %rax,(1*8)(%rdi) + movq (3*8)(%rsp),%rax /* CS */ + movq %rax,(2*8)(%rdi) + movq (4*8)(%rsp),%rax /* RFLAGS */ + movq %rax,(3*8)(%rdi) + movq (6*8)(%rsp),%rax /* SS */ + movq %rax,(5*8)(%rdi) + movq (5*8)(%rsp),%rax /* RSP */ + movq %rax,(4*8)(%rdi) + andl $0xffff0000,%eax + popq_cfi %rdi + orq PER_CPU_VAR(espfix_stack),%rax + SWAPGS + movq %rax,%rsp + popq_cfi %rax + jmp native_irq_return_iret +#endif + + .section .fixup,"ax" +bad_iret: + /* + * The iret traps when the %cs or %ss being restored is bogus. + * We've lost the original trap vector and error code. + * #GPF is the most likely one to get for an invalid selector. + * So pretend we completed the iret and took the #GPF in user mode. + * + * We are now running with the kernel GS after exception recovery. + * But error_entry expects us to have user GS to match the user %cs, + * so swap back. + */ + pushq $0 + + SWAPGS + jmp general_protection + + .previous + + /* edi: workmask, edx: work */ +retint_careful: + CFI_RESTORE_STATE + bt $TIF_NEED_RESCHED,%edx + jnc retint_signal + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + pushq_cfi %rdi + SCHEDULE_USER + popq_cfi %rdi + GET_THREAD_INFO(%rcx) + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + jmp retint_check + +retint_signal: + testl $_TIF_DO_NOTIFY_MASK,%edx + jz retint_swapgs + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + SAVE_REST + movq $-1,ORIG_RAX(%rsp) + xorl %esi,%esi # oldset + movq %rsp,%rdi # &pt_regs + call do_notify_resume + RESTORE_REST + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + GET_THREAD_INFO(%rcx) + jmp retint_with_reschedule + +#ifdef CONFIG_PREEMPT + /* Returning to kernel space. Check if we need preemption */ + /* rcx: threadinfo. interrupts off. */ +ENTRY(retint_kernel) + cmpl $0,PER_CPU_VAR(__preempt_count) + jnz retint_restore_args + bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ + jnc retint_restore_args + call preempt_schedule_irq + jmp exit_intr +#endif + CFI_ENDPROC +END(common_interrupt) + + /* + * If IRET takes a fault on the espfix stack, then we + * end up promoting it to a doublefault. In that case, + * modify the stack to make it look like we just entered + * the #GP handler from user space, similar to bad_iret. + */ +#ifdef CONFIG_X86_ESPFIX64 + ALIGN +__do_double_fault: + XCPT_FRAME 1 RDI+8 + movq RSP(%rdi),%rax /* Trap on the espfix stack? */ + sarq $PGDIR_SHIFT,%rax + cmpl $ESPFIX_PGD_ENTRY,%eax + jne do_double_fault /* No, just deliver the fault */ + cmpl $__KERNEL_CS,CS(%rdi) + jne do_double_fault + movq RIP(%rdi),%rax + cmpq $native_irq_return_iret,%rax + jne do_double_fault /* This shouldn't happen... */ + movq PER_CPU_VAR(kernel_stack),%rax + subq $(6*8-KERNEL_STACK_OFFSET),%rax /* Reset to original stack */ + movq %rax,RSP(%rdi) + movq $0,(%rax) /* Missing (lost) #GP error code */ + movq $general_protection,RIP(%rdi) + retq + CFI_ENDPROC +END(__do_double_fault) +#else +# define __do_double_fault do_double_fault +#endif + +/* + * APIC interrupts. + */ +.macro apicinterrupt3 num sym do_sym +ENTRY(\sym) + INTR_FRAME + ASM_CLAC + pushq_cfi $~(\num) +.Lcommon_\sym: + interrupt \do_sym + jmp ret_from_intr + CFI_ENDPROC +END(\sym) +.endm + +#ifdef CONFIG_TRACING +#define trace(sym) trace_##sym +#define smp_trace(sym) smp_trace_##sym + +.macro trace_apicinterrupt num sym +apicinterrupt3 \num trace(\sym) smp_trace(\sym) +.endm +#else +.macro trace_apicinterrupt num sym do_sym +.endm +#endif + +.macro apicinterrupt num sym do_sym +apicinterrupt3 \num \sym \do_sym +trace_apicinterrupt \num \sym +.endm + +#ifdef CONFIG_SMP +apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR \ + irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt +apicinterrupt3 REBOOT_VECTOR \ + reboot_interrupt smp_reboot_interrupt +#endif + +#ifdef CONFIG_X86_UV +apicinterrupt3 UV_BAU_MESSAGE \ + uv_bau_message_intr1 uv_bau_message_interrupt +#endif +apicinterrupt LOCAL_TIMER_VECTOR \ + apic_timer_interrupt smp_apic_timer_interrupt +apicinterrupt X86_PLATFORM_IPI_VECTOR \ + x86_platform_ipi smp_x86_platform_ipi + +#ifdef CONFIG_HAVE_KVM +apicinterrupt3 POSTED_INTR_VECTOR \ + kvm_posted_intr_ipi smp_kvm_posted_intr_ipi +#endif + +#ifdef CONFIG_X86_MCE_THRESHOLD +apicinterrupt THRESHOLD_APIC_VECTOR \ + threshold_interrupt smp_threshold_interrupt +#endif + +#ifdef CONFIG_X86_THERMAL_VECTOR +apicinterrupt THERMAL_APIC_VECTOR \ + thermal_interrupt smp_thermal_interrupt +#endif + +#ifdef CONFIG_SMP +apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \ + call_function_single_interrupt smp_call_function_single_interrupt +apicinterrupt CALL_FUNCTION_VECTOR \ + call_function_interrupt smp_call_function_interrupt +apicinterrupt RESCHEDULE_VECTOR \ + reschedule_interrupt smp_reschedule_interrupt +#endif + +apicinterrupt ERROR_APIC_VECTOR \ + error_interrupt smp_error_interrupt +apicinterrupt SPURIOUS_APIC_VECTOR \ + spurious_interrupt smp_spurious_interrupt + +#ifdef CONFIG_IRQ_WORK +apicinterrupt IRQ_WORK_VECTOR \ + irq_work_interrupt smp_irq_work_interrupt +#endif + +/* + * Exception entry points. + */ +#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) + +.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 +ENTRY(\sym) + /* Sanity check */ + .if \shift_ist != -1 && \paranoid == 0 + .error "using shift_ist requires paranoid=1" + .endif + + .if \has_error_code + XCPT_FRAME + .else + INTR_FRAME + .endif + + ASM_CLAC + PARAVIRT_ADJUST_EXCEPTION_FRAME + + .ifeq \has_error_code + pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ + .endif + + subq $ORIG_RAX-R15, %rsp + CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 + + .if \paranoid + call save_paranoid + .else + call error_entry + .endif + + DEFAULT_FRAME 0 + + .if \paranoid + .if \shift_ist != -1 + TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */ + .else + TRACE_IRQS_OFF + .endif + .endif + + movq %rsp,%rdi /* pt_regs pointer */ + + .if \has_error_code + movq ORIG_RAX(%rsp),%rsi /* get error code */ + movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */ + .else + xorl %esi,%esi /* no error code */ + .endif + + .if \shift_ist != -1 + subq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist) + .endif + + call \do_sym + + .if \shift_ist != -1 + addq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist) + .endif + + .if \paranoid + jmp paranoid_exit /* %ebx: no swapgs flag */ + .else + jmp error_exit /* %ebx: no swapgs flag */ + .endif + + CFI_ENDPROC +END(\sym) +.endm + +#ifdef CONFIG_TRACING +.macro trace_idtentry sym do_sym has_error_code:req +idtentry trace(\sym) trace(\do_sym) has_error_code=\has_error_code +idtentry \sym \do_sym has_error_code=\has_error_code +.endm +#else +.macro trace_idtentry sym do_sym has_error_code:req +idtentry \sym \do_sym has_error_code=\has_error_code +.endm +#endif + +idtentry divide_error do_divide_error has_error_code=0 +idtentry overflow do_overflow has_error_code=0 +idtentry bounds do_bounds has_error_code=0 +idtentry invalid_op do_invalid_op has_error_code=0 +idtentry device_not_available do_device_not_available has_error_code=0 +idtentry double_fault __do_double_fault has_error_code=1 paranoid=1 +idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 +idtentry invalid_TSS do_invalid_TSS has_error_code=1 +idtentry segment_not_present do_segment_not_present has_error_code=1 +idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0 +idtentry coprocessor_error do_coprocessor_error has_error_code=0 +idtentry alignment_check do_alignment_check has_error_code=1 +idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0 + + + /* Reload gs selector with exception handling */ + /* edi: new selector */ +ENTRY(native_load_gs_index) + CFI_STARTPROC + pushfq_cfi + DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) + SWAPGS +gs_change: + movl %edi,%gs +2: mfence /* workaround */ + SWAPGS + popfq_cfi + ret + CFI_ENDPROC +END(native_load_gs_index) + + _ASM_EXTABLE(gs_change,bad_gs) + .section .fixup,"ax" + /* running with kernelgs */ +bad_gs: + SWAPGS /* switch back to user gs */ + xorl %eax,%eax + movl %eax,%gs + jmp 2b + .previous + +/* Call softirq on interrupt stack. Interrupts are off. */ +ENTRY(do_softirq_own_stack) + CFI_STARTPROC + pushq_cfi %rbp + CFI_REL_OFFSET rbp,0 + mov %rsp,%rbp + CFI_DEF_CFA_REGISTER rbp + incl PER_CPU_VAR(irq_count) + cmove PER_CPU_VAR(irq_stack_ptr),%rsp + push %rbp # backlink for old unwinder + call __do_softirq + leaveq + CFI_RESTORE rbp + CFI_DEF_CFA_REGISTER rsp + CFI_ADJUST_CFA_OFFSET -8 + decl PER_CPU_VAR(irq_count) + ret + CFI_ENDPROC +END(do_softirq_own_stack) + +#ifdef CONFIG_XEN +idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0 + +/* + * A note on the "critical region" in our callback handler. + * We want to avoid stacking callback handlers due to events occurring + * during handling of the last event. To do this, we keep events disabled + * until we've done all processing. HOWEVER, we must enable events before + * popping the stack frame (can't be done atomically) and so it would still + * be possible to get enough handler activations to overflow the stack. + * Although unlikely, bugs of that kind are hard to track down, so we'd + * like to avoid the possibility. + * So, on entry to the handler we detect whether we interrupted an + * existing activation in its critical region -- if so, we pop the current + * activation and restart the handler using the previous one. + */ +ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) + CFI_STARTPROC +/* + * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will + * see the correct pointer to the pt_regs + */ + movq %rdi, %rsp # we don't return, adjust the stack frame + CFI_ENDPROC + DEFAULT_FRAME +11: incl PER_CPU_VAR(irq_count) + movq %rsp,%rbp + CFI_DEF_CFA_REGISTER rbp + cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp + pushq %rbp # backlink for old unwinder + call xen_evtchn_do_upcall + popq %rsp + CFI_DEF_CFA_REGISTER rsp + decl PER_CPU_VAR(irq_count) + jmp error_exit + CFI_ENDPROC +END(xen_do_hypervisor_callback) + +/* + * Hypervisor uses this for application faults while it executes. + * We get here for two reasons: + * 1. Fault while reloading DS, ES, FS or GS + * 2. Fault while executing IRET + * Category 1 we do not need to fix up as Xen has already reloaded all segment + * registers that could be reloaded and zeroed the others. + * Category 2 we fix up by killing the current process. We cannot use the + * normal Linux return path in this case because if we use the IRET hypercall + * to pop the stack frame we end up in an infinite loop of failsafe callbacks. + * We distinguish between categories by comparing each saved segment register + * with its current contents: any discrepancy means we in category 1. + */ +ENTRY(xen_failsafe_callback) + INTR_FRAME 1 (6*8) + /*CFI_REL_OFFSET gs,GS*/ + /*CFI_REL_OFFSET fs,FS*/ + /*CFI_REL_OFFSET es,ES*/ + /*CFI_REL_OFFSET ds,DS*/ + CFI_REL_OFFSET r11,8 + CFI_REL_OFFSET rcx,0 + movw %ds,%cx + cmpw %cx,0x10(%rsp) + CFI_REMEMBER_STATE + jne 1f + movw %es,%cx + cmpw %cx,0x18(%rsp) + jne 1f + movw %fs,%cx + cmpw %cx,0x20(%rsp) + jne 1f + movw %gs,%cx + cmpw %cx,0x28(%rsp) + jne 1f + /* All segments match their saved values => Category 2 (Bad IRET). */ + movq (%rsp),%rcx + CFI_RESTORE rcx + movq 8(%rsp),%r11 + CFI_RESTORE r11 + addq $0x30,%rsp + CFI_ADJUST_CFA_OFFSET -0x30 + pushq_cfi $0 /* RIP */ + pushq_cfi %r11 + pushq_cfi %rcx + jmp general_protection + CFI_RESTORE_STATE +1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ + movq (%rsp),%rcx + CFI_RESTORE rcx + movq 8(%rsp),%r11 + CFI_RESTORE r11 + addq $0x30,%rsp + CFI_ADJUST_CFA_OFFSET -0x30 + pushq_cfi $-1 /* orig_ax = -1 => not a system call */ + SAVE_ALL + jmp error_exit + CFI_ENDPROC +END(xen_failsafe_callback) + +apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ + xen_hvm_callback_vector xen_evtchn_do_upcall + +#endif /* CONFIG_XEN */ + +#if IS_ENABLED(CONFIG_HYPERV) +apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ + hyperv_callback_vector hyperv_vector_handler +#endif /* CONFIG_HYPERV */ + +idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK +idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK +idtentry stack_segment do_stack_segment has_error_code=1 paranoid=1 +#ifdef CONFIG_XEN +idtentry xen_debug do_debug has_error_code=0 +idtentry xen_int3 do_int3 has_error_code=0 +idtentry xen_stack_segment do_stack_segment has_error_code=1 +#endif +idtentry general_protection do_general_protection has_error_code=1 +trace_idtentry page_fault do_page_fault has_error_code=1 +#ifdef CONFIG_KVM_GUEST +idtentry async_page_fault do_async_page_fault has_error_code=1 +#endif +#ifdef CONFIG_X86_MCE +idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip) +#endif + + /* + * "Paranoid" exit path from exception stack. + * Paranoid because this is used by NMIs and cannot take + * any kernel state for granted. + * We don't do kernel preemption checks here, because only + * NMI should be common and it does not enable IRQs and + * cannot get reschedule ticks. + * + * "trace" is 0 for the NMI handler only, because irq-tracing + * is fundamentally NMI-unsafe. (we cannot change the soft and + * hard flags at once, atomically) + */ + + /* ebx: no swapgs flag */ +ENTRY(paranoid_exit) + DEFAULT_FRAME + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF_DEBUG + testl %ebx,%ebx /* swapgs needed? */ + jnz paranoid_restore + testl $3,CS(%rsp) + jnz paranoid_userspace +paranoid_swapgs: + TRACE_IRQS_IRETQ 0 + SWAPGS_UNSAFE_STACK + RESTORE_ALL 8 + jmp irq_return +paranoid_restore: + TRACE_IRQS_IRETQ_DEBUG 0 + RESTORE_ALL 8 + jmp irq_return +paranoid_userspace: + GET_THREAD_INFO(%rcx) + movl TI_flags(%rcx),%ebx + andl $_TIF_WORK_MASK,%ebx + jz paranoid_swapgs + movq %rsp,%rdi /* &pt_regs */ + call sync_regs + movq %rax,%rsp /* switch stack for scheduling */ + testl $_TIF_NEED_RESCHED,%ebx + jnz paranoid_schedule + movl %ebx,%edx /* arg3: thread flags */ + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + xorl %esi,%esi /* arg2: oldset */ + movq %rsp,%rdi /* arg1: &pt_regs */ + call do_notify_resume + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + jmp paranoid_userspace +paranoid_schedule: + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_ANY) + SCHEDULE_USER + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_OFF + jmp paranoid_userspace + CFI_ENDPROC +END(paranoid_exit) + +/* + * Exception entry point. This expects an error code/orig_rax on the stack. + * returns in "no swapgs flag" in %ebx. + */ +ENTRY(error_entry) + XCPT_FRAME + CFI_ADJUST_CFA_OFFSET 15*8 + /* oldrax contains error code */ + cld + movq_cfi rdi, RDI+8 + movq_cfi rsi, RSI+8 + movq_cfi rdx, RDX+8 + movq_cfi rcx, RCX+8 + movq_cfi rax, RAX+8 + movq_cfi r8, R8+8 + movq_cfi r9, R9+8 + movq_cfi r10, R10+8 + movq_cfi r11, R11+8 + movq_cfi rbx, RBX+8 + movq_cfi rbp, RBP+8 + movq_cfi r12, R12+8 + movq_cfi r13, R13+8 + movq_cfi r14, R14+8 + movq_cfi r15, R15+8 + xorl %ebx,%ebx + testl $3,CS+8(%rsp) + je error_kernelspace +error_swapgs: + SWAPGS +error_sti: + TRACE_IRQS_OFF + ret + +/* + * There are two places in the kernel that can potentially fault with + * usergs. Handle them here. The exception handlers after iret run with + * kernel gs again, so don't set the user space flag. B stepping K8s + * sometimes report an truncated RIP for IRET exceptions returning to + * compat mode. Check for these here too. + */ +error_kernelspace: + incl %ebx + leaq native_irq_return_iret(%rip),%rcx + cmpq %rcx,RIP+8(%rsp) + je error_swapgs + movl %ecx,%eax /* zero extend */ + cmpq %rax,RIP+8(%rsp) + je bstep_iret + cmpq $gs_change,RIP+8(%rsp) + je error_swapgs + jmp error_sti + +bstep_iret: + /* Fix truncated RIP */ + movq %rcx,RIP+8(%rsp) + jmp error_swapgs + CFI_ENDPROC +END(error_entry) + + +/* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */ +ENTRY(error_exit) + DEFAULT_FRAME + movl %ebx,%eax + RESTORE_REST + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + GET_THREAD_INFO(%rcx) + testl %eax,%eax + jne retint_kernel + LOCKDEP_SYS_EXIT_IRQ + movl TI_flags(%rcx),%edx + movl $_TIF_WORK_MASK,%edi + andl %edi,%edx + jnz retint_careful + jmp retint_swapgs + CFI_ENDPROC +END(error_exit) + +/* + * Test if a given stack is an NMI stack or not. + */ + .macro test_in_nmi reg stack nmi_ret normal_ret + cmpq %\reg, \stack + ja \normal_ret + subq $EXCEPTION_STKSZ, %\reg + cmpq %\reg, \stack + jb \normal_ret + jmp \nmi_ret + .endm + + /* runs on exception stack */ +ENTRY(nmi) + INTR_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME + /* + * We allow breakpoints in NMIs. If a breakpoint occurs, then + * the iretq it performs will take us out of NMI context. + * This means that we can have nested NMIs where the next + * NMI is using the top of the stack of the previous NMI. We + * can't let it execute because the nested NMI will corrupt the + * stack of the previous NMI. NMI handlers are not re-entrant + * anyway. + * + * To handle this case we do the following: + * Check the a special location on the stack that contains + * a variable that is set when NMIs are executing. + * The interrupted task's stack is also checked to see if it + * is an NMI stack. + * If the variable is not set and the stack is not the NMI + * stack then: + * o Set the special variable on the stack + * o Copy the interrupt frame into a "saved" location on the stack + * o Copy the interrupt frame into a "copy" location on the stack + * o Continue processing the NMI + * If the variable is set or the previous stack is the NMI stack: + * o Modify the "copy" location to jump to the repeate_nmi + * o return back to the first NMI + * + * Now on exit of the first NMI, we first clear the stack variable + * The NMI stack will tell any nested NMIs at that point that it is + * nested. Then we pop the stack normally with iret, and if there was + * a nested NMI that updated the copy interrupt stack frame, a + * jump will be made to the repeat_nmi code that will handle the second + * NMI. + */ + + /* Use %rdx as out temp variable throughout */ + pushq_cfi %rdx + CFI_REL_OFFSET rdx, 0 + + /* + * If %cs was not the kernel segment, then the NMI triggered in user + * space, which means it is definitely not nested. + */ + cmpl $__KERNEL_CS, 16(%rsp) + jne first_nmi + + /* + * Check the special variable on the stack to see if NMIs are + * executing. + */ + cmpl $1, -8(%rsp) + je nested_nmi + + /* + * Now test if the previous stack was an NMI stack. + * We need the double check. We check the NMI stack to satisfy the + * race when the first NMI clears the variable before returning. + * We check the variable because the first NMI could be in a + * breakpoint routine using a breakpoint stack. + */ + lea 6*8(%rsp), %rdx + test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi + CFI_REMEMBER_STATE + +nested_nmi: + /* + * Do nothing if we interrupted the fixup in repeat_nmi. + * It's about to repeat the NMI handler, so we are fine + * with ignoring this one. + */ + movq $repeat_nmi, %rdx + cmpq 8(%rsp), %rdx + ja 1f + movq $end_repeat_nmi, %rdx + cmpq 8(%rsp), %rdx + ja nested_nmi_out + +1: + /* Set up the interrupted NMIs stack to jump to repeat_nmi */ + leaq -1*8(%rsp), %rdx + movq %rdx, %rsp + CFI_ADJUST_CFA_OFFSET 1*8 + leaq -10*8(%rsp), %rdx + pushq_cfi $__KERNEL_DS + pushq_cfi %rdx + pushfq_cfi + pushq_cfi $__KERNEL_CS + pushq_cfi $repeat_nmi + + /* Put stack back */ + addq $(6*8), %rsp + CFI_ADJUST_CFA_OFFSET -6*8 + +nested_nmi_out: + popq_cfi %rdx + CFI_RESTORE rdx + + /* No need to check faults here */ + INTERRUPT_RETURN + + CFI_RESTORE_STATE +first_nmi: + /* + * Because nested NMIs will use the pushed location that we + * stored in rdx, we must keep that space available. + * Here's what our stack frame will look like: + * +-------------------------+ + * | original SS | + * | original Return RSP | + * | original RFLAGS | + * | original CS | + * | original RIP | + * +-------------------------+ + * | temp storage for rdx | + * +-------------------------+ + * | NMI executing variable | + * +-------------------------+ + * | copied SS | + * | copied Return RSP | + * | copied RFLAGS | + * | copied CS | + * | copied RIP | + * +-------------------------+ + * | Saved SS | + * | Saved Return RSP | + * | Saved RFLAGS | + * | Saved CS | + * | Saved RIP | + * +-------------------------+ + * | pt_regs | + * +-------------------------+ + * + * The saved stack frame is used to fix up the copied stack frame + * that a nested NMI may change to make the interrupted NMI iret jump + * to the repeat_nmi. The original stack frame and the temp storage + * is also used by nested NMIs and can not be trusted on exit. + */ + /* Do not pop rdx, nested NMIs will corrupt that part of the stack */ + movq (%rsp), %rdx + CFI_RESTORE rdx + + /* Set the NMI executing variable on the stack. */ + pushq_cfi $1 + + /* + * Leave room for the "copied" frame + */ + subq $(5*8), %rsp + CFI_ADJUST_CFA_OFFSET 5*8 + + /* Copy the stack frame to the Saved frame */ + .rept 5 + pushq_cfi 11*8(%rsp) + .endr + CFI_DEF_CFA_OFFSET SS+8-RIP + + /* Everything up to here is safe from nested NMIs */ + + /* + * If there was a nested NMI, the first NMI's iret will return + * here. But NMIs are still enabled and we can take another + * nested NMI. The nested NMI checks the interrupted RIP to see + * if it is between repeat_nmi and end_repeat_nmi, and if so + * it will just return, as we are about to repeat an NMI anyway. + * This makes it safe to copy to the stack frame that a nested + * NMI will update. + */ +repeat_nmi: + /* + * Update the stack variable to say we are still in NMI (the update + * is benign for the non-repeat case, where 1 was pushed just above + * to this very stack slot). + */ + movq $1, 10*8(%rsp) + + /* Make another copy, this one may be modified by nested NMIs */ + addq $(10*8), %rsp + CFI_ADJUST_CFA_OFFSET -10*8 + .rept 5 + pushq_cfi -6*8(%rsp) + .endr + subq $(5*8), %rsp + CFI_DEF_CFA_OFFSET SS+8-RIP +end_repeat_nmi: + + /* + * Everything below this point can be preempted by a nested + * NMI if the first NMI took an exception and reset our iret stack + * so that we repeat another NMI. + */ + pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ + subq $ORIG_RAX-R15, %rsp + CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 + /* + * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit + * as we should not be calling schedule in NMI context. + * Even with normal interrupts enabled. An NMI should not be + * setting NEED_RESCHED or anything that normal interrupts and + * exceptions might do. + */ + call save_paranoid + DEFAULT_FRAME 0 + + /* + * Save off the CR2 register. If we take a page fault in the NMI then + * it could corrupt the CR2 value. If the NMI preempts a page fault + * handler before it was able to read the CR2 register, and then the + * NMI itself takes a page fault, the page fault that was preempted + * will read the information from the NMI page fault and not the + * origin fault. Save it off and restore it if it changes. + * Use the r12 callee-saved register. + */ + movq %cr2, %r12 + + /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ + movq %rsp,%rdi + movq $-1,%rsi + call do_nmi + + /* Did the NMI take a page fault? Restore cr2 if it did */ + movq %cr2, %rcx + cmpq %rcx, %r12 + je 1f + movq %r12, %cr2 +1: + + testl %ebx,%ebx /* swapgs needed? */ + jnz nmi_restore +nmi_swapgs: + SWAPGS_UNSAFE_STACK +nmi_restore: + /* Pop the extra iret frame at once */ + RESTORE_ALL 6*8 + + /* Clear the NMI executing stack variable */ + movq $0, 5*8(%rsp) + jmp irq_return + CFI_ENDPROC +END(nmi) + +ENTRY(ignore_sysret) + CFI_STARTPROC + mov $-ENOSYS,%eax + sysret + CFI_ENDPROC +END(ignore_sysret) + diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c new file mode 100644 index 00000000000..94d857fb103 --- /dev/null +++ b/arch/x86/kernel/espfix_64.c @@ -0,0 +1,208 @@ +/* ----------------------------------------------------------------------- * + * + * Copyright 2014 Intel Corporation; author: H. Peter Anvin + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * ----------------------------------------------------------------------- */ + +/* + * The IRET instruction, when returning to a 16-bit segment, only + * restores the bottom 16 bits of the user space stack pointer. This + * causes some 16-bit software to break, but it also leaks kernel state + * to user space. + * + * This works around this by creating percpu "ministacks", each of which + * is mapped 2^16 times 64K apart. When we detect that the return SS is + * on the LDT, we copy the IRET frame to the ministack and use the + * relevant alias to return to userspace. The ministacks are mapped + * readonly, so if the IRET fault we promote #GP to #DF which is an IST + * vector and thus has its own stack; we then do the fixup in the #DF + * handler. + * + * This file sets up the ministacks and the related page tables. The + * actual ministack invocation is in entry_64.S. + */ + +#include <linux/init.h> +#include <linux/init_task.h> +#include <linux/kernel.h> +#include <linux/percpu.h> +#include <linux/gfp.h> +#include <linux/random.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/setup.h> +#include <asm/espfix.h> + +/* + * Note: we only need 6*8 = 48 bytes for the espfix stack, but round + * it up to a cache line to avoid unnecessary sharing. + */ +#define ESPFIX_STACK_SIZE (8*8UL) +#define ESPFIX_STACKS_PER_PAGE (PAGE_SIZE/ESPFIX_STACK_SIZE) + +/* There is address space for how many espfix pages? */ +#define ESPFIX_PAGE_SPACE (1UL << (PGDIR_SHIFT-PAGE_SHIFT-16)) + +#define ESPFIX_MAX_CPUS (ESPFIX_STACKS_PER_PAGE * ESPFIX_PAGE_SPACE) +#if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS +# error "Need more than one PGD for the ESPFIX hack" +#endif + +#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO) + +/* This contains the *bottom* address of the espfix stack */ +DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack); +DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr); + +/* Initialization mutex - should this be a spinlock? */ +static DEFINE_MUTEX(espfix_init_mutex); + +/* Page allocation bitmap - each page serves ESPFIX_STACKS_PER_PAGE CPUs */ +#define ESPFIX_MAX_PAGES DIV_ROUND_UP(CONFIG_NR_CPUS, ESPFIX_STACKS_PER_PAGE) +static void *espfix_pages[ESPFIX_MAX_PAGES]; + +static __page_aligned_bss pud_t espfix_pud_page[PTRS_PER_PUD] + __aligned(PAGE_SIZE); + +static unsigned int page_random, slot_random; + +/* + * This returns the bottom address of the espfix stack for a specific CPU. + * The math allows for a non-power-of-two ESPFIX_STACK_SIZE, in which case + * we have to account for some amount of padding at the end of each page. + */ +static inline unsigned long espfix_base_addr(unsigned int cpu) +{ + unsigned long page, slot; + unsigned long addr; + + page = (cpu / ESPFIX_STACKS_PER_PAGE) ^ page_random; + slot = (cpu + slot_random) % ESPFIX_STACKS_PER_PAGE; + addr = (page << PAGE_SHIFT) + (slot * ESPFIX_STACK_SIZE); + addr = (addr & 0xffffUL) | ((addr & ~0xffffUL) << 16); + addr += ESPFIX_BASE_ADDR; + return addr; +} + +#define PTE_STRIDE (65536/PAGE_SIZE) +#define ESPFIX_PTE_CLONES (PTRS_PER_PTE/PTE_STRIDE) +#define ESPFIX_PMD_CLONES PTRS_PER_PMD +#define ESPFIX_PUD_CLONES (65536/(ESPFIX_PTE_CLONES*ESPFIX_PMD_CLONES)) + +#define PGTABLE_PROT ((_KERNPG_TABLE & ~_PAGE_RW) | _PAGE_NX) + +static void init_espfix_random(void) +{ + unsigned long rand; + + /* + * This is run before the entropy pools are initialized, + * but this is hopefully better than nothing. + */ + if (!arch_get_random_long(&rand)) { + /* The constant is an arbitrary large prime */ + rdtscll(rand); + rand *= 0xc345c6b72fd16123UL; + } + + slot_random = rand % ESPFIX_STACKS_PER_PAGE; + page_random = (rand / ESPFIX_STACKS_PER_PAGE) + & (ESPFIX_PAGE_SPACE - 1); +} + +void __init init_espfix_bsp(void) +{ + pgd_t *pgd_p; + pteval_t ptemask; + + ptemask = __supported_pte_mask; + + /* Install the espfix pud into the kernel page directory */ + pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; + pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page); + + /* Randomize the locations */ + init_espfix_random(); + + /* The rest is the same as for any other processor */ + init_espfix_ap(); +} + +void init_espfix_ap(void) +{ + unsigned int cpu, page; + unsigned long addr; + pud_t pud, *pud_p; + pmd_t pmd, *pmd_p; + pte_t pte, *pte_p; + int n; + void *stack_page; + pteval_t ptemask; + + /* We only have to do this once... */ + if (likely(this_cpu_read(espfix_stack))) + return; /* Already initialized */ + + cpu = smp_processor_id(); + addr = espfix_base_addr(cpu); + page = cpu/ESPFIX_STACKS_PER_PAGE; + + /* Did another CPU already set this up? */ + stack_page = ACCESS_ONCE(espfix_pages[page]); + if (likely(stack_page)) + goto done; + + mutex_lock(&espfix_init_mutex); + + /* Did we race on the lock? */ + stack_page = ACCESS_ONCE(espfix_pages[page]); + if (stack_page) + goto unlock_done; + + ptemask = __supported_pte_mask; + + pud_p = &espfix_pud_page[pud_index(addr)]; + pud = *pud_p; + if (!pud_present(pud)) { + pmd_p = (pmd_t *)__get_free_page(PGALLOC_GFP); + pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask)); + paravirt_alloc_pmd(&init_mm, __pa(pmd_p) >> PAGE_SHIFT); + for (n = 0; n < ESPFIX_PUD_CLONES; n++) + set_pud(&pud_p[n], pud); + } + + pmd_p = pmd_offset(&pud, addr); + pmd = *pmd_p; + if (!pmd_present(pmd)) { + pte_p = (pte_t *)__get_free_page(PGALLOC_GFP); + pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask)); + paravirt_alloc_pte(&init_mm, __pa(pte_p) >> PAGE_SHIFT); + for (n = 0; n < ESPFIX_PMD_CLONES; n++) + set_pmd(&pmd_p[n], pmd); + } + + pte_p = pte_offset_kernel(&pmd, addr); + stack_page = (void *)__get_free_page(GFP_KERNEL); + pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask)); + for (n = 0; n < ESPFIX_PTE_CLONES; n++) + set_pte(&pte_p[n*PTE_STRIDE], pte); + + /* Job is done for this CPU and any CPU which shares this page */ + ACCESS_ONCE(espfix_pages[page]) = stack_page; + +unlock_done: + mutex_unlock(&espfix_init_mutex); +done: + this_cpu_write(espfix_stack, addr); + this_cpu_write(espfix_waddr, (unsigned long)stack_page + + (addr & ~PAGE_MASK)); +} diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c new file mode 100644 index 00000000000..cbc4a91b131 --- /dev/null +++ b/arch/x86/kernel/ftrace.c @@ -0,0 +1,754 @@ +/* + * Code for replacing ftrace calls with jumps. + * + * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> + * + * Thanks goes to Ingo Molnar, for suggesting the idea. + * Mathieu Desnoyers, for suggesting postponing the modifications. + * Arjan van de Ven, for keeping me straight, and explaining to me + * the dangers of modifying code on the run. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/spinlock.h> +#include <linux/hardirq.h> +#include <linux/uaccess.h> +#include <linux/ftrace.h> +#include <linux/percpu.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/module.h> + +#include <trace/syscall.h> + +#include <asm/cacheflush.h> +#include <asm/kprobes.h> +#include <asm/ftrace.h> +#include <asm/nops.h> + +#ifdef CONFIG_DYNAMIC_FTRACE + +int ftrace_arch_code_modify_prepare(void) +{ + set_kernel_text_rw(); + set_all_modules_text_rw(); + return 0; +} + +int ftrace_arch_code_modify_post_process(void) +{ + set_all_modules_text_ro(); + set_kernel_text_ro(); + return 0; +} + +union ftrace_code_union { + char code[MCOUNT_INSN_SIZE]; + struct { + char e8; + int offset; + } __attribute__((packed)); +}; + +static int ftrace_calc_offset(long ip, long addr) +{ + return (int)(addr - ip); +} + +static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) +{ + static union ftrace_code_union calc; + + calc.e8 = 0xe8; + calc.offset = ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr); + + /* + * No locking needed, this must be called via kstop_machine + * which in essence is like running on a uniprocessor machine. + */ + return calc.code; +} + +static inline int +within(unsigned long addr, unsigned long start, unsigned long end) +{ + return addr >= start && addr < end; +} + +static unsigned long text_ip_addr(unsigned long ip) +{ + /* + * On x86_64, kernel text mappings are mapped read-only with + * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead + * of the kernel text mapping to modify the kernel text. + * + * For 32bit kernels, these mappings are same and we can use + * kernel identity mapping to modify code. + */ + if (within(ip, (unsigned long)_text, (unsigned long)_etext)) + ip = (unsigned long)__va(__pa_symbol(ip)); + + return ip; +} + +static const unsigned char *ftrace_nop_replace(void) +{ + return ideal_nops[NOP_ATOMIC5]; +} + +static int +ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code, + unsigned const char *new_code) +{ + unsigned char replaced[MCOUNT_INSN_SIZE]; + + /* + * Note: Due to modules and __init, code can + * disappear and change, we need to protect against faulting + * as well as code changing. We do this by using the + * probe_kernel_* functions. + * + * No real locking needed, this code is run through + * kstop_machine, or before SMP starts. + */ + + /* read the text we want to modify */ + if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) + return -EFAULT; + + /* Make sure it is what we expect it to be */ + if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0) + return -EINVAL; + + ip = text_ip_addr(ip); + + /* replace the text with the new text */ + if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE)) + return -EPERM; + + sync_core(); + + return 0; +} + +int ftrace_make_nop(struct module *mod, + struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned const char *new, *old; + unsigned long ip = rec->ip; + + old = ftrace_call_replace(ip, addr); + new = ftrace_nop_replace(); + + /* + * On boot up, and when modules are loaded, the MCOUNT_ADDR + * is converted to a nop, and will never become MCOUNT_ADDR + * again. This code is either running before SMP (on boot up) + * or before the code will ever be executed (module load). + * We do not want to use the breakpoint version in this case, + * just modify the code directly. + */ + if (addr == MCOUNT_ADDR) + return ftrace_modify_code_direct(rec->ip, old, new); + + /* Normal cases use add_brk_on_nop */ + WARN_ONCE(1, "invalid use of ftrace_make_nop"); + return -EINVAL; +} + +int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned const char *new, *old; + unsigned long ip = rec->ip; + + old = ftrace_nop_replace(); + new = ftrace_call_replace(ip, addr); + + /* Should only be called when module is loaded */ + return ftrace_modify_code_direct(rec->ip, old, new); +} + +/* + * The modifying_ftrace_code is used to tell the breakpoint + * handler to call ftrace_int3_handler(). If it fails to + * call this handler for a breakpoint added by ftrace, then + * the kernel may crash. + * + * As atomic_writes on x86 do not need a barrier, we do not + * need to add smp_mb()s for this to work. It is also considered + * that we can not read the modifying_ftrace_code before + * executing the breakpoint. That would be quite remarkable if + * it could do that. Here's the flow that is required: + * + * CPU-0 CPU-1 + * + * atomic_inc(mfc); + * write int3s + * <trap-int3> // implicit (r)mb + * if (atomic_read(mfc)) + * call ftrace_int3_handler() + * + * Then when we are finished: + * + * atomic_dec(mfc); + * + * If we hit a breakpoint that was not set by ftrace, it does not + * matter if ftrace_int3_handler() is called or not. It will + * simply be ignored. But it is crucial that a ftrace nop/caller + * breakpoint is handled. No other user should ever place a + * breakpoint on an ftrace nop/caller location. It must only + * be done by this code. + */ +atomic_t modifying_ftrace_code __read_mostly; + +static int +ftrace_modify_code(unsigned long ip, unsigned const char *old_code, + unsigned const char *new_code); + +/* + * Should never be called: + * As it is only called by __ftrace_replace_code() which is called by + * ftrace_replace_code() that x86 overrides, and by ftrace_update_code() + * which is called to turn mcount into nops or nops into function calls + * but not to convert a function from not using regs to one that uses + * regs, which ftrace_modify_call() is for. + */ +int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, + unsigned long addr) +{ + WARN_ON(1); + return -EINVAL; +} + +static unsigned long ftrace_update_func; + +static int update_ftrace_func(unsigned long ip, void *new) +{ + unsigned char old[MCOUNT_INSN_SIZE]; + int ret; + + memcpy(old, (void *)ip, MCOUNT_INSN_SIZE); + + ftrace_update_func = ip; + /* Make sure the breakpoints see the ftrace_update_func update */ + smp_wmb(); + + /* See comment above by declaration of modifying_ftrace_code */ + atomic_inc(&modifying_ftrace_code); + + ret = ftrace_modify_code(ip, old, new); + + atomic_dec(&modifying_ftrace_code); + + return ret; +} + +int ftrace_update_ftrace_func(ftrace_func_t func) +{ + unsigned long ip = (unsigned long)(&ftrace_call); + unsigned char *new; + int ret; + + new = ftrace_call_replace(ip, (unsigned long)func); + ret = update_ftrace_func(ip, new); + + /* Also update the regs callback function */ + if (!ret) { + ip = (unsigned long)(&ftrace_regs_call); + new = ftrace_call_replace(ip, (unsigned long)func); + ret = update_ftrace_func(ip, new); + } + + return ret; +} + +static int is_ftrace_caller(unsigned long ip) +{ + if (ip == ftrace_update_func) + return 1; + + return 0; +} + +/* + * A breakpoint was added to the code address we are about to + * modify, and this is the handle that will just skip over it. + * We are either changing a nop into a trace call, or a trace + * call to a nop. While the change is taking place, we treat + * it just like it was a nop. + */ +int ftrace_int3_handler(struct pt_regs *regs) +{ + unsigned long ip; + + if (WARN_ON_ONCE(!regs)) + return 0; + + ip = regs->ip - 1; + if (!ftrace_location(ip) && !is_ftrace_caller(ip)) + return 0; + + regs->ip += MCOUNT_INSN_SIZE - 1; + + return 1; +} + +static int ftrace_write(unsigned long ip, const char *val, int size) +{ + ip = text_ip_addr(ip); + + if (probe_kernel_write((void *)ip, val, size)) + return -EPERM; + + return 0; +} + +static int add_break(unsigned long ip, const char *old) +{ + unsigned char replaced[MCOUNT_INSN_SIZE]; + unsigned char brk = BREAKPOINT_INSTRUCTION; + + if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) + return -EFAULT; + + /* Make sure it is what we expect it to be */ + if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0) + return -EINVAL; + + return ftrace_write(ip, &brk, 1); +} + +static int add_brk_on_call(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned const char *old; + unsigned long ip = rec->ip; + + old = ftrace_call_replace(ip, addr); + + return add_break(rec->ip, old); +} + + +static int add_brk_on_nop(struct dyn_ftrace *rec) +{ + unsigned const char *old; + + old = ftrace_nop_replace(); + + return add_break(rec->ip, old); +} + +static int add_breakpoints(struct dyn_ftrace *rec, int enable) +{ + unsigned long ftrace_addr; + int ret; + + ftrace_addr = ftrace_get_addr_curr(rec); + + ret = ftrace_test_record(rec, enable); + + switch (ret) { + case FTRACE_UPDATE_IGNORE: + return 0; + + case FTRACE_UPDATE_MAKE_CALL: + /* converting nop to call */ + return add_brk_on_nop(rec); + + case FTRACE_UPDATE_MODIFY_CALL: + case FTRACE_UPDATE_MAKE_NOP: + /* converting a call to a nop */ + return add_brk_on_call(rec, ftrace_addr); + } + return 0; +} + +/* + * On error, we need to remove breakpoints. This needs to + * be done caefully. If the address does not currently have a + * breakpoint, we know we are done. Otherwise, we look at the + * remaining 4 bytes of the instruction. If it matches a nop + * we replace the breakpoint with the nop. Otherwise we replace + * it with the call instruction. + */ +static int remove_breakpoint(struct dyn_ftrace *rec) +{ + unsigned char ins[MCOUNT_INSN_SIZE]; + unsigned char brk = BREAKPOINT_INSTRUCTION; + const unsigned char *nop; + unsigned long ftrace_addr; + unsigned long ip = rec->ip; + + /* If we fail the read, just give up */ + if (probe_kernel_read(ins, (void *)ip, MCOUNT_INSN_SIZE)) + return -EFAULT; + + /* If this does not have a breakpoint, we are done */ + if (ins[0] != brk) + return 0; + + nop = ftrace_nop_replace(); + + /* + * If the last 4 bytes of the instruction do not match + * a nop, then we assume that this is a call to ftrace_addr. + */ + if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) { + /* + * For extra paranoidism, we check if the breakpoint is on + * a call that would actually jump to the ftrace_addr. + * If not, don't touch the breakpoint, we make just create + * a disaster. + */ + ftrace_addr = ftrace_get_addr_new(rec); + nop = ftrace_call_replace(ip, ftrace_addr); + + if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) == 0) + goto update; + + /* Check both ftrace_addr and ftrace_old_addr */ + ftrace_addr = ftrace_get_addr_curr(rec); + nop = ftrace_call_replace(ip, ftrace_addr); + + if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) + return -EINVAL; + } + + update: + return ftrace_write(ip, nop, 1); +} + +static int add_update_code(unsigned long ip, unsigned const char *new) +{ + /* skip breakpoint */ + ip++; + new++; + return ftrace_write(ip, new, MCOUNT_INSN_SIZE - 1); +} + +static int add_update_call(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned long ip = rec->ip; + unsigned const char *new; + + new = ftrace_call_replace(ip, addr); + return add_update_code(ip, new); +} + +static int add_update_nop(struct dyn_ftrace *rec) +{ + unsigned long ip = rec->ip; + unsigned const char *new; + + new = ftrace_nop_replace(); + return add_update_code(ip, new); +} + +static int add_update(struct dyn_ftrace *rec, int enable) +{ + unsigned long ftrace_addr; + int ret; + + ret = ftrace_test_record(rec, enable); + + ftrace_addr = ftrace_get_addr_new(rec); + + switch (ret) { + case FTRACE_UPDATE_IGNORE: + return 0; + + case FTRACE_UPDATE_MODIFY_CALL: + case FTRACE_UPDATE_MAKE_CALL: + /* converting nop to call */ + return add_update_call(rec, ftrace_addr); + + case FTRACE_UPDATE_MAKE_NOP: + /* converting a call to a nop */ + return add_update_nop(rec); + } + + return 0; +} + +static int finish_update_call(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned long ip = rec->ip; + unsigned const char *new; + + new = ftrace_call_replace(ip, addr); + + return ftrace_write(ip, new, 1); +} + +static int finish_update_nop(struct dyn_ftrace *rec) +{ + unsigned long ip = rec->ip; + unsigned const char *new; + + new = ftrace_nop_replace(); + + return ftrace_write(ip, new, 1); +} + +static int finish_update(struct dyn_ftrace *rec, int enable) +{ + unsigned long ftrace_addr; + int ret; + + ret = ftrace_update_record(rec, enable); + + ftrace_addr = ftrace_get_addr_new(rec); + + switch (ret) { + case FTRACE_UPDATE_IGNORE: + return 0; + + case FTRACE_UPDATE_MODIFY_CALL: + case FTRACE_UPDATE_MAKE_CALL: + /* converting nop to call */ + return finish_update_call(rec, ftrace_addr); + + case FTRACE_UPDATE_MAKE_NOP: + /* converting a call to a nop */ + return finish_update_nop(rec); + } + + return 0; +} + +static void do_sync_core(void *data) +{ + sync_core(); +} + +static void run_sync(void) +{ + int enable_irqs = irqs_disabled(); + + /* We may be called with interrupts disbled (on bootup). */ + if (enable_irqs) + local_irq_enable(); + on_each_cpu(do_sync_core, NULL, 1); + if (enable_irqs) + local_irq_disable(); +} + +void ftrace_replace_code(int enable) +{ + struct ftrace_rec_iter *iter; + struct dyn_ftrace *rec; + const char *report = "adding breakpoints"; + int count = 0; + int ret; + + for_ftrace_rec_iter(iter) { + rec = ftrace_rec_iter_record(iter); + + ret = add_breakpoints(rec, enable); + if (ret) + goto remove_breakpoints; + count++; + } + + run_sync(); + + report = "updating code"; + + for_ftrace_rec_iter(iter) { + rec = ftrace_rec_iter_record(iter); + + ret = add_update(rec, enable); + if (ret) + goto remove_breakpoints; + } + + run_sync(); + + report = "removing breakpoints"; + + for_ftrace_rec_iter(iter) { + rec = ftrace_rec_iter_record(iter); + + ret = finish_update(rec, enable); + if (ret) + goto remove_breakpoints; + } + + run_sync(); + + return; + + remove_breakpoints: + pr_warn("Failed on %s (%d):\n", report, count); + ftrace_bug(ret, rec ? rec->ip : 0); + for_ftrace_rec_iter(iter) { + rec = ftrace_rec_iter_record(iter); + /* + * Breakpoints are handled only when this function is in + * progress. The system could not work with them. + */ + if (remove_breakpoint(rec)) + BUG(); + } + run_sync(); +} + +static int +ftrace_modify_code(unsigned long ip, unsigned const char *old_code, + unsigned const char *new_code) +{ + int ret; + + ret = add_break(ip, old_code); + if (ret) + goto out; + + run_sync(); + + ret = add_update_code(ip, new_code); + if (ret) + goto fail_update; + + run_sync(); + + ret = ftrace_write(ip, new_code, 1); + /* + * The breakpoint is handled only when this function is in progress. + * The system could not work if we could not remove it. + */ + BUG_ON(ret); + out: + run_sync(); + return ret; + + fail_update: + /* Also here the system could not work with the breakpoint */ + if (ftrace_write(ip, old_code, 1)) + BUG(); + goto out; +} + +void arch_ftrace_update_code(int command) +{ + /* See comment above by declaration of modifying_ftrace_code */ + atomic_inc(&modifying_ftrace_code); + + ftrace_modify_all_code(command); + + atomic_dec(&modifying_ftrace_code); +} + +int __init ftrace_dyn_arch_init(void) +{ + return 0; +} +#endif + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + +#ifdef CONFIG_DYNAMIC_FTRACE +extern void ftrace_graph_call(void); + +static unsigned char *ftrace_jmp_replace(unsigned long ip, unsigned long addr) +{ + static union ftrace_code_union calc; + + /* Jmp not a call (ignore the .e8) */ + calc.e8 = 0xe9; + calc.offset = ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr); + + /* + * ftrace external locks synchronize the access to the static variable. + */ + return calc.code; +} + +static int ftrace_mod_jmp(unsigned long ip, void *func) +{ + unsigned char *new; + + new = ftrace_jmp_replace(ip, (unsigned long)func); + + return update_ftrace_func(ip, new); +} + +int ftrace_enable_ftrace_graph_caller(void) +{ + unsigned long ip = (unsigned long)(&ftrace_graph_call); + + return ftrace_mod_jmp(ip, &ftrace_graph_caller); +} + +int ftrace_disable_ftrace_graph_caller(void) +{ + unsigned long ip = (unsigned long)(&ftrace_graph_call); + + return ftrace_mod_jmp(ip, &ftrace_stub); +} + +#endif /* !CONFIG_DYNAMIC_FTRACE */ + +/* + * Hook the return address and push it in the stack of return addrs + * in current thread info. + */ +void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, + unsigned long frame_pointer) +{ + unsigned long old; + int faulted; + struct ftrace_graph_ent trace; + unsigned long return_hooker = (unsigned long) + &return_to_handler; + + if (unlikely(atomic_read(¤t->tracing_graph_pause))) + return; + + /* + * Protect against fault, even if it shouldn't + * happen. This tool is too much intrusive to + * ignore such a protection. + */ + asm volatile( + "1: " _ASM_MOV " (%[parent]), %[old]\n" + "2: " _ASM_MOV " %[return_hooker], (%[parent])\n" + " movl $0, %[faulted]\n" + "3:\n" + + ".section .fixup, \"ax\"\n" + "4: movl $1, %[faulted]\n" + " jmp 3b\n" + ".previous\n" + + _ASM_EXTABLE(1b, 4b) + _ASM_EXTABLE(2b, 4b) + + : [old] "=&r" (old), [faulted] "=r" (faulted) + : [parent] "r" (parent), [return_hooker] "r" (return_hooker) + : "memory" + ); + + if (unlikely(faulted)) { + ftrace_graph_stop(); + WARN_ON(1); + return; + } + + trace.func = self_addr; + trace.depth = current->curr_ret_stack + 1; + + /* Only trace if the calling function expects to */ + if (!ftrace_graph_entry(&trace)) { + *parent = old; + return; + } + + if (ftrace_push_return_trace(old, self_addr, &trace.depth, + frame_pointer) == -EBUSY) { + *parent = old; + return; + } +} +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c new file mode 100644 index 00000000000..992f442ca15 --- /dev/null +++ b/arch/x86/kernel/head.c @@ -0,0 +1,71 @@ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/memblock.h> + +#include <asm/setup.h> +#include <asm/bios_ebda.h> + +/* + * The BIOS places the EBDA/XBDA at the top of conventional + * memory, and usually decreases the reported amount of + * conventional memory (int 0x12) too. This also contains a + * workaround for Dell systems that neglect to reserve EBDA. + * The same workaround also avoids a problem with the AMD768MPX + * chipset: reserve a page before VGA to prevent PCI prefetch + * into it (errata #56). Usually the page is reserved anyways, + * unless you have no PS/2 mouse plugged in. + * + * This functions is deliberately very conservative. Losing + * memory in the bottom megabyte is rarely a problem, as long + * as we have enough memory to install the trampoline. Using + * memory that is in use by the BIOS or by some DMA device + * the BIOS didn't shut down *is* a big problem. + */ + +#define BIOS_LOWMEM_KILOBYTES 0x413 +#define LOWMEM_CAP 0x9f000U /* Absolute maximum */ +#define INSANE_CUTOFF 0x20000U /* Less than this = insane */ + +void __init reserve_ebda_region(void) +{ + unsigned int lowmem, ebda_addr; + + /* + * To determine the position of the EBDA and the + * end of conventional memory, we need to look at + * the BIOS data area. In a paravirtual environment + * that area is absent. We'll just have to assume + * that the paravirt case can handle memory setup + * correctly, without our help. + */ + if (paravirt_enabled()) + return; + + /* end of low (conventional) memory */ + lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES); + lowmem <<= 10; + + /* start of EBDA area */ + ebda_addr = get_bios_ebda(); + + /* + * Note: some old Dells seem to need 4k EBDA without + * reporting so, so just consider the memory above 0x9f000 + * to be off limits (bugzilla 2990). + */ + + /* If the EBDA address is below 128K, assume it is bogus */ + if (ebda_addr < INSANE_CUTOFF) + ebda_addr = LOWMEM_CAP; + + /* If lowmem is less than 128K, assume it is bogus */ + if (lowmem < INSANE_CUTOFF) + lowmem = LOWMEM_CAP; + + /* Use the lower of the lowmem and EBDA markers as the cutoff */ + lowmem = min(lowmem, ebda_addr); + lowmem = min(lowmem, LOWMEM_CAP); /* Absolute cap */ + + /* reserve all memory between lowmem and the 1MB mark */ + memblock_reserve(lowmem, 0x100000 - lowmem); +} diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c new file mode 100644 index 00000000000..d6c1b983699 --- /dev/null +++ b/arch/x86/kernel/head32.c @@ -0,0 +1,50 @@ +/* + * linux/arch/i386/kernel/head32.c -- prepare to run common code + * + * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE + * Copyright (C) 2007 Eric Biederman <ebiederm@xmission.com> + */ + +#include <linux/init.h> +#include <linux/start_kernel.h> +#include <linux/mm.h> +#include <linux/memblock.h> + +#include <asm/setup.h> +#include <asm/sections.h> +#include <asm/e820.h> +#include <asm/page.h> +#include <asm/apic.h> +#include <asm/io_apic.h> +#include <asm/bios_ebda.h> +#include <asm/tlbflush.h> +#include <asm/bootparam_utils.h> + +static void __init i386_default_early_setup(void) +{ + /* Initialize 32bit specific setup functions */ + x86_init.resources.reserve_resources = i386_reserve_resources; + x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; + + reserve_ebda_region(); +} + +asmlinkage __visible void __init i386_start_kernel(void) +{ + sanitize_boot_params(&boot_params); + + /* Call the subarch specific early setup function */ + switch (boot_params.hdr.hardware_subarch) { + case X86_SUBARCH_INTEL_MID: + x86_intel_mid_early_setup(); + break; + case X86_SUBARCH_CE4100: + x86_ce4100_early_setup(); + break; + default: + i386_default_early_setup(); + break; + } + + start_kernel(); +} diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c new file mode 100644 index 00000000000..eda1a865641 --- /dev/null +++ b/arch/x86/kernel/head64.c @@ -0,0 +1,194 @@ +/* + * prepare to run common code + * + * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE + */ + +#include <linux/init.h> +#include <linux/linkage.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/percpu.h> +#include <linux/start_kernel.h> +#include <linux/io.h> +#include <linux/memblock.h> + +#include <asm/processor.h> +#include <asm/proto.h> +#include <asm/smp.h> +#include <asm/setup.h> +#include <asm/desc.h> +#include <asm/pgtable.h> +#include <asm/tlbflush.h> +#include <asm/sections.h> +#include <asm/kdebug.h> +#include <asm/e820.h> +#include <asm/bios_ebda.h> +#include <asm/bootparam_utils.h> +#include <asm/microcode.h> + +/* + * Manage page tables very early on. + */ +extern pgd_t early_level4_pgt[PTRS_PER_PGD]; +extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; +static unsigned int __initdata next_early_pgt = 2; +pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); + +/* Wipe all early page tables except for the kernel symbol map */ +static void __init reset_early_page_tables(void) +{ + unsigned long i; + + for (i = 0; i < PTRS_PER_PGD-1; i++) + early_level4_pgt[i].pgd = 0; + + next_early_pgt = 0; + + write_cr3(__pa(early_level4_pgt)); +} + +/* Create a new PMD entry */ +int __init early_make_pgtable(unsigned long address) +{ + unsigned long physaddr = address - __PAGE_OFFSET; + unsigned long i; + pgdval_t pgd, *pgd_p; + pudval_t pud, *pud_p; + pmdval_t pmd, *pmd_p; + + /* Invalid address or early pgt is done ? */ + if (physaddr >= MAXMEM || read_cr3() != __pa(early_level4_pgt)) + return -1; + +again: + pgd_p = &early_level4_pgt[pgd_index(address)].pgd; + pgd = *pgd_p; + + /* + * The use of __START_KERNEL_map rather than __PAGE_OFFSET here is + * critical -- __PAGE_OFFSET would point us back into the dynamic + * range and we might end up looping forever... + */ + if (pgd) + pud_p = (pudval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); + else { + if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) { + reset_early_page_tables(); + goto again; + } + + pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++]; + for (i = 0; i < PTRS_PER_PUD; i++) + pud_p[i] = 0; + *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; + } + pud_p += pud_index(address); + pud = *pud_p; + + if (pud) + pmd_p = (pmdval_t *)((pud & PTE_PFN_MASK) + __START_KERNEL_map - phys_base); + else { + if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) { + reset_early_page_tables(); + goto again; + } + + pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++]; + for (i = 0; i < PTRS_PER_PMD; i++) + pmd_p[i] = 0; + *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; + } + pmd = (physaddr & PMD_MASK) + early_pmd_flags; + pmd_p[pmd_index(address)] = pmd; + + return 0; +} + +/* Don't add a printk in there. printk relies on the PDA which is not initialized + yet. */ +static void __init clear_bss(void) +{ + memset(__bss_start, 0, + (unsigned long) __bss_stop - (unsigned long) __bss_start); +} + +static unsigned long get_cmd_line_ptr(void) +{ + unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr; + + cmd_line_ptr |= (u64)boot_params.ext_cmd_line_ptr << 32; + + return cmd_line_ptr; +} + +static void __init copy_bootdata(char *real_mode_data) +{ + char * command_line; + unsigned long cmd_line_ptr; + + memcpy(&boot_params, real_mode_data, sizeof boot_params); + sanitize_boot_params(&boot_params); + cmd_line_ptr = get_cmd_line_ptr(); + if (cmd_line_ptr) { + command_line = __va(cmd_line_ptr); + memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); + } +} + +asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) +{ + int i; + + /* + * Build-time sanity checks on the kernel image and module + * area mappings. (these are purely build-time and produce no code) + */ + BUILD_BUG_ON(MODULES_VADDR < __START_KERNEL_map); + BUILD_BUG_ON(MODULES_VADDR - __START_KERNEL_map < KERNEL_IMAGE_SIZE); + BUILD_BUG_ON(MODULES_LEN + KERNEL_IMAGE_SIZE > 2*PUD_SIZE); + BUILD_BUG_ON((__START_KERNEL_map & ~PMD_MASK) != 0); + BUILD_BUG_ON((MODULES_VADDR & ~PMD_MASK) != 0); + BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); + BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == + (__START_KERNEL & PGDIR_MASK))); + BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END); + + /* Kill off the identity-map trampoline */ + reset_early_page_tables(); + + /* clear bss before set_intr_gate with early_idt_handler */ + clear_bss(); + + for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) + set_intr_gate(i, early_idt_handlers[i]); + load_idt((const struct desc_ptr *)&idt_descr); + + copy_bootdata(__va(real_mode_data)); + + /* + * Load microcode early on BSP. + */ + load_ucode_bsp(); + + if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) + early_printk("Kernel alive\n"); + + clear_page(init_level4_pgt); + /* set init_level4_pgt kernel high mapping*/ + init_level4_pgt[511] = early_level4_pgt[511]; + + x86_64_start_reservations(real_mode_data); +} + +void __init x86_64_start_reservations(char *real_mode_data) +{ + /* version is always not zero if it is copied */ + if (!boot_params.hdr.version) + copy_bootdata(__va(real_mode_data)); + + reserve_ebda_region(); + + start_kernel(); +} diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S new file mode 100644 index 00000000000..f36bd42d6f0 --- /dev/null +++ b/arch/x86/kernel/head_32.S @@ -0,0 +1,760 @@ +/* + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Enhanced CPU detection and feature setting code by Mike Jagdis + * and Martin Mares, November 1997. + */ + +.text +#include <linux/threads.h> +#include <linux/init.h> +#include <linux/linkage.h> +#include <asm/segment.h> +#include <asm/page_types.h> +#include <asm/pgtable_types.h> +#include <asm/cache.h> +#include <asm/thread_info.h> +#include <asm/asm-offsets.h> +#include <asm/setup.h> +#include <asm/processor-flags.h> +#include <asm/msr-index.h> +#include <asm/cpufeature.h> +#include <asm/percpu.h> +#include <asm/nops.h> + +/* Physical address */ +#define pa(X) ((X) - __PAGE_OFFSET) + +/* + * References to members of the new_cpu_data structure. + */ + +#define X86 new_cpu_data+CPUINFO_x86 +#define X86_VENDOR new_cpu_data+CPUINFO_x86_vendor +#define X86_MODEL new_cpu_data+CPUINFO_x86_model +#define X86_MASK new_cpu_data+CPUINFO_x86_mask +#define X86_HARD_MATH new_cpu_data+CPUINFO_hard_math +#define X86_CPUID new_cpu_data+CPUINFO_cpuid_level +#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability +#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id + +/* + * This is how much memory in addition to the memory covered up to + * and including _end we need mapped initially. + * We need: + * (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE) + * (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE) + * + * Modulo rounding, each megabyte assigned here requires a kilobyte of + * memory, which is currently unreclaimed. + * + * This should be a multiple of a page. + * + * KERNEL_IMAGE_SIZE should be greater than pa(_end) + * and small than max_low_pfn, otherwise will waste some page table entries + */ + +#if PTRS_PER_PMD > 1 +#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD) +#else +#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD) +#endif + +/* Number of possible pages in the lowmem region */ +LOWMEM_PAGES = (((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT) + +/* Enough space to fit pagetables for the low memory linear map */ +MAPPING_BEYOND_END = PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT + +/* + * Worst-case size of the kernel mapping we need to make: + * a relocatable kernel can live anywhere in lowmem, so we need to be able + * to map all of lowmem. + */ +KERNEL_PAGES = LOWMEM_PAGES + +INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE +RESERVE_BRK(pagetables, INIT_MAP_SIZE) + +/* + * 32-bit kernel entrypoint; only used by the boot CPU. On entry, + * %esi points to the real-mode code as a 32-bit pointer. + * CS and DS must be 4 GB flat segments, but we don't depend on + * any particular GDT layout, because we load our own as soon as we + * can. + */ +__HEAD +ENTRY(startup_32) + movl pa(stack_start),%ecx + + /* test KEEP_SEGMENTS flag to see if the bootloader is asking + us to not reload segments */ + testb $(1<<6), BP_loadflags(%esi) + jnz 2f + +/* + * Set segments to known values. + */ + lgdt pa(boot_gdt_descr) + movl $(__BOOT_DS),%eax + movl %eax,%ds + movl %eax,%es + movl %eax,%fs + movl %eax,%gs + movl %eax,%ss +2: + leal -__PAGE_OFFSET(%ecx),%esp + +/* + * Clear BSS first so that there are no surprises... + */ + cld + xorl %eax,%eax + movl $pa(__bss_start),%edi + movl $pa(__bss_stop),%ecx + subl %edi,%ecx + shrl $2,%ecx + rep ; stosl +/* + * Copy bootup parameters out of the way. + * Note: %esi still has the pointer to the real-mode data. + * With the kexec as boot loader, parameter segment might be loaded beyond + * kernel image and might not even be addressable by early boot page tables. + * (kexec on panic case). Hence copy out the parameters before initializing + * page tables. + */ + movl $pa(boot_params),%edi + movl $(PARAM_SIZE/4),%ecx + cld + rep + movsl + movl pa(boot_params) + NEW_CL_POINTER,%esi + andl %esi,%esi + jz 1f # No command line + movl $pa(boot_command_line),%edi + movl $(COMMAND_LINE_SIZE/4),%ecx + rep + movsl +1: + +#ifdef CONFIG_OLPC + /* save OFW's pgdir table for later use when calling into OFW */ + movl %cr3, %eax + movl %eax, pa(olpc_ofw_pgd) +#endif + +#ifdef CONFIG_MICROCODE_EARLY + /* Early load ucode on BSP. */ + call load_ucode_bsp +#endif + +/* + * Initialize page tables. This creates a PDE and a set of page + * tables, which are located immediately beyond __brk_base. The variable + * _brk_end is set up to point to the first "safe" location. + * Mappings are created both at virtual address 0 (identity mapping) + * and PAGE_OFFSET for up to _end. + */ +#ifdef CONFIG_X86_PAE + + /* + * In PAE mode initial_page_table is statically defined to contain + * enough entries to cover the VMSPLIT option (that is the top 1, 2 or 3 + * entries). The identity mapping is handled by pointing two PGD entries + * to the first kernel PMD. + * + * Note the upper half of each PMD or PTE are always zero at this stage. + */ + +#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */ + + xorl %ebx,%ebx /* %ebx is kept at zero */ + + movl $pa(__brk_base), %edi + movl $pa(initial_pg_pmd), %edx + movl $PTE_IDENT_ATTR, %eax +10: + leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */ + movl %ecx,(%edx) /* Store PMD entry */ + /* Upper half already zero */ + addl $8,%edx + movl $512,%ecx +11: + stosl + xchgl %eax,%ebx + stosl + xchgl %eax,%ebx + addl $0x1000,%eax + loop 11b + + /* + * End condition: we must map up to the end + MAPPING_BEYOND_END. + */ + movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp + cmpl %ebp,%eax + jb 10b +1: + addl $__PAGE_OFFSET, %edi + movl %edi, pa(_brk_end) + shrl $12, %eax + movl %eax, pa(max_pfn_mapped) + + /* Do early initialization of the fixmap area */ + movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax + movl %eax,pa(initial_pg_pmd+0x1000*KPMDS-8) +#else /* Not PAE */ + +page_pde_offset = (__PAGE_OFFSET >> 20); + + movl $pa(__brk_base), %edi + movl $pa(initial_page_table), %edx + movl $PTE_IDENT_ATTR, %eax +10: + leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */ + movl %ecx,(%edx) /* Store identity PDE entry */ + movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */ + addl $4,%edx + movl $1024, %ecx +11: + stosl + addl $0x1000,%eax + loop 11b + /* + * End condition: we must map up to the end + MAPPING_BEYOND_END. + */ + movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp + cmpl %ebp,%eax + jb 10b + addl $__PAGE_OFFSET, %edi + movl %edi, pa(_brk_end) + shrl $12, %eax + movl %eax, pa(max_pfn_mapped) + + /* Do early initialization of the fixmap area */ + movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax + movl %eax,pa(initial_page_table+0xffc) +#endif + +#ifdef CONFIG_PARAVIRT + /* This is can only trip for a broken bootloader... */ + cmpw $0x207, pa(boot_params + BP_version) + jb default_entry + + /* Paravirt-compatible boot parameters. Look to see what architecture + we're booting under. */ + movl pa(boot_params + BP_hardware_subarch), %eax + cmpl $num_subarch_entries, %eax + jae bad_subarch + + movl pa(subarch_entries)(,%eax,4), %eax + subl $__PAGE_OFFSET, %eax + jmp *%eax + +bad_subarch: +WEAK(lguest_entry) +WEAK(xen_entry) + /* Unknown implementation; there's really + nothing we can do at this point. */ + ud2a + + __INITDATA + +subarch_entries: + .long default_entry /* normal x86/PC */ + .long lguest_entry /* lguest hypervisor */ + .long xen_entry /* Xen hypervisor */ + .long default_entry /* Moorestown MID */ +num_subarch_entries = (. - subarch_entries) / 4 +.previous +#else + jmp default_entry +#endif /* CONFIG_PARAVIRT */ + +#ifdef CONFIG_HOTPLUG_CPU +/* + * Boot CPU0 entry point. It's called from play_dead(). Everything has been set + * up already except stack. We just set up stack here. Then call + * start_secondary(). + */ +ENTRY(start_cpu0) + movl stack_start, %ecx + movl %ecx, %esp + jmp *(initial_code) +ENDPROC(start_cpu0) +#endif + +/* + * Non-boot CPU entry point; entered from trampoline.S + * We can't lgdt here, because lgdt itself uses a data segment, but + * we know the trampoline has already loaded the boot_gdt for us. + * + * If cpu hotplug is not supported then this code can go in init section + * which will be freed later + */ +ENTRY(startup_32_smp) + cld + movl $(__BOOT_DS),%eax + movl %eax,%ds + movl %eax,%es + movl %eax,%fs + movl %eax,%gs + movl pa(stack_start),%ecx + movl %eax,%ss + leal -__PAGE_OFFSET(%ecx),%esp + +#ifdef CONFIG_MICROCODE_EARLY + /* Early load ucode on AP. */ + call load_ucode_ap +#endif + + +default_entry: +#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \ + X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \ + X86_CR0_PG) + movl $(CR0_STATE & ~X86_CR0_PG),%eax + movl %eax,%cr0 + +/* + * We want to start out with EFLAGS unambiguously cleared. Some BIOSes leave + * bits like NT set. This would confuse the debugger if this code is traced. So + * initialize them properly now before switching to protected mode. That means + * DF in particular (even though we have cleared it earlier after copying the + * command line) because GCC expects it. + */ + pushl $0 + popfl + +/* + * New page tables may be in 4Mbyte page mode and may be using the global pages. + * + * NOTE! If we are on a 486 we may have no cr4 at all! Specifically, cr4 exists + * if and only if CPUID exists and has flags other than the FPU flag set. + */ + movl $-1,pa(X86_CPUID) # preset CPUID level + movl $X86_EFLAGS_ID,%ecx + pushl %ecx + popfl # set EFLAGS=ID + pushfl + popl %eax # get EFLAGS + testl $X86_EFLAGS_ID,%eax # did EFLAGS.ID remained set? + jz enable_paging # hw disallowed setting of ID bit + # which means no CPUID and no CR4 + + xorl %eax,%eax + cpuid + movl %eax,pa(X86_CPUID) # save largest std CPUID function + + movl $1,%eax + cpuid + andl $~1,%edx # Ignore CPUID.FPU + jz enable_paging # No flags or only CPUID.FPU = no CR4 + + movl pa(mmu_cr4_features),%eax + movl %eax,%cr4 + + testb $X86_CR4_PAE, %al # check if PAE is enabled + jz enable_paging + + /* Check if extended functions are implemented */ + movl $0x80000000, %eax + cpuid + /* Value must be in the range 0x80000001 to 0x8000ffff */ + subl $0x80000001, %eax + cmpl $(0x8000ffff-0x80000001), %eax + ja enable_paging + + /* Clear bogus XD_DISABLE bits */ + call verify_cpu + + mov $0x80000001, %eax + cpuid + /* Execute Disable bit supported? */ + btl $(X86_FEATURE_NX & 31), %edx + jnc enable_paging + + /* Setup EFER (Extended Feature Enable Register) */ + movl $MSR_EFER, %ecx + rdmsr + + btsl $_EFER_NX, %eax + /* Make changes effective */ + wrmsr + +enable_paging: + +/* + * Enable paging + */ + movl $pa(initial_page_table), %eax + movl %eax,%cr3 /* set the page table pointer.. */ + movl $CR0_STATE,%eax + movl %eax,%cr0 /* ..and set paging (PG) bit */ + ljmp $__BOOT_CS,$1f /* Clear prefetch and normalize %eip */ +1: + /* Shift the stack pointer to a virtual address */ + addl $__PAGE_OFFSET, %esp + +/* + * start system 32-bit setup. We need to re-do some of the things done + * in 16-bit mode for the "real" operations. + */ + movl setup_once_ref,%eax + andl %eax,%eax + jz 1f # Did we do this already? + call *%eax +1: + +/* + * Check if it is 486 + */ + movb $4,X86 # at least 486 + cmpl $-1,X86_CPUID + je is486 + + /* get vendor info */ + xorl %eax,%eax # call CPUID with 0 -> return vendor ID + cpuid + movl %eax,X86_CPUID # save CPUID level + movl %ebx,X86_VENDOR_ID # lo 4 chars + movl %edx,X86_VENDOR_ID+4 # next 4 chars + movl %ecx,X86_VENDOR_ID+8 # last 4 chars + + orl %eax,%eax # do we have processor info as well? + je is486 + + movl $1,%eax # Use the CPUID instruction to get CPU type + cpuid + movb %al,%cl # save reg for future use + andb $0x0f,%ah # mask processor family + movb %ah,X86 + andb $0xf0,%al # mask model + shrb $4,%al + movb %al,X86_MODEL + andb $0x0f,%cl # mask mask revision + movb %cl,X86_MASK + movl %edx,X86_CAPABILITY + +is486: + movl $0x50022,%ecx # set AM, WP, NE and MP + movl %cr0,%eax + andl $0x80000011,%eax # Save PG,PE,ET + orl %ecx,%eax + movl %eax,%cr0 + + lgdt early_gdt_descr + lidt idt_descr + ljmp $(__KERNEL_CS),$1f +1: movl $(__KERNEL_DS),%eax # reload all the segment registers + movl %eax,%ss # after changing gdt. + + movl $(__USER_DS),%eax # DS/ES contains default USER segment + movl %eax,%ds + movl %eax,%es + + movl $(__KERNEL_PERCPU), %eax + movl %eax,%fs # set this cpu's percpu + + movl $(__KERNEL_STACK_CANARY),%eax + movl %eax,%gs + + xorl %eax,%eax # Clear LDT + lldt %ax + + pushl $0 # fake return address for unwinder + jmp *(initial_code) + +#include "verify_cpu.S" + +/* + * setup_once + * + * The setup work we only want to run on the BSP. + * + * Warning: %esi is live across this function. + */ +__INIT +setup_once: + /* + * Set up a idt with 256 entries pointing to ignore_int, + * interrupt gates. It doesn't actually load idt - that needs + * to be done on each CPU. Interrupts are enabled elsewhere, + * when we can be relatively sure everything is ok. + */ + + movl $idt_table,%edi + movl $early_idt_handlers,%eax + movl $NUM_EXCEPTION_VECTORS,%ecx +1: + movl %eax,(%edi) + movl %eax,4(%edi) + /* interrupt gate, dpl=0, present */ + movl $(0x8E000000 + __KERNEL_CS),2(%edi) + addl $9,%eax + addl $8,%edi + loop 1b + + movl $256 - NUM_EXCEPTION_VECTORS,%ecx + movl $ignore_int,%edx + movl $(__KERNEL_CS << 16),%eax + movw %dx,%ax /* selector = 0x0010 = cs */ + movw $0x8E00,%dx /* interrupt gate - dpl=0, present */ +2: + movl %eax,(%edi) + movl %edx,4(%edi) + addl $8,%edi + loop 2b + +#ifdef CONFIG_CC_STACKPROTECTOR + /* + * Configure the stack canary. The linker can't handle this by + * relocation. Manually set base address in stack canary + * segment descriptor. + */ + movl $gdt_page,%eax + movl $stack_canary,%ecx + movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) + shrl $16, %ecx + movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) + movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax) +#endif + + andl $0,setup_once_ref /* Once is enough, thanks */ + ret + +ENTRY(early_idt_handlers) + # 36(%esp) %eflags + # 32(%esp) %cs + # 28(%esp) %eip + # 24(%rsp) error code + i = 0 + .rept NUM_EXCEPTION_VECTORS + .if (EXCEPTION_ERRCODE_MASK >> i) & 1 + ASM_NOP2 + .else + pushl $0 # Dummy error code, to make stack frame uniform + .endif + pushl $i # 20(%esp) Vector number + jmp early_idt_handler + i = i + 1 + .endr +ENDPROC(early_idt_handlers) + + /* This is global to keep gas from relaxing the jumps */ +ENTRY(early_idt_handler) + cld + + cmpl $2,(%esp) # X86_TRAP_NMI + je is_nmi # Ignore NMI + + cmpl $2,%ss:early_recursion_flag + je hlt_loop + incl %ss:early_recursion_flag + + push %eax # 16(%esp) + push %ecx # 12(%esp) + push %edx # 8(%esp) + push %ds # 4(%esp) + push %es # 0(%esp) + movl $(__KERNEL_DS),%eax + movl %eax,%ds + movl %eax,%es + + cmpl $(__KERNEL_CS),32(%esp) + jne 10f + + leal 28(%esp),%eax # Pointer to %eip + call early_fixup_exception + andl %eax,%eax + jnz ex_entry /* found an exception entry */ + +10: +#ifdef CONFIG_PRINTK + xorl %eax,%eax + movw %ax,2(%esp) /* clean up the segment values on some cpus */ + movw %ax,6(%esp) + movw %ax,34(%esp) + leal 40(%esp),%eax + pushl %eax /* %esp before the exception */ + pushl %ebx + pushl %ebp + pushl %esi + pushl %edi + movl %cr2,%eax + pushl %eax + pushl (20+6*4)(%esp) /* trapno */ + pushl $fault_msg + call printk +#endif + call dump_stack +hlt_loop: + hlt + jmp hlt_loop + +ex_entry: + pop %es + pop %ds + pop %edx + pop %ecx + pop %eax + decl %ss:early_recursion_flag +is_nmi: + addl $8,%esp /* drop vector number and error code */ + iret +ENDPROC(early_idt_handler) + +/* This is the default interrupt "handler" :-) */ + ALIGN +ignore_int: + cld +#ifdef CONFIG_PRINTK + pushl %eax + pushl %ecx + pushl %edx + pushl %es + pushl %ds + movl $(__KERNEL_DS),%eax + movl %eax,%ds + movl %eax,%es + cmpl $2,early_recursion_flag + je hlt_loop + incl early_recursion_flag + pushl 16(%esp) + pushl 24(%esp) + pushl 32(%esp) + pushl 40(%esp) + pushl $int_msg + call printk + + call dump_stack + + addl $(5*4),%esp + popl %ds + popl %es + popl %edx + popl %ecx + popl %eax +#endif + iret +ENDPROC(ignore_int) +__INITDATA + .align 4 +early_recursion_flag: + .long 0 + +__REFDATA + .align 4 +ENTRY(initial_code) + .long i386_start_kernel +ENTRY(setup_once_ref) + .long setup_once + +/* + * BSS section + */ +__PAGE_ALIGNED_BSS + .align PAGE_SIZE +#ifdef CONFIG_X86_PAE +initial_pg_pmd: + .fill 1024*KPMDS,4,0 +#else +ENTRY(initial_page_table) + .fill 1024,4,0 +#endif +initial_pg_fixmap: + .fill 1024,4,0 +ENTRY(empty_zero_page) + .fill 4096,1,0 +ENTRY(swapper_pg_dir) + .fill 1024,4,0 + +/* + * This starts the data section. + */ +#ifdef CONFIG_X86_PAE +__PAGE_ALIGNED_DATA + /* Page-aligned for the benefit of paravirt? */ + .align PAGE_SIZE +ENTRY(initial_page_table) + .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */ +# if KPMDS == 3 + .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 + .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0 + .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x2000),0 +# elif KPMDS == 2 + .long 0,0 + .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 + .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0 +# elif KPMDS == 1 + .long 0,0 + .long 0,0 + .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 +# else +# error "Kernel PMDs should be 1, 2 or 3" +# endif + .align PAGE_SIZE /* needs to be page-sized too */ +#endif + +.data +.balign 4 +ENTRY(stack_start) + .long init_thread_union+THREAD_SIZE + +__INITRODATA +int_msg: + .asciz "Unknown interrupt or fault at: %p %p %p\n" + +fault_msg: +/* fault info: */ + .ascii "BUG: Int %d: CR2 %p\n" +/* regs pushed in early_idt_handler: */ + .ascii " EDI %p ESI %p EBP %p EBX %p\n" + .ascii " ESP %p ES %p DS %p\n" + .ascii " EDX %p ECX %p EAX %p\n" +/* fault frame: */ + .ascii " vec %p err %p EIP %p CS %p flg %p\n" + .ascii "Stack: %p %p %p %p %p %p %p %p\n" + .ascii " %p %p %p %p %p %p %p %p\n" + .asciz " %p %p %p %p %p %p %p %p\n" + +#include "../../x86/xen/xen-head.S" + +/* + * The IDT and GDT 'descriptors' are a strange 48-bit object + * only used by the lidt and lgdt instructions. They are not + * like usual segment descriptors - they consist of a 16-bit + * segment size, and 32-bit linear address value: + */ + + .data +.globl boot_gdt_descr +.globl idt_descr + + ALIGN +# early boot GDT descriptor (must use 1:1 address mapping) + .word 0 # 32 bit align gdt_desc.address +boot_gdt_descr: + .word __BOOT_DS+7 + .long boot_gdt - __PAGE_OFFSET + + .word 0 # 32-bit align idt_desc.address +idt_descr: + .word IDT_ENTRIES*8-1 # idt contains 256 entries + .long idt_table + +# boot GDT descriptor (later on used by CPU#0): + .word 0 # 32 bit align gdt_desc.address +ENTRY(early_gdt_descr) + .word GDT_ENTRIES*8-1 + .long gdt_page /* Overwritten for secondary CPUs */ + +/* + * The boot_gdt must mirror the equivalent in setup.S and is + * used only for booting. + */ + .align L1_CACHE_BYTES +ENTRY(boot_gdt) + .fill GDT_ENTRY_BOOT_CS,8,0 + .quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */ + .quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */ diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S new file mode 100644 index 00000000000..a468c0a65c4 --- /dev/null +++ b/arch/x86/kernel/head_64.S @@ -0,0 +1,521 @@ +/* + * linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit + * + * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE + * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> + * Copyright (C) 2000 Karsten Keil <kkeil@suse.de> + * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de> + * Copyright (C) 2005 Eric Biederman <ebiederm@xmission.com> + */ + + +#include <linux/linkage.h> +#include <linux/threads.h> +#include <linux/init.h> +#include <asm/segment.h> +#include <asm/pgtable.h> +#include <asm/page.h> +#include <asm/msr.h> +#include <asm/cache.h> +#include <asm/processor-flags.h> +#include <asm/percpu.h> +#include <asm/nops.h> + +#ifdef CONFIG_PARAVIRT +#include <asm/asm-offsets.h> +#include <asm/paravirt.h> +#define GET_CR2_INTO(reg) GET_CR2_INTO_RAX ; movq %rax, reg +#else +#define GET_CR2_INTO(reg) movq %cr2, reg +#define INTERRUPT_RETURN iretq +#endif + +/* we are not able to switch in one step to the final KERNEL ADDRESS SPACE + * because we need identity-mapped pages. + * + */ + +#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) + +L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET) +L3_PAGE_OFFSET = pud_index(__PAGE_OFFSET) +L4_START_KERNEL = pgd_index(__START_KERNEL_map) +L3_START_KERNEL = pud_index(__START_KERNEL_map) + + .text + __HEAD + .code64 + .globl startup_64 +startup_64: + /* + * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0, + * and someone has loaded an identity mapped page table + * for us. These identity mapped page tables map all of the + * kernel pages and possibly all of memory. + * + * %rsi holds a physical pointer to real_mode_data. + * + * We come here either directly from a 64bit bootloader, or from + * arch/x86_64/boot/compressed/head.S. + * + * We only come here initially at boot nothing else comes here. + * + * Since we may be loaded at an address different from what we were + * compiled to run at we first fixup the physical addresses in our page + * tables and then reload them. + */ + + /* + * Compute the delta between the address I am compiled to run at and the + * address I am actually running at. + */ + leaq _text(%rip), %rbp + subq $_text - __START_KERNEL_map, %rbp + + /* Is the address not 2M aligned? */ + movq %rbp, %rax + andl $~PMD_PAGE_MASK, %eax + testl %eax, %eax + jnz bad_address + + /* + * Is the address too large? + */ + leaq _text(%rip), %rax + shrq $MAX_PHYSMEM_BITS, %rax + jnz bad_address + + /* + * Fixup the physical addresses in the page table + */ + addq %rbp, early_level4_pgt + (L4_START_KERNEL*8)(%rip) + + addq %rbp, level3_kernel_pgt + (510*8)(%rip) + addq %rbp, level3_kernel_pgt + (511*8)(%rip) + + addq %rbp, level2_fixmap_pgt + (506*8)(%rip) + + /* + * Set up the identity mapping for the switchover. These + * entries should *NOT* have the global bit set! This also + * creates a bunch of nonsense entries but that is fine -- + * it avoids problems around wraparound. + */ + leaq _text(%rip), %rdi + leaq early_level4_pgt(%rip), %rbx + + movq %rdi, %rax + shrq $PGDIR_SHIFT, %rax + + leaq (4096 + _KERNPG_TABLE)(%rbx), %rdx + movq %rdx, 0(%rbx,%rax,8) + movq %rdx, 8(%rbx,%rax,8) + + addq $4096, %rdx + movq %rdi, %rax + shrq $PUD_SHIFT, %rax + andl $(PTRS_PER_PUD-1), %eax + movq %rdx, 4096(%rbx,%rax,8) + incl %eax + andl $(PTRS_PER_PUD-1), %eax + movq %rdx, 4096(%rbx,%rax,8) + + addq $8192, %rbx + movq %rdi, %rax + shrq $PMD_SHIFT, %rdi + addq $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax + leaq (_end - 1)(%rip), %rcx + shrq $PMD_SHIFT, %rcx + subq %rdi, %rcx + incl %ecx + +1: + andq $(PTRS_PER_PMD - 1), %rdi + movq %rax, (%rbx,%rdi,8) + incq %rdi + addq $PMD_SIZE, %rax + decl %ecx + jnz 1b + + /* + * Fixup the kernel text+data virtual addresses. Note that + * we might write invalid pmds, when the kernel is relocated + * cleanup_highmap() fixes this up along with the mappings + * beyond _end. + */ + leaq level2_kernel_pgt(%rip), %rdi + leaq 4096(%rdi), %r8 + /* See if it is a valid page table entry */ +1: testq $1, 0(%rdi) + jz 2f + addq %rbp, 0(%rdi) + /* Go to the next page */ +2: addq $8, %rdi + cmp %r8, %rdi + jne 1b + + /* Fixup phys_base */ + addq %rbp, phys_base(%rip) + + movq $(early_level4_pgt - __START_KERNEL_map), %rax + jmp 1f +ENTRY(secondary_startup_64) + /* + * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0, + * and someone has loaded a mapped page table. + * + * %rsi holds a physical pointer to real_mode_data. + * + * We come here either from startup_64 (using physical addresses) + * or from trampoline.S (using virtual addresses). + * + * Using virtual addresses from trampoline.S removes the need + * to have any identity mapped pages in the kernel page table + * after the boot processor executes this code. + */ + + movq $(init_level4_pgt - __START_KERNEL_map), %rax +1: + + /* Enable PAE mode and PGE */ + movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx + movq %rcx, %cr4 + + /* Setup early boot stage 4 level pagetables. */ + addq phys_base(%rip), %rax + movq %rax, %cr3 + + /* Ensure I am executing from virtual addresses */ + movq $1f, %rax + jmp *%rax +1: + + /* Check if nx is implemented */ + movl $0x80000001, %eax + cpuid + movl %edx,%edi + + /* Setup EFER (Extended Feature Enable Register) */ + movl $MSR_EFER, %ecx + rdmsr + btsl $_EFER_SCE, %eax /* Enable System Call */ + btl $20,%edi /* No Execute supported? */ + jnc 1f + btsl $_EFER_NX, %eax + btsq $_PAGE_BIT_NX,early_pmd_flags(%rip) +1: wrmsr /* Make changes effective */ + + /* Setup cr0 */ +#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \ + X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \ + X86_CR0_PG) + movl $CR0_STATE, %eax + /* Make changes effective */ + movq %rax, %cr0 + + /* Setup a boot time stack */ + movq stack_start(%rip), %rsp + + /* zero EFLAGS after setting rsp */ + pushq $0 + popfq + + /* + * We must switch to a new descriptor in kernel space for the GDT + * because soon the kernel won't have access anymore to the userspace + * addresses where we're currently running on. We have to do that here + * because in 32bit we couldn't load a 64bit linear address. + */ + lgdt early_gdt_descr(%rip) + + /* set up data segments */ + xorl %eax,%eax + movl %eax,%ds + movl %eax,%ss + movl %eax,%es + + /* + * We don't really need to load %fs or %gs, but load them anyway + * to kill any stale realmode selectors. This allows execution + * under VT hardware. + */ + movl %eax,%fs + movl %eax,%gs + + /* Set up %gs. + * + * The base of %gs always points to the bottom of the irqstack + * union. If the stack protector canary is enabled, it is + * located at %gs:40. Note that, on SMP, the boot cpu uses + * init data section till per cpu areas are set up. + */ + movl $MSR_GS_BASE,%ecx + movl initial_gs(%rip),%eax + movl initial_gs+4(%rip),%edx + wrmsr + + /* rsi is pointer to real mode structure with interesting info. + pass it to C */ + movq %rsi, %rdi + + /* Finally jump to run C code and to be on real kernel address + * Since we are running on identity-mapped space we have to jump + * to the full 64bit address, this is only possible as indirect + * jump. In addition we need to ensure %cs is set so we make this + * a far return. + * + * Note: do not change to far jump indirect with 64bit offset. + * + * AMD does not support far jump indirect with 64bit offset. + * AMD64 Architecture Programmer's Manual, Volume 3: states only + * JMP FAR mem16:16 FF /5 Far jump indirect, + * with the target specified by a far pointer in memory. + * JMP FAR mem16:32 FF /5 Far jump indirect, + * with the target specified by a far pointer in memory. + * + * Intel64 does support 64bit offset. + * Software Developer Manual Vol 2: states: + * FF /5 JMP m16:16 Jump far, absolute indirect, + * address given in m16:16 + * FF /5 JMP m16:32 Jump far, absolute indirect, + * address given in m16:32. + * REX.W + FF /5 JMP m16:64 Jump far, absolute indirect, + * address given in m16:64. + */ + movq initial_code(%rip),%rax + pushq $0 # fake return address to stop unwinder + pushq $__KERNEL_CS # set correct cs + pushq %rax # target address in negative space + lretq + +#ifdef CONFIG_HOTPLUG_CPU +/* + * Boot CPU0 entry point. It's called from play_dead(). Everything has been set + * up already except stack. We just set up stack here. Then call + * start_secondary(). + */ +ENTRY(start_cpu0) + movq stack_start(%rip),%rsp + movq initial_code(%rip),%rax + pushq $0 # fake return address to stop unwinder + pushq $__KERNEL_CS # set correct cs + pushq %rax # target address in negative space + lretq +ENDPROC(start_cpu0) +#endif + + /* SMP bootup changes these two */ + __REFDATA + .balign 8 + GLOBAL(initial_code) + .quad x86_64_start_kernel + GLOBAL(initial_gs) + .quad INIT_PER_CPU_VAR(irq_stack_union) + + GLOBAL(stack_start) + .quad init_thread_union+THREAD_SIZE-8 + .word 0 + __FINITDATA + +bad_address: + jmp bad_address + + __INIT + .globl early_idt_handlers +early_idt_handlers: + # 104(%rsp) %rflags + # 96(%rsp) %cs + # 88(%rsp) %rip + # 80(%rsp) error code + i = 0 + .rept NUM_EXCEPTION_VECTORS + .if (EXCEPTION_ERRCODE_MASK >> i) & 1 + ASM_NOP2 + .else + pushq $0 # Dummy error code, to make stack frame uniform + .endif + pushq $i # 72(%rsp) Vector number + jmp early_idt_handler + i = i + 1 + .endr + +/* This is global to keep gas from relaxing the jumps */ +ENTRY(early_idt_handler) + cld + + cmpl $2,(%rsp) # X86_TRAP_NMI + je is_nmi # Ignore NMI + + cmpl $2,early_recursion_flag(%rip) + jz 1f + incl early_recursion_flag(%rip) + + pushq %rax # 64(%rsp) + pushq %rcx # 56(%rsp) + pushq %rdx # 48(%rsp) + pushq %rsi # 40(%rsp) + pushq %rdi # 32(%rsp) + pushq %r8 # 24(%rsp) + pushq %r9 # 16(%rsp) + pushq %r10 # 8(%rsp) + pushq %r11 # 0(%rsp) + + cmpl $__KERNEL_CS,96(%rsp) + jne 11f + + cmpl $14,72(%rsp) # Page fault? + jnz 10f + GET_CR2_INTO(%rdi) # can clobber any volatile register if pv + call early_make_pgtable + andl %eax,%eax + jz 20f # All good + +10: + leaq 88(%rsp),%rdi # Pointer to %rip + call early_fixup_exception + andl %eax,%eax + jnz 20f # Found an exception entry + +11: +#ifdef CONFIG_EARLY_PRINTK + GET_CR2_INTO(%r9) # can clobber any volatile register if pv + movl 80(%rsp),%r8d # error code + movl 72(%rsp),%esi # vector number + movl 96(%rsp),%edx # %cs + movq 88(%rsp),%rcx # %rip + xorl %eax,%eax + leaq early_idt_msg(%rip),%rdi + call early_printk + cmpl $2,early_recursion_flag(%rip) + jz 1f + call dump_stack +#ifdef CONFIG_KALLSYMS + leaq early_idt_ripmsg(%rip),%rdi + movq 40(%rsp),%rsi # %rip again + call __print_symbol +#endif +#endif /* EARLY_PRINTK */ +1: hlt + jmp 1b + +20: # Exception table entry found or page table generated + popq %r11 + popq %r10 + popq %r9 + popq %r8 + popq %rdi + popq %rsi + popq %rdx + popq %rcx + popq %rax + decl early_recursion_flag(%rip) +is_nmi: + addq $16,%rsp # drop vector number and error code + INTERRUPT_RETURN +ENDPROC(early_idt_handler) + + __INITDATA + + .balign 4 +early_recursion_flag: + .long 0 + +#ifdef CONFIG_EARLY_PRINTK +early_idt_msg: + .asciz "PANIC: early exception %02lx rip %lx:%lx error %lx cr2 %lx\n" +early_idt_ripmsg: + .asciz "RIP %s\n" +#endif /* CONFIG_EARLY_PRINTK */ + +#define NEXT_PAGE(name) \ + .balign PAGE_SIZE; \ +GLOBAL(name) + +/* Automate the creation of 1 to 1 mapping pmd entries */ +#define PMDS(START, PERM, COUNT) \ + i = 0 ; \ + .rept (COUNT) ; \ + .quad (START) + (i << PMD_SHIFT) + (PERM) ; \ + i = i + 1 ; \ + .endr + + __INITDATA +NEXT_PAGE(early_level4_pgt) + .fill 511,8,0 + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + +NEXT_PAGE(early_dynamic_pgts) + .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 + + .data + +#ifndef CONFIG_XEN +NEXT_PAGE(init_level4_pgt) + .fill 512,8,0 +#else +NEXT_PAGE(init_level4_pgt) + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .org init_level4_pgt + L4_PAGE_OFFSET*8, 0 + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .org init_level4_pgt + L4_START_KERNEL*8, 0 + /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + +NEXT_PAGE(level3_ident_pgt) + .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .fill 511, 8, 0 +NEXT_PAGE(level2_ident_pgt) + /* Since I easily can, map the first 1G. + * Don't set NX because code runs from these pages. + */ + PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) +#endif + +NEXT_PAGE(level3_kernel_pgt) + .fill L3_START_KERNEL,8,0 + /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ + .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE + .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE + +NEXT_PAGE(level2_kernel_pgt) + /* + * 512 MB kernel mapping. We spend a full page on this pagetable + * anyway. + * + * The kernel code+data+bss must not be bigger than that. + * + * (NOTE: at +512MB starts the module area, see MODULES_VADDR. + * If you want to increase this then increase MODULES_VADDR + * too.) + */ + PMDS(0, __PAGE_KERNEL_LARGE_EXEC, + KERNEL_IMAGE_SIZE/PMD_SIZE) + +NEXT_PAGE(level2_fixmap_pgt) + .fill 506,8,0 + .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE + /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ + .fill 5,8,0 + +NEXT_PAGE(level1_fixmap_pgt) + .fill 512,8,0 + +#undef PMDS + + .data + .align 16 + .globl early_gdt_descr +early_gdt_descr: + .word GDT_ENTRIES*8-1 +early_gdt_descr_base: + .quad INIT_PER_CPU_VAR(gdt_page) + +ENTRY(phys_base) + /* This must match the first entry in level2_kernel_pgt */ + .quad 0x0000000000000000 + +#include "../../x86/xen/xen-head.S" + + __PAGE_ALIGNED_BSS +NEXT_PAGE(empty_zero_page) + .skip PAGE_SIZE diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c new file mode 100644 index 00000000000..319bcb9372f --- /dev/null +++ b/arch/x86/kernel/hpet.c @@ -0,0 +1,1253 @@ +#include <linux/clocksource.h> +#include <linux/clockchips.h> +#include <linux/interrupt.h> +#include <linux/export.h> +#include <linux/delay.h> +#include <linux/errno.h> +#include <linux/i8253.h> +#include <linux/slab.h> +#include <linux/hpet.h> +#include <linux/init.h> +#include <linux/cpu.h> +#include <linux/pm.h> +#include <linux/io.h> + +#include <asm/fixmap.h> +#include <asm/hpet.h> +#include <asm/time.h> + +#define HPET_MASK CLOCKSOURCE_MASK(32) + +/* FSEC = 10^-15 + NSEC = 10^-9 */ +#define FSEC_PER_NSEC 1000000L + +#define HPET_DEV_USED_BIT 2 +#define HPET_DEV_USED (1 << HPET_DEV_USED_BIT) +#define HPET_DEV_VALID 0x8 +#define HPET_DEV_FSB_CAP 0x1000 +#define HPET_DEV_PERI_CAP 0x2000 + +#define HPET_MIN_CYCLES 128 +#define HPET_MIN_PROG_DELTA (HPET_MIN_CYCLES + (HPET_MIN_CYCLES >> 1)) + +/* + * HPET address is set in acpi/boot.c, when an ACPI entry exists + */ +unsigned long hpet_address; +u8 hpet_blockid; /* OS timer block num */ +u8 hpet_msi_disable; + +#ifdef CONFIG_PCI_MSI +static unsigned long hpet_num_timers; +#endif +static void __iomem *hpet_virt_address; + +struct hpet_dev { + struct clock_event_device evt; + unsigned int num; + int cpu; + unsigned int irq; + unsigned int flags; + char name[10]; +}; + +inline struct hpet_dev *EVT_TO_HPET_DEV(struct clock_event_device *evtdev) +{ + return container_of(evtdev, struct hpet_dev, evt); +} + +inline unsigned int hpet_readl(unsigned int a) +{ + return readl(hpet_virt_address + a); +} + +static inline void hpet_writel(unsigned int d, unsigned int a) +{ + writel(d, hpet_virt_address + a); +} + +#ifdef CONFIG_X86_64 +#include <asm/pgtable.h> +#endif + +static inline void hpet_set_mapping(void) +{ + hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE); +} + +static inline void hpet_clear_mapping(void) +{ + iounmap(hpet_virt_address); + hpet_virt_address = NULL; +} + +/* + * HPET command line enable / disable + */ +int boot_hpet_disable; +int hpet_force_user; +static int hpet_verbose; + +static int __init hpet_setup(char *str) +{ + while (str) { + char *next = strchr(str, ','); + + if (next) + *next++ = 0; + if (!strncmp("disable", str, 7)) + boot_hpet_disable = 1; + if (!strncmp("force", str, 5)) + hpet_force_user = 1; + if (!strncmp("verbose", str, 7)) + hpet_verbose = 1; + str = next; + } + return 1; +} +__setup("hpet=", hpet_setup); + +static int __init disable_hpet(char *str) +{ + boot_hpet_disable = 1; + return 1; +} +__setup("nohpet", disable_hpet); + +static inline int is_hpet_capable(void) +{ + return !boot_hpet_disable && hpet_address; +} + +/* + * HPET timer interrupt enable / disable + */ +static int hpet_legacy_int_enabled; + +/** + * is_hpet_enabled - check whether the hpet timer interrupt is enabled + */ +int is_hpet_enabled(void) +{ + return is_hpet_capable() && hpet_legacy_int_enabled; +} +EXPORT_SYMBOL_GPL(is_hpet_enabled); + +static void _hpet_print_config(const char *function, int line) +{ + u32 i, timers, l, h; + printk(KERN_INFO "hpet: %s(%d):\n", function, line); + l = hpet_readl(HPET_ID); + h = hpet_readl(HPET_PERIOD); + timers = ((l & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1; + printk(KERN_INFO "hpet: ID: 0x%x, PERIOD: 0x%x\n", l, h); + l = hpet_readl(HPET_CFG); + h = hpet_readl(HPET_STATUS); + printk(KERN_INFO "hpet: CFG: 0x%x, STATUS: 0x%x\n", l, h); + l = hpet_readl(HPET_COUNTER); + h = hpet_readl(HPET_COUNTER+4); + printk(KERN_INFO "hpet: COUNTER_l: 0x%x, COUNTER_h: 0x%x\n", l, h); + + for (i = 0; i < timers; i++) { + l = hpet_readl(HPET_Tn_CFG(i)); + h = hpet_readl(HPET_Tn_CFG(i)+4); + printk(KERN_INFO "hpet: T%d: CFG_l: 0x%x, CFG_h: 0x%x\n", + i, l, h); + l = hpet_readl(HPET_Tn_CMP(i)); + h = hpet_readl(HPET_Tn_CMP(i)+4); + printk(KERN_INFO "hpet: T%d: CMP_l: 0x%x, CMP_h: 0x%x\n", + i, l, h); + l = hpet_readl(HPET_Tn_ROUTE(i)); + h = hpet_readl(HPET_Tn_ROUTE(i)+4); + printk(KERN_INFO "hpet: T%d ROUTE_l: 0x%x, ROUTE_h: 0x%x\n", + i, l, h); + } +} + +#define hpet_print_config() \ +do { \ + if (hpet_verbose) \ + _hpet_print_config(__FUNCTION__, __LINE__); \ +} while (0) + +/* + * When the hpet driver (/dev/hpet) is enabled, we need to reserve + * timer 0 and timer 1 in case of RTC emulation. + */ +#ifdef CONFIG_HPET + +static void hpet_reserve_msi_timers(struct hpet_data *hd); + +static void hpet_reserve_platform_timers(unsigned int id) +{ + struct hpet __iomem *hpet = hpet_virt_address; + struct hpet_timer __iomem *timer = &hpet->hpet_timers[2]; + unsigned int nrtimers, i; + struct hpet_data hd; + + nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1; + + memset(&hd, 0, sizeof(hd)); + hd.hd_phys_address = hpet_address; + hd.hd_address = hpet; + hd.hd_nirqs = nrtimers; + hpet_reserve_timer(&hd, 0); + +#ifdef CONFIG_HPET_EMULATE_RTC + hpet_reserve_timer(&hd, 1); +#endif + + /* + * NOTE that hd_irq[] reflects IOAPIC input pins (LEGACY_8254 + * is wrong for i8259!) not the output IRQ. Many BIOS writers + * don't bother configuring *any* comparator interrupts. + */ + hd.hd_irq[0] = HPET_LEGACY_8254; + hd.hd_irq[1] = HPET_LEGACY_RTC; + + for (i = 2; i < nrtimers; timer++, i++) { + hd.hd_irq[i] = (readl(&timer->hpet_config) & + Tn_INT_ROUTE_CNF_MASK) >> Tn_INT_ROUTE_CNF_SHIFT; + } + + hpet_reserve_msi_timers(&hd); + + hpet_alloc(&hd); + +} +#else +static void hpet_reserve_platform_timers(unsigned int id) { } +#endif + +/* + * Common hpet info + */ +static unsigned long hpet_freq; + +static void hpet_legacy_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt); +static int hpet_legacy_next_event(unsigned long delta, + struct clock_event_device *evt); + +/* + * The hpet clock event device + */ +static struct clock_event_device hpet_clockevent = { + .name = "hpet", + .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, + .set_mode = hpet_legacy_set_mode, + .set_next_event = hpet_legacy_next_event, + .irq = 0, + .rating = 50, +}; + +static void hpet_stop_counter(void) +{ + unsigned long cfg = hpet_readl(HPET_CFG); + cfg &= ~HPET_CFG_ENABLE; + hpet_writel(cfg, HPET_CFG); +} + +static void hpet_reset_counter(void) +{ + hpet_writel(0, HPET_COUNTER); + hpet_writel(0, HPET_COUNTER + 4); +} + +static void hpet_start_counter(void) +{ + unsigned int cfg = hpet_readl(HPET_CFG); + cfg |= HPET_CFG_ENABLE; + hpet_writel(cfg, HPET_CFG); +} + +static void hpet_restart_counter(void) +{ + hpet_stop_counter(); + hpet_reset_counter(); + hpet_start_counter(); +} + +static void hpet_resume_device(void) +{ + force_hpet_resume(); +} + +static void hpet_resume_counter(struct clocksource *cs) +{ + hpet_resume_device(); + hpet_restart_counter(); +} + +static void hpet_enable_legacy_int(void) +{ + unsigned int cfg = hpet_readl(HPET_CFG); + + cfg |= HPET_CFG_LEGACY; + hpet_writel(cfg, HPET_CFG); + hpet_legacy_int_enabled = 1; +} + +static void hpet_legacy_clockevent_register(void) +{ + /* Start HPET legacy interrupts */ + hpet_enable_legacy_int(); + + /* + * Start hpet with the boot cpu mask and make it + * global after the IO_APIC has been initialized. + */ + hpet_clockevent.cpumask = cpumask_of(smp_processor_id()); + clockevents_config_and_register(&hpet_clockevent, hpet_freq, + HPET_MIN_PROG_DELTA, 0x7FFFFFFF); + global_clock_event = &hpet_clockevent; + printk(KERN_DEBUG "hpet clockevent registered\n"); +} + +static int hpet_setup_msi_irq(unsigned int irq); + +static void hpet_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt, int timer) +{ + unsigned int cfg, cmp, now; + uint64_t delta; + + switch (mode) { + case CLOCK_EVT_MODE_PERIODIC: + hpet_stop_counter(); + delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult; + delta >>= evt->shift; + now = hpet_readl(HPET_COUNTER); + cmp = now + (unsigned int) delta; + cfg = hpet_readl(HPET_Tn_CFG(timer)); + cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | + HPET_TN_SETVAL | HPET_TN_32BIT; + hpet_writel(cfg, HPET_Tn_CFG(timer)); + hpet_writel(cmp, HPET_Tn_CMP(timer)); + udelay(1); + /* + * HPET on AMD 81xx needs a second write (with HPET_TN_SETVAL + * cleared) to T0_CMP to set the period. The HPET_TN_SETVAL + * bit is automatically cleared after the first write. + * (See AMD-8111 HyperTransport I/O Hub Data Sheet, + * Publication # 24674) + */ + hpet_writel((unsigned int) delta, HPET_Tn_CMP(timer)); + hpet_start_counter(); + hpet_print_config(); + break; + + case CLOCK_EVT_MODE_ONESHOT: + cfg = hpet_readl(HPET_Tn_CFG(timer)); + cfg &= ~HPET_TN_PERIODIC; + cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; + hpet_writel(cfg, HPET_Tn_CFG(timer)); + break; + + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_SHUTDOWN: + cfg = hpet_readl(HPET_Tn_CFG(timer)); + cfg &= ~HPET_TN_ENABLE; + hpet_writel(cfg, HPET_Tn_CFG(timer)); + break; + + case CLOCK_EVT_MODE_RESUME: + if (timer == 0) { + hpet_enable_legacy_int(); + } else { + struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); + hpet_setup_msi_irq(hdev->irq); + disable_irq(hdev->irq); + irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu)); + enable_irq(hdev->irq); + } + hpet_print_config(); + break; + } +} + +static int hpet_next_event(unsigned long delta, + struct clock_event_device *evt, int timer) +{ + u32 cnt; + s32 res; + + cnt = hpet_readl(HPET_COUNTER); + cnt += (u32) delta; + hpet_writel(cnt, HPET_Tn_CMP(timer)); + + /* + * HPETs are a complete disaster. The compare register is + * based on a equal comparison and neither provides a less + * than or equal functionality (which would require to take + * the wraparound into account) nor a simple count down event + * mode. Further the write to the comparator register is + * delayed internally up to two HPET clock cycles in certain + * chipsets (ATI, ICH9,10). Some newer AMD chipsets have even + * longer delays. We worked around that by reading back the + * compare register, but that required another workaround for + * ICH9,10 chips where the first readout after write can + * return the old stale value. We already had a minimum + * programming delta of 5us enforced, but a NMI or SMI hitting + * between the counter readout and the comparator write can + * move us behind that point easily. Now instead of reading + * the compare register back several times, we make the ETIME + * decision based on the following: Return ETIME if the + * counter value after the write is less than HPET_MIN_CYCLES + * away from the event or if the counter is already ahead of + * the event. The minimum programming delta for the generic + * clockevents code is set to 1.5 * HPET_MIN_CYCLES. + */ + res = (s32)(cnt - hpet_readl(HPET_COUNTER)); + + return res < HPET_MIN_CYCLES ? -ETIME : 0; +} + +static void hpet_legacy_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + hpet_set_mode(mode, evt, 0); +} + +static int hpet_legacy_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + return hpet_next_event(delta, evt, 0); +} + +/* + * HPET MSI Support + */ +#ifdef CONFIG_PCI_MSI + +static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev); +static struct hpet_dev *hpet_devs; + +void hpet_msi_unmask(struct irq_data *data) +{ + struct hpet_dev *hdev = data->handler_data; + unsigned int cfg; + + /* unmask it */ + cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); + cfg |= HPET_TN_ENABLE | HPET_TN_FSB; + hpet_writel(cfg, HPET_Tn_CFG(hdev->num)); +} + +void hpet_msi_mask(struct irq_data *data) +{ + struct hpet_dev *hdev = data->handler_data; + unsigned int cfg; + + /* mask it */ + cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); + cfg &= ~(HPET_TN_ENABLE | HPET_TN_FSB); + hpet_writel(cfg, HPET_Tn_CFG(hdev->num)); +} + +void hpet_msi_write(struct hpet_dev *hdev, struct msi_msg *msg) +{ + hpet_writel(msg->data, HPET_Tn_ROUTE(hdev->num)); + hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hdev->num) + 4); +} + +void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg) +{ + msg->data = hpet_readl(HPET_Tn_ROUTE(hdev->num)); + msg->address_lo = hpet_readl(HPET_Tn_ROUTE(hdev->num) + 4); + msg->address_hi = 0; +} + +static void hpet_msi_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); + hpet_set_mode(mode, evt, hdev->num); +} + +static int hpet_msi_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); + return hpet_next_event(delta, evt, hdev->num); +} + +static int hpet_setup_msi_irq(unsigned int irq) +{ + if (x86_msi.setup_hpet_msi(irq, hpet_blockid)) { + irq_free_hwirq(irq); + return -EINVAL; + } + return 0; +} + +static int hpet_assign_irq(struct hpet_dev *dev) +{ + unsigned int irq = irq_alloc_hwirq(-1); + + if (!irq) + return -EINVAL; + + irq_set_handler_data(irq, dev); + + if (hpet_setup_msi_irq(irq)) + return -EINVAL; + + dev->irq = irq; + return 0; +} + +static irqreturn_t hpet_interrupt_handler(int irq, void *data) +{ + struct hpet_dev *dev = (struct hpet_dev *)data; + struct clock_event_device *hevt = &dev->evt; + + if (!hevt->event_handler) { + printk(KERN_INFO "Spurious HPET timer interrupt on HPET timer %d\n", + dev->num); + return IRQ_HANDLED; + } + + hevt->event_handler(hevt); + return IRQ_HANDLED; +} + +static int hpet_setup_irq(struct hpet_dev *dev) +{ + + if (request_irq(dev->irq, hpet_interrupt_handler, + IRQF_TIMER | IRQF_NOBALANCING, + dev->name, dev)) + return -1; + + disable_irq(dev->irq); + irq_set_affinity(dev->irq, cpumask_of(dev->cpu)); + enable_irq(dev->irq); + + printk(KERN_DEBUG "hpet: %s irq %d for MSI\n", + dev->name, dev->irq); + + return 0; +} + +/* This should be called in specific @cpu */ +static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu) +{ + struct clock_event_device *evt = &hdev->evt; + + WARN_ON(cpu != smp_processor_id()); + if (!(hdev->flags & HPET_DEV_VALID)) + return; + + if (hpet_setup_msi_irq(hdev->irq)) + return; + + hdev->cpu = cpu; + per_cpu(cpu_hpet_dev, cpu) = hdev; + evt->name = hdev->name; + hpet_setup_irq(hdev); + evt->irq = hdev->irq; + + evt->rating = 110; + evt->features = CLOCK_EVT_FEAT_ONESHOT; + if (hdev->flags & HPET_DEV_PERI_CAP) + evt->features |= CLOCK_EVT_FEAT_PERIODIC; + + evt->set_mode = hpet_msi_set_mode; + evt->set_next_event = hpet_msi_next_event; + evt->cpumask = cpumask_of(hdev->cpu); + + clockevents_config_and_register(evt, hpet_freq, HPET_MIN_PROG_DELTA, + 0x7FFFFFFF); +} + +#ifdef CONFIG_HPET +/* Reserve at least one timer for userspace (/dev/hpet) */ +#define RESERVE_TIMERS 1 +#else +#define RESERVE_TIMERS 0 +#endif + +static void hpet_msi_capability_lookup(unsigned int start_timer) +{ + unsigned int id; + unsigned int num_timers; + unsigned int num_timers_used = 0; + int i; + + if (hpet_msi_disable) + return; + + if (boot_cpu_has(X86_FEATURE_ARAT)) + return; + id = hpet_readl(HPET_ID); + + num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT); + num_timers++; /* Value read out starts from 0 */ + hpet_print_config(); + + hpet_devs = kzalloc(sizeof(struct hpet_dev) * num_timers, GFP_KERNEL); + if (!hpet_devs) + return; + + hpet_num_timers = num_timers; + + for (i = start_timer; i < num_timers - RESERVE_TIMERS; i++) { + struct hpet_dev *hdev = &hpet_devs[num_timers_used]; + unsigned int cfg = hpet_readl(HPET_Tn_CFG(i)); + + /* Only consider HPET timer with MSI support */ + if (!(cfg & HPET_TN_FSB_CAP)) + continue; + + hdev->flags = 0; + if (cfg & HPET_TN_PERIODIC_CAP) + hdev->flags |= HPET_DEV_PERI_CAP; + hdev->num = i; + + sprintf(hdev->name, "hpet%d", i); + if (hpet_assign_irq(hdev)) + continue; + + hdev->flags |= HPET_DEV_FSB_CAP; + hdev->flags |= HPET_DEV_VALID; + num_timers_used++; + if (num_timers_used == num_possible_cpus()) + break; + } + + printk(KERN_INFO "HPET: %d timers in total, %d timers will be used for per-cpu timer\n", + num_timers, num_timers_used); +} + +#ifdef CONFIG_HPET +static void hpet_reserve_msi_timers(struct hpet_data *hd) +{ + int i; + + if (!hpet_devs) + return; + + for (i = 0; i < hpet_num_timers; i++) { + struct hpet_dev *hdev = &hpet_devs[i]; + + if (!(hdev->flags & HPET_DEV_VALID)) + continue; + + hd->hd_irq[hdev->num] = hdev->irq; + hpet_reserve_timer(hd, hdev->num); + } +} +#endif + +static struct hpet_dev *hpet_get_unused_timer(void) +{ + int i; + + if (!hpet_devs) + return NULL; + + for (i = 0; i < hpet_num_timers; i++) { + struct hpet_dev *hdev = &hpet_devs[i]; + + if (!(hdev->flags & HPET_DEV_VALID)) + continue; + if (test_and_set_bit(HPET_DEV_USED_BIT, + (unsigned long *)&hdev->flags)) + continue; + return hdev; + } + return NULL; +} + +struct hpet_work_struct { + struct delayed_work work; + struct completion complete; +}; + +static void hpet_work(struct work_struct *w) +{ + struct hpet_dev *hdev; + int cpu = smp_processor_id(); + struct hpet_work_struct *hpet_work; + + hpet_work = container_of(w, struct hpet_work_struct, work.work); + + hdev = hpet_get_unused_timer(); + if (hdev) + init_one_hpet_msi_clockevent(hdev, cpu); + + complete(&hpet_work->complete); +} + +static int hpet_cpuhp_notify(struct notifier_block *n, + unsigned long action, void *hcpu) +{ + unsigned long cpu = (unsigned long)hcpu; + struct hpet_work_struct work; + struct hpet_dev *hdev = per_cpu(cpu_hpet_dev, cpu); + + switch (action & 0xf) { + case CPU_ONLINE: + INIT_DELAYED_WORK_ONSTACK(&work.work, hpet_work); + init_completion(&work.complete); + /* FIXME: add schedule_work_on() */ + schedule_delayed_work_on(cpu, &work.work, 0); + wait_for_completion(&work.complete); + destroy_delayed_work_on_stack(&work.work); + break; + case CPU_DEAD: + if (hdev) { + free_irq(hdev->irq, hdev); + hdev->flags &= ~HPET_DEV_USED; + per_cpu(cpu_hpet_dev, cpu) = NULL; + } + break; + } + return NOTIFY_OK; +} +#else + +static int hpet_setup_msi_irq(unsigned int irq) +{ + return 0; +} +static void hpet_msi_capability_lookup(unsigned int start_timer) +{ + return; +} + +#ifdef CONFIG_HPET +static void hpet_reserve_msi_timers(struct hpet_data *hd) +{ + return; +} +#endif + +static int hpet_cpuhp_notify(struct notifier_block *n, + unsigned long action, void *hcpu) +{ + return NOTIFY_OK; +} + +#endif + +/* + * Clock source related code + */ +static cycle_t read_hpet(struct clocksource *cs) +{ + return (cycle_t)hpet_readl(HPET_COUNTER); +} + +static struct clocksource clocksource_hpet = { + .name = "hpet", + .rating = 250, + .read = read_hpet, + .mask = HPET_MASK, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .resume = hpet_resume_counter, + .archdata = { .vclock_mode = VCLOCK_HPET }, +}; + +static int hpet_clocksource_register(void) +{ + u64 start, now; + cycle_t t1; + + /* Start the counter */ + hpet_restart_counter(); + + /* Verify whether hpet counter works */ + t1 = hpet_readl(HPET_COUNTER); + rdtscll(start); + + /* + * We don't know the TSC frequency yet, but waiting for + * 200000 TSC cycles is safe: + * 4 GHz == 50us + * 1 GHz == 200us + */ + do { + rep_nop(); + rdtscll(now); + } while ((now - start) < 200000UL); + + if (t1 == hpet_readl(HPET_COUNTER)) { + printk(KERN_WARNING + "HPET counter not counting. HPET disabled\n"); + return -ENODEV; + } + + clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq); + return 0; +} + +static u32 *hpet_boot_cfg; + +/** + * hpet_enable - Try to setup the HPET timer. Returns 1 on success. + */ +int __init hpet_enable(void) +{ + u32 hpet_period, cfg, id; + u64 freq; + unsigned int i, last; + + if (!is_hpet_capable()) + return 0; + + hpet_set_mapping(); + + /* + * Read the period and check for a sane value: + */ + hpet_period = hpet_readl(HPET_PERIOD); + + /* + * AMD SB700 based systems with spread spectrum enabled use a + * SMM based HPET emulation to provide proper frequency + * setting. The SMM code is initialized with the first HPET + * register access and takes some time to complete. During + * this time the config register reads 0xffffffff. We check + * for max. 1000 loops whether the config register reads a non + * 0xffffffff value to make sure that HPET is up and running + * before we go further. A counting loop is safe, as the HPET + * access takes thousands of CPU cycles. On non SB700 based + * machines this check is only done once and has no side + * effects. + */ + for (i = 0; hpet_readl(HPET_CFG) == 0xFFFFFFFF; i++) { + if (i == 1000) { + printk(KERN_WARNING + "HPET config register value = 0xFFFFFFFF. " + "Disabling HPET\n"); + goto out_nohpet; + } + } + + if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD) + goto out_nohpet; + + /* + * The period is a femto seconds value. Convert it to a + * frequency. + */ + freq = FSEC_PER_SEC; + do_div(freq, hpet_period); + hpet_freq = freq; + + /* + * Read the HPET ID register to retrieve the IRQ routing + * information and the number of channels + */ + id = hpet_readl(HPET_ID); + hpet_print_config(); + + last = (id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT; + +#ifdef CONFIG_HPET_EMULATE_RTC + /* + * The legacy routing mode needs at least two channels, tick timer + * and the rtc emulation channel. + */ + if (!last) + goto out_nohpet; +#endif + + cfg = hpet_readl(HPET_CFG); + hpet_boot_cfg = kmalloc((last + 2) * sizeof(*hpet_boot_cfg), + GFP_KERNEL); + if (hpet_boot_cfg) + *hpet_boot_cfg = cfg; + else + pr_warn("HPET initial state will not be saved\n"); + cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY); + hpet_writel(cfg, HPET_CFG); + if (cfg) + pr_warn("HPET: Unrecognized bits %#x set in global cfg\n", + cfg); + + for (i = 0; i <= last; ++i) { + cfg = hpet_readl(HPET_Tn_CFG(i)); + if (hpet_boot_cfg) + hpet_boot_cfg[i + 1] = cfg; + cfg &= ~(HPET_TN_ENABLE | HPET_TN_LEVEL | HPET_TN_FSB); + hpet_writel(cfg, HPET_Tn_CFG(i)); + cfg &= ~(HPET_TN_PERIODIC | HPET_TN_PERIODIC_CAP + | HPET_TN_64BIT_CAP | HPET_TN_32BIT | HPET_TN_ROUTE + | HPET_TN_FSB | HPET_TN_FSB_CAP); + if (cfg) + pr_warn("HPET: Unrecognized bits %#x set in cfg#%u\n", + cfg, i); + } + hpet_print_config(); + + if (hpet_clocksource_register()) + goto out_nohpet; + + if (id & HPET_ID_LEGSUP) { + hpet_legacy_clockevent_register(); + return 1; + } + return 0; + +out_nohpet: + hpet_clear_mapping(); + hpet_address = 0; + return 0; +} + +/* + * Needs to be late, as the reserve_timer code calls kalloc ! + * + * Not a problem on i386 as hpet_enable is called from late_time_init, + * but on x86_64 it is necessary ! + */ +static __init int hpet_late_init(void) +{ + int cpu; + + if (boot_hpet_disable) + return -ENODEV; + + if (!hpet_address) { + if (!force_hpet_address) + return -ENODEV; + + hpet_address = force_hpet_address; + hpet_enable(); + } + + if (!hpet_virt_address) + return -ENODEV; + + if (hpet_readl(HPET_ID) & HPET_ID_LEGSUP) + hpet_msi_capability_lookup(2); + else + hpet_msi_capability_lookup(0); + + hpet_reserve_platform_timers(hpet_readl(HPET_ID)); + hpet_print_config(); + + if (hpet_msi_disable) + return 0; + + if (boot_cpu_has(X86_FEATURE_ARAT)) + return 0; + + cpu_notifier_register_begin(); + for_each_online_cpu(cpu) { + hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu); + } + + /* This notifier should be called after workqueue is ready */ + __hotcpu_notifier(hpet_cpuhp_notify, -20); + cpu_notifier_register_done(); + + return 0; +} +fs_initcall(hpet_late_init); + +void hpet_disable(void) +{ + if (is_hpet_capable() && hpet_virt_address) { + unsigned int cfg = hpet_readl(HPET_CFG), id, last; + + if (hpet_boot_cfg) + cfg = *hpet_boot_cfg; + else if (hpet_legacy_int_enabled) { + cfg &= ~HPET_CFG_LEGACY; + hpet_legacy_int_enabled = 0; + } + cfg &= ~HPET_CFG_ENABLE; + hpet_writel(cfg, HPET_CFG); + + if (!hpet_boot_cfg) + return; + + id = hpet_readl(HPET_ID); + last = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT); + + for (id = 0; id <= last; ++id) + hpet_writel(hpet_boot_cfg[id + 1], HPET_Tn_CFG(id)); + + if (*hpet_boot_cfg & HPET_CFG_ENABLE) + hpet_writel(*hpet_boot_cfg, HPET_CFG); + } +} + +#ifdef CONFIG_HPET_EMULATE_RTC + +/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET + * is enabled, we support RTC interrupt functionality in software. + * RTC has 3 kinds of interrupts: + * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock + * is updated + * 2) Alarm Interrupt - generate an interrupt at a specific time of day + * 3) Periodic Interrupt - generate periodic interrupt, with frequencies + * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2) + * (1) and (2) above are implemented using polling at a frequency of + * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt + * overhead. (DEFAULT_RTC_INT_FREQ) + * For (3), we use interrupts at 64Hz or user specified periodic + * frequency, whichever is higher. + */ +#include <linux/mc146818rtc.h> +#include <linux/rtc.h> +#include <asm/rtc.h> + +#define DEFAULT_RTC_INT_FREQ 64 +#define DEFAULT_RTC_SHIFT 6 +#define RTC_NUM_INTS 1 + +static unsigned long hpet_rtc_flags; +static int hpet_prev_update_sec; +static struct rtc_time hpet_alarm_time; +static unsigned long hpet_pie_count; +static u32 hpet_t1_cmp; +static u32 hpet_default_delta; +static u32 hpet_pie_delta; +static unsigned long hpet_pie_limit; + +static rtc_irq_handler irq_handler; + +/* + * Check that the hpet counter c1 is ahead of the c2 + */ +static inline int hpet_cnt_ahead(u32 c1, u32 c2) +{ + return (s32)(c2 - c1) < 0; +} + +/* + * Registers a IRQ handler. + */ +int hpet_register_irq_handler(rtc_irq_handler handler) +{ + if (!is_hpet_enabled()) + return -ENODEV; + if (irq_handler) + return -EBUSY; + + irq_handler = handler; + + return 0; +} +EXPORT_SYMBOL_GPL(hpet_register_irq_handler); + +/* + * Deregisters the IRQ handler registered with hpet_register_irq_handler() + * and does cleanup. + */ +void hpet_unregister_irq_handler(rtc_irq_handler handler) +{ + if (!is_hpet_enabled()) + return; + + irq_handler = NULL; + hpet_rtc_flags = 0; +} +EXPORT_SYMBOL_GPL(hpet_unregister_irq_handler); + +/* + * Timer 1 for RTC emulation. We use one shot mode, as periodic mode + * is not supported by all HPET implementations for timer 1. + * + * hpet_rtc_timer_init() is called when the rtc is initialized. + */ +int hpet_rtc_timer_init(void) +{ + unsigned int cfg, cnt, delta; + unsigned long flags; + + if (!is_hpet_enabled()) + return 0; + + if (!hpet_default_delta) { + uint64_t clc; + + clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC; + clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT; + hpet_default_delta = clc; + } + + if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit) + delta = hpet_default_delta; + else + delta = hpet_pie_delta; + + local_irq_save(flags); + + cnt = delta + hpet_readl(HPET_COUNTER); + hpet_writel(cnt, HPET_T1_CMP); + hpet_t1_cmp = cnt; + + cfg = hpet_readl(HPET_T1_CFG); + cfg &= ~HPET_TN_PERIODIC; + cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; + hpet_writel(cfg, HPET_T1_CFG); + + local_irq_restore(flags); + + return 1; +} +EXPORT_SYMBOL_GPL(hpet_rtc_timer_init); + +static void hpet_disable_rtc_channel(void) +{ + unsigned long cfg; + cfg = hpet_readl(HPET_T1_CFG); + cfg &= ~HPET_TN_ENABLE; + hpet_writel(cfg, HPET_T1_CFG); +} + +/* + * The functions below are called from rtc driver. + * Return 0 if HPET is not being used. + * Otherwise do the necessary changes and return 1. + */ +int hpet_mask_rtc_irq_bit(unsigned long bit_mask) +{ + if (!is_hpet_enabled()) + return 0; + + hpet_rtc_flags &= ~bit_mask; + if (unlikely(!hpet_rtc_flags)) + hpet_disable_rtc_channel(); + + return 1; +} +EXPORT_SYMBOL_GPL(hpet_mask_rtc_irq_bit); + +int hpet_set_rtc_irq_bit(unsigned long bit_mask) +{ + unsigned long oldbits = hpet_rtc_flags; + + if (!is_hpet_enabled()) + return 0; + + hpet_rtc_flags |= bit_mask; + + if ((bit_mask & RTC_UIE) && !(oldbits & RTC_UIE)) + hpet_prev_update_sec = -1; + + if (!oldbits) + hpet_rtc_timer_init(); + + return 1; +} +EXPORT_SYMBOL_GPL(hpet_set_rtc_irq_bit); + +int hpet_set_alarm_time(unsigned char hrs, unsigned char min, + unsigned char sec) +{ + if (!is_hpet_enabled()) + return 0; + + hpet_alarm_time.tm_hour = hrs; + hpet_alarm_time.tm_min = min; + hpet_alarm_time.tm_sec = sec; + + return 1; +} +EXPORT_SYMBOL_GPL(hpet_set_alarm_time); + +int hpet_set_periodic_freq(unsigned long freq) +{ + uint64_t clc; + + if (!is_hpet_enabled()) + return 0; + + if (freq <= DEFAULT_RTC_INT_FREQ) + hpet_pie_limit = DEFAULT_RTC_INT_FREQ / freq; + else { + clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC; + do_div(clc, freq); + clc >>= hpet_clockevent.shift; + hpet_pie_delta = clc; + hpet_pie_limit = 0; + } + return 1; +} +EXPORT_SYMBOL_GPL(hpet_set_periodic_freq); + +int hpet_rtc_dropped_irq(void) +{ + return is_hpet_enabled(); +} +EXPORT_SYMBOL_GPL(hpet_rtc_dropped_irq); + +static void hpet_rtc_timer_reinit(void) +{ + unsigned int delta; + int lost_ints = -1; + + if (unlikely(!hpet_rtc_flags)) + hpet_disable_rtc_channel(); + + if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit) + delta = hpet_default_delta; + else + delta = hpet_pie_delta; + + /* + * Increment the comparator value until we are ahead of the + * current count. + */ + do { + hpet_t1_cmp += delta; + hpet_writel(hpet_t1_cmp, HPET_T1_CMP); + lost_ints++; + } while (!hpet_cnt_ahead(hpet_t1_cmp, hpet_readl(HPET_COUNTER))); + + if (lost_ints) { + if (hpet_rtc_flags & RTC_PIE) + hpet_pie_count += lost_ints; + if (printk_ratelimit()) + printk(KERN_WARNING "hpet1: lost %d rtc interrupts\n", + lost_ints); + } +} + +irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) +{ + struct rtc_time curr_time; + unsigned long rtc_int_flag = 0; + + hpet_rtc_timer_reinit(); + memset(&curr_time, 0, sizeof(struct rtc_time)); + + if (hpet_rtc_flags & (RTC_UIE | RTC_AIE)) + get_rtc_time(&curr_time); + + if (hpet_rtc_flags & RTC_UIE && + curr_time.tm_sec != hpet_prev_update_sec) { + if (hpet_prev_update_sec >= 0) + rtc_int_flag = RTC_UF; + hpet_prev_update_sec = curr_time.tm_sec; + } + + if (hpet_rtc_flags & RTC_PIE && + ++hpet_pie_count >= hpet_pie_limit) { + rtc_int_flag |= RTC_PF; + hpet_pie_count = 0; + } + + if (hpet_rtc_flags & RTC_AIE && + (curr_time.tm_sec == hpet_alarm_time.tm_sec) && + (curr_time.tm_min == hpet_alarm_time.tm_min) && + (curr_time.tm_hour == hpet_alarm_time.tm_hour)) + rtc_int_flag |= RTC_AF; + + if (rtc_int_flag) { + rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8)); + if (irq_handler) + irq_handler(rtc_int_flag, dev_id); + } + return IRQ_HANDLED; +} +EXPORT_SYMBOL_GPL(hpet_rtc_interrupt); +#endif diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c new file mode 100644 index 00000000000..5f9cf20cdb6 --- /dev/null +++ b/arch/x86/kernel/hw_breakpoint.c @@ -0,0 +1,525 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) 2007 Alan Stern + * Copyright (C) 2009 IBM Corporation + * Copyright (C) 2009 Frederic Weisbecker <fweisbec@gmail.com> + * + * Authors: Alan Stern <stern@rowland.harvard.edu> + * K.Prasad <prasad@linux.vnet.ibm.com> + * Frederic Weisbecker <fweisbec@gmail.com> + */ + +/* + * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility, + * using the CPU's debug registers. + */ + +#include <linux/perf_event.h> +#include <linux/hw_breakpoint.h> +#include <linux/irqflags.h> +#include <linux/notifier.h> +#include <linux/kallsyms.h> +#include <linux/percpu.h> +#include <linux/kdebug.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/smp.h> + +#include <asm/hw_breakpoint.h> +#include <asm/processor.h> +#include <asm/debugreg.h> + +/* Per cpu debug control register value */ +DEFINE_PER_CPU(unsigned long, cpu_dr7); +EXPORT_PER_CPU_SYMBOL(cpu_dr7); + +/* Per cpu debug address registers values */ +static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]); + +/* + * Stores the breakpoints currently in use on each breakpoint address + * register for each cpus + */ +static DEFINE_PER_CPU(struct perf_event *, bp_per_reg[HBP_NUM]); + + +static inline unsigned long +__encode_dr7(int drnum, unsigned int len, unsigned int type) +{ + unsigned long bp_info; + + bp_info = (len | type) & 0xf; + bp_info <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE); + bp_info |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE)); + + return bp_info; +} + +/* + * Encode the length, type, Exact, and Enable bits for a particular breakpoint + * as stored in debug register 7. + */ +unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type) +{ + return __encode_dr7(drnum, len, type) | DR_GLOBAL_SLOWDOWN; +} + +/* + * Decode the length and type bits for a particular breakpoint as + * stored in debug register 7. Return the "enabled" status. + */ +int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type) +{ + int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE); + + *len = (bp_info & 0xc) | 0x40; + *type = (bp_info & 0x3) | 0x80; + + return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3; +} + +/* + * Install a perf counter breakpoint. + * + * We seek a free debug address register and use it for this + * breakpoint. Eventually we enable it in the debug control register. + * + * Atomic: we hold the counter->ctx->lock and we only handle variables + * and registers local to this cpu. + */ +int arch_install_hw_breakpoint(struct perf_event *bp) +{ + struct arch_hw_breakpoint *info = counter_arch_bp(bp); + unsigned long *dr7; + int i; + + for (i = 0; i < HBP_NUM; i++) { + struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]); + + if (!*slot) { + *slot = bp; + break; + } + } + + if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot")) + return -EBUSY; + + set_debugreg(info->address, i); + __this_cpu_write(cpu_debugreg[i], info->address); + + dr7 = &__get_cpu_var(cpu_dr7); + *dr7 |= encode_dr7(i, info->len, info->type); + + set_debugreg(*dr7, 7); + + return 0; +} + +/* + * Uninstall the breakpoint contained in the given counter. + * + * First we search the debug address register it uses and then we disable + * it. + * + * Atomic: we hold the counter->ctx->lock and we only handle variables + * and registers local to this cpu. + */ +void arch_uninstall_hw_breakpoint(struct perf_event *bp) +{ + struct arch_hw_breakpoint *info = counter_arch_bp(bp); + unsigned long *dr7; + int i; + + for (i = 0; i < HBP_NUM; i++) { + struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]); + + if (*slot == bp) { + *slot = NULL; + break; + } + } + + if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot")) + return; + + dr7 = &__get_cpu_var(cpu_dr7); + *dr7 &= ~__encode_dr7(i, info->len, info->type); + + set_debugreg(*dr7, 7); +} + +static int get_hbp_len(u8 hbp_len) +{ + unsigned int len_in_bytes = 0; + + switch (hbp_len) { + case X86_BREAKPOINT_LEN_1: + len_in_bytes = 1; + break; + case X86_BREAKPOINT_LEN_2: + len_in_bytes = 2; + break; + case X86_BREAKPOINT_LEN_4: + len_in_bytes = 4; + break; +#ifdef CONFIG_X86_64 + case X86_BREAKPOINT_LEN_8: + len_in_bytes = 8; + break; +#endif + } + return len_in_bytes; +} + +/* + * Check for virtual address in kernel space. + */ +int arch_check_bp_in_kernelspace(struct perf_event *bp) +{ + unsigned int len; + unsigned long va; + struct arch_hw_breakpoint *info = counter_arch_bp(bp); + + va = info->address; + len = get_hbp_len(info->len); + + return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE); +} + +int arch_bp_generic_fields(int x86_len, int x86_type, + int *gen_len, int *gen_type) +{ + /* Type */ + switch (x86_type) { + case X86_BREAKPOINT_EXECUTE: + if (x86_len != X86_BREAKPOINT_LEN_X) + return -EINVAL; + + *gen_type = HW_BREAKPOINT_X; + *gen_len = sizeof(long); + return 0; + case X86_BREAKPOINT_WRITE: + *gen_type = HW_BREAKPOINT_W; + break; + case X86_BREAKPOINT_RW: + *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R; + break; + default: + return -EINVAL; + } + + /* Len */ + switch (x86_len) { + case X86_BREAKPOINT_LEN_1: + *gen_len = HW_BREAKPOINT_LEN_1; + break; + case X86_BREAKPOINT_LEN_2: + *gen_len = HW_BREAKPOINT_LEN_2; + break; + case X86_BREAKPOINT_LEN_4: + *gen_len = HW_BREAKPOINT_LEN_4; + break; +#ifdef CONFIG_X86_64 + case X86_BREAKPOINT_LEN_8: + *gen_len = HW_BREAKPOINT_LEN_8; + break; +#endif + default: + return -EINVAL; + } + + return 0; +} + + +static int arch_build_bp_info(struct perf_event *bp) +{ + struct arch_hw_breakpoint *info = counter_arch_bp(bp); + + info->address = bp->attr.bp_addr; + + /* Type */ + switch (bp->attr.bp_type) { + case HW_BREAKPOINT_W: + info->type = X86_BREAKPOINT_WRITE; + break; + case HW_BREAKPOINT_W | HW_BREAKPOINT_R: + info->type = X86_BREAKPOINT_RW; + break; + case HW_BREAKPOINT_X: + info->type = X86_BREAKPOINT_EXECUTE; + /* + * x86 inst breakpoints need to have a specific undefined len. + * But we still need to check userspace is not trying to setup + * an unsupported length, to get a range breakpoint for example. + */ + if (bp->attr.bp_len == sizeof(long)) { + info->len = X86_BREAKPOINT_LEN_X; + return 0; + } + default: + return -EINVAL; + } + + /* Len */ + switch (bp->attr.bp_len) { + case HW_BREAKPOINT_LEN_1: + info->len = X86_BREAKPOINT_LEN_1; + break; + case HW_BREAKPOINT_LEN_2: + info->len = X86_BREAKPOINT_LEN_2; + break; + case HW_BREAKPOINT_LEN_4: + info->len = X86_BREAKPOINT_LEN_4; + break; +#ifdef CONFIG_X86_64 + case HW_BREAKPOINT_LEN_8: + info->len = X86_BREAKPOINT_LEN_8; + break; +#endif + default: + return -EINVAL; + } + + return 0; +} +/* + * Validate the arch-specific HW Breakpoint register settings + */ +int arch_validate_hwbkpt_settings(struct perf_event *bp) +{ + struct arch_hw_breakpoint *info = counter_arch_bp(bp); + unsigned int align; + int ret; + + + ret = arch_build_bp_info(bp); + if (ret) + return ret; + + ret = -EINVAL; + + switch (info->len) { + case X86_BREAKPOINT_LEN_1: + align = 0; + break; + case X86_BREAKPOINT_LEN_2: + align = 1; + break; + case X86_BREAKPOINT_LEN_4: + align = 3; + break; +#ifdef CONFIG_X86_64 + case X86_BREAKPOINT_LEN_8: + align = 7; + break; +#endif + default: + return ret; + } + + /* + * Check that the low-order bits of the address are appropriate + * for the alignment implied by len. + */ + if (info->address & align) + return -EINVAL; + + return 0; +} + +/* + * Dump the debug register contents to the user. + * We can't dump our per cpu values because it + * may contain cpu wide breakpoint, something that + * doesn't belong to the current task. + * + * TODO: include non-ptrace user breakpoints (perf) + */ +void aout_dump_debugregs(struct user *dump) +{ + int i; + int dr7 = 0; + struct perf_event *bp; + struct arch_hw_breakpoint *info; + struct thread_struct *thread = ¤t->thread; + + for (i = 0; i < HBP_NUM; i++) { + bp = thread->ptrace_bps[i]; + + if (bp && !bp->attr.disabled) { + dump->u_debugreg[i] = bp->attr.bp_addr; + info = counter_arch_bp(bp); + dr7 |= encode_dr7(i, info->len, info->type); + } else { + dump->u_debugreg[i] = 0; + } + } + + dump->u_debugreg[4] = 0; + dump->u_debugreg[5] = 0; + dump->u_debugreg[6] = current->thread.debugreg6; + + dump->u_debugreg[7] = dr7; +} +EXPORT_SYMBOL_GPL(aout_dump_debugregs); + +/* + * Release the user breakpoints used by ptrace + */ +void flush_ptrace_hw_breakpoint(struct task_struct *tsk) +{ + int i; + struct thread_struct *t = &tsk->thread; + + for (i = 0; i < HBP_NUM; i++) { + unregister_hw_breakpoint(t->ptrace_bps[i]); + t->ptrace_bps[i] = NULL; + } + + t->debugreg6 = 0; + t->ptrace_dr7 = 0; +} + +void hw_breakpoint_restore(void) +{ + set_debugreg(__this_cpu_read(cpu_debugreg[0]), 0); + set_debugreg(__this_cpu_read(cpu_debugreg[1]), 1); + set_debugreg(__this_cpu_read(cpu_debugreg[2]), 2); + set_debugreg(__this_cpu_read(cpu_debugreg[3]), 3); + set_debugreg(current->thread.debugreg6, 6); + set_debugreg(__this_cpu_read(cpu_dr7), 7); +} +EXPORT_SYMBOL_GPL(hw_breakpoint_restore); + +/* + * Handle debug exception notifications. + * + * Return value is either NOTIFY_STOP or NOTIFY_DONE as explained below. + * + * NOTIFY_DONE returned if one of the following conditions is true. + * i) When the causative address is from user-space and the exception + * is a valid one, i.e. not triggered as a result of lazy debug register + * switching + * ii) When there are more bits than trap<n> set in DR6 register (such + * as BD, BS or BT) indicating that more than one debug condition is + * met and requires some more action in do_debug(). + * + * NOTIFY_STOP returned for all other cases + * + */ +static int hw_breakpoint_handler(struct die_args *args) +{ + int i, cpu, rc = NOTIFY_STOP; + struct perf_event *bp; + unsigned long dr7, dr6; + unsigned long *dr6_p; + + /* The DR6 value is pointed by args->err */ + dr6_p = (unsigned long *)ERR_PTR(args->err); + dr6 = *dr6_p; + + /* If it's a single step, TRAP bits are random */ + if (dr6 & DR_STEP) + return NOTIFY_DONE; + + /* Do an early return if no trap bits are set in DR6 */ + if ((dr6 & DR_TRAP_BITS) == 0) + return NOTIFY_DONE; + + get_debugreg(dr7, 7); + /* Disable breakpoints during exception handling */ + set_debugreg(0UL, 7); + /* + * Assert that local interrupts are disabled + * Reset the DRn bits in the virtualized register value. + * The ptrace trigger routine will add in whatever is needed. + */ + current->thread.debugreg6 &= ~DR_TRAP_BITS; + cpu = get_cpu(); + + /* Handle all the breakpoints that were triggered */ + for (i = 0; i < HBP_NUM; ++i) { + if (likely(!(dr6 & (DR_TRAP0 << i)))) + continue; + + /* + * The counter may be concurrently released but that can only + * occur from a call_rcu() path. We can then safely fetch + * the breakpoint, use its callback, touch its counter + * while we are in an rcu_read_lock() path. + */ + rcu_read_lock(); + + bp = per_cpu(bp_per_reg[i], cpu); + /* + * Reset the 'i'th TRAP bit in dr6 to denote completion of + * exception handling + */ + (*dr6_p) &= ~(DR_TRAP0 << i); + /* + * bp can be NULL due to lazy debug register switching + * or due to concurrent perf counter removing. + */ + if (!bp) { + rcu_read_unlock(); + break; + } + + perf_bp_event(bp, args->regs); + + /* + * Set up resume flag to avoid breakpoint recursion when + * returning back to origin. + */ + if (bp->hw.info.type == X86_BREAKPOINT_EXECUTE) + args->regs->flags |= X86_EFLAGS_RF; + + rcu_read_unlock(); + } + /* + * Further processing in do_debug() is needed for a) user-space + * breakpoints (to generate signals) and b) when the system has + * taken exception due to multiple causes + */ + if ((current->thread.debugreg6 & DR_TRAP_BITS) || + (dr6 & (~DR_TRAP_BITS))) + rc = NOTIFY_DONE; + + set_debugreg(dr7, 7); + put_cpu(); + + return rc; +} + +/* + * Handle debug exception notifications. + */ +int hw_breakpoint_exceptions_notify( + struct notifier_block *unused, unsigned long val, void *data) +{ + if (val != DIE_DEBUG) + return NOTIFY_DONE; + + return hw_breakpoint_handler(data); +} + +void hw_breakpoint_pmu_read(struct perf_event *bp) +{ + /* TODO */ +} diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c new file mode 100644 index 00000000000..05fd74f537d --- /dev/null +++ b/arch/x86/kernel/i386_ksyms_32.c @@ -0,0 +1,46 @@ +#include <linux/module.h> + +#include <asm/checksum.h> +#include <asm/pgtable.h> +#include <asm/desc.h> +#include <asm/ftrace.h> + +#ifdef CONFIG_FUNCTION_TRACER +/* mcount is defined in assembly */ +EXPORT_SYMBOL(mcount); +#endif + +/* + * Note, this is a prototype to get at the symbol for + * the export, but dont use it from C code, it is used + * by assembly code and is not using C calling convention! + */ +#ifndef CONFIG_X86_CMPXCHG64 +extern void cmpxchg8b_emu(void); +EXPORT_SYMBOL(cmpxchg8b_emu); +#endif + +/* Networking helper routines. */ +EXPORT_SYMBOL(csum_partial_copy_generic); + +EXPORT_SYMBOL(__get_user_1); +EXPORT_SYMBOL(__get_user_2); +EXPORT_SYMBOL(__get_user_4); +EXPORT_SYMBOL(__get_user_8); + +EXPORT_SYMBOL(__put_user_1); +EXPORT_SYMBOL(__put_user_2); +EXPORT_SYMBOL(__put_user_4); +EXPORT_SYMBOL(__put_user_8); + +EXPORT_SYMBOL(strstr); + +EXPORT_SYMBOL(csum_partial); +EXPORT_SYMBOL(empty_zero_page); + +#ifdef CONFIG_PREEMPT +EXPORT_SYMBOL(___preempt_schedule); +#ifdef CONFIG_CONTEXT_TRACKING +EXPORT_SYMBOL(___preempt_schedule_context); +#endif +#endif diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c new file mode 100644 index 00000000000..d5dd8081441 --- /dev/null +++ b/arch/x86/kernel/i387.c @@ -0,0 +1,640 @@ +/* + * Copyright (C) 1994 Linus Torvalds + * + * Pentium III FXSR, SSE support + * General FPU state handling cleanups + * Gareth Hughes <gareth@valinux.com>, May 2000 + */ +#include <linux/module.h> +#include <linux/regset.h> +#include <linux/sched.h> +#include <linux/slab.h> + +#include <asm/sigcontext.h> +#include <asm/processor.h> +#include <asm/math_emu.h> +#include <asm/uaccess.h> +#include <asm/ptrace.h> +#include <asm/i387.h> +#include <asm/fpu-internal.h> +#include <asm/user.h> + +/* + * Were we in an interrupt that interrupted kernel mode? + * + * On others, we can do a kernel_fpu_begin/end() pair *ONLY* if that + * pair does nothing at all: the thread must not have fpu (so + * that we don't try to save the FPU state), and TS must + * be set (so that the clts/stts pair does nothing that is + * visible in the interrupted kernel thread). + * + * Except for the eagerfpu case when we return 1 unless we've already + * been eager and saved the state in kernel_fpu_begin(). + */ +static inline bool interrupted_kernel_fpu_idle(void) +{ + if (use_eager_fpu()) + return __thread_has_fpu(current); + + return !__thread_has_fpu(current) && + (read_cr0() & X86_CR0_TS); +} + +/* + * Were we in user mode (or vm86 mode) when we were + * interrupted? + * + * Doing kernel_fpu_begin/end() is ok if we are running + * in an interrupt context from user mode - we'll just + * save the FPU state as required. + */ +static inline bool interrupted_user_mode(void) +{ + struct pt_regs *regs = get_irq_regs(); + return regs && user_mode_vm(regs); +} + +/* + * Can we use the FPU in kernel mode with the + * whole "kernel_fpu_begin/end()" sequence? + * + * It's always ok in process context (ie "not interrupt") + * but it is sometimes ok even from an irq. + */ +bool irq_fpu_usable(void) +{ + return !in_interrupt() || + interrupted_user_mode() || + interrupted_kernel_fpu_idle(); +} +EXPORT_SYMBOL(irq_fpu_usable); + +void __kernel_fpu_begin(void) +{ + struct task_struct *me = current; + + if (__thread_has_fpu(me)) { + __thread_clear_has_fpu(me); + __save_init_fpu(me); + /* We do 'stts()' in __kernel_fpu_end() */ + } else if (!use_eager_fpu()) { + this_cpu_write(fpu_owner_task, NULL); + clts(); + } +} +EXPORT_SYMBOL(__kernel_fpu_begin); + +void __kernel_fpu_end(void) +{ + if (use_eager_fpu()) { + /* + * For eager fpu, most the time, tsk_used_math() is true. + * Restore the user math as we are done with the kernel usage. + * At few instances during thread exit, signal handling etc, + * tsk_used_math() is false. Those few places will take proper + * actions, so we don't need to restore the math here. + */ + if (likely(tsk_used_math(current))) + math_state_restore(); + } else { + stts(); + } +} +EXPORT_SYMBOL(__kernel_fpu_end); + +void unlazy_fpu(struct task_struct *tsk) +{ + preempt_disable(); + if (__thread_has_fpu(tsk)) { + __save_init_fpu(tsk); + __thread_fpu_end(tsk); + } else + tsk->thread.fpu_counter = 0; + preempt_enable(); +} +EXPORT_SYMBOL(unlazy_fpu); + +unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; +unsigned int xstate_size; +EXPORT_SYMBOL_GPL(xstate_size); +static struct i387_fxsave_struct fx_scratch; + +static void mxcsr_feature_mask_init(void) +{ + unsigned long mask = 0; + + if (cpu_has_fxsr) { + memset(&fx_scratch, 0, sizeof(struct i387_fxsave_struct)); + asm volatile("fxsave %0" : "+m" (fx_scratch)); + mask = fx_scratch.mxcsr_mask; + if (mask == 0) + mask = 0x0000ffbf; + } + mxcsr_feature_mask &= mask; +} + +static void init_thread_xstate(void) +{ + /* + * Note that xstate_size might be overwriten later during + * xsave_init(). + */ + + if (!cpu_has_fpu) { + /* + * Disable xsave as we do not support it if i387 + * emulation is enabled. + */ + setup_clear_cpu_cap(X86_FEATURE_XSAVE); + setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); + xstate_size = sizeof(struct i387_soft_struct); + return; + } + + if (cpu_has_fxsr) + xstate_size = sizeof(struct i387_fxsave_struct); + else + xstate_size = sizeof(struct i387_fsave_struct); +} + +/* + * Called at bootup to set up the initial FPU state that is later cloned + * into all processes. + */ + +void fpu_init(void) +{ + unsigned long cr0; + unsigned long cr4_mask = 0; + +#ifndef CONFIG_MATH_EMULATION + if (!cpu_has_fpu) { + pr_emerg("No FPU found and no math emulation present\n"); + pr_emerg("Giving up\n"); + for (;;) + asm volatile("hlt"); + } +#endif + if (cpu_has_fxsr) + cr4_mask |= X86_CR4_OSFXSR; + if (cpu_has_xmm) + cr4_mask |= X86_CR4_OSXMMEXCPT; + if (cr4_mask) + set_in_cr4(cr4_mask); + + cr0 = read_cr0(); + cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */ + if (!cpu_has_fpu) + cr0 |= X86_CR0_EM; + write_cr0(cr0); + + /* + * init_thread_xstate is only called once to avoid overriding + * xstate_size during boot time or during CPU hotplug. + */ + if (xstate_size == 0) + init_thread_xstate(); + + mxcsr_feature_mask_init(); + xsave_init(); + eager_fpu_init(); +} + +void fpu_finit(struct fpu *fpu) +{ + if (!cpu_has_fpu) { + finit_soft_fpu(&fpu->state->soft); + return; + } + + if (cpu_has_fxsr) { + fx_finit(&fpu->state->fxsave); + } else { + struct i387_fsave_struct *fp = &fpu->state->fsave; + memset(fp, 0, xstate_size); + fp->cwd = 0xffff037fu; + fp->swd = 0xffff0000u; + fp->twd = 0xffffffffu; + fp->fos = 0xffff0000u; + } +} +EXPORT_SYMBOL_GPL(fpu_finit); + +/* + * The _current_ task is using the FPU for the first time + * so initialize it and set the mxcsr to its default + * value at reset if we support XMM instructions and then + * remember the current task has used the FPU. + */ +int init_fpu(struct task_struct *tsk) +{ + int ret; + + if (tsk_used_math(tsk)) { + if (cpu_has_fpu && tsk == current) + unlazy_fpu(tsk); + tsk->thread.fpu.last_cpu = ~0; + return 0; + } + + /* + * Memory allocation at the first usage of the FPU and other state. + */ + ret = fpu_alloc(&tsk->thread.fpu); + if (ret) + return ret; + + fpu_finit(&tsk->thread.fpu); + + set_stopped_child_used_math(tsk); + return 0; +} +EXPORT_SYMBOL_GPL(init_fpu); + +/* + * The xstateregs_active() routine is the same as the fpregs_active() routine, + * as the "regset->n" for the xstate regset will be updated based on the feature + * capabilites supported by the xsave. + */ +int fpregs_active(struct task_struct *target, const struct user_regset *regset) +{ + return tsk_used_math(target) ? regset->n : 0; +} + +int xfpregs_active(struct task_struct *target, const struct user_regset *regset) +{ + return (cpu_has_fxsr && tsk_used_math(target)) ? regset->n : 0; +} + +int xfpregs_get(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + int ret; + + if (!cpu_has_fxsr) + return -ENODEV; + + ret = init_fpu(target); + if (ret) + return ret; + + sanitize_i387_state(target); + + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.fpu.state->fxsave, 0, -1); +} + +int xfpregs_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + + if (!cpu_has_fxsr) + return -ENODEV; + + ret = init_fpu(target); + if (ret) + return ret; + + sanitize_i387_state(target); + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.fpu.state->fxsave, 0, -1); + + /* + * mxcsr reserved bits must be masked to zero for security reasons. + */ + target->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask; + + /* + * update the header bits in the xsave header, indicating the + * presence of FP and SSE state. + */ + if (cpu_has_xsave) + target->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE; + + return ret; +} + +int xstateregs_get(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + int ret; + + if (!cpu_has_xsave) + return -ENODEV; + + ret = init_fpu(target); + if (ret) + return ret; + + /* + * Copy the 48bytes defined by the software first into the xstate + * memory layout in the thread struct, so that we can copy the entire + * xstateregs to the user using one user_regset_copyout(). + */ + memcpy(&target->thread.fpu.state->fxsave.sw_reserved, + xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes)); + + /* + * Copy the xstate memory layout. + */ + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.fpu.state->xsave, 0, -1); + return ret; +} + +int xstateregs_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + struct xsave_hdr_struct *xsave_hdr; + + if (!cpu_has_xsave) + return -ENODEV; + + ret = init_fpu(target); + if (ret) + return ret; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.fpu.state->xsave, 0, -1); + + /* + * mxcsr reserved bits must be masked to zero for security reasons. + */ + target->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask; + + xsave_hdr = &target->thread.fpu.state->xsave.xsave_hdr; + + xsave_hdr->xstate_bv &= pcntxt_mask; + /* + * These bits must be zero. + */ + xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0; + + return ret; +} + +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION + +/* + * FPU tag word conversions. + */ + +static inline unsigned short twd_i387_to_fxsr(unsigned short twd) +{ + unsigned int tmp; /* to avoid 16 bit prefixes in the code */ + + /* Transform each pair of bits into 01 (valid) or 00 (empty) */ + tmp = ~twd; + tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */ + /* and move the valid bits to the lower byte. */ + tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */ + tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */ + tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */ + + return tmp; +} + +#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16) +#define FP_EXP_TAG_VALID 0 +#define FP_EXP_TAG_ZERO 1 +#define FP_EXP_TAG_SPECIAL 2 +#define FP_EXP_TAG_EMPTY 3 + +static inline u32 twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave) +{ + struct _fpxreg *st; + u32 tos = (fxsave->swd >> 11) & 7; + u32 twd = (unsigned long) fxsave->twd; + u32 tag; + u32 ret = 0xffff0000u; + int i; + + for (i = 0; i < 8; i++, twd >>= 1) { + if (twd & 0x1) { + st = FPREG_ADDR(fxsave, (i - tos) & 7); + + switch (st->exponent & 0x7fff) { + case 0x7fff: + tag = FP_EXP_TAG_SPECIAL; + break; + case 0x0000: + if (!st->significand[0] && + !st->significand[1] && + !st->significand[2] && + !st->significand[3]) + tag = FP_EXP_TAG_ZERO; + else + tag = FP_EXP_TAG_SPECIAL; + break; + default: + if (st->significand[3] & 0x8000) + tag = FP_EXP_TAG_VALID; + else + tag = FP_EXP_TAG_SPECIAL; + break; + } + } else { + tag = FP_EXP_TAG_EMPTY; + } + ret |= tag << (2 * i); + } + return ret; +} + +/* + * FXSR floating point environment conversions. + */ + +void +convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) +{ + struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave; + struct _fpreg *to = (struct _fpreg *) &env->st_space[0]; + struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0]; + int i; + + env->cwd = fxsave->cwd | 0xffff0000u; + env->swd = fxsave->swd | 0xffff0000u; + env->twd = twd_fxsr_to_i387(fxsave); + +#ifdef CONFIG_X86_64 + env->fip = fxsave->rip; + env->foo = fxsave->rdp; + /* + * should be actually ds/cs at fpu exception time, but + * that information is not available in 64bit mode. + */ + env->fcs = task_pt_regs(tsk)->cs; + if (tsk == current) { + savesegment(ds, env->fos); + } else { + env->fos = tsk->thread.ds; + } + env->fos |= 0xffff0000; +#else + env->fip = fxsave->fip; + env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16); + env->foo = fxsave->foo; + env->fos = fxsave->fos; +#endif + + for (i = 0; i < 8; ++i) + memcpy(&to[i], &from[i], sizeof(to[0])); +} + +void convert_to_fxsr(struct task_struct *tsk, + const struct user_i387_ia32_struct *env) + +{ + struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave; + struct _fpreg *from = (struct _fpreg *) &env->st_space[0]; + struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0]; + int i; + + fxsave->cwd = env->cwd; + fxsave->swd = env->swd; + fxsave->twd = twd_i387_to_fxsr(env->twd); + fxsave->fop = (u16) ((u32) env->fcs >> 16); +#ifdef CONFIG_X86_64 + fxsave->rip = env->fip; + fxsave->rdp = env->foo; + /* cs and ds ignored */ +#else + fxsave->fip = env->fip; + fxsave->fcs = (env->fcs & 0xffff); + fxsave->foo = env->foo; + fxsave->fos = env->fos; +#endif + + for (i = 0; i < 8; ++i) + memcpy(&to[i], &from[i], sizeof(from[0])); +} + +int fpregs_get(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + struct user_i387_ia32_struct env; + int ret; + + ret = init_fpu(target); + if (ret) + return ret; + + if (!static_cpu_has(X86_FEATURE_FPU)) + return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf); + + if (!cpu_has_fxsr) + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.fpu.state->fsave, 0, + -1); + + sanitize_i387_state(target); + + if (kbuf && pos == 0 && count == sizeof(env)) { + convert_from_fxsr(kbuf, target); + return 0; + } + + convert_from_fxsr(&env, target); + + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &env, 0, -1); +} + +int fpregs_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct user_i387_ia32_struct env; + int ret; + + ret = init_fpu(target); + if (ret) + return ret; + + sanitize_i387_state(target); + + if (!static_cpu_has(X86_FEATURE_FPU)) + return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); + + if (!cpu_has_fxsr) + return user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.fpu.state->fsave, 0, + -1); + + if (pos > 0 || count < sizeof(env)) + convert_from_fxsr(&env, target); + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1); + if (!ret) + convert_to_fxsr(target, &env); + + /* + * update the header bit in the xsave header, indicating the + * presence of FP. + */ + if (cpu_has_xsave) + target->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FP; + return ret; +} + +/* + * FPU state for core dumps. + * This is only used for a.out dumps now. + * It is declared generically using elf_fpregset_t (which is + * struct user_i387_struct) but is in fact only used for 32-bit + * dumps, so on 64-bit it is really struct user_i387_ia32_struct. + */ +int dump_fpu(struct pt_regs *regs, struct user_i387_struct *fpu) +{ + struct task_struct *tsk = current; + int fpvalid; + + fpvalid = !!used_math(); + if (fpvalid) + fpvalid = !fpregs_get(tsk, NULL, + 0, sizeof(struct user_i387_ia32_struct), + fpu, NULL); + + return fpvalid; +} +EXPORT_SYMBOL(dump_fpu); + +#endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */ + +static int __init no_387(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_FPU); + return 1; +} + +__setup("no387", no_387); + +void fpu_detect(struct cpuinfo_x86 *c) +{ + unsigned long cr0; + u16 fsw, fcw; + + fsw = fcw = 0xffff; + + cr0 = read_cr0(); + cr0 &= ~(X86_CR0_TS | X86_CR0_EM); + write_cr0(cr0); + + asm volatile("fninit ; fnstsw %0 ; fnstcw %1" + : "+m" (fsw), "+m" (fcw)); + + if (fsw == 0 && (fcw & 0x103f) == 0x003f) + set_cpu_cap(c, X86_FEATURE_FPU); + else + clear_cpu_cap(c, X86_FEATURE_FPU); + + /* The final cr0 value is set in fpu_init() */ +} diff --git a/arch/x86/kernel/i8237.c b/arch/x86/kernel/i8237.c new file mode 100644 index 00000000000..8eeaa81de06 --- /dev/null +++ b/arch/x86/kernel/i8237.c @@ -0,0 +1,55 @@ +/* + * 8237A DMA controller suspend functions. + * + * Written by Pierre Ossman, 2005. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + */ + +#include <linux/init.h> +#include <linux/syscore_ops.h> + +#include <asm/dma.h> + +/* + * This module just handles suspend/resume issues with the + * 8237A DMA controller (used for ISA and LPC). + * Allocation is handled in kernel/dma.c and normal usage is + * in asm/dma.h. + */ + +static void i8237A_resume(void) +{ + unsigned long flags; + int i; + + flags = claim_dma_lock(); + + dma_outb(0, DMA1_RESET_REG); + dma_outb(0, DMA2_RESET_REG); + + for (i = 0; i < 8; i++) { + set_dma_addr(i, 0x000000); + /* DMA count is a bit weird so this is not 0 */ + set_dma_count(i, 1); + } + + /* Enable cascade DMA or channel 0-3 won't work */ + enable_dma(4); + + release_dma_lock(flags); +} + +static struct syscore_ops i8237_syscore_ops = { + .resume = i8237A_resume, +}; + +static int __init i8237A_init_ops(void) +{ + register_syscore_ops(&i8237_syscore_ops); + return 0; +} +device_initcall(i8237A_init_ops); diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c new file mode 100644 index 00000000000..f2b96de3c7c --- /dev/null +++ b/arch/x86/kernel/i8253.c @@ -0,0 +1,43 @@ +/* + * 8253/PIT functions + * + */ +#include <linux/clockchips.h> +#include <linux/module.h> +#include <linux/timex.h> +#include <linux/i8253.h> + +#include <asm/hpet.h> +#include <asm/time.h> +#include <asm/smp.h> + +/* + * HPET replaces the PIT, when enabled. So we need to know, which of + * the two timers is used + */ +struct clock_event_device *global_clock_event; + +void __init setup_pit_timer(void) +{ + clockevent_i8253_init(true); + global_clock_event = &i8253_clockevent; +} + +#ifndef CONFIG_X86_64 +static int __init init_pit_clocksource(void) +{ + /* + * Several reasons not to register PIT as a clocksource: + * + * - On SMP PIT does not scale due to i8253_lock + * - when HPET is enabled + * - when local APIC timer is active (PIT is switched off) + */ + if (num_possible_cpus() > 1 || is_hpet_enabled() || + i8253_clockevent.mode != CLOCK_EVT_MODE_PERIODIC) + return 0; + + return clocksource_i8253_init(); +} +arch_initcall(init_pit_clocksource); +#endif /* !CONFIG_X86_64 */ diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c new file mode 100644 index 00000000000..8af817105e2 --- /dev/null +++ b/arch/x86/kernel/i8259.c @@ -0,0 +1,418 @@ +#include <linux/linkage.h> +#include <linux/errno.h> +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/ioport.h> +#include <linux/interrupt.h> +#include <linux/timex.h> +#include <linux/random.h> +#include <linux/init.h> +#include <linux/kernel_stat.h> +#include <linux/syscore_ops.h> +#include <linux/bitops.h> +#include <linux/acpi.h> +#include <linux/io.h> +#include <linux/delay.h> + +#include <linux/atomic.h> +#include <asm/timer.h> +#include <asm/hw_irq.h> +#include <asm/pgtable.h> +#include <asm/desc.h> +#include <asm/apic.h> +#include <asm/i8259.h> + +/* + * This is the 'legacy' 8259A Programmable Interrupt Controller, + * present in the majority of PC/AT boxes. + * plus some generic x86 specific things if generic specifics makes + * any sense at all. + */ +static void init_8259A(int auto_eoi); + +static int i8259A_auto_eoi; +DEFINE_RAW_SPINLOCK(i8259A_lock); + +/* + * 8259A PIC functions to handle ISA devices: + */ + +/* + * This contains the irq mask for both 8259A irq controllers, + */ +unsigned int cached_irq_mask = 0xffff; + +/* + * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) + * boards the timer interrupt is not really connected to any IO-APIC pin, + * it's fed to the master 8259A's IR0 line only. + * + * Any '1' bit in this mask means the IRQ is routed through the IO-APIC. + * this 'mixed mode' IRQ handling costs nothing because it's only used + * at IRQ setup time. + */ +unsigned long io_apic_irqs; + +static void mask_8259A_irq(unsigned int irq) +{ + unsigned int mask = 1 << irq; + unsigned long flags; + + raw_spin_lock_irqsave(&i8259A_lock, flags); + cached_irq_mask |= mask; + if (irq & 8) + outb(cached_slave_mask, PIC_SLAVE_IMR); + else + outb(cached_master_mask, PIC_MASTER_IMR); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); +} + +static void disable_8259A_irq(struct irq_data *data) +{ + mask_8259A_irq(data->irq); +} + +static void unmask_8259A_irq(unsigned int irq) +{ + unsigned int mask = ~(1 << irq); + unsigned long flags; + + raw_spin_lock_irqsave(&i8259A_lock, flags); + cached_irq_mask &= mask; + if (irq & 8) + outb(cached_slave_mask, PIC_SLAVE_IMR); + else + outb(cached_master_mask, PIC_MASTER_IMR); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); +} + +static void enable_8259A_irq(struct irq_data *data) +{ + unmask_8259A_irq(data->irq); +} + +static int i8259A_irq_pending(unsigned int irq) +{ + unsigned int mask = 1<<irq; + unsigned long flags; + int ret; + + raw_spin_lock_irqsave(&i8259A_lock, flags); + if (irq < 8) + ret = inb(PIC_MASTER_CMD) & mask; + else + ret = inb(PIC_SLAVE_CMD) & (mask >> 8); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); + + return ret; +} + +static void make_8259A_irq(unsigned int irq) +{ + disable_irq_nosync(irq); + io_apic_irqs &= ~(1<<irq); + irq_set_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq, + i8259A_chip.name); + enable_irq(irq); +} + +/* + * This function assumes to be called rarely. Switching between + * 8259A registers is slow. + * This has to be protected by the irq controller spinlock + * before being called. + */ +static inline int i8259A_irq_real(unsigned int irq) +{ + int value; + int irqmask = 1<<irq; + + if (irq < 8) { + outb(0x0B, PIC_MASTER_CMD); /* ISR register */ + value = inb(PIC_MASTER_CMD) & irqmask; + outb(0x0A, PIC_MASTER_CMD); /* back to the IRR register */ + return value; + } + outb(0x0B, PIC_SLAVE_CMD); /* ISR register */ + value = inb(PIC_SLAVE_CMD) & (irqmask >> 8); + outb(0x0A, PIC_SLAVE_CMD); /* back to the IRR register */ + return value; +} + +/* + * Careful! The 8259A is a fragile beast, it pretty + * much _has_ to be done exactly like this (mask it + * first, _then_ send the EOI, and the order of EOI + * to the two 8259s is important! + */ +static void mask_and_ack_8259A(struct irq_data *data) +{ + unsigned int irq = data->irq; + unsigned int irqmask = 1 << irq; + unsigned long flags; + + raw_spin_lock_irqsave(&i8259A_lock, flags); + /* + * Lightweight spurious IRQ detection. We do not want + * to overdo spurious IRQ handling - it's usually a sign + * of hardware problems, so we only do the checks we can + * do without slowing down good hardware unnecessarily. + * + * Note that IRQ7 and IRQ15 (the two spurious IRQs + * usually resulting from the 8259A-1|2 PICs) occur + * even if the IRQ is masked in the 8259A. Thus we + * can check spurious 8259A IRQs without doing the + * quite slow i8259A_irq_real() call for every IRQ. + * This does not cover 100% of spurious interrupts, + * but should be enough to warn the user that there + * is something bad going on ... + */ + if (cached_irq_mask & irqmask) + goto spurious_8259A_irq; + cached_irq_mask |= irqmask; + +handle_real_irq: + if (irq & 8) { + inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */ + outb(cached_slave_mask, PIC_SLAVE_IMR); + /* 'Specific EOI' to slave */ + outb(0x60+(irq&7), PIC_SLAVE_CMD); + /* 'Specific EOI' to master-IRQ2 */ + outb(0x60+PIC_CASCADE_IR, PIC_MASTER_CMD); + } else { + inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */ + outb(cached_master_mask, PIC_MASTER_IMR); + outb(0x60+irq, PIC_MASTER_CMD); /* 'Specific EOI to master */ + } + raw_spin_unlock_irqrestore(&i8259A_lock, flags); + return; + +spurious_8259A_irq: + /* + * this is the slow path - should happen rarely. + */ + if (i8259A_irq_real(irq)) + /* + * oops, the IRQ _is_ in service according to the + * 8259A - not spurious, go handle it. + */ + goto handle_real_irq; + + { + static int spurious_irq_mask; + /* + * At this point we can be sure the IRQ is spurious, + * lets ACK and report it. [once per IRQ] + */ + if (!(spurious_irq_mask & irqmask)) { + printk(KERN_DEBUG + "spurious 8259A interrupt: IRQ%d.\n", irq); + spurious_irq_mask |= irqmask; + } + atomic_inc(&irq_err_count); + /* + * Theoretically we do not have to handle this IRQ, + * but in Linux this does not cause problems and is + * simpler for us. + */ + goto handle_real_irq; + } +} + +struct irq_chip i8259A_chip = { + .name = "XT-PIC", + .irq_mask = disable_8259A_irq, + .irq_disable = disable_8259A_irq, + .irq_unmask = enable_8259A_irq, + .irq_mask_ack = mask_and_ack_8259A, +}; + +static char irq_trigger[2]; +/** + * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ + */ +static void restore_ELCR(char *trigger) +{ + outb(trigger[0], 0x4d0); + outb(trigger[1], 0x4d1); +} + +static void save_ELCR(char *trigger) +{ + /* IRQ 0,1,2,8,13 are marked as reserved */ + trigger[0] = inb(0x4d0) & 0xF8; + trigger[1] = inb(0x4d1) & 0xDE; +} + +static void i8259A_resume(void) +{ + init_8259A(i8259A_auto_eoi); + restore_ELCR(irq_trigger); +} + +static int i8259A_suspend(void) +{ + save_ELCR(irq_trigger); + return 0; +} + +static void i8259A_shutdown(void) +{ + /* Put the i8259A into a quiescent state that + * the kernel initialization code can get it + * out of. + */ + outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ + outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ +} + +static struct syscore_ops i8259_syscore_ops = { + .suspend = i8259A_suspend, + .resume = i8259A_resume, + .shutdown = i8259A_shutdown, +}; + +static void mask_8259A(void) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&i8259A_lock, flags); + + outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ + outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ + + raw_spin_unlock_irqrestore(&i8259A_lock, flags); +} + +static void unmask_8259A(void) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&i8259A_lock, flags); + + outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ + outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ + + raw_spin_unlock_irqrestore(&i8259A_lock, flags); +} + +static void init_8259A(int auto_eoi) +{ + unsigned long flags; + unsigned char probe_val = ~(1 << PIC_CASCADE_IR); + unsigned char new_val; + + i8259A_auto_eoi = auto_eoi; + + raw_spin_lock_irqsave(&i8259A_lock, flags); + + /* + * Check to see if we have a PIC. + * Mask all except the cascade and read + * back the value we just wrote. If we don't + * have a PIC, we will read 0xff as opposed to the + * value we wrote. + */ + outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ + outb(probe_val, PIC_MASTER_IMR); + new_val = inb(PIC_MASTER_IMR); + if (new_val != probe_val) { + printk(KERN_INFO "Using NULL legacy PIC\n"); + legacy_pic = &null_legacy_pic; + raw_spin_unlock_irqrestore(&i8259A_lock, flags); + return; + } + + outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ + + /* + * outb_pic - this has to work on a wide range of PC hardware. + */ + outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ + + /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */ + outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR); + + /* 8259A-1 (the master) has a slave on IR2 */ + outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); + + if (auto_eoi) /* master does Auto EOI */ + outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); + else /* master expects normal EOI */ + outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); + + outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ + + /* ICW2: 8259A-2 IR0-7 mapped to IRQ8_VECTOR */ + outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR); + /* 8259A-2 is a slave on master's IR2 */ + outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); + /* (slave's support for AEOI in flat mode is to be investigated) */ + outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); + + if (auto_eoi) + /* + * In AEOI mode we just have to mask the interrupt + * when acking. + */ + i8259A_chip.irq_mask_ack = disable_8259A_irq; + else + i8259A_chip.irq_mask_ack = mask_and_ack_8259A; + + udelay(100); /* wait for 8259A to initialize */ + + outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ + outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ + + raw_spin_unlock_irqrestore(&i8259A_lock, flags); +} + +/* + * make i8259 a driver so that we can select pic functions at run time. the goal + * is to make x86 binary compatible among pc compatible and non-pc compatible + * platforms, such as x86 MID. + */ + +static void legacy_pic_noop(void) { }; +static void legacy_pic_uint_noop(unsigned int unused) { }; +static void legacy_pic_int_noop(int unused) { }; +static int legacy_pic_irq_pending_noop(unsigned int irq) +{ + return 0; +} + +struct legacy_pic null_legacy_pic = { + .nr_legacy_irqs = 0, + .chip = &dummy_irq_chip, + .mask = legacy_pic_uint_noop, + .unmask = legacy_pic_uint_noop, + .mask_all = legacy_pic_noop, + .restore_mask = legacy_pic_noop, + .init = legacy_pic_int_noop, + .irq_pending = legacy_pic_irq_pending_noop, + .make_irq = legacy_pic_uint_noop, +}; + +struct legacy_pic default_legacy_pic = { + .nr_legacy_irqs = NR_IRQS_LEGACY, + .chip = &i8259A_chip, + .mask = mask_8259A_irq, + .unmask = unmask_8259A_irq, + .mask_all = mask_8259A, + .restore_mask = unmask_8259A, + .init = init_8259A, + .irq_pending = i8259A_irq_pending, + .make_irq = make_8259A_irq, +}; + +struct legacy_pic *legacy_pic = &default_legacy_pic; + +static int __init i8259A_init_ops(void) +{ + if (legacy_pic == &default_legacy_pic) + register_syscore_ops(&i8259_syscore_ops); + + return 0; +} + +device_initcall(i8259A_init_ops); diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c new file mode 100644 index 00000000000..a979b5bd2fc --- /dev/null +++ b/arch/x86/kernel/io_delay.c @@ -0,0 +1,131 @@ +/* + * I/O delay strategies for inb_p/outb_p + * + * Allow for a DMI based override of port 0x80, needed for certain HP laptops + * and possibly other systems. Also allow for the gradual elimination of + * outb_p/inb_p API uses. + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/delay.h> +#include <linux/init.h> +#include <linux/dmi.h> +#include <linux/io.h> + +int io_delay_type __read_mostly = CONFIG_DEFAULT_IO_DELAY_TYPE; + +static int __initdata io_delay_override; + +/* + * Paravirt wants native_io_delay to be a constant. + */ +void native_io_delay(void) +{ + switch (io_delay_type) { + default: + case CONFIG_IO_DELAY_TYPE_0X80: + asm volatile ("outb %al, $0x80"); + break; + case CONFIG_IO_DELAY_TYPE_0XED: + asm volatile ("outb %al, $0xed"); + break; + case CONFIG_IO_DELAY_TYPE_UDELAY: + /* + * 2 usecs is an upper-bound for the outb delay but + * note that udelay doesn't have the bus-level + * side-effects that outb does, nor does udelay() have + * precise timings during very early bootup (the delays + * are shorter until calibrated): + */ + udelay(2); + case CONFIG_IO_DELAY_TYPE_NONE: + break; + } +} +EXPORT_SYMBOL(native_io_delay); + +static int __init dmi_io_delay_0xed_port(const struct dmi_system_id *id) +{ + if (io_delay_type == CONFIG_IO_DELAY_TYPE_0X80) { + pr_notice("%s: using 0xed I/O delay port\n", id->ident); + io_delay_type = CONFIG_IO_DELAY_TYPE_0XED; + } + + return 0; +} + +/* + * Quirk table for systems that misbehave (lock up, etc.) if port + * 0x80 is used: + */ +static struct dmi_system_id __initdata io_delay_0xed_port_dmi_table[] = { + { + .callback = dmi_io_delay_0xed_port, + .ident = "Compaq Presario V6000", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), + DMI_MATCH(DMI_BOARD_NAME, "30B7") + } + }, + { + .callback = dmi_io_delay_0xed_port, + .ident = "HP Pavilion dv9000z", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), + DMI_MATCH(DMI_BOARD_NAME, "30B9") + } + }, + { + .callback = dmi_io_delay_0xed_port, + .ident = "HP Pavilion dv6000", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), + DMI_MATCH(DMI_BOARD_NAME, "30B8") + } + }, + { + .callback = dmi_io_delay_0xed_port, + .ident = "HP Pavilion tx1000", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), + DMI_MATCH(DMI_BOARD_NAME, "30BF") + } + }, + { + .callback = dmi_io_delay_0xed_port, + .ident = "Presario F700", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), + DMI_MATCH(DMI_BOARD_NAME, "30D3") + } + }, + { } +}; + +void __init io_delay_init(void) +{ + if (!io_delay_override) + dmi_check_system(io_delay_0xed_port_dmi_table); +} + +static int __init io_delay_param(char *s) +{ + if (!s) + return -EINVAL; + + if (!strcmp(s, "0x80")) + io_delay_type = CONFIG_IO_DELAY_TYPE_0X80; + else if (!strcmp(s, "0xed")) + io_delay_type = CONFIG_IO_DELAY_TYPE_0XED; + else if (!strcmp(s, "udelay")) + io_delay_type = CONFIG_IO_DELAY_TYPE_UDELAY; + else if (!strcmp(s, "none")) + io_delay_type = CONFIG_IO_DELAY_TYPE_NONE; + else + return -EINVAL; + + io_delay_override = 1; + return 0; +} + +early_param("io_delay", io_delay_param); diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c new file mode 100644 index 00000000000..4ddaf66ea35 --- /dev/null +++ b/arch/x86/kernel/ioport.c @@ -0,0 +1,114 @@ +/* + * This contains the io-permission bitmap code - written by obz, with changes + * by Linus. 32/64 bits code unification by Miguel Botón. + */ + +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/capability.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/ioport.h> +#include <linux/smp.h> +#include <linux/stddef.h> +#include <linux/slab.h> +#include <linux/thread_info.h> +#include <linux/syscalls.h> +#include <linux/bitmap.h> +#include <asm/syscalls.h> + +/* + * this changes the io permissions bitmap in the current task. + */ +asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) +{ + struct thread_struct *t = ¤t->thread; + struct tss_struct *tss; + unsigned int i, max_long, bytes, bytes_updated; + + if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) + return -EINVAL; + if (turn_on && !capable(CAP_SYS_RAWIO)) + return -EPERM; + + /* + * If it's the first ioperm() call in this thread's lifetime, set the + * IO bitmap up. ioperm() is much less timing critical than clone(), + * this is why we delay this operation until now: + */ + if (!t->io_bitmap_ptr) { + unsigned long *bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); + + if (!bitmap) + return -ENOMEM; + + memset(bitmap, 0xff, IO_BITMAP_BYTES); + t->io_bitmap_ptr = bitmap; + set_thread_flag(TIF_IO_BITMAP); + } + + /* + * do it in the per-thread copy and in the TSS ... + * + * Disable preemption via get_cpu() - we must not switch away + * because the ->io_bitmap_max value must match the bitmap + * contents: + */ + tss = &per_cpu(init_tss, get_cpu()); + + if (turn_on) + bitmap_clear(t->io_bitmap_ptr, from, num); + else + bitmap_set(t->io_bitmap_ptr, from, num); + + /* + * Search for a (possibly new) maximum. This is simple and stupid, + * to keep it obviously correct: + */ + max_long = 0; + for (i = 0; i < IO_BITMAP_LONGS; i++) + if (t->io_bitmap_ptr[i] != ~0UL) + max_long = i; + + bytes = (max_long + 1) * sizeof(unsigned long); + bytes_updated = max(bytes, t->io_bitmap_max); + + t->io_bitmap_max = bytes; + + /* Update the TSS: */ + memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated); + + put_cpu(); + + return 0; +} + +/* + * sys_iopl has to be used when you want to access the IO ports + * beyond the 0x3ff range: to get the full 65536 ports bitmapped + * you'd need 8kB of bitmaps/process, which is a bit excessive. + * + * Here we just change the flags value on the stack: we allow + * only the super-user to do it. This depends on the stack-layout + * on system-call entry - see also fork() and the signal handling + * code. + */ +SYSCALL_DEFINE1(iopl, unsigned int, level) +{ + struct pt_regs *regs = current_pt_regs(); + unsigned int old = (regs->flags >> 12) & 3; + struct thread_struct *t = ¤t->thread; + + if (level > 3) + return -EINVAL; + /* Trying to gain more privileges? */ + if (level > old) { + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + } + regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12); + t->iopl = level << 12; + set_iopl_mask(t->iopl); + + return 0; +} diff --git a/arch/x86/kernel/iosf_mbi.c b/arch/x86/kernel/iosf_mbi.c new file mode 100644 index 00000000000..d30acdc1229 --- /dev/null +++ b/arch/x86/kernel/iosf_mbi.c @@ -0,0 +1,237 @@ +/* + * IOSF-SB MailBox Interface Driver + * Copyright (c) 2013, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * + * The IOSF-SB is a fabric bus available on Atom based SOC's that uses a + * mailbox interface (MBI) to communicate with mutiple devices. This + * driver implements access to this interface for those platforms that can + * enumerate the device using PCI. + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/spinlock.h> +#include <linux/pci.h> + +#include <asm/iosf_mbi.h> + +#define PCI_DEVICE_ID_BAYTRAIL 0x0F00 +#define PCI_DEVICE_ID_QUARK_X1000 0x0958 + +static DEFINE_SPINLOCK(iosf_mbi_lock); + +static inline u32 iosf_mbi_form_mcr(u8 op, u8 port, u8 offset) +{ + return (op << 24) | (port << 16) | (offset << 8) | MBI_ENABLE; +} + +static struct pci_dev *mbi_pdev; /* one mbi device */ + +static int iosf_mbi_pci_read_mdr(u32 mcrx, u32 mcr, u32 *mdr) +{ + int result; + + if (!mbi_pdev) + return -ENODEV; + + if (mcrx) { + result = pci_write_config_dword(mbi_pdev, MBI_MCRX_OFFSET, + mcrx); + if (result < 0) + goto fail_read; + } + + result = pci_write_config_dword(mbi_pdev, MBI_MCR_OFFSET, mcr); + if (result < 0) + goto fail_read; + + result = pci_read_config_dword(mbi_pdev, MBI_MDR_OFFSET, mdr); + if (result < 0) + goto fail_read; + + return 0; + +fail_read: + dev_err(&mbi_pdev->dev, "PCI config access failed with %d\n", result); + return result; +} + +static int iosf_mbi_pci_write_mdr(u32 mcrx, u32 mcr, u32 mdr) +{ + int result; + + if (!mbi_pdev) + return -ENODEV; + + result = pci_write_config_dword(mbi_pdev, MBI_MDR_OFFSET, mdr); + if (result < 0) + goto fail_write; + + if (mcrx) { + result = pci_write_config_dword(mbi_pdev, MBI_MCRX_OFFSET, + mcrx); + if (result < 0) + goto fail_write; + } + + result = pci_write_config_dword(mbi_pdev, MBI_MCR_OFFSET, mcr); + if (result < 0) + goto fail_write; + + return 0; + +fail_write: + dev_err(&mbi_pdev->dev, "PCI config access failed with %d\n", result); + return result; +} + +int iosf_mbi_read(u8 port, u8 opcode, u32 offset, u32 *mdr) +{ + u32 mcr, mcrx; + unsigned long flags; + int ret; + + /*Access to the GFX unit is handled by GPU code */ + if (port == BT_MBI_UNIT_GFX) { + WARN_ON(1); + return -EPERM; + } + + mcr = iosf_mbi_form_mcr(opcode, port, offset & MBI_MASK_LO); + mcrx = offset & MBI_MASK_HI; + + spin_lock_irqsave(&iosf_mbi_lock, flags); + ret = iosf_mbi_pci_read_mdr(mcrx, mcr, mdr); + spin_unlock_irqrestore(&iosf_mbi_lock, flags); + + return ret; +} +EXPORT_SYMBOL(iosf_mbi_read); + +int iosf_mbi_write(u8 port, u8 opcode, u32 offset, u32 mdr) +{ + u32 mcr, mcrx; + unsigned long flags; + int ret; + + /*Access to the GFX unit is handled by GPU code */ + if (port == BT_MBI_UNIT_GFX) { + WARN_ON(1); + return -EPERM; + } + + mcr = iosf_mbi_form_mcr(opcode, port, offset & MBI_MASK_LO); + mcrx = offset & MBI_MASK_HI; + + spin_lock_irqsave(&iosf_mbi_lock, flags); + ret = iosf_mbi_pci_write_mdr(mcrx, mcr, mdr); + spin_unlock_irqrestore(&iosf_mbi_lock, flags); + + return ret; +} +EXPORT_SYMBOL(iosf_mbi_write); + +int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask) +{ + u32 mcr, mcrx; + u32 value; + unsigned long flags; + int ret; + + /*Access to the GFX unit is handled by GPU code */ + if (port == BT_MBI_UNIT_GFX) { + WARN_ON(1); + return -EPERM; + } + + mcr = iosf_mbi_form_mcr(opcode, port, offset & MBI_MASK_LO); + mcrx = offset & MBI_MASK_HI; + + spin_lock_irqsave(&iosf_mbi_lock, flags); + + /* Read current mdr value */ + ret = iosf_mbi_pci_read_mdr(mcrx, mcr & MBI_RD_MASK, &value); + if (ret < 0) { + spin_unlock_irqrestore(&iosf_mbi_lock, flags); + return ret; + } + + /* Apply mask */ + value &= ~mask; + mdr &= mask; + value |= mdr; + + /* Write back */ + ret = iosf_mbi_pci_write_mdr(mcrx, mcr | MBI_WR_MASK, value); + + spin_unlock_irqrestore(&iosf_mbi_lock, flags); + + return ret; +} +EXPORT_SYMBOL(iosf_mbi_modify); + +bool iosf_mbi_available(void) +{ + /* Mbi isn't hot-pluggable. No remove routine is provided */ + return mbi_pdev; +} +EXPORT_SYMBOL(iosf_mbi_available); + +static int iosf_mbi_probe(struct pci_dev *pdev, + const struct pci_device_id *unused) +{ + int ret; + + ret = pci_enable_device(pdev); + if (ret < 0) { + dev_err(&pdev->dev, "error: could not enable device\n"); + return ret; + } + + mbi_pdev = pci_dev_get(pdev); + return 0; +} + +static DEFINE_PCI_DEVICE_TABLE(iosf_mbi_pci_ids) = { + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_BAYTRAIL) }, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_QUARK_X1000) }, + { 0, }, +}; +MODULE_DEVICE_TABLE(pci, iosf_mbi_pci_ids); + +static struct pci_driver iosf_mbi_pci_driver = { + .name = "iosf_mbi_pci", + .probe = iosf_mbi_probe, + .id_table = iosf_mbi_pci_ids, +}; + +static int __init iosf_mbi_init(void) +{ + return pci_register_driver(&iosf_mbi_pci_driver); +} + +static void __exit iosf_mbi_exit(void) +{ + pci_unregister_driver(&iosf_mbi_pci_driver); + if (mbi_pdev) { + pci_dev_put(mbi_pdev); + mbi_pdev = NULL; + } +} + +module_init(iosf_mbi_init); +module_exit(iosf_mbi_exit); + +MODULE_AUTHOR("David E. Box <david.e.box@linux.intel.com>"); +MODULE_DESCRIPTION("IOSF Mailbox Interface accessor"); +MODULE_LICENSE("GPL v2"); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c new file mode 100644 index 00000000000..922d2858102 --- /dev/null +++ b/arch/x86/kernel/irq.c @@ -0,0 +1,468 @@ +/* + * Common interrupt code for 32 and 64 bit + */ +#include <linux/cpu.h> +#include <linux/interrupt.h> +#include <linux/kernel_stat.h> +#include <linux/of.h> +#include <linux/seq_file.h> +#include <linux/smp.h> +#include <linux/ftrace.h> +#include <linux/delay.h> +#include <linux/export.h> + +#include <asm/apic.h> +#include <asm/io_apic.h> +#include <asm/irq.h> +#include <asm/idle.h> +#include <asm/mce.h> +#include <asm/hw_irq.h> +#include <asm/desc.h> + +#define CREATE_TRACE_POINTS +#include <asm/trace/irq_vectors.h> + +atomic_t irq_err_count; + +/* Function pointer for generic interrupt vector handling */ +void (*x86_platform_ipi_callback)(void) = NULL; + +/* + * 'what should we do if we get a hw irq event on an illegal vector'. + * each architecture has to answer this themselves. + */ +void ack_bad_irq(unsigned int irq) +{ + if (printk_ratelimit()) + pr_err("unexpected IRQ trap at vector %02x\n", irq); + + /* + * Currently unexpected vectors happen only on SMP and APIC. + * We _must_ ack these because every local APIC has only N + * irq slots per priority level, and a 'hanging, unacked' IRQ + * holds up an irq slot - in excessive cases (when multiple + * unexpected vectors occur) that might lock up the APIC + * completely. + * But only ack when the APIC is enabled -AK + */ + ack_APIC_irq(); +} + +#define irq_stats(x) (&per_cpu(irq_stat, x)) +/* + * /proc/interrupts printing for arch specific interrupts + */ +int arch_show_interrupts(struct seq_file *p, int prec) +{ + int j; + + seq_printf(p, "%*s: ", prec, "NMI"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->__nmi_count); + seq_printf(p, " Non-maskable interrupts\n"); +#ifdef CONFIG_X86_LOCAL_APIC + seq_printf(p, "%*s: ", prec, "LOC"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs); + seq_printf(p, " Local timer interrupts\n"); + + seq_printf(p, "%*s: ", prec, "SPU"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); + seq_printf(p, " Spurious interrupts\n"); + seq_printf(p, "%*s: ", prec, "PMI"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); + seq_printf(p, " Performance monitoring interrupts\n"); + seq_printf(p, "%*s: ", prec, "IWI"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs); + seq_printf(p, " IRQ work interrupts\n"); + seq_printf(p, "%*s: ", prec, "RTR"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->icr_read_retry_count); + seq_printf(p, " APIC ICR read retries\n"); +#endif + if (x86_platform_ipi_callback) { + seq_printf(p, "%*s: ", prec, "PLT"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis); + seq_printf(p, " Platform interrupts\n"); + } +#ifdef CONFIG_SMP + seq_printf(p, "%*s: ", prec, "RES"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count); + seq_printf(p, " Rescheduling interrupts\n"); + seq_printf(p, "%*s: ", prec, "CAL"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_call_count - + irq_stats(j)->irq_tlb_count); + seq_printf(p, " Function call interrupts\n"); + seq_printf(p, "%*s: ", prec, "TLB"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count); + seq_printf(p, " TLB shootdowns\n"); +#endif +#ifdef CONFIG_X86_THERMAL_VECTOR + seq_printf(p, "%*s: ", prec, "TRM"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); + seq_printf(p, " Thermal event interrupts\n"); +#endif +#ifdef CONFIG_X86_MCE_THRESHOLD + seq_printf(p, "%*s: ", prec, "THR"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); + seq_printf(p, " Threshold APIC interrupts\n"); +#endif +#ifdef CONFIG_X86_MCE + seq_printf(p, "%*s: ", prec, "MCE"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(mce_exception_count, j)); + seq_printf(p, " Machine check exceptions\n"); + seq_printf(p, "%*s: ", prec, "MCP"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(mce_poll_count, j)); + seq_printf(p, " Machine check polls\n"); +#endif +#if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN) + seq_printf(p, "%*s: ", prec, "THR"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_hv_callback_count); + seq_printf(p, " Hypervisor callback interrupts\n"); +#endif + seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); +#if defined(CONFIG_X86_IO_APIC) + seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count)); +#endif + return 0; +} + +/* + * /proc/stat helpers + */ +u64 arch_irq_stat_cpu(unsigned int cpu) +{ + u64 sum = irq_stats(cpu)->__nmi_count; + +#ifdef CONFIG_X86_LOCAL_APIC + sum += irq_stats(cpu)->apic_timer_irqs; + sum += irq_stats(cpu)->irq_spurious_count; + sum += irq_stats(cpu)->apic_perf_irqs; + sum += irq_stats(cpu)->apic_irq_work_irqs; + sum += irq_stats(cpu)->icr_read_retry_count; +#endif + if (x86_platform_ipi_callback) + sum += irq_stats(cpu)->x86_platform_ipis; +#ifdef CONFIG_SMP + sum += irq_stats(cpu)->irq_resched_count; + sum += irq_stats(cpu)->irq_call_count; +#endif +#ifdef CONFIG_X86_THERMAL_VECTOR + sum += irq_stats(cpu)->irq_thermal_count; +#endif +#ifdef CONFIG_X86_MCE_THRESHOLD + sum += irq_stats(cpu)->irq_threshold_count; +#endif +#ifdef CONFIG_X86_MCE + sum += per_cpu(mce_exception_count, cpu); + sum += per_cpu(mce_poll_count, cpu); +#endif + return sum; +} + +u64 arch_irq_stat(void) +{ + u64 sum = atomic_read(&irq_err_count); + return sum; +} + + +/* + * do_IRQ handles all normal device IRQ's (the special + * SMP cross-CPU interrupts have their own specific + * handlers). + */ +__visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + /* high bit used in ret_from_ code */ + unsigned vector = ~regs->orig_ax; + unsigned irq; + + irq_enter(); + exit_idle(); + + irq = __this_cpu_read(vector_irq[vector]); + + if (!handle_irq(irq, regs)) { + ack_APIC_irq(); + + if (irq != VECTOR_RETRIGGERED) { + pr_emerg_ratelimited("%s: %d.%d No irq handler for vector (irq %d)\n", + __func__, smp_processor_id(), + vector, irq); + } else { + __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED); + } + } + + irq_exit(); + + set_irq_regs(old_regs); + return 1; +} + +/* + * Handler for X86_PLATFORM_IPI_VECTOR. + */ +void __smp_x86_platform_ipi(void) +{ + inc_irq_stat(x86_platform_ipis); + + if (x86_platform_ipi_callback) + x86_platform_ipi_callback(); +} + +__visible void smp_x86_platform_ipi(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + entering_ack_irq(); + __smp_x86_platform_ipi(); + exiting_irq(); + set_irq_regs(old_regs); +} + +#ifdef CONFIG_HAVE_KVM +/* + * Handler for POSTED_INTERRUPT_VECTOR. + */ +__visible void smp_kvm_posted_intr_ipi(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + ack_APIC_irq(); + + irq_enter(); + + exit_idle(); + + inc_irq_stat(kvm_posted_intr_ipis); + + irq_exit(); + + set_irq_regs(old_regs); +} +#endif + +__visible void smp_trace_x86_platform_ipi(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + entering_ack_irq(); + trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR); + __smp_x86_platform_ipi(); + trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR); + exiting_irq(); + set_irq_regs(old_regs); +} + +EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); + +#ifdef CONFIG_HOTPLUG_CPU + +/* These two declarations are only used in check_irq_vectors_for_cpu_disable() + * below, which is protected by stop_machine(). Putting them on the stack + * results in a stack frame overflow. Dynamically allocating could result in a + * failure so declare these two cpumasks as global. + */ +static struct cpumask affinity_new, online_new; + +/* + * This cpu is going to be removed and its vectors migrated to the remaining + * online cpus. Check to see if there are enough vectors in the remaining cpus. + * This function is protected by stop_machine(). + */ +int check_irq_vectors_for_cpu_disable(void) +{ + int irq, cpu; + unsigned int this_cpu, vector, this_count, count; + struct irq_desc *desc; + struct irq_data *data; + + this_cpu = smp_processor_id(); + cpumask_copy(&online_new, cpu_online_mask); + cpu_clear(this_cpu, online_new); + + this_count = 0; + for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { + irq = __this_cpu_read(vector_irq[vector]); + if (irq >= 0) { + desc = irq_to_desc(irq); + data = irq_desc_get_irq_data(desc); + cpumask_copy(&affinity_new, data->affinity); + cpu_clear(this_cpu, affinity_new); + + /* Do not count inactive or per-cpu irqs. */ + if (!irq_has_action(irq) || irqd_is_per_cpu(data)) + continue; + + /* + * A single irq may be mapped to multiple + * cpu's vector_irq[] (for example IOAPIC cluster + * mode). In this case we have two + * possibilities: + * + * 1) the resulting affinity mask is empty; that is + * this the down'd cpu is the last cpu in the irq's + * affinity mask, or + * + * 2) the resulting affinity mask is no longer + * a subset of the online cpus but the affinity + * mask is not zero; that is the down'd cpu is the + * last online cpu in a user set affinity mask. + */ + if (cpumask_empty(&affinity_new) || + !cpumask_subset(&affinity_new, &online_new)) + this_count++; + } + } + + count = 0; + for_each_online_cpu(cpu) { + if (cpu == this_cpu) + continue; + /* + * We scan from FIRST_EXTERNAL_VECTOR to first system + * vector. If the vector is marked in the used vectors + * bitmap or an irq is assigned to it, we don't count + * it as available. + */ + for (vector = FIRST_EXTERNAL_VECTOR; + vector < first_system_vector; vector++) { + if (!test_bit(vector, used_vectors) && + per_cpu(vector_irq, cpu)[vector] < 0) + count++; + } + } + + if (count < this_count) { + pr_warn("CPU %d disable failed: CPU has %u vectors assigned and there are only %u available.\n", + this_cpu, this_count, count); + return -ERANGE; + } + return 0; +} + +/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ +void fixup_irqs(void) +{ + unsigned int irq, vector; + static int warned; + struct irq_desc *desc; + struct irq_data *data; + struct irq_chip *chip; + int ret; + + for_each_irq_desc(irq, desc) { + int break_affinity = 0; + int set_affinity = 1; + const struct cpumask *affinity; + + if (!desc) + continue; + if (irq == 2) + continue; + + /* interrupt's are disabled at this point */ + raw_spin_lock(&desc->lock); + + data = irq_desc_get_irq_data(desc); + affinity = data->affinity; + if (!irq_has_action(irq) || irqd_is_per_cpu(data) || + cpumask_subset(affinity, cpu_online_mask)) { + raw_spin_unlock(&desc->lock); + continue; + } + + /* + * Complete the irq move. This cpu is going down and for + * non intr-remapping case, we can't wait till this interrupt + * arrives at this cpu before completing the irq move. + */ + irq_force_complete_move(irq); + + if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { + break_affinity = 1; + affinity = cpu_online_mask; + } + + chip = irq_data_get_irq_chip(data); + if (!irqd_can_move_in_process_context(data) && chip->irq_mask) + chip->irq_mask(data); + + if (chip->irq_set_affinity) { + ret = chip->irq_set_affinity(data, affinity, true); + if (ret == -ENOSPC) + pr_crit("IRQ %d set affinity failed because there are no available vectors. The device assigned to this IRQ is unstable.\n", irq); + } else { + if (!(warned++)) + set_affinity = 0; + } + + /* + * We unmask if the irq was not marked masked by the + * core code. That respects the lazy irq disable + * behaviour. + */ + if (!irqd_can_move_in_process_context(data) && + !irqd_irq_masked(data) && chip->irq_unmask) + chip->irq_unmask(data); + + raw_spin_unlock(&desc->lock); + + if (break_affinity && set_affinity) + pr_notice("Broke affinity for irq %i\n", irq); + else if (!set_affinity) + pr_notice("Cannot set affinity for irq %i\n", irq); + } + + /* + * We can remove mdelay() and then send spuriuous interrupts to + * new cpu targets for all the irqs that were handled previously by + * this cpu. While it works, I have seen spurious interrupt messages + * (nothing wrong but still...). + * + * So for now, retain mdelay(1) and check the IRR and then send those + * interrupts to new targets as this cpu is already offlined... + */ + mdelay(1); + + for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { + unsigned int irr; + + if (__this_cpu_read(vector_irq[vector]) <= VECTOR_UNDEFINED) + continue; + + irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); + if (irr & (1 << (vector % 32))) { + irq = __this_cpu_read(vector_irq[vector]); + + desc = irq_to_desc(irq); + data = irq_desc_get_irq_data(desc); + chip = irq_data_get_irq_chip(data); + raw_spin_lock(&desc->lock); + if (chip->irq_retrigger) { + chip->irq_retrigger(data); + __this_cpu_write(vector_irq[vector], VECTOR_RETRIGGERED); + } + raw_spin_unlock(&desc->lock); + } + if (__this_cpu_read(vector_irq[vector]) != VECTOR_RETRIGGERED) + __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED); + } +} +#endif diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c new file mode 100644 index 00000000000..63ce838e5a5 --- /dev/null +++ b/arch/x86/kernel/irq_32.c @@ -0,0 +1,182 @@ +/* + * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar + * + * This file contains the lowest level x86-specific interrupt + * entry, irq-stacks and irq statistics code. All the remaining + * irq logic is done by the generic kernel/irq/ code and + * by the x86-specific irq controller code. (e.g. i8259.c and + * io_apic.c.) + */ + +#include <linux/module.h> +#include <linux/seq_file.h> +#include <linux/interrupt.h> +#include <linux/kernel_stat.h> +#include <linux/notifier.h> +#include <linux/cpu.h> +#include <linux/delay.h> +#include <linux/uaccess.h> +#include <linux/percpu.h> +#include <linux/mm.h> + +#include <asm/apic.h> + +DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); +EXPORT_PER_CPU_SYMBOL(irq_stat); + +DEFINE_PER_CPU(struct pt_regs *, irq_regs); +EXPORT_PER_CPU_SYMBOL(irq_regs); + +#ifdef CONFIG_DEBUG_STACKOVERFLOW + +int sysctl_panic_on_stackoverflow __read_mostly; + +/* Debugging check for stack overflow: is there less than 1KB free? */ +static int check_stack_overflow(void) +{ + long sp; + + __asm__ __volatile__("andl %%esp,%0" : + "=r" (sp) : "0" (THREAD_SIZE - 1)); + + return sp < (sizeof(struct thread_info) + STACK_WARN); +} + +static void print_stack_overflow(void) +{ + printk(KERN_WARNING "low stack detected by irq handler\n"); + dump_stack(); + if (sysctl_panic_on_stackoverflow) + panic("low stack detected by irq handler - check messages\n"); +} + +#else +static inline int check_stack_overflow(void) { return 0; } +static inline void print_stack_overflow(void) { } +#endif + +DEFINE_PER_CPU(struct irq_stack *, hardirq_stack); +DEFINE_PER_CPU(struct irq_stack *, softirq_stack); + +static void call_on_stack(void *func, void *stack) +{ + asm volatile("xchgl %%ebx,%%esp \n" + "call *%%edi \n" + "movl %%ebx,%%esp \n" + : "=b" (stack) + : "0" (stack), + "D"(func) + : "memory", "cc", "edx", "ecx", "eax"); +} + +/* how to get the current stack pointer from C */ +#define current_stack_pointer ({ \ + unsigned long sp; \ + asm("mov %%esp,%0" : "=g" (sp)); \ + sp; \ +}) + +static inline void *current_stack(void) +{ + return (void *)(current_stack_pointer & ~(THREAD_SIZE - 1)); +} + +static inline int +execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) +{ + struct irq_stack *curstk, *irqstk; + u32 *isp, *prev_esp, arg1, arg2; + + curstk = (struct irq_stack *) current_stack(); + irqstk = __this_cpu_read(hardirq_stack); + + /* + * this is where we switch to the IRQ stack. However, if we are + * already using the IRQ stack (because we interrupted a hardirq + * handler) we can't do that and just have to keep using the + * current stack (which is the irq stack already after all) + */ + if (unlikely(curstk == irqstk)) + return 0; + + isp = (u32 *) ((char *)irqstk + sizeof(*irqstk)); + + /* Save the next esp at the bottom of the stack */ + prev_esp = (u32 *)irqstk; + *prev_esp = current_stack_pointer; + + if (unlikely(overflow)) + call_on_stack(print_stack_overflow, isp); + + asm volatile("xchgl %%ebx,%%esp \n" + "call *%%edi \n" + "movl %%ebx,%%esp \n" + : "=a" (arg1), "=d" (arg2), "=b" (isp) + : "0" (irq), "1" (desc), "2" (isp), + "D" (desc->handle_irq) + : "memory", "cc", "ecx"); + return 1; +} + +/* + * allocate per-cpu stacks for hardirq and for softirq processing + */ +void irq_ctx_init(int cpu) +{ + struct irq_stack *irqstk; + + if (per_cpu(hardirq_stack, cpu)) + return; + + irqstk = page_address(alloc_pages_node(cpu_to_node(cpu), + THREADINFO_GFP, + THREAD_SIZE_ORDER)); + per_cpu(hardirq_stack, cpu) = irqstk; + + irqstk = page_address(alloc_pages_node(cpu_to_node(cpu), + THREADINFO_GFP, + THREAD_SIZE_ORDER)); + per_cpu(softirq_stack, cpu) = irqstk; + + printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n", + cpu, per_cpu(hardirq_stack, cpu), per_cpu(softirq_stack, cpu)); +} + +void do_softirq_own_stack(void) +{ + struct thread_info *curstk; + struct irq_stack *irqstk; + u32 *isp, *prev_esp; + + curstk = current_stack(); + irqstk = __this_cpu_read(softirq_stack); + + /* build the stack frame on the softirq stack */ + isp = (u32 *) ((char *)irqstk + sizeof(*irqstk)); + + /* Push the previous esp onto the stack */ + prev_esp = (u32 *)irqstk; + *prev_esp = current_stack_pointer; + + call_on_stack(__do_softirq, isp); +} + +bool handle_irq(unsigned irq, struct pt_regs *regs) +{ + struct irq_desc *desc; + int overflow; + + overflow = check_stack_overflow(); + + desc = irq_to_desc(irq); + if (unlikely(!desc)) + return false; + + if (user_mode_vm(regs) || !execute_on_irq_stack(overflow, desc, irq)) { + if (unlikely(overflow)) + print_stack_overflow(); + desc->handle_irq(irq, desc); + } + + return true; +} diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c new file mode 100644 index 00000000000..4d1c746892e --- /dev/null +++ b/arch/x86/kernel/irq_64.c @@ -0,0 +1,89 @@ +/* + * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar + * + * This file contains the lowest level x86_64-specific interrupt + * entry and irq statistics code. All the remaining irq logic is + * done by the generic kernel/irq/ code and in the + * x86_64-specific irq controller code. (e.g. i8259.c and + * io_apic.c.) + */ + +#include <linux/kernel_stat.h> +#include <linux/interrupt.h> +#include <linux/seq_file.h> +#include <linux/module.h> +#include <linux/delay.h> +#include <linux/ftrace.h> +#include <linux/uaccess.h> +#include <linux/smp.h> +#include <asm/io_apic.h> +#include <asm/idle.h> +#include <asm/apic.h> + +DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); +EXPORT_PER_CPU_SYMBOL(irq_stat); + +DEFINE_PER_CPU(struct pt_regs *, irq_regs); +EXPORT_PER_CPU_SYMBOL(irq_regs); + +int sysctl_panic_on_stackoverflow; + +/* + * Probabilistic stack overflow check: + * + * Only check the stack in process context, because everything else + * runs on the big interrupt stacks. Checking reliably is too expensive, + * so we just check from interrupts. + */ +static inline void stack_overflow_check(struct pt_regs *regs) +{ +#ifdef CONFIG_DEBUG_STACKOVERFLOW +#define STACK_TOP_MARGIN 128 + struct orig_ist *oist; + u64 irq_stack_top, irq_stack_bottom; + u64 estack_top, estack_bottom; + u64 curbase = (u64)task_stack_page(current); + + if (user_mode_vm(regs)) + return; + + if (regs->sp >= curbase + sizeof(struct thread_info) + + sizeof(struct pt_regs) + STACK_TOP_MARGIN && + regs->sp <= curbase + THREAD_SIZE) + return; + + irq_stack_top = (u64)__get_cpu_var(irq_stack_union.irq_stack) + + STACK_TOP_MARGIN; + irq_stack_bottom = (u64)__get_cpu_var(irq_stack_ptr); + if (regs->sp >= irq_stack_top && regs->sp <= irq_stack_bottom) + return; + + oist = &__get_cpu_var(orig_ist); + estack_top = (u64)oist->ist[0] - EXCEPTION_STKSZ + STACK_TOP_MARGIN; + estack_bottom = (u64)oist->ist[N_EXCEPTION_STACKS - 1]; + if (regs->sp >= estack_top && regs->sp <= estack_bottom) + return; + + WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx)\n", + current->comm, curbase, regs->sp, + irq_stack_top, irq_stack_bottom, + estack_top, estack_bottom); + + if (sysctl_panic_on_stackoverflow) + panic("low stack detected by irq handler - check messages\n"); +#endif +} + +bool handle_irq(unsigned irq, struct pt_regs *regs) +{ + struct irq_desc *desc; + + stack_overflow_check(regs); + + desc = irq_to_desc(irq); + if (unlikely(!desc)) + return false; + + generic_handle_irq_desc(irq, desc); + return true; +} diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c new file mode 100644 index 00000000000..1de84e3ab4e --- /dev/null +++ b/arch/x86/kernel/irq_work.c @@ -0,0 +1,50 @@ +/* + * x86 specific code for irq_work + * + * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + */ + +#include <linux/kernel.h> +#include <linux/irq_work.h> +#include <linux/hardirq.h> +#include <asm/apic.h> +#include <asm/trace/irq_vectors.h> + +static inline void irq_work_entering_irq(void) +{ + irq_enter(); + ack_APIC_irq(); +} + +static inline void __smp_irq_work_interrupt(void) +{ + inc_irq_stat(apic_irq_work_irqs); + irq_work_run(); +} + +__visible void smp_irq_work_interrupt(struct pt_regs *regs) +{ + irq_work_entering_irq(); + __smp_irq_work_interrupt(); + exiting_irq(); +} + +__visible void smp_trace_irq_work_interrupt(struct pt_regs *regs) +{ + irq_work_entering_irq(); + trace_irq_work_entry(IRQ_WORK_VECTOR); + __smp_irq_work_interrupt(); + trace_irq_work_exit(IRQ_WORK_VECTOR); + exiting_irq(); +} + +void arch_irq_work_raise(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + if (!cpu_has_apic) + return; + + apic->send_IPI_self(IRQ_WORK_VECTOR); + apic_wait_icr_idle(); +#endif +} diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c new file mode 100644 index 00000000000..7f50156542f --- /dev/null +++ b/arch/x86/kernel/irqinit.c @@ -0,0 +1,218 @@ +#include <linux/linkage.h> +#include <linux/errno.h> +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/ioport.h> +#include <linux/interrupt.h> +#include <linux/timex.h> +#include <linux/random.h> +#include <linux/kprobes.h> +#include <linux/init.h> +#include <linux/kernel_stat.h> +#include <linux/device.h> +#include <linux/bitops.h> +#include <linux/acpi.h> +#include <linux/io.h> +#include <linux/delay.h> + +#include <linux/atomic.h> +#include <asm/timer.h> +#include <asm/hw_irq.h> +#include <asm/pgtable.h> +#include <asm/desc.h> +#include <asm/apic.h> +#include <asm/setup.h> +#include <asm/i8259.h> +#include <asm/traps.h> +#include <asm/prom.h> + +/* + * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: + * (these are usually mapped to vectors 0x30-0x3f) + */ + +/* + * The IO-APIC gives us many more interrupt sources. Most of these + * are unused but an SMP system is supposed to have enough memory ... + * sometimes (mostly wrt. hw bugs) we get corrupted vectors all + * across the spectrum, so we really want to be prepared to get all + * of these. Plus, more powerful systems might have more than 64 + * IO-APIC registers. + * + * (these are usually mapped into the 0x30-0xff vector range) + */ + +/* + * IRQ2 is cascade interrupt to second interrupt controller + */ +static struct irqaction irq2 = { + .handler = no_action, + .name = "cascade", + .flags = IRQF_NO_THREAD, +}; + +DEFINE_PER_CPU(vector_irq_t, vector_irq) = { + [0 ... NR_VECTORS - 1] = VECTOR_UNDEFINED, +}; + +int vector_used_by_percpu_irq(unsigned int vector) +{ + int cpu; + + for_each_online_cpu(cpu) { + if (per_cpu(vector_irq, cpu)[vector] > VECTOR_UNDEFINED) + return 1; + } + + return 0; +} + +void __init init_ISA_irqs(void) +{ + struct irq_chip *chip = legacy_pic->chip; + const char *name = chip->name; + int i; + +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) + init_bsp_APIC(); +#endif + legacy_pic->init(0); + + for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) + irq_set_chip_and_handler_name(i, chip, handle_level_irq, name); +} + +void __init init_IRQ(void) +{ + int i; + + /* + * We probably need a better place for this, but it works for + * now ... + */ + x86_add_irq_domains(); + + /* + * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15. + * If these IRQ's are handled by legacy interrupt-controllers like PIC, + * then this configuration will likely be static after the boot. If + * these IRQ's are handled by more mordern controllers like IO-APIC, + * then this vector space can be freed and re-used dynamically as the + * irq's migrate etc. + */ + for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) + per_cpu(vector_irq, 0)[IRQ0_VECTOR + i] = i; + + x86_init.irqs.intr_init(); +} + +/* + * Setup the vector to irq mappings. + */ +void setup_vector_irq(int cpu) +{ +#ifndef CONFIG_X86_IO_APIC + int irq; + + /* + * On most of the platforms, legacy PIC delivers the interrupts on the + * boot cpu. But there are certain platforms where PIC interrupts are + * delivered to multiple cpu's. If the legacy IRQ is handled by the + * legacy PIC, for the new cpu that is coming online, setup the static + * legacy vector to irq mapping: + */ + for (irq = 0; irq < legacy_pic->nr_legacy_irqs; irq++) + per_cpu(vector_irq, cpu)[IRQ0_VECTOR + irq] = irq; +#endif + + __setup_vector_irq(cpu); +} + +static void __init smp_intr_init(void) +{ +#ifdef CONFIG_SMP +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) + /* + * The reschedule interrupt is a CPU-to-CPU reschedule-helper + * IPI, driven by wakeup. + */ + alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); + + /* IPI for generic function call */ + alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); + + /* IPI for generic single function call */ + alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, + call_function_single_interrupt); + + /* Low priority IPI to cleanup after moving an irq */ + set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); + set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); + + /* IPI used for rebooting/stopping */ + alloc_intr_gate(REBOOT_VECTOR, reboot_interrupt); +#endif +#endif /* CONFIG_SMP */ +} + +static void __init apic_intr_init(void) +{ + smp_intr_init(); + +#ifdef CONFIG_X86_THERMAL_VECTOR + alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); +#endif +#ifdef CONFIG_X86_MCE_THRESHOLD + alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); +#endif + +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) + /* self generated IPI for local APIC timer */ + alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); + + /* IPI for X86 platform specific use */ + alloc_intr_gate(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi); +#ifdef CONFIG_HAVE_KVM + /* IPI for KVM to deliver posted interrupt */ + alloc_intr_gate(POSTED_INTR_VECTOR, kvm_posted_intr_ipi); +#endif + + /* IPI vectors for APIC spurious and error interrupts */ + alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); + alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); + + /* IRQ work interrupts: */ +# ifdef CONFIG_IRQ_WORK + alloc_intr_gate(IRQ_WORK_VECTOR, irq_work_interrupt); +# endif + +#endif +} + +void __init native_init_IRQ(void) +{ + int i; + + /* Execute any quirks before the call gates are initialised: */ + x86_init.irqs.pre_vector_init(); + + apic_intr_init(); + + /* + * Cover the whole vector space, no vector can escape + * us. (some of these will be overridden and become + * 'special' SMP interrupts) + */ + i = FIRST_EXTERNAL_VECTOR; + for_each_clear_bit_from(i, used_vectors, NR_VECTORS) { + /* IA32_SYSCALL_VECTOR could be used in trap_init already. */ + set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); + } + + if (!acpi_ioapic && !of_ioapic) + setup_irq(2, &irq2); + +#ifdef CONFIG_X86_32 + irq_ctx_init(smp_processor_id()); +#endif +} diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c new file mode 100644 index 00000000000..26d5a55a273 --- /dev/null +++ b/arch/x86/kernel/jump_label.c @@ -0,0 +1,144 @@ +/* + * jump label x86 support + * + * Copyright (C) 2009 Jason Baron <jbaron@redhat.com> + * + */ +#include <linux/jump_label.h> +#include <linux/memory.h> +#include <linux/uaccess.h> +#include <linux/module.h> +#include <linux/list.h> +#include <linux/jhash.h> +#include <linux/cpu.h> +#include <asm/kprobes.h> +#include <asm/alternative.h> + +#ifdef HAVE_JUMP_LABEL + +union jump_code_union { + char code[JUMP_LABEL_NOP_SIZE]; + struct { + char jump; + int offset; + } __attribute__((packed)); +}; + +static void bug_at(unsigned char *ip, int line) +{ + /* + * The location is not an op that we were expecting. + * Something went wrong. Crash the box, as something could be + * corrupting the kernel. + */ + pr_warning("Unexpected op at %pS [%p] (%02x %02x %02x %02x %02x) %s:%d\n", + ip, ip, ip[0], ip[1], ip[2], ip[3], ip[4], __FILE__, line); + BUG(); +} + +static void __jump_label_transform(struct jump_entry *entry, + enum jump_label_type type, + void *(*poker)(void *, const void *, size_t), + int init) +{ + union jump_code_union code; + const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP }; + const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5]; + + if (type == JUMP_LABEL_ENABLE) { + if (init) { + /* + * Jump label is enabled for the first time. + * So we expect a default_nop... + */ + if (unlikely(memcmp((void *)entry->code, default_nop, 5) + != 0)) + bug_at((void *)entry->code, __LINE__); + } else { + /* + * ...otherwise expect an ideal_nop. Otherwise + * something went horribly wrong. + */ + if (unlikely(memcmp((void *)entry->code, ideal_nop, 5) + != 0)) + bug_at((void *)entry->code, __LINE__); + } + + code.jump = 0xe9; + code.offset = entry->target - + (entry->code + JUMP_LABEL_NOP_SIZE); + } else { + /* + * We are disabling this jump label. If it is not what + * we think it is, then something must have gone wrong. + * If this is the first initialization call, then we + * are converting the default nop to the ideal nop. + */ + if (init) { + if (unlikely(memcmp((void *)entry->code, default_nop, 5) != 0)) + bug_at((void *)entry->code, __LINE__); + } else { + code.jump = 0xe9; + code.offset = entry->target - + (entry->code + JUMP_LABEL_NOP_SIZE); + if (unlikely(memcmp((void *)entry->code, &code, 5) != 0)) + bug_at((void *)entry->code, __LINE__); + } + memcpy(&code, ideal_nops[NOP_ATOMIC5], JUMP_LABEL_NOP_SIZE); + } + + /* + * Make text_poke_bp() a default fallback poker. + * + * At the time the change is being done, just ignore whether we + * are doing nop -> jump or jump -> nop transition, and assume + * always nop being the 'currently valid' instruction + * + */ + if (poker) + (*poker)((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE); + else + text_poke_bp((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE, + (void *)entry->code + JUMP_LABEL_NOP_SIZE); +} + +void arch_jump_label_transform(struct jump_entry *entry, + enum jump_label_type type) +{ + get_online_cpus(); + mutex_lock(&text_mutex); + __jump_label_transform(entry, type, NULL, 0); + mutex_unlock(&text_mutex); + put_online_cpus(); +} + +static enum { + JL_STATE_START, + JL_STATE_NO_UPDATE, + JL_STATE_UPDATE, +} jlstate __initdata_or_module = JL_STATE_START; + +__init_or_module void arch_jump_label_transform_static(struct jump_entry *entry, + enum jump_label_type type) +{ + /* + * This function is called at boot up and when modules are + * first loaded. Check if the default nop, the one that is + * inserted at compile time, is the ideal nop. If it is, then + * we do not need to update the nop, and we can leave it as is. + * If it is not, then we need to update the nop to the ideal nop. + */ + if (jlstate == JL_STATE_START) { + const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP }; + const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5]; + + if (memcmp(ideal_nop, default_nop, 5) != 0) + jlstate = JL_STATE_UPDATE; + else + jlstate = JL_STATE_NO_UPDATE; + } + if (jlstate == JL_STATE_UPDATE) + __jump_label_transform(entry, type, text_poke_early, 1); +} + +#endif diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c new file mode 100644 index 00000000000..dc1404bf8e4 --- /dev/null +++ b/arch/x86/kernel/kdebugfs.c @@ -0,0 +1,214 @@ +/* + * Architecture specific debugfs files + * + * Copyright (C) 2007, Intel Corp. + * Huang Ying <ying.huang@intel.com> + * + * This file is released under the GPLv2. + */ +#include <linux/debugfs.h> +#include <linux/uaccess.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/stat.h> +#include <linux/io.h> +#include <linux/mm.h> + +#include <asm/setup.h> + +struct dentry *arch_debugfs_dir; +EXPORT_SYMBOL(arch_debugfs_dir); + +#ifdef CONFIG_DEBUG_BOOT_PARAMS +struct setup_data_node { + u64 paddr; + u32 type; + u32 len; +}; + +static ssize_t setup_data_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct setup_data_node *node = file->private_data; + unsigned long remain; + loff_t pos = *ppos; + struct page *pg; + void *p; + u64 pa; + + if (pos < 0) + return -EINVAL; + + if (pos >= node->len) + return 0; + + if (count > node->len - pos) + count = node->len - pos; + + pa = node->paddr + sizeof(struct setup_data) + pos; + pg = pfn_to_page((pa + count - 1) >> PAGE_SHIFT); + if (PageHighMem(pg)) { + p = ioremap_cache(pa, count); + if (!p) + return -ENXIO; + } else + p = __va(pa); + + remain = copy_to_user(user_buf, p, count); + + if (PageHighMem(pg)) + iounmap(p); + + if (remain) + return -EFAULT; + + *ppos = pos + count; + + return count; +} + +static const struct file_operations fops_setup_data = { + .read = setup_data_read, + .open = simple_open, + .llseek = default_llseek, +}; + +static int __init +create_setup_data_node(struct dentry *parent, int no, + struct setup_data_node *node) +{ + struct dentry *d, *type, *data; + char buf[16]; + + sprintf(buf, "%d", no); + d = debugfs_create_dir(buf, parent); + if (!d) + return -ENOMEM; + + type = debugfs_create_x32("type", S_IRUGO, d, &node->type); + if (!type) + goto err_dir; + + data = debugfs_create_file("data", S_IRUGO, d, node, &fops_setup_data); + if (!data) + goto err_type; + + return 0; + +err_type: + debugfs_remove(type); +err_dir: + debugfs_remove(d); + return -ENOMEM; +} + +static int __init create_setup_data_nodes(struct dentry *parent) +{ + struct setup_data_node *node; + struct setup_data *data; + int error; + struct dentry *d; + struct page *pg; + u64 pa_data; + int no = 0; + + d = debugfs_create_dir("setup_data", parent); + if (!d) + return -ENOMEM; + + pa_data = boot_params.hdr.setup_data; + + while (pa_data) { + node = kmalloc(sizeof(*node), GFP_KERNEL); + if (!node) { + error = -ENOMEM; + goto err_dir; + } + + pg = pfn_to_page((pa_data+sizeof(*data)-1) >> PAGE_SHIFT); + if (PageHighMem(pg)) { + data = ioremap_cache(pa_data, sizeof(*data)); + if (!data) { + kfree(node); + error = -ENXIO; + goto err_dir; + } + } else + data = __va(pa_data); + + node->paddr = pa_data; + node->type = data->type; + node->len = data->len; + error = create_setup_data_node(d, no, node); + pa_data = data->next; + + if (PageHighMem(pg)) + iounmap(data); + if (error) + goto err_dir; + no++; + } + + return 0; + +err_dir: + debugfs_remove(d); + return error; +} + +static struct debugfs_blob_wrapper boot_params_blob = { + .data = &boot_params, + .size = sizeof(boot_params), +}; + +static int __init boot_params_kdebugfs_init(void) +{ + struct dentry *dbp, *version, *data; + int error = -ENOMEM; + + dbp = debugfs_create_dir("boot_params", NULL); + if (!dbp) + return -ENOMEM; + + version = debugfs_create_x16("version", S_IRUGO, dbp, + &boot_params.hdr.version); + if (!version) + goto err_dir; + + data = debugfs_create_blob("data", S_IRUGO, dbp, + &boot_params_blob); + if (!data) + goto err_version; + + error = create_setup_data_nodes(dbp); + if (error) + goto err_data; + + return 0; + +err_data: + debugfs_remove(data); +err_version: + debugfs_remove(version); +err_dir: + debugfs_remove(dbp); + return error; +} +#endif /* CONFIG_DEBUG_BOOT_PARAMS */ + +static int __init arch_kdebugfs_init(void) +{ + int error = 0; + + arch_debugfs_dir = debugfs_create_dir("x86", NULL); + if (!arch_debugfs_dir) + return -ENOMEM; + +#ifdef CONFIG_DEBUG_BOOT_PARAMS + error = boot_params_kdebugfs_init(); +#endif + + return error; +} +arch_initcall(arch_kdebugfs_init); diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c new file mode 100644 index 00000000000..7ec1d5f8d28 --- /dev/null +++ b/arch/x86/kernel/kgdb.c @@ -0,0 +1,814 @@ +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + */ + +/* + * Copyright (C) 2004 Amit S. Kale <amitkale@linsyssoft.com> + * Copyright (C) 2000-2001 VERITAS Software Corporation. + * Copyright (C) 2002 Andi Kleen, SuSE Labs + * Copyright (C) 2004 LinSysSoft Technologies Pvt. Ltd. + * Copyright (C) 2007 MontaVista Software, Inc. + * Copyright (C) 2007-2008 Jason Wessel, Wind River Systems, Inc. + */ +/**************************************************************************** + * Contributor: Lake Stevens Instrument Division$ + * Written by: Glenn Engel $ + * Updated by: Amit Kale<akale@veritas.com> + * Updated by: Tom Rini <trini@kernel.crashing.org> + * Updated by: Jason Wessel <jason.wessel@windriver.com> + * Modified for 386 by Jim Kingdon, Cygnus Support. + * Origianl kgdb, compatibility with 2.1.xx kernel by + * David Grothe <dave@gcom.com> + * Integrated into 2.2.5 kernel by Tigran Aivazian <tigran@sco.com> + * X86_64 changes from Andi Kleen's patch merged by Jim Houston + */ +#include <linux/spinlock.h> +#include <linux/kdebug.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/ptrace.h> +#include <linux/sched.h> +#include <linux/delay.h> +#include <linux/kgdb.h> +#include <linux/smp.h> +#include <linux/nmi.h> +#include <linux/hw_breakpoint.h> +#include <linux/uaccess.h> +#include <linux/memory.h> + +#include <asm/debugreg.h> +#include <asm/apicdef.h> +#include <asm/apic.h> +#include <asm/nmi.h> + +struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = +{ +#ifdef CONFIG_X86_32 + { "ax", 4, offsetof(struct pt_regs, ax) }, + { "cx", 4, offsetof(struct pt_regs, cx) }, + { "dx", 4, offsetof(struct pt_regs, dx) }, + { "bx", 4, offsetof(struct pt_regs, bx) }, + { "sp", 4, offsetof(struct pt_regs, sp) }, + { "bp", 4, offsetof(struct pt_regs, bp) }, + { "si", 4, offsetof(struct pt_regs, si) }, + { "di", 4, offsetof(struct pt_regs, di) }, + { "ip", 4, offsetof(struct pt_regs, ip) }, + { "flags", 4, offsetof(struct pt_regs, flags) }, + { "cs", 4, offsetof(struct pt_regs, cs) }, + { "ss", 4, offsetof(struct pt_regs, ss) }, + { "ds", 4, offsetof(struct pt_regs, ds) }, + { "es", 4, offsetof(struct pt_regs, es) }, +#else + { "ax", 8, offsetof(struct pt_regs, ax) }, + { "bx", 8, offsetof(struct pt_regs, bx) }, + { "cx", 8, offsetof(struct pt_regs, cx) }, + { "dx", 8, offsetof(struct pt_regs, dx) }, + { "si", 8, offsetof(struct pt_regs, dx) }, + { "di", 8, offsetof(struct pt_regs, di) }, + { "bp", 8, offsetof(struct pt_regs, bp) }, + { "sp", 8, offsetof(struct pt_regs, sp) }, + { "r8", 8, offsetof(struct pt_regs, r8) }, + { "r9", 8, offsetof(struct pt_regs, r9) }, + { "r10", 8, offsetof(struct pt_regs, r10) }, + { "r11", 8, offsetof(struct pt_regs, r11) }, + { "r12", 8, offsetof(struct pt_regs, r12) }, + { "r13", 8, offsetof(struct pt_regs, r13) }, + { "r14", 8, offsetof(struct pt_regs, r14) }, + { "r15", 8, offsetof(struct pt_regs, r15) }, + { "ip", 8, offsetof(struct pt_regs, ip) }, + { "flags", 4, offsetof(struct pt_regs, flags) }, + { "cs", 4, offsetof(struct pt_regs, cs) }, + { "ss", 4, offsetof(struct pt_regs, ss) }, + { "ds", 4, -1 }, + { "es", 4, -1 }, +#endif + { "fs", 4, -1 }, + { "gs", 4, -1 }, +}; + +int dbg_set_reg(int regno, void *mem, struct pt_regs *regs) +{ + if ( +#ifdef CONFIG_X86_32 + regno == GDB_SS || regno == GDB_FS || regno == GDB_GS || +#endif + regno == GDB_SP || regno == GDB_ORIG_AX) + return 0; + + if (dbg_reg_def[regno].offset != -1) + memcpy((void *)regs + dbg_reg_def[regno].offset, mem, + dbg_reg_def[regno].size); + return 0; +} + +char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs) +{ + if (regno == GDB_ORIG_AX) { + memcpy(mem, ®s->orig_ax, sizeof(regs->orig_ax)); + return "orig_ax"; + } + if (regno >= DBG_MAX_REG_NUM || regno < 0) + return NULL; + + if (dbg_reg_def[regno].offset != -1) + memcpy(mem, (void *)regs + dbg_reg_def[regno].offset, + dbg_reg_def[regno].size); + +#ifdef CONFIG_X86_32 + switch (regno) { + case GDB_SS: + if (!user_mode_vm(regs)) + *(unsigned long *)mem = __KERNEL_DS; + break; + case GDB_SP: + if (!user_mode_vm(regs)) + *(unsigned long *)mem = kernel_stack_pointer(regs); + break; + case GDB_GS: + case GDB_FS: + *(unsigned long *)mem = 0xFFFF; + break; + } +#endif + return dbg_reg_def[regno].name; +} + +/** + * sleeping_thread_to_gdb_regs - Convert ptrace regs to GDB regs + * @gdb_regs: A pointer to hold the registers in the order GDB wants. + * @p: The &struct task_struct of the desired process. + * + * Convert the register values of the sleeping process in @p to + * the format that GDB expects. + * This function is called when kgdb does not have access to the + * &struct pt_regs and therefore it should fill the gdb registers + * @gdb_regs with what has been saved in &struct thread_struct + * thread field during switch_to. + */ +void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) +{ +#ifndef CONFIG_X86_32 + u32 *gdb_regs32 = (u32 *)gdb_regs; +#endif + gdb_regs[GDB_AX] = 0; + gdb_regs[GDB_BX] = 0; + gdb_regs[GDB_CX] = 0; + gdb_regs[GDB_DX] = 0; + gdb_regs[GDB_SI] = 0; + gdb_regs[GDB_DI] = 0; + gdb_regs[GDB_BP] = *(unsigned long *)p->thread.sp; +#ifdef CONFIG_X86_32 + gdb_regs[GDB_DS] = __KERNEL_DS; + gdb_regs[GDB_ES] = __KERNEL_DS; + gdb_regs[GDB_PS] = 0; + gdb_regs[GDB_CS] = __KERNEL_CS; + gdb_regs[GDB_PC] = p->thread.ip; + gdb_regs[GDB_SS] = __KERNEL_DS; + gdb_regs[GDB_FS] = 0xFFFF; + gdb_regs[GDB_GS] = 0xFFFF; +#else + gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8); + gdb_regs32[GDB_CS] = __KERNEL_CS; + gdb_regs32[GDB_SS] = __KERNEL_DS; + gdb_regs[GDB_PC] = 0; + gdb_regs[GDB_R8] = 0; + gdb_regs[GDB_R9] = 0; + gdb_regs[GDB_R10] = 0; + gdb_regs[GDB_R11] = 0; + gdb_regs[GDB_R12] = 0; + gdb_regs[GDB_R13] = 0; + gdb_regs[GDB_R14] = 0; + gdb_regs[GDB_R15] = 0; +#endif + gdb_regs[GDB_SP] = p->thread.sp; +} + +static struct hw_breakpoint { + unsigned enabled; + unsigned long addr; + int len; + int type; + struct perf_event * __percpu *pev; +} breakinfo[HBP_NUM]; + +static unsigned long early_dr7; + +static void kgdb_correct_hw_break(void) +{ + int breakno; + + for (breakno = 0; breakno < HBP_NUM; breakno++) { + struct perf_event *bp; + struct arch_hw_breakpoint *info; + int val; + int cpu = raw_smp_processor_id(); + if (!breakinfo[breakno].enabled) + continue; + if (dbg_is_early) { + set_debugreg(breakinfo[breakno].addr, breakno); + early_dr7 |= encode_dr7(breakno, + breakinfo[breakno].len, + breakinfo[breakno].type); + set_debugreg(early_dr7, 7); + continue; + } + bp = *per_cpu_ptr(breakinfo[breakno].pev, cpu); + info = counter_arch_bp(bp); + if (bp->attr.disabled != 1) + continue; + bp->attr.bp_addr = breakinfo[breakno].addr; + bp->attr.bp_len = breakinfo[breakno].len; + bp->attr.bp_type = breakinfo[breakno].type; + info->address = breakinfo[breakno].addr; + info->len = breakinfo[breakno].len; + info->type = breakinfo[breakno].type; + val = arch_install_hw_breakpoint(bp); + if (!val) + bp->attr.disabled = 0; + } + if (!dbg_is_early) + hw_breakpoint_restore(); +} + +static int hw_break_reserve_slot(int breakno) +{ + int cpu; + int cnt = 0; + struct perf_event **pevent; + + if (dbg_is_early) + return 0; + + for_each_online_cpu(cpu) { + cnt++; + pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); + if (dbg_reserve_bp_slot(*pevent)) + goto fail; + } + + return 0; + +fail: + for_each_online_cpu(cpu) { + cnt--; + if (!cnt) + break; + pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); + dbg_release_bp_slot(*pevent); + } + return -1; +} + +static int hw_break_release_slot(int breakno) +{ + struct perf_event **pevent; + int cpu; + + if (dbg_is_early) + return 0; + + for_each_online_cpu(cpu) { + pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); + if (dbg_release_bp_slot(*pevent)) + /* + * The debugger is responsible for handing the retry on + * remove failure. + */ + return -1; + } + return 0; +} + +static int +kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) +{ + int i; + + for (i = 0; i < HBP_NUM; i++) + if (breakinfo[i].addr == addr && breakinfo[i].enabled) + break; + if (i == HBP_NUM) + return -1; + + if (hw_break_release_slot(i)) { + printk(KERN_ERR "Cannot remove hw breakpoint at %lx\n", addr); + return -1; + } + breakinfo[i].enabled = 0; + + return 0; +} + +static void kgdb_remove_all_hw_break(void) +{ + int i; + int cpu = raw_smp_processor_id(); + struct perf_event *bp; + + for (i = 0; i < HBP_NUM; i++) { + if (!breakinfo[i].enabled) + continue; + bp = *per_cpu_ptr(breakinfo[i].pev, cpu); + if (!bp->attr.disabled) { + arch_uninstall_hw_breakpoint(bp); + bp->attr.disabled = 1; + continue; + } + if (dbg_is_early) + early_dr7 &= ~encode_dr7(i, breakinfo[i].len, + breakinfo[i].type); + else if (hw_break_release_slot(i)) + printk(KERN_ERR "KGDB: hw bpt remove failed %lx\n", + breakinfo[i].addr); + breakinfo[i].enabled = 0; + } +} + +static int +kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) +{ + int i; + + for (i = 0; i < HBP_NUM; i++) + if (!breakinfo[i].enabled) + break; + if (i == HBP_NUM) + return -1; + + switch (bptype) { + case BP_HARDWARE_BREAKPOINT: + len = 1; + breakinfo[i].type = X86_BREAKPOINT_EXECUTE; + break; + case BP_WRITE_WATCHPOINT: + breakinfo[i].type = X86_BREAKPOINT_WRITE; + break; + case BP_ACCESS_WATCHPOINT: + breakinfo[i].type = X86_BREAKPOINT_RW; + break; + default: + return -1; + } + switch (len) { + case 1: + breakinfo[i].len = X86_BREAKPOINT_LEN_1; + break; + case 2: + breakinfo[i].len = X86_BREAKPOINT_LEN_2; + break; + case 4: + breakinfo[i].len = X86_BREAKPOINT_LEN_4; + break; +#ifdef CONFIG_X86_64 + case 8: + breakinfo[i].len = X86_BREAKPOINT_LEN_8; + break; +#endif + default: + return -1; + } + breakinfo[i].addr = addr; + if (hw_break_reserve_slot(i)) { + breakinfo[i].addr = 0; + return -1; + } + breakinfo[i].enabled = 1; + + return 0; +} + +/** + * kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb. + * @regs: Current &struct pt_regs. + * + * This function will be called if the particular architecture must + * disable hardware debugging while it is processing gdb packets or + * handling exception. + */ +static void kgdb_disable_hw_debug(struct pt_regs *regs) +{ + int i; + int cpu = raw_smp_processor_id(); + struct perf_event *bp; + + /* Disable hardware debugging while we are in kgdb: */ + set_debugreg(0UL, 7); + for (i = 0; i < HBP_NUM; i++) { + if (!breakinfo[i].enabled) + continue; + if (dbg_is_early) { + early_dr7 &= ~encode_dr7(i, breakinfo[i].len, + breakinfo[i].type); + continue; + } + bp = *per_cpu_ptr(breakinfo[i].pev, cpu); + if (bp->attr.disabled == 1) + continue; + arch_uninstall_hw_breakpoint(bp); + bp->attr.disabled = 1; + } +} + +#ifdef CONFIG_SMP +/** + * kgdb_roundup_cpus - Get other CPUs into a holding pattern + * @flags: Current IRQ state + * + * On SMP systems, we need to get the attention of the other CPUs + * and get them be in a known state. This should do what is needed + * to get the other CPUs to call kgdb_wait(). Note that on some arches, + * the NMI approach is not used for rounding up all the CPUs. For example, + * in case of MIPS, smp_call_function() is used to roundup CPUs. In + * this case, we have to make sure that interrupts are enabled before + * calling smp_call_function(). The argument to this function is + * the flags that will be used when restoring the interrupts. There is + * local_irq_save() call before kgdb_roundup_cpus(). + * + * On non-SMP systems, this is not called. + */ +void kgdb_roundup_cpus(unsigned long flags) +{ + apic->send_IPI_allbutself(APIC_DM_NMI); +} +#endif + +/** + * kgdb_arch_handle_exception - Handle architecture specific GDB packets. + * @e_vector: The error vector of the exception that happened. + * @signo: The signal number of the exception that happened. + * @err_code: The error code of the exception that happened. + * @remcomInBuffer: The buffer of the packet we have read. + * @remcomOutBuffer: The buffer of %BUFMAX bytes to write a packet into. + * @linux_regs: The &struct pt_regs of the current process. + * + * This function MUST handle the 'c' and 's' command packets, + * as well packets to set / remove a hardware breakpoint, if used. + * If there are additional packets which the hardware needs to handle, + * they are handled here. The code should return -1 if it wants to + * process more packets, and a %0 or %1 if it wants to exit from the + * kgdb callback. + */ +int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, + char *remcomInBuffer, char *remcomOutBuffer, + struct pt_regs *linux_regs) +{ + unsigned long addr; + char *ptr; + + switch (remcomInBuffer[0]) { + case 'c': + case 's': + /* try to read optional parameter, pc unchanged if no parm */ + ptr = &remcomInBuffer[1]; + if (kgdb_hex2long(&ptr, &addr)) + linux_regs->ip = addr; + case 'D': + case 'k': + /* clear the trace bit */ + linux_regs->flags &= ~X86_EFLAGS_TF; + atomic_set(&kgdb_cpu_doing_single_step, -1); + + /* set the trace bit if we're stepping */ + if (remcomInBuffer[0] == 's') { + linux_regs->flags |= X86_EFLAGS_TF; + atomic_set(&kgdb_cpu_doing_single_step, + raw_smp_processor_id()); + } + + return 0; + } + + /* this means that we do not want to exit from the handler: */ + return -1; +} + +static inline int +single_step_cont(struct pt_regs *regs, struct die_args *args) +{ + /* + * Single step exception from kernel space to user space so + * eat the exception and continue the process: + */ + printk(KERN_ERR "KGDB: trap/step from kernel to user space, " + "resuming...\n"); + kgdb_arch_handle_exception(args->trapnr, args->signr, + args->err, "c", "", regs); + /* + * Reset the BS bit in dr6 (pointed by args->err) to + * denote completion of processing + */ + (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP; + + return NOTIFY_STOP; +} + +static int was_in_debug_nmi[NR_CPUS]; + +static int kgdb_nmi_handler(unsigned int cmd, struct pt_regs *regs) +{ + switch (cmd) { + case NMI_LOCAL: + if (atomic_read(&kgdb_active) != -1) { + /* KGDB CPU roundup */ + kgdb_nmicallback(raw_smp_processor_id(), regs); + was_in_debug_nmi[raw_smp_processor_id()] = 1; + touch_nmi_watchdog(); + return NMI_HANDLED; + } + break; + + case NMI_UNKNOWN: + if (was_in_debug_nmi[raw_smp_processor_id()]) { + was_in_debug_nmi[raw_smp_processor_id()] = 0; + return NMI_HANDLED; + } + break; + default: + /* do nothing */ + break; + } + return NMI_DONE; +} + +static int __kgdb_notify(struct die_args *args, unsigned long cmd) +{ + struct pt_regs *regs = args->regs; + + switch (cmd) { + case DIE_DEBUG: + if (atomic_read(&kgdb_cpu_doing_single_step) != -1) { + if (user_mode(regs)) + return single_step_cont(regs, args); + break; + } else if (test_thread_flag(TIF_SINGLESTEP)) + /* This means a user thread is single stepping + * a system call which should be ignored + */ + return NOTIFY_DONE; + /* fall through */ + default: + if (user_mode(regs)) + return NOTIFY_DONE; + } + + if (kgdb_handle_exception(args->trapnr, args->signr, cmd, regs)) + return NOTIFY_DONE; + + /* Must touch watchdog before return to normal operation */ + touch_nmi_watchdog(); + return NOTIFY_STOP; +} + +int kgdb_ll_trap(int cmd, const char *str, + struct pt_regs *regs, long err, int trap, int sig) +{ + struct die_args args = { + .regs = regs, + .str = str, + .err = err, + .trapnr = trap, + .signr = sig, + + }; + + if (!kgdb_io_module_registered) + return NOTIFY_DONE; + + return __kgdb_notify(&args, cmd); +} + +static int +kgdb_notify(struct notifier_block *self, unsigned long cmd, void *ptr) +{ + unsigned long flags; + int ret; + + local_irq_save(flags); + ret = __kgdb_notify(ptr, cmd); + local_irq_restore(flags); + + return ret; +} + +static struct notifier_block kgdb_notifier = { + .notifier_call = kgdb_notify, +}; + +/** + * kgdb_arch_init - Perform any architecture specific initalization. + * + * This function will handle the initalization of any architecture + * specific callbacks. + */ +int kgdb_arch_init(void) +{ + int retval; + + retval = register_die_notifier(&kgdb_notifier); + if (retval) + goto out; + + retval = register_nmi_handler(NMI_LOCAL, kgdb_nmi_handler, + 0, "kgdb"); + if (retval) + goto out1; + + retval = register_nmi_handler(NMI_UNKNOWN, kgdb_nmi_handler, + 0, "kgdb"); + + if (retval) + goto out2; + + return retval; + +out2: + unregister_nmi_handler(NMI_LOCAL, "kgdb"); +out1: + unregister_die_notifier(&kgdb_notifier); +out: + return retval; +} + +static void kgdb_hw_overflow_handler(struct perf_event *event, + struct perf_sample_data *data, struct pt_regs *regs) +{ + struct task_struct *tsk = current; + int i; + + for (i = 0; i < 4; i++) + if (breakinfo[i].enabled) + tsk->thread.debugreg6 |= (DR_TRAP0 << i); +} + +void kgdb_arch_late(void) +{ + int i, cpu; + struct perf_event_attr attr; + struct perf_event **pevent; + + /* + * Pre-allocate the hw breakpoint structions in the non-atomic + * portion of kgdb because this operation requires mutexs to + * complete. + */ + hw_breakpoint_init(&attr); + attr.bp_addr = (unsigned long)kgdb_arch_init; + attr.bp_len = HW_BREAKPOINT_LEN_1; + attr.bp_type = HW_BREAKPOINT_W; + attr.disabled = 1; + for (i = 0; i < HBP_NUM; i++) { + if (breakinfo[i].pev) + continue; + breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL, NULL); + if (IS_ERR((void * __force)breakinfo[i].pev)) { + printk(KERN_ERR "kgdb: Could not allocate hw" + "breakpoints\nDisabling the kernel debugger\n"); + breakinfo[i].pev = NULL; + kgdb_arch_exit(); + return; + } + for_each_online_cpu(cpu) { + pevent = per_cpu_ptr(breakinfo[i].pev, cpu); + pevent[0]->hw.sample_period = 1; + pevent[0]->overflow_handler = kgdb_hw_overflow_handler; + if (pevent[0]->destroy != NULL) { + pevent[0]->destroy = NULL; + release_bp_slot(*pevent); + } + } + } +} + +/** + * kgdb_arch_exit - Perform any architecture specific uninitalization. + * + * This function will handle the uninitalization of any architecture + * specific callbacks, for dynamic registration and unregistration. + */ +void kgdb_arch_exit(void) +{ + int i; + for (i = 0; i < 4; i++) { + if (breakinfo[i].pev) { + unregister_wide_hw_breakpoint(breakinfo[i].pev); + breakinfo[i].pev = NULL; + } + } + unregister_nmi_handler(NMI_UNKNOWN, "kgdb"); + unregister_nmi_handler(NMI_LOCAL, "kgdb"); + unregister_die_notifier(&kgdb_notifier); +} + +/** + * + * kgdb_skipexception - Bail out of KGDB when we've been triggered. + * @exception: Exception vector number + * @regs: Current &struct pt_regs. + * + * On some architectures we need to skip a breakpoint exception when + * it occurs after a breakpoint has been removed. + * + * Skip an int3 exception when it occurs after a breakpoint has been + * removed. Backtrack eip by 1 since the int3 would have caused it to + * increment by 1. + */ +int kgdb_skipexception(int exception, struct pt_regs *regs) +{ + if (exception == 3 && kgdb_isremovedbreak(regs->ip - 1)) { + regs->ip -= 1; + return 1; + } + return 0; +} + +unsigned long kgdb_arch_pc(int exception, struct pt_regs *regs) +{ + if (exception == 3) + return instruction_pointer(regs) - 1; + return instruction_pointer(regs); +} + +void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip) +{ + regs->ip = ip; +} + +int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) +{ + int err; +#ifdef CONFIG_DEBUG_RODATA + char opc[BREAK_INSTR_SIZE]; +#endif /* CONFIG_DEBUG_RODATA */ + + bpt->type = BP_BREAKPOINT; + err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr, + BREAK_INSTR_SIZE); + if (err) + return err; + err = probe_kernel_write((char *)bpt->bpt_addr, + arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE); +#ifdef CONFIG_DEBUG_RODATA + if (!err) + return err; + /* + * It is safe to call text_poke() because normal kernel execution + * is stopped on all cores, so long as the text_mutex is not locked. + */ + if (mutex_is_locked(&text_mutex)) + return -EBUSY; + text_poke((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr, + BREAK_INSTR_SIZE); + err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE); + if (err) + return err; + if (memcmp(opc, arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE)) + return -EINVAL; + bpt->type = BP_POKE_BREAKPOINT; +#endif /* CONFIG_DEBUG_RODATA */ + return err; +} + +int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt) +{ +#ifdef CONFIG_DEBUG_RODATA + int err; + char opc[BREAK_INSTR_SIZE]; + + if (bpt->type != BP_POKE_BREAKPOINT) + goto knl_write; + /* + * It is safe to call text_poke() because normal kernel execution + * is stopped on all cores, so long as the text_mutex is not locked. + */ + if (mutex_is_locked(&text_mutex)) + goto knl_write; + text_poke((void *)bpt->bpt_addr, bpt->saved_instr, BREAK_INSTR_SIZE); + err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE); + if (err || memcmp(opc, bpt->saved_instr, BREAK_INSTR_SIZE)) + goto knl_write; + return err; +knl_write: +#endif /* CONFIG_DEBUG_RODATA */ + return probe_kernel_write((char *)bpt->bpt_addr, + (char *)bpt->saved_instr, BREAK_INSTR_SIZE); +} + +struct kgdb_arch arch_kgdb_ops = { + /* Breakpoint instruction: */ + .gdb_bpt_instr = { 0xcc }, + .flags = KGDB_HW_BREAKPOINT, + .set_hw_breakpoint = kgdb_set_hw_break, + .remove_hw_breakpoint = kgdb_remove_hw_break, + .disable_hw_break = kgdb_disable_hw_debug, + .remove_all_hw_break = kgdb_remove_all_hw_break, + .correct_hw_break = kgdb_correct_hw_break, +}; diff --git a/arch/x86/kernel/kprobes/Makefile b/arch/x86/kernel/kprobes/Makefile new file mode 100644 index 00000000000..0d33169cc1a --- /dev/null +++ b/arch/x86/kernel/kprobes/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for kernel probes +# + +obj-$(CONFIG_KPROBES) += core.o +obj-$(CONFIG_OPTPROBES) += opt.o +obj-$(CONFIG_KPROBES_ON_FTRACE) += ftrace.o diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h new file mode 100644 index 00000000000..c6ee63f927a --- /dev/null +++ b/arch/x86/kernel/kprobes/common.h @@ -0,0 +1,108 @@ +#ifndef __X86_KERNEL_KPROBES_COMMON_H +#define __X86_KERNEL_KPROBES_COMMON_H + +/* Kprobes and Optprobes common header */ + +#ifdef CONFIG_X86_64 +#define SAVE_REGS_STRING \ + /* Skip cs, ip, orig_ax. */ \ + " subq $24, %rsp\n" \ + " pushq %rdi\n" \ + " pushq %rsi\n" \ + " pushq %rdx\n" \ + " pushq %rcx\n" \ + " pushq %rax\n" \ + " pushq %r8\n" \ + " pushq %r9\n" \ + " pushq %r10\n" \ + " pushq %r11\n" \ + " pushq %rbx\n" \ + " pushq %rbp\n" \ + " pushq %r12\n" \ + " pushq %r13\n" \ + " pushq %r14\n" \ + " pushq %r15\n" +#define RESTORE_REGS_STRING \ + " popq %r15\n" \ + " popq %r14\n" \ + " popq %r13\n" \ + " popq %r12\n" \ + " popq %rbp\n" \ + " popq %rbx\n" \ + " popq %r11\n" \ + " popq %r10\n" \ + " popq %r9\n" \ + " popq %r8\n" \ + " popq %rax\n" \ + " popq %rcx\n" \ + " popq %rdx\n" \ + " popq %rsi\n" \ + " popq %rdi\n" \ + /* Skip orig_ax, ip, cs */ \ + " addq $24, %rsp\n" +#else +#define SAVE_REGS_STRING \ + /* Skip cs, ip, orig_ax and gs. */ \ + " subl $16, %esp\n" \ + " pushl %fs\n" \ + " pushl %es\n" \ + " pushl %ds\n" \ + " pushl %eax\n" \ + " pushl %ebp\n" \ + " pushl %edi\n" \ + " pushl %esi\n" \ + " pushl %edx\n" \ + " pushl %ecx\n" \ + " pushl %ebx\n" +#define RESTORE_REGS_STRING \ + " popl %ebx\n" \ + " popl %ecx\n" \ + " popl %edx\n" \ + " popl %esi\n" \ + " popl %edi\n" \ + " popl %ebp\n" \ + " popl %eax\n" \ + /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\ + " addl $24, %esp\n" +#endif + +/* Ensure if the instruction can be boostable */ +extern int can_boost(kprobe_opcode_t *instruction); +/* Recover instruction if given address is probed */ +extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf, + unsigned long addr); +/* + * Copy an instruction and adjust the displacement if the instruction + * uses the %rip-relative addressing mode. + */ +extern int __copy_instruction(u8 *dest, u8 *src); + +/* Generate a relative-jump/call instruction */ +extern void synthesize_reljump(void *from, void *to); +extern void synthesize_relcall(void *from, void *to); + +#ifdef CONFIG_OPTPROBES +extern int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter); +extern unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr); +#else /* !CONFIG_OPTPROBES */ +static inline int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter) +{ + return 0; +} +static inline unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr) +{ + return addr; +} +#endif + +#ifdef CONFIG_KPROBES_ON_FTRACE +extern int skip_singlestep(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb); +#else +static inline int skip_singlestep(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + return 0; +} +#endif +#endif diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c new file mode 100644 index 00000000000..67e6d19ef1b --- /dev/null +++ b/arch/x86/kernel/kprobes/core.c @@ -0,0 +1,1090 @@ +/* + * Kernel Probes (KProbes) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2002, 2004 + * + * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel + * Probes initial implementation ( includes contributions from + * Rusty Russell). + * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes + * interface to access function arguments. + * 2004-Oct Jim Keniston <jkenisto@us.ibm.com> and Prasanna S Panchamukhi + * <prasanna@in.ibm.com> adapted for x86_64 from i386. + * 2005-Mar Roland McGrath <roland@redhat.com> + * Fixed to handle %rip-relative addressing mode correctly. + * 2005-May Hien Nguyen <hien@us.ibm.com>, Jim Keniston + * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi + * <prasanna@in.ibm.com> added function-return probes. + * 2005-May Rusty Lynch <rusty.lynch@intel.com> + * Added function return probes functionality + * 2006-Feb Masami Hiramatsu <hiramatu@sdl.hitachi.co.jp> added + * kprobe-booster and kretprobe-booster for i386. + * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com> added kprobe-booster + * and kretprobe-booster for x86-64 + * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com>, Arjan van de Ven + * <arjan@infradead.org> and Jim Keniston <jkenisto@us.ibm.com> + * unified x86 kprobes code. + */ +#include <linux/kprobes.h> +#include <linux/ptrace.h> +#include <linux/string.h> +#include <linux/slab.h> +#include <linux/hardirq.h> +#include <linux/preempt.h> +#include <linux/module.h> +#include <linux/kdebug.h> +#include <linux/kallsyms.h> +#include <linux/ftrace.h> + +#include <asm/cacheflush.h> +#include <asm/desc.h> +#include <asm/pgtable.h> +#include <asm/uaccess.h> +#include <asm/alternative.h> +#include <asm/insn.h> +#include <asm/debugreg.h> + +#include "common.h" + +void jprobe_return_end(void); + +DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; +DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); + +#define stack_addr(regs) ((unsigned long *)kernel_stack_pointer(regs)) + +#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ + (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ + (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \ + (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \ + (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \ + << (row % 32)) + /* + * Undefined/reserved opcodes, conditional jump, Opcode Extension + * Groups, and some special opcodes can not boost. + * This is non-const and volatile to keep gcc from statically + * optimizing it out, as variable_test_bit makes gcc think only + * *(unsigned long*) is used. + */ +static volatile u32 twobyte_is_boostable[256 / 32] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ---------------------------------------------- */ + W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */ + W(0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 10 */ + W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 20 */ + W(0x30, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */ + W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ + W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */ + W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1) | /* 60 */ + W(0x70, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */ + W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 80 */ + W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ + W(0xa0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* a0 */ + W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) , /* b0 */ + W(0xc0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */ + W(0xd0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) , /* d0 */ + W(0xe0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* e0 */ + W(0xf0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0) /* f0 */ + /* ----------------------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ +}; +#undef W + +struct kretprobe_blackpoint kretprobe_blacklist[] = { + {"__switch_to", }, /* This function switches only current task, but + doesn't switch kernel stack.*/ + {NULL, NULL} /* Terminator */ +}; + +const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); + +static nokprobe_inline void +__synthesize_relative_insn(void *from, void *to, u8 op) +{ + struct __arch_relative_insn { + u8 op; + s32 raddr; + } __packed *insn; + + insn = (struct __arch_relative_insn *)from; + insn->raddr = (s32)((long)(to) - ((long)(from) + 5)); + insn->op = op; +} + +/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ +void synthesize_reljump(void *from, void *to) +{ + __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE); +} +NOKPROBE_SYMBOL(synthesize_reljump); + +/* Insert a call instruction at address 'from', which calls address 'to'.*/ +void synthesize_relcall(void *from, void *to) +{ + __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE); +} +NOKPROBE_SYMBOL(synthesize_relcall); + +/* + * Skip the prefixes of the instruction. + */ +static kprobe_opcode_t *skip_prefixes(kprobe_opcode_t *insn) +{ + insn_attr_t attr; + + attr = inat_get_opcode_attribute((insn_byte_t)*insn); + while (inat_is_legacy_prefix(attr)) { + insn++; + attr = inat_get_opcode_attribute((insn_byte_t)*insn); + } +#ifdef CONFIG_X86_64 + if (inat_is_rex_prefix(attr)) + insn++; +#endif + return insn; +} +NOKPROBE_SYMBOL(skip_prefixes); + +/* + * Returns non-zero if opcode is boostable. + * RIP relative instructions are adjusted at copying time in 64 bits mode + */ +int can_boost(kprobe_opcode_t *opcodes) +{ + kprobe_opcode_t opcode; + kprobe_opcode_t *orig_opcodes = opcodes; + + if (search_exception_tables((unsigned long)opcodes)) + return 0; /* Page fault may occur on this address. */ + +retry: + if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1) + return 0; + opcode = *(opcodes++); + + /* 2nd-byte opcode */ + if (opcode == 0x0f) { + if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1) + return 0; + return test_bit(*opcodes, + (unsigned long *)twobyte_is_boostable); + } + + switch (opcode & 0xf0) { +#ifdef CONFIG_X86_64 + case 0x40: + goto retry; /* REX prefix is boostable */ +#endif + case 0x60: + if (0x63 < opcode && opcode < 0x67) + goto retry; /* prefixes */ + /* can't boost Address-size override and bound */ + return (opcode != 0x62 && opcode != 0x67); + case 0x70: + return 0; /* can't boost conditional jump */ + case 0xc0: + /* can't boost software-interruptions */ + return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf; + case 0xd0: + /* can boost AA* and XLAT */ + return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7); + case 0xe0: + /* can boost in/out and absolute jmps */ + return ((opcode & 0x04) || opcode == 0xea); + case 0xf0: + if ((opcode & 0x0c) == 0 && opcode != 0xf1) + goto retry; /* lock/rep(ne) prefix */ + /* clear and set flags are boostable */ + return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe)); + default: + /* segment override prefixes are boostable */ + if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e) + goto retry; /* prefixes */ + /* CS override prefix and call are not boostable */ + return (opcode != 0x2e && opcode != 0x9a); + } +} + +static unsigned long +__recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr) +{ + struct kprobe *kp; + + kp = get_kprobe((void *)addr); + /* There is no probe, return original address */ + if (!kp) + return addr; + + /* + * Basically, kp->ainsn.insn has an original instruction. + * However, RIP-relative instruction can not do single-stepping + * at different place, __copy_instruction() tweaks the displacement of + * that instruction. In that case, we can't recover the instruction + * from the kp->ainsn.insn. + * + * On the other hand, kp->opcode has a copy of the first byte of + * the probed instruction, which is overwritten by int3. And + * the instruction at kp->addr is not modified by kprobes except + * for the first byte, we can recover the original instruction + * from it and kp->opcode. + */ + memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); + buf[0] = kp->opcode; + return (unsigned long)buf; +} + +/* + * Recover the probed instruction at addr for further analysis. + * Caller must lock kprobes by kprobe_mutex, or disable preemption + * for preventing to release referencing kprobes. + */ +unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) +{ + unsigned long __addr; + + __addr = __recover_optprobed_insn(buf, addr); + if (__addr != addr) + return __addr; + + return __recover_probed_insn(buf, addr); +} + +/* Check if paddr is at an instruction boundary */ +static int can_probe(unsigned long paddr) +{ + unsigned long addr, __addr, offset = 0; + struct insn insn; + kprobe_opcode_t buf[MAX_INSN_SIZE]; + + if (!kallsyms_lookup_size_offset(paddr, NULL, &offset)) + return 0; + + /* Decode instructions */ + addr = paddr - offset; + while (addr < paddr) { + /* + * Check if the instruction has been modified by another + * kprobe, in which case we replace the breakpoint by the + * original instruction in our buffer. + * Also, jump optimization will change the breakpoint to + * relative-jump. Since the relative-jump itself is + * normally used, we just go through if there is no kprobe. + */ + __addr = recover_probed_instruction(buf, addr); + kernel_insn_init(&insn, (void *)__addr); + insn_get_length(&insn); + + /* + * Another debugging subsystem might insert this breakpoint. + * In that case, we can't recover it. + */ + if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) + return 0; + addr += insn.length; + } + + return (addr == paddr); +} + +/* + * Returns non-zero if opcode modifies the interrupt flag. + */ +static int is_IF_modifier(kprobe_opcode_t *insn) +{ + /* Skip prefixes */ + insn = skip_prefixes(insn); + + switch (*insn) { + case 0xfa: /* cli */ + case 0xfb: /* sti */ + case 0xcf: /* iret/iretd */ + case 0x9d: /* popf/popfd */ + return 1; + } + + return 0; +} + +/* + * Copy an instruction and adjust the displacement if the instruction + * uses the %rip-relative addressing mode. + * If it does, Return the address of the 32-bit displacement word. + * If not, return null. + * Only applicable to 64-bit x86. + */ +int __copy_instruction(u8 *dest, u8 *src) +{ + struct insn insn; + kprobe_opcode_t buf[MAX_INSN_SIZE]; + + kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, (unsigned long)src)); + insn_get_length(&insn); + /* Another subsystem puts a breakpoint, failed to recover */ + if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) + return 0; + memcpy(dest, insn.kaddr, insn.length); + +#ifdef CONFIG_X86_64 + if (insn_rip_relative(&insn)) { + s64 newdisp; + u8 *disp; + kernel_insn_init(&insn, dest); + insn_get_displacement(&insn); + /* + * The copied instruction uses the %rip-relative addressing + * mode. Adjust the displacement for the difference between + * the original location of this instruction and the location + * of the copy that will actually be run. The tricky bit here + * is making sure that the sign extension happens correctly in + * this calculation, since we need a signed 32-bit result to + * be sign-extended to 64 bits when it's added to the %rip + * value and yield the same 64-bit result that the sign- + * extension of the original signed 32-bit displacement would + * have given. + */ + newdisp = (u8 *) src + (s64) insn.displacement.value - (u8 *) dest; + if ((s64) (s32) newdisp != newdisp) { + pr_err("Kprobes error: new displacement does not fit into s32 (%llx)\n", newdisp); + pr_err("\tSrc: %p, Dest: %p, old disp: %x\n", src, dest, insn.displacement.value); + return 0; + } + disp = (u8 *) dest + insn_offset_displacement(&insn); + *(s32 *) disp = (s32) newdisp; + } +#endif + return insn.length; +} + +static int arch_copy_kprobe(struct kprobe *p) +{ + int ret; + + /* Copy an instruction with recovering if other optprobe modifies it.*/ + ret = __copy_instruction(p->ainsn.insn, p->addr); + if (!ret) + return -EINVAL; + + /* + * __copy_instruction can modify the displacement of the instruction, + * but it doesn't affect boostable check. + */ + if (can_boost(p->ainsn.insn)) + p->ainsn.boostable = 0; + else + p->ainsn.boostable = -1; + + /* Check whether the instruction modifies Interrupt Flag or not */ + p->ainsn.if_modifier = is_IF_modifier(p->ainsn.insn); + + /* Also, displacement change doesn't affect the first byte */ + p->opcode = p->ainsn.insn[0]; + + return 0; +} + +int arch_prepare_kprobe(struct kprobe *p) +{ + if (alternatives_text_reserved(p->addr, p->addr)) + return -EINVAL; + + if (!can_probe((unsigned long)p->addr)) + return -EILSEQ; + /* insn: must be on special executable page on x86. */ + p->ainsn.insn = get_insn_slot(); + if (!p->ainsn.insn) + return -ENOMEM; + + return arch_copy_kprobe(p); +} + +void arch_arm_kprobe(struct kprobe *p) +{ + text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1); +} + +void arch_disarm_kprobe(struct kprobe *p) +{ + text_poke(p->addr, &p->opcode, 1); +} + +void arch_remove_kprobe(struct kprobe *p) +{ + if (p->ainsn.insn) { + free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1)); + p->ainsn.insn = NULL; + } +} + +static nokprobe_inline void +save_previous_kprobe(struct kprobe_ctlblk *kcb) +{ + kcb->prev_kprobe.kp = kprobe_running(); + kcb->prev_kprobe.status = kcb->kprobe_status; + kcb->prev_kprobe.old_flags = kcb->kprobe_old_flags; + kcb->prev_kprobe.saved_flags = kcb->kprobe_saved_flags; +} + +static nokprobe_inline void +restore_previous_kprobe(struct kprobe_ctlblk *kcb) +{ + __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp); + kcb->kprobe_status = kcb->prev_kprobe.status; + kcb->kprobe_old_flags = kcb->prev_kprobe.old_flags; + kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags; +} + +static nokprobe_inline void +set_current_kprobe(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + __this_cpu_write(current_kprobe, p); + kcb->kprobe_saved_flags = kcb->kprobe_old_flags + = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF)); + if (p->ainsn.if_modifier) + kcb->kprobe_saved_flags &= ~X86_EFLAGS_IF; +} + +static nokprobe_inline void clear_btf(void) +{ + if (test_thread_flag(TIF_BLOCKSTEP)) { + unsigned long debugctl = get_debugctlmsr(); + + debugctl &= ~DEBUGCTLMSR_BTF; + update_debugctlmsr(debugctl); + } +} + +static nokprobe_inline void restore_btf(void) +{ + if (test_thread_flag(TIF_BLOCKSTEP)) { + unsigned long debugctl = get_debugctlmsr(); + + debugctl |= DEBUGCTLMSR_BTF; + update_debugctlmsr(debugctl); + } +} + +void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + unsigned long *sara = stack_addr(regs); + + ri->ret_addr = (kprobe_opcode_t *) *sara; + + /* Replace the return addr with trampoline addr */ + *sara = (unsigned long) &kretprobe_trampoline; +} +NOKPROBE_SYMBOL(arch_prepare_kretprobe); + +static void setup_singlestep(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb, int reenter) +{ + if (setup_detour_execution(p, regs, reenter)) + return; + +#if !defined(CONFIG_PREEMPT) + if (p->ainsn.boostable == 1 && !p->post_handler) { + /* Boost up -- we can execute copied instructions directly */ + if (!reenter) + reset_current_kprobe(); + /* + * Reentering boosted probe doesn't reset current_kprobe, + * nor set current_kprobe, because it doesn't use single + * stepping. + */ + regs->ip = (unsigned long)p->ainsn.insn; + preempt_enable_no_resched(); + return; + } +#endif + if (reenter) { + save_previous_kprobe(kcb); + set_current_kprobe(p, regs, kcb); + kcb->kprobe_status = KPROBE_REENTER; + } else + kcb->kprobe_status = KPROBE_HIT_SS; + /* Prepare real single stepping */ + clear_btf(); + regs->flags |= X86_EFLAGS_TF; + regs->flags &= ~X86_EFLAGS_IF; + /* single step inline if the instruction is an int3 */ + if (p->opcode == BREAKPOINT_INSTRUCTION) + regs->ip = (unsigned long)p->addr; + else + regs->ip = (unsigned long)p->ainsn.insn; +} +NOKPROBE_SYMBOL(setup_singlestep); + +/* + * We have reentered the kprobe_handler(), since another probe was hit while + * within the handler. We save the original kprobes variables and just single + * step on the instruction of the new probe without calling any user handlers. + */ +static int reenter_kprobe(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + switch (kcb->kprobe_status) { + case KPROBE_HIT_SSDONE: + case KPROBE_HIT_ACTIVE: + case KPROBE_HIT_SS: + kprobes_inc_nmissed_count(p); + setup_singlestep(p, regs, kcb, 1); + break; + case KPROBE_REENTER: + /* A probe has been hit in the codepath leading up to, or just + * after, single-stepping of a probed instruction. This entire + * codepath should strictly reside in .kprobes.text section. + * Raise a BUG or we'll continue in an endless reentering loop + * and eventually a stack overflow. + */ + printk(KERN_WARNING "Unrecoverable kprobe detected at %p.\n", + p->addr); + dump_kprobe(p); + BUG(); + default: + /* impossible cases */ + WARN_ON(1); + return 0; + } + + return 1; +} +NOKPROBE_SYMBOL(reenter_kprobe); + +/* + * Interrupts are disabled on entry as trap3 is an interrupt gate and they + * remain disabled throughout this function. + */ +int kprobe_int3_handler(struct pt_regs *regs) +{ + kprobe_opcode_t *addr; + struct kprobe *p; + struct kprobe_ctlblk *kcb; + + if (user_mode_vm(regs)) + return 0; + + addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t)); + /* + * We don't want to be preempted for the entire + * duration of kprobe processing. We conditionally + * re-enable preemption at the end of this function, + * and also in reenter_kprobe() and setup_singlestep(). + */ + preempt_disable(); + + kcb = get_kprobe_ctlblk(); + p = get_kprobe(addr); + + if (p) { + if (kprobe_running()) { + if (reenter_kprobe(p, regs, kcb)) + return 1; + } else { + set_current_kprobe(p, regs, kcb); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + + /* + * If we have no pre-handler or it returned 0, we + * continue with normal processing. If we have a + * pre-handler and it returned non-zero, it prepped + * for calling the break_handler below on re-entry + * for jprobe processing, so get out doing nothing + * more here. + */ + if (!p->pre_handler || !p->pre_handler(p, regs)) + setup_singlestep(p, regs, kcb, 0); + return 1; + } + } else if (*addr != BREAKPOINT_INSTRUCTION) { + /* + * The breakpoint instruction was removed right + * after we hit it. Another cpu has removed + * either a probepoint or a debugger breakpoint + * at this address. In either case, no further + * handling of this interrupt is appropriate. + * Back up over the (now missing) int3 and run + * the original instruction. + */ + regs->ip = (unsigned long)addr; + preempt_enable_no_resched(); + return 1; + } else if (kprobe_running()) { + p = __this_cpu_read(current_kprobe); + if (p->break_handler && p->break_handler(p, regs)) { + if (!skip_singlestep(p, regs, kcb)) + setup_singlestep(p, regs, kcb, 0); + return 1; + } + } /* else: not a kprobe fault; let the kernel handle it */ + + preempt_enable_no_resched(); + return 0; +} +NOKPROBE_SYMBOL(kprobe_int3_handler); + +/* + * When a retprobed function returns, this code saves registers and + * calls trampoline_handler() runs, which calls the kretprobe's handler. + */ +static void __used kretprobe_trampoline_holder(void) +{ + asm volatile ( + ".global kretprobe_trampoline\n" + "kretprobe_trampoline: \n" +#ifdef CONFIG_X86_64 + /* We don't bother saving the ss register */ + " pushq %rsp\n" + " pushfq\n" + SAVE_REGS_STRING + " movq %rsp, %rdi\n" + " call trampoline_handler\n" + /* Replace saved sp with true return address. */ + " movq %rax, 152(%rsp)\n" + RESTORE_REGS_STRING + " popfq\n" +#else + " pushf\n" + SAVE_REGS_STRING + " movl %esp, %eax\n" + " call trampoline_handler\n" + /* Move flags to cs */ + " movl 56(%esp), %edx\n" + " movl %edx, 52(%esp)\n" + /* Replace saved flags with true return address. */ + " movl %eax, 56(%esp)\n" + RESTORE_REGS_STRING + " popf\n" +#endif + " ret\n"); +} +NOKPROBE_SYMBOL(kretprobe_trampoline_holder); +NOKPROBE_SYMBOL(kretprobe_trampoline); + +/* + * Called from kretprobe_trampoline + */ +__visible __used void *trampoline_handler(struct pt_regs *regs) +{ + struct kretprobe_instance *ri = NULL; + struct hlist_head *head, empty_rp; + struct hlist_node *tmp; + unsigned long flags, orig_ret_address = 0; + unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline; + kprobe_opcode_t *correct_ret_addr = NULL; + + INIT_HLIST_HEAD(&empty_rp); + kretprobe_hash_lock(current, &head, &flags); + /* fixup registers */ +#ifdef CONFIG_X86_64 + regs->cs = __KERNEL_CS; +#else + regs->cs = __KERNEL_CS | get_kernel_rpl(); + regs->gs = 0; +#endif + regs->ip = trampoline_address; + regs->orig_ax = ~0UL; + + /* + * It is possible to have multiple instances associated with a given + * task either because multiple functions in the call path have + * return probes installed on them, and/or more than one + * return probe was registered for a target function. + * + * We can handle this because: + * - instances are always pushed into the head of the list + * - when multiple return probes are registered for the same + * function, the (chronologically) first instance's ret_addr + * will be the real return address, and all the rest will + * point to kretprobe_trampoline. + */ + hlist_for_each_entry_safe(ri, tmp, head, hlist) { + if (ri->task != current) + /* another task is sharing our hash bucket */ + continue; + + orig_ret_address = (unsigned long)ri->ret_addr; + + if (orig_ret_address != trampoline_address) + /* + * This is the real return address. Any other + * instances associated with this task are for + * other calls deeper on the call stack + */ + break; + } + + kretprobe_assert(ri, orig_ret_address, trampoline_address); + + correct_ret_addr = ri->ret_addr; + hlist_for_each_entry_safe(ri, tmp, head, hlist) { + if (ri->task != current) + /* another task is sharing our hash bucket */ + continue; + + orig_ret_address = (unsigned long)ri->ret_addr; + if (ri->rp && ri->rp->handler) { + __this_cpu_write(current_kprobe, &ri->rp->kp); + get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE; + ri->ret_addr = correct_ret_addr; + ri->rp->handler(ri, regs); + __this_cpu_write(current_kprobe, NULL); + } + + recycle_rp_inst(ri, &empty_rp); + + if (orig_ret_address != trampoline_address) + /* + * This is the real return address. Any other + * instances associated with this task are for + * other calls deeper on the call stack + */ + break; + } + + kretprobe_hash_unlock(current, &flags); + + hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) { + hlist_del(&ri->hlist); + kfree(ri); + } + return (void *)orig_ret_address; +} +NOKPROBE_SYMBOL(trampoline_handler); + +/* + * Called after single-stepping. p->addr is the address of the + * instruction whose first byte has been replaced by the "int 3" + * instruction. To avoid the SMP problems that can occur when we + * temporarily put back the original opcode to single-step, we + * single-stepped a copy of the instruction. The address of this + * copy is p->ainsn.insn. + * + * This function prepares to return from the post-single-step + * interrupt. We have to fix up the stack as follows: + * + * 0) Except in the case of absolute or indirect jump or call instructions, + * the new ip is relative to the copied instruction. We need to make + * it relative to the original instruction. + * + * 1) If the single-stepped instruction was pushfl, then the TF and IF + * flags are set in the just-pushed flags, and may need to be cleared. + * + * 2) If the single-stepped instruction was a call, the return address + * that is atop the stack is the address following the copied instruction. + * We need to make it the address following the original instruction. + * + * If this is the first time we've single-stepped the instruction at + * this probepoint, and the instruction is boostable, boost it: add a + * jump instruction after the copied instruction, that jumps to the next + * instruction after the probepoint. + */ +static void resume_execution(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + unsigned long *tos = stack_addr(regs); + unsigned long copy_ip = (unsigned long)p->ainsn.insn; + unsigned long orig_ip = (unsigned long)p->addr; + kprobe_opcode_t *insn = p->ainsn.insn; + + /* Skip prefixes */ + insn = skip_prefixes(insn); + + regs->flags &= ~X86_EFLAGS_TF; + switch (*insn) { + case 0x9c: /* pushfl */ + *tos &= ~(X86_EFLAGS_TF | X86_EFLAGS_IF); + *tos |= kcb->kprobe_old_flags; + break; + case 0xc2: /* iret/ret/lret */ + case 0xc3: + case 0xca: + case 0xcb: + case 0xcf: + case 0xea: /* jmp absolute -- ip is correct */ + /* ip is already adjusted, no more changes required */ + p->ainsn.boostable = 1; + goto no_change; + case 0xe8: /* call relative - Fix return addr */ + *tos = orig_ip + (*tos - copy_ip); + break; +#ifdef CONFIG_X86_32 + case 0x9a: /* call absolute -- same as call absolute, indirect */ + *tos = orig_ip + (*tos - copy_ip); + goto no_change; +#endif + case 0xff: + if ((insn[1] & 0x30) == 0x10) { + /* + * call absolute, indirect + * Fix return addr; ip is correct. + * But this is not boostable + */ + *tos = orig_ip + (*tos - copy_ip); + goto no_change; + } else if (((insn[1] & 0x31) == 0x20) || + ((insn[1] & 0x31) == 0x21)) { + /* + * jmp near and far, absolute indirect + * ip is correct. And this is boostable + */ + p->ainsn.boostable = 1; + goto no_change; + } + default: + break; + } + + if (p->ainsn.boostable == 0) { + if ((regs->ip > copy_ip) && + (regs->ip - copy_ip) + 5 < MAX_INSN_SIZE) { + /* + * These instructions can be executed directly if it + * jumps back to correct address. + */ + synthesize_reljump((void *)regs->ip, + (void *)orig_ip + (regs->ip - copy_ip)); + p->ainsn.boostable = 1; + } else { + p->ainsn.boostable = -1; + } + } + + regs->ip += orig_ip - copy_ip; + +no_change: + restore_btf(); +} +NOKPROBE_SYMBOL(resume_execution); + +/* + * Interrupts are disabled on entry as trap1 is an interrupt gate and they + * remain disabled throughout this function. + */ +int kprobe_debug_handler(struct pt_regs *regs) +{ + struct kprobe *cur = kprobe_running(); + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + if (!cur) + return 0; + + resume_execution(cur, regs, kcb); + regs->flags |= kcb->kprobe_saved_flags; + + if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { + kcb->kprobe_status = KPROBE_HIT_SSDONE; + cur->post_handler(cur, regs, 0); + } + + /* Restore back the original saved kprobes variables and continue. */ + if (kcb->kprobe_status == KPROBE_REENTER) { + restore_previous_kprobe(kcb); + goto out; + } + reset_current_kprobe(); +out: + preempt_enable_no_resched(); + + /* + * if somebody else is singlestepping across a probe point, flags + * will have TF set, in which case, continue the remaining processing + * of do_debug, as if this is not a probe hit. + */ + if (regs->flags & X86_EFLAGS_TF) + return 0; + + return 1; +} +NOKPROBE_SYMBOL(kprobe_debug_handler); + +int kprobe_fault_handler(struct pt_regs *regs, int trapnr) +{ + struct kprobe *cur = kprobe_running(); + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + if (unlikely(regs->ip == (unsigned long)cur->ainsn.insn)) { + /* This must happen on single-stepping */ + WARN_ON(kcb->kprobe_status != KPROBE_HIT_SS && + kcb->kprobe_status != KPROBE_REENTER); + /* + * We are here because the instruction being single + * stepped caused a page fault. We reset the current + * kprobe and the ip points back to the probe address + * and allow the page fault handler to continue as a + * normal page fault. + */ + regs->ip = (unsigned long)cur->addr; + regs->flags |= kcb->kprobe_old_flags; + if (kcb->kprobe_status == KPROBE_REENTER) + restore_previous_kprobe(kcb); + else + reset_current_kprobe(); + preempt_enable_no_resched(); + } else if (kcb->kprobe_status == KPROBE_HIT_ACTIVE || + kcb->kprobe_status == KPROBE_HIT_SSDONE) { + /* + * We increment the nmissed count for accounting, + * we can also use npre/npostfault count for accounting + * these specific fault cases. + */ + kprobes_inc_nmissed_count(cur); + + /* + * We come here because instructions in the pre/post + * handler caused the page_fault, this could happen + * if handler tries to access user space by + * copy_from_user(), get_user() etc. Let the + * user-specified handler try to fix it first. + */ + if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr)) + return 1; + + /* + * In case the user-specified fault handler returned + * zero, try to fix up. + */ + if (fixup_exception(regs)) + return 1; + + /* + * fixup routine could not handle it, + * Let do_page_fault() fix it. + */ + } + + return 0; +} +NOKPROBE_SYMBOL(kprobe_fault_handler); + +/* + * Wrapper routine for handling exceptions. + */ +int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, + void *data) +{ + struct die_args *args = data; + int ret = NOTIFY_DONE; + + if (args->regs && user_mode_vm(args->regs)) + return ret; + + if (val == DIE_GPF) { + /* + * To be potentially processing a kprobe fault and to + * trust the result from kprobe_running(), we have + * be non-preemptible. + */ + if (!preemptible() && kprobe_running() && + kprobe_fault_handler(args->regs, args->trapnr)) + ret = NOTIFY_STOP; + } + return ret; +} +NOKPROBE_SYMBOL(kprobe_exceptions_notify); + +int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct jprobe *jp = container_of(p, struct jprobe, kp); + unsigned long addr; + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + kcb->jprobe_saved_regs = *regs; + kcb->jprobe_saved_sp = stack_addr(regs); + addr = (unsigned long)(kcb->jprobe_saved_sp); + + /* + * As Linus pointed out, gcc assumes that the callee + * owns the argument space and could overwrite it, e.g. + * tailcall optimization. So, to be absolutely safe + * we also save and restore enough stack bytes to cover + * the argument area. + */ + memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr, + MIN_STACK_SIZE(addr)); + regs->flags &= ~X86_EFLAGS_IF; + trace_hardirqs_off(); + regs->ip = (unsigned long)(jp->entry); + return 1; +} +NOKPROBE_SYMBOL(setjmp_pre_handler); + +void jprobe_return(void) +{ + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + asm volatile ( +#ifdef CONFIG_X86_64 + " xchg %%rbx,%%rsp \n" +#else + " xchgl %%ebx,%%esp \n" +#endif + " int3 \n" + " .globl jprobe_return_end\n" + " jprobe_return_end: \n" + " nop \n"::"b" + (kcb->jprobe_saved_sp):"memory"); +} +NOKPROBE_SYMBOL(jprobe_return); +NOKPROBE_SYMBOL(jprobe_return_end); + +int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + u8 *addr = (u8 *) (regs->ip - 1); + struct jprobe *jp = container_of(p, struct jprobe, kp); + + if ((addr > (u8 *) jprobe_return) && + (addr < (u8 *) jprobe_return_end)) { + if (stack_addr(regs) != kcb->jprobe_saved_sp) { + struct pt_regs *saved_regs = &kcb->jprobe_saved_regs; + printk(KERN_ERR + "current sp %p does not match saved sp %p\n", + stack_addr(regs), kcb->jprobe_saved_sp); + printk(KERN_ERR "Saved registers for jprobe %p\n", jp); + show_regs(saved_regs); + printk(KERN_ERR "Current registers\n"); + show_regs(regs); + BUG(); + } + *regs = kcb->jprobe_saved_regs; + memcpy((kprobe_opcode_t *)(kcb->jprobe_saved_sp), + kcb->jprobes_stack, + MIN_STACK_SIZE(kcb->jprobe_saved_sp)); + preempt_enable_no_resched(); + return 1; + } + return 0; +} +NOKPROBE_SYMBOL(longjmp_break_handler); + +bool arch_within_kprobe_blacklist(unsigned long addr) +{ + return (addr >= (unsigned long)__kprobes_text_start && + addr < (unsigned long)__kprobes_text_end) || + (addr >= (unsigned long)__entry_text_start && + addr < (unsigned long)__entry_text_end); +} + +int __init arch_init_kprobes(void) +{ + return 0; +} + +int arch_trampoline_kprobe(struct kprobe *p) +{ + return 0; +} diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c new file mode 100644 index 00000000000..717b02a22e6 --- /dev/null +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -0,0 +1,96 @@ +/* + * Dynamic Ftrace based Kprobes Optimization + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) Hitachi Ltd., 2012 + */ +#include <linux/kprobes.h> +#include <linux/ptrace.h> +#include <linux/hardirq.h> +#include <linux/preempt.h> +#include <linux/ftrace.h> + +#include "common.h" + +static nokprobe_inline +int __skip_singlestep(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + /* + * Emulate singlestep (and also recover regs->ip) + * as if there is a 5byte nop + */ + regs->ip = (unsigned long)p->addr + MCOUNT_INSN_SIZE; + if (unlikely(p->post_handler)) { + kcb->kprobe_status = KPROBE_HIT_SSDONE; + p->post_handler(p, regs, 0); + } + __this_cpu_write(current_kprobe, NULL); + return 1; +} + +int skip_singlestep(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + if (kprobe_ftrace(p)) + return __skip_singlestep(p, regs, kcb); + else + return 0; +} +NOKPROBE_SYMBOL(skip_singlestep); + +/* Ftrace callback handler for kprobes */ +void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, struct pt_regs *regs) +{ + struct kprobe *p; + struct kprobe_ctlblk *kcb; + unsigned long flags; + + /* Disable irq for emulating a breakpoint and avoiding preempt */ + local_irq_save(flags); + + p = get_kprobe((kprobe_opcode_t *)ip); + if (unlikely(!p) || kprobe_disabled(p)) + goto end; + + kcb = get_kprobe_ctlblk(); + if (kprobe_running()) { + kprobes_inc_nmissed_count(p); + } else { + /* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */ + regs->ip = ip + sizeof(kprobe_opcode_t); + + __this_cpu_write(current_kprobe, p); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + if (!p->pre_handler || !p->pre_handler(p, regs)) + __skip_singlestep(p, regs, kcb); + /* + * If pre_handler returns !0, it sets regs->ip and + * resets current kprobe. + */ + } +end: + local_irq_restore(flags); +} +NOKPROBE_SYMBOL(kprobe_ftrace_handler); + +int arch_prepare_kprobe_ftrace(struct kprobe *p) +{ + p->ainsn.insn = NULL; + p->ainsn.boostable = -1; + return 0; +} diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c new file mode 100644 index 00000000000..f304773285a --- /dev/null +++ b/arch/x86/kernel/kprobes/opt.c @@ -0,0 +1,445 @@ +/* + * Kernel Probes Jump Optimization (Optprobes) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2002, 2004 + * Copyright (C) Hitachi Ltd., 2012 + */ +#include <linux/kprobes.h> +#include <linux/ptrace.h> +#include <linux/string.h> +#include <linux/slab.h> +#include <linux/hardirq.h> +#include <linux/preempt.h> +#include <linux/module.h> +#include <linux/kdebug.h> +#include <linux/kallsyms.h> +#include <linux/ftrace.h> + +#include <asm/cacheflush.h> +#include <asm/desc.h> +#include <asm/pgtable.h> +#include <asm/uaccess.h> +#include <asm/alternative.h> +#include <asm/insn.h> +#include <asm/debugreg.h> + +#include "common.h" + +unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr) +{ + struct optimized_kprobe *op; + struct kprobe *kp; + long offs; + int i; + + for (i = 0; i < RELATIVEJUMP_SIZE; i++) { + kp = get_kprobe((void *)addr - i); + /* This function only handles jump-optimized kprobe */ + if (kp && kprobe_optimized(kp)) { + op = container_of(kp, struct optimized_kprobe, kp); + /* If op->list is not empty, op is under optimizing */ + if (list_empty(&op->list)) + goto found; + } + } + + return addr; +found: + /* + * If the kprobe can be optimized, original bytes which can be + * overwritten by jump destination address. In this case, original + * bytes must be recovered from op->optinsn.copied_insn buffer. + */ + memcpy(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); + if (addr == (unsigned long)kp->addr) { + buf[0] = kp->opcode; + memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); + } else { + offs = addr - (unsigned long)kp->addr - 1; + memcpy(buf, op->optinsn.copied_insn + offs, RELATIVE_ADDR_SIZE - offs); + } + + return (unsigned long)buf; +} + +/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */ +static void synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val) +{ +#ifdef CONFIG_X86_64 + *addr++ = 0x48; + *addr++ = 0xbf; +#else + *addr++ = 0xb8; +#endif + *(unsigned long *)addr = val; +} + +asm ( + ".global optprobe_template_entry\n" + "optprobe_template_entry:\n" +#ifdef CONFIG_X86_64 + /* We don't bother saving the ss register */ + " pushq %rsp\n" + " pushfq\n" + SAVE_REGS_STRING + " movq %rsp, %rsi\n" + ".global optprobe_template_val\n" + "optprobe_template_val:\n" + ASM_NOP5 + ASM_NOP5 + ".global optprobe_template_call\n" + "optprobe_template_call:\n" + ASM_NOP5 + /* Move flags to rsp */ + " movq 144(%rsp), %rdx\n" + " movq %rdx, 152(%rsp)\n" + RESTORE_REGS_STRING + /* Skip flags entry */ + " addq $8, %rsp\n" + " popfq\n" +#else /* CONFIG_X86_32 */ + " pushf\n" + SAVE_REGS_STRING + " movl %esp, %edx\n" + ".global optprobe_template_val\n" + "optprobe_template_val:\n" + ASM_NOP5 + ".global optprobe_template_call\n" + "optprobe_template_call:\n" + ASM_NOP5 + RESTORE_REGS_STRING + " addl $4, %esp\n" /* skip cs */ + " popf\n" +#endif + ".global optprobe_template_end\n" + "optprobe_template_end:\n"); + +#define TMPL_MOVE_IDX \ + ((long)&optprobe_template_val - (long)&optprobe_template_entry) +#define TMPL_CALL_IDX \ + ((long)&optprobe_template_call - (long)&optprobe_template_entry) +#define TMPL_END_IDX \ + ((long)&optprobe_template_end - (long)&optprobe_template_entry) + +#define INT3_SIZE sizeof(kprobe_opcode_t) + +/* Optimized kprobe call back function: called from optinsn */ +static void +optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) +{ + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + unsigned long flags; + + /* This is possible if op is under delayed unoptimizing */ + if (kprobe_disabled(&op->kp)) + return; + + local_irq_save(flags); + if (kprobe_running()) { + kprobes_inc_nmissed_count(&op->kp); + } else { + /* Save skipped registers */ +#ifdef CONFIG_X86_64 + regs->cs = __KERNEL_CS; +#else + regs->cs = __KERNEL_CS | get_kernel_rpl(); + regs->gs = 0; +#endif + regs->ip = (unsigned long)op->kp.addr + INT3_SIZE; + regs->orig_ax = ~0UL; + + __this_cpu_write(current_kprobe, &op->kp); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + opt_pre_handler(&op->kp, regs); + __this_cpu_write(current_kprobe, NULL); + } + local_irq_restore(flags); +} +NOKPROBE_SYMBOL(optimized_callback); + +static int copy_optimized_instructions(u8 *dest, u8 *src) +{ + int len = 0, ret; + + while (len < RELATIVEJUMP_SIZE) { + ret = __copy_instruction(dest + len, src + len); + if (!ret || !can_boost(dest + len)) + return -EINVAL; + len += ret; + } + /* Check whether the address range is reserved */ + if (ftrace_text_reserved(src, src + len - 1) || + alternatives_text_reserved(src, src + len - 1) || + jump_label_text_reserved(src, src + len - 1)) + return -EBUSY; + + return len; +} + +/* Check whether insn is indirect jump */ +static int insn_is_indirect_jump(struct insn *insn) +{ + return ((insn->opcode.bytes[0] == 0xff && + (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */ + insn->opcode.bytes[0] == 0xea); /* Segment based jump */ +} + +/* Check whether insn jumps into specified address range */ +static int insn_jump_into_range(struct insn *insn, unsigned long start, int len) +{ + unsigned long target = 0; + + switch (insn->opcode.bytes[0]) { + case 0xe0: /* loopne */ + case 0xe1: /* loope */ + case 0xe2: /* loop */ + case 0xe3: /* jcxz */ + case 0xe9: /* near relative jump */ + case 0xeb: /* short relative jump */ + break; + case 0x0f: + if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */ + break; + return 0; + default: + if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */ + break; + return 0; + } + target = (unsigned long)insn->next_byte + insn->immediate.value; + + return (start <= target && target <= start + len); +} + +/* Decode whole function to ensure any instructions don't jump into target */ +static int can_optimize(unsigned long paddr) +{ + unsigned long addr, size = 0, offset = 0; + struct insn insn; + kprobe_opcode_t buf[MAX_INSN_SIZE]; + + /* Lookup symbol including addr */ + if (!kallsyms_lookup_size_offset(paddr, &size, &offset)) + return 0; + + /* + * Do not optimize in the entry code due to the unstable + * stack handling. + */ + if ((paddr >= (unsigned long)__entry_text_start) && + (paddr < (unsigned long)__entry_text_end)) + return 0; + + /* Check there is enough space for a relative jump. */ + if (size - offset < RELATIVEJUMP_SIZE) + return 0; + + /* Decode instructions */ + addr = paddr - offset; + while (addr < paddr - offset + size) { /* Decode until function end */ + if (search_exception_tables(addr)) + /* + * Since some fixup code will jumps into this function, + * we can't optimize kprobe in this function. + */ + return 0; + kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, addr)); + insn_get_length(&insn); + /* Another subsystem puts a breakpoint */ + if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) + return 0; + /* Recover address */ + insn.kaddr = (void *)addr; + insn.next_byte = (void *)(addr + insn.length); + /* Check any instructions don't jump into target */ + if (insn_is_indirect_jump(&insn) || + insn_jump_into_range(&insn, paddr + INT3_SIZE, + RELATIVE_ADDR_SIZE)) + return 0; + addr += insn.length; + } + + return 1; +} + +/* Check optimized_kprobe can actually be optimized. */ +int arch_check_optimized_kprobe(struct optimized_kprobe *op) +{ + int i; + struct kprobe *p; + + for (i = 1; i < op->optinsn.size; i++) { + p = get_kprobe(op->kp.addr + i); + if (p && !kprobe_disabled(p)) + return -EEXIST; + } + + return 0; +} + +/* Check the addr is within the optimized instructions. */ +int arch_within_optimized_kprobe(struct optimized_kprobe *op, + unsigned long addr) +{ + return ((unsigned long)op->kp.addr <= addr && + (unsigned long)op->kp.addr + op->optinsn.size > addr); +} + +/* Free optimized instruction slot */ +static +void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty) +{ + if (op->optinsn.insn) { + free_optinsn_slot(op->optinsn.insn, dirty); + op->optinsn.insn = NULL; + op->optinsn.size = 0; + } +} + +void arch_remove_optimized_kprobe(struct optimized_kprobe *op) +{ + __arch_remove_optimized_kprobe(op, 1); +} + +/* + * Copy replacing target instructions + * Target instructions MUST be relocatable (checked inside) + * This is called when new aggr(opt)probe is allocated or reused. + */ +int arch_prepare_optimized_kprobe(struct optimized_kprobe *op) +{ + u8 *buf; + int ret; + long rel; + + if (!can_optimize((unsigned long)op->kp.addr)) + return -EILSEQ; + + op->optinsn.insn = get_optinsn_slot(); + if (!op->optinsn.insn) + return -ENOMEM; + + /* + * Verify if the address gap is in 2GB range, because this uses + * a relative jump. + */ + rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE; + if (abs(rel) > 0x7fffffff) + return -ERANGE; + + buf = (u8 *)op->optinsn.insn; + + /* Copy instructions into the out-of-line buffer */ + ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr); + if (ret < 0) { + __arch_remove_optimized_kprobe(op, 0); + return ret; + } + op->optinsn.size = ret; + + /* Copy arch-dep-instance from template */ + memcpy(buf, &optprobe_template_entry, TMPL_END_IDX); + + /* Set probe information */ + synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op); + + /* Set probe function call */ + synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback); + + /* Set returning jmp instruction at the tail of out-of-line buffer */ + synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size, + (u8 *)op->kp.addr + op->optinsn.size); + + flush_icache_range((unsigned long) buf, + (unsigned long) buf + TMPL_END_IDX + + op->optinsn.size + RELATIVEJUMP_SIZE); + return 0; +} + +/* + * Replace breakpoints (int3) with relative jumps. + * Caller must call with locking kprobe_mutex and text_mutex. + */ +void arch_optimize_kprobes(struct list_head *oplist) +{ + struct optimized_kprobe *op, *tmp; + u8 insn_buf[RELATIVEJUMP_SIZE]; + + list_for_each_entry_safe(op, tmp, oplist, list) { + s32 rel = (s32)((long)op->optinsn.insn - + ((long)op->kp.addr + RELATIVEJUMP_SIZE)); + + WARN_ON(kprobe_disabled(&op->kp)); + + /* Backup instructions which will be replaced by jump address */ + memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE, + RELATIVE_ADDR_SIZE); + + insn_buf[0] = RELATIVEJUMP_OPCODE; + *(s32 *)(&insn_buf[1]) = rel; + + text_poke_bp(op->kp.addr, insn_buf, RELATIVEJUMP_SIZE, + op->optinsn.insn); + + list_del_init(&op->list); + } +} + +/* Replace a relative jump with a breakpoint (int3). */ +void arch_unoptimize_kprobe(struct optimized_kprobe *op) +{ + u8 insn_buf[RELATIVEJUMP_SIZE]; + + /* Set int3 to first byte for kprobes */ + insn_buf[0] = BREAKPOINT_INSTRUCTION; + memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); + text_poke_bp(op->kp.addr, insn_buf, RELATIVEJUMP_SIZE, + op->optinsn.insn); +} + +/* + * Recover original instructions and breakpoints from relative jumps. + * Caller must call with locking kprobe_mutex. + */ +extern void arch_unoptimize_kprobes(struct list_head *oplist, + struct list_head *done_list) +{ + struct optimized_kprobe *op, *tmp; + + list_for_each_entry_safe(op, tmp, oplist, list) { + arch_unoptimize_kprobe(op); + list_move(&op->list, done_list); + } +} + +int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter) +{ + struct optimized_kprobe *op; + + if (p->flags & KPROBE_FLAG_OPTIMIZED) { + /* This kprobe is really able to run optimized path. */ + op = container_of(p, struct optimized_kprobe, kp); + /* Detour through copied instructions */ + regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX; + if (!reenter) + reset_current_kprobe(); + preempt_enable_no_resched(); + return 1; + } + return 0; +} +NOKPROBE_SYMBOL(setup_detour_execution); diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c new file mode 100644 index 00000000000..c2bedaea11f --- /dev/null +++ b/arch/x86/kernel/ksysfs.c @@ -0,0 +1,340 @@ +/* + * Architecture specific sysfs attributes in /sys/kernel + * + * Copyright (C) 2007, Intel Corp. + * Huang Ying <ying.huang@intel.com> + * Copyright (C) 2013, 2013 Red Hat, Inc. + * Dave Young <dyoung@redhat.com> + * + * This file is released under the GPLv2 + */ + +#include <linux/kobject.h> +#include <linux/string.h> +#include <linux/sysfs.h> +#include <linux/init.h> +#include <linux/stat.h> +#include <linux/slab.h> +#include <linux/mm.h> + +#include <asm/io.h> +#include <asm/setup.h> + +static ssize_t version_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "0x%04x\n", boot_params.hdr.version); +} + +static struct kobj_attribute boot_params_version_attr = __ATTR_RO(version); + +static ssize_t boot_params_data_read(struct file *fp, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t off, size_t count) +{ + memcpy(buf, (void *)&boot_params + off, count); + return count; +} + +static struct bin_attribute boot_params_data_attr = { + .attr = { + .name = "data", + .mode = S_IRUGO, + }, + .read = boot_params_data_read, + .size = sizeof(boot_params), +}; + +static struct attribute *boot_params_version_attrs[] = { + &boot_params_version_attr.attr, + NULL, +}; + +static struct bin_attribute *boot_params_data_attrs[] = { + &boot_params_data_attr, + NULL, +}; + +static struct attribute_group boot_params_attr_group = { + .attrs = boot_params_version_attrs, + .bin_attrs = boot_params_data_attrs, +}; + +static int kobj_to_setup_data_nr(struct kobject *kobj, int *nr) +{ + const char *name; + + name = kobject_name(kobj); + return kstrtoint(name, 10, nr); +} + +static int get_setup_data_paddr(int nr, u64 *paddr) +{ + int i = 0; + struct setup_data *data; + u64 pa_data = boot_params.hdr.setup_data; + + while (pa_data) { + if (nr == i) { + *paddr = pa_data; + return 0; + } + data = ioremap_cache(pa_data, sizeof(*data)); + if (!data) + return -ENOMEM; + + pa_data = data->next; + iounmap(data); + i++; + } + return -EINVAL; +} + +static int __init get_setup_data_size(int nr, size_t *size) +{ + int i = 0; + struct setup_data *data; + u64 pa_data = boot_params.hdr.setup_data; + + while (pa_data) { + data = ioremap_cache(pa_data, sizeof(*data)); + if (!data) + return -ENOMEM; + if (nr == i) { + *size = data->len; + iounmap(data); + return 0; + } + + pa_data = data->next; + iounmap(data); + i++; + } + return -EINVAL; +} + +static ssize_t type_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int nr, ret; + u64 paddr; + struct setup_data *data; + + ret = kobj_to_setup_data_nr(kobj, &nr); + if (ret) + return ret; + + ret = get_setup_data_paddr(nr, &paddr); + if (ret) + return ret; + data = ioremap_cache(paddr, sizeof(*data)); + if (!data) + return -ENOMEM; + + ret = sprintf(buf, "0x%x\n", data->type); + iounmap(data); + return ret; +} + +static ssize_t setup_data_data_read(struct file *fp, + struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, + loff_t off, size_t count) +{ + int nr, ret = 0; + u64 paddr; + struct setup_data *data; + void *p; + + ret = kobj_to_setup_data_nr(kobj, &nr); + if (ret) + return ret; + + ret = get_setup_data_paddr(nr, &paddr); + if (ret) + return ret; + data = ioremap_cache(paddr, sizeof(*data)); + if (!data) + return -ENOMEM; + + if (off > data->len) { + ret = -EINVAL; + goto out; + } + + if (count > data->len - off) + count = data->len - off; + + if (!count) + goto out; + + ret = count; + p = ioremap_cache(paddr + sizeof(*data), data->len); + if (!p) { + ret = -ENOMEM; + goto out; + } + memcpy(buf, p + off, count); + iounmap(p); +out: + iounmap(data); + return ret; +} + +static struct kobj_attribute type_attr = __ATTR_RO(type); + +static struct bin_attribute data_attr = { + .attr = { + .name = "data", + .mode = S_IRUGO, + }, + .read = setup_data_data_read, +}; + +static struct attribute *setup_data_type_attrs[] = { + &type_attr.attr, + NULL, +}; + +static struct bin_attribute *setup_data_data_attrs[] = { + &data_attr, + NULL, +}; + +static struct attribute_group setup_data_attr_group = { + .attrs = setup_data_type_attrs, + .bin_attrs = setup_data_data_attrs, +}; + +static int __init create_setup_data_node(struct kobject *parent, + struct kobject **kobjp, int nr) +{ + int ret = 0; + size_t size; + struct kobject *kobj; + char name[16]; /* should be enough for setup_data nodes numbers */ + snprintf(name, 16, "%d", nr); + + kobj = kobject_create_and_add(name, parent); + if (!kobj) + return -ENOMEM; + + ret = get_setup_data_size(nr, &size); + if (ret) + goto out_kobj; + + data_attr.size = size; + ret = sysfs_create_group(kobj, &setup_data_attr_group); + if (ret) + goto out_kobj; + *kobjp = kobj; + + return 0; +out_kobj: + kobject_put(kobj); + return ret; +} + +static void __init cleanup_setup_data_node(struct kobject *kobj) +{ + sysfs_remove_group(kobj, &setup_data_attr_group); + kobject_put(kobj); +} + +static int __init get_setup_data_total_num(u64 pa_data, int *nr) +{ + int ret = 0; + struct setup_data *data; + + *nr = 0; + while (pa_data) { + *nr += 1; + data = ioremap_cache(pa_data, sizeof(*data)); + if (!data) { + ret = -ENOMEM; + goto out; + } + pa_data = data->next; + iounmap(data); + } + +out: + return ret; +} + +static int __init create_setup_data_nodes(struct kobject *parent) +{ + struct kobject *setup_data_kobj, **kobjp; + u64 pa_data; + int i, j, nr, ret = 0; + + pa_data = boot_params.hdr.setup_data; + if (!pa_data) + return 0; + + setup_data_kobj = kobject_create_and_add("setup_data", parent); + if (!setup_data_kobj) { + ret = -ENOMEM; + goto out; + } + + ret = get_setup_data_total_num(pa_data, &nr); + if (ret) + goto out_setup_data_kobj; + + kobjp = kmalloc(sizeof(*kobjp) * nr, GFP_KERNEL); + if (!kobjp) { + ret = -ENOMEM; + goto out_setup_data_kobj; + } + + for (i = 0; i < nr; i++) { + ret = create_setup_data_node(setup_data_kobj, kobjp + i, i); + if (ret) + goto out_clean_nodes; + } + + kfree(kobjp); + return 0; + +out_clean_nodes: + for (j = i - 1; j > 0; j--) + cleanup_setup_data_node(*(kobjp + j)); + kfree(kobjp); +out_setup_data_kobj: + kobject_put(setup_data_kobj); +out: + return ret; +} + +static int __init boot_params_ksysfs_init(void) +{ + int ret; + struct kobject *boot_params_kobj; + + boot_params_kobj = kobject_create_and_add("boot_params", + kernel_kobj); + if (!boot_params_kobj) { + ret = -ENOMEM; + goto out; + } + + ret = sysfs_create_group(boot_params_kobj, &boot_params_attr_group); + if (ret) + goto out_boot_params_kobj; + + ret = create_setup_data_nodes(boot_params_kobj); + if (ret) + goto out_create_group; + + return 0; +out_create_group: + sysfs_remove_group(boot_params_kobj, &boot_params_attr_group); +out_boot_params_kobj: + kobject_put(boot_params_kobj); +out: + return ret; +} + +arch_initcall(boot_params_ksysfs_init); diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c new file mode 100644 index 00000000000..3dd8e2c4d74 --- /dev/null +++ b/arch/x86/kernel/kvm.c @@ -0,0 +1,829 @@ +/* + * KVM paravirt_ops implementation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * Copyright IBM Corporation, 2007 + * Authors: Anthony Liguori <aliguori@us.ibm.com> + */ + +#include <linux/context_tracking.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/kvm_para.h> +#include <linux/cpu.h> +#include <linux/mm.h> +#include <linux/highmem.h> +#include <linux/hardirq.h> +#include <linux/notifier.h> +#include <linux/reboot.h> +#include <linux/hash.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/kprobes.h> +#include <linux/debugfs.h> +#include <asm/timer.h> +#include <asm/cpu.h> +#include <asm/traps.h> +#include <asm/desc.h> +#include <asm/tlbflush.h> +#include <asm/idle.h> +#include <asm/apic.h> +#include <asm/apicdef.h> +#include <asm/hypervisor.h> +#include <asm/kvm_guest.h> + +static int kvmapf = 1; + +static int parse_no_kvmapf(char *arg) +{ + kvmapf = 0; + return 0; +} + +early_param("no-kvmapf", parse_no_kvmapf); + +static int steal_acc = 1; +static int parse_no_stealacc(char *arg) +{ + steal_acc = 0; + return 0; +} + +early_param("no-steal-acc", parse_no_stealacc); + +static int kvmclock_vsyscall = 1; +static int parse_no_kvmclock_vsyscall(char *arg) +{ + kvmclock_vsyscall = 0; + return 0; +} + +early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); + +static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); +static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64); +static int has_steal_clock = 0; + +/* + * No need for any "IO delay" on KVM + */ +static void kvm_io_delay(void) +{ +} + +#define KVM_TASK_SLEEP_HASHBITS 8 +#define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS) + +struct kvm_task_sleep_node { + struct hlist_node link; + wait_queue_head_t wq; + u32 token; + int cpu; + bool halted; +}; + +static struct kvm_task_sleep_head { + spinlock_t lock; + struct hlist_head list; +} async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE]; + +static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b, + u32 token) +{ + struct hlist_node *p; + + hlist_for_each(p, &b->list) { + struct kvm_task_sleep_node *n = + hlist_entry(p, typeof(*n), link); + if (n->token == token) + return n; + } + + return NULL; +} + +void kvm_async_pf_task_wait(u32 token) +{ + u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); + struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; + struct kvm_task_sleep_node n, *e; + DEFINE_WAIT(wait); + + rcu_irq_enter(); + + spin_lock(&b->lock); + e = _find_apf_task(b, token); + if (e) { + /* dummy entry exist -> wake up was delivered ahead of PF */ + hlist_del(&e->link); + kfree(e); + spin_unlock(&b->lock); + + rcu_irq_exit(); + return; + } + + n.token = token; + n.cpu = smp_processor_id(); + n.halted = is_idle_task(current) || preempt_count() > 1; + init_waitqueue_head(&n.wq); + hlist_add_head(&n.link, &b->list); + spin_unlock(&b->lock); + + for (;;) { + if (!n.halted) + prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE); + if (hlist_unhashed(&n.link)) + break; + + if (!n.halted) { + local_irq_enable(); + schedule(); + local_irq_disable(); + } else { + /* + * We cannot reschedule. So halt. + */ + rcu_irq_exit(); + native_safe_halt(); + rcu_irq_enter(); + local_irq_disable(); + } + } + if (!n.halted) + finish_wait(&n.wq, &wait); + + rcu_irq_exit(); + return; +} +EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait); + +static void apf_task_wake_one(struct kvm_task_sleep_node *n) +{ + hlist_del_init(&n->link); + if (n->halted) + smp_send_reschedule(n->cpu); + else if (waitqueue_active(&n->wq)) + wake_up(&n->wq); +} + +static void apf_task_wake_all(void) +{ + int i; + + for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) { + struct hlist_node *p, *next; + struct kvm_task_sleep_head *b = &async_pf_sleepers[i]; + spin_lock(&b->lock); + hlist_for_each_safe(p, next, &b->list) { + struct kvm_task_sleep_node *n = + hlist_entry(p, typeof(*n), link); + if (n->cpu == smp_processor_id()) + apf_task_wake_one(n); + } + spin_unlock(&b->lock); + } +} + +void kvm_async_pf_task_wake(u32 token) +{ + u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); + struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; + struct kvm_task_sleep_node *n; + + if (token == ~0) { + apf_task_wake_all(); + return; + } + +again: + spin_lock(&b->lock); + n = _find_apf_task(b, token); + if (!n) { + /* + * async PF was not yet handled. + * Add dummy entry for the token. + */ + n = kzalloc(sizeof(*n), GFP_ATOMIC); + if (!n) { + /* + * Allocation failed! Busy wait while other cpu + * handles async PF. + */ + spin_unlock(&b->lock); + cpu_relax(); + goto again; + } + n->token = token; + n->cpu = smp_processor_id(); + init_waitqueue_head(&n->wq); + hlist_add_head(&n->link, &b->list); + } else + apf_task_wake_one(n); + spin_unlock(&b->lock); + return; +} +EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake); + +u32 kvm_read_and_reset_pf_reason(void) +{ + u32 reason = 0; + + if (__get_cpu_var(apf_reason).enabled) { + reason = __get_cpu_var(apf_reason).reason; + __get_cpu_var(apf_reason).reason = 0; + } + + return reason; +} +EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason); +NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason); + +dotraplinkage void +do_async_page_fault(struct pt_regs *regs, unsigned long error_code) +{ + enum ctx_state prev_state; + + switch (kvm_read_and_reset_pf_reason()) { + default: + trace_do_page_fault(regs, error_code); + break; + case KVM_PV_REASON_PAGE_NOT_PRESENT: + /* page is swapped out by the host. */ + prev_state = exception_enter(); + exit_idle(); + kvm_async_pf_task_wait((u32)read_cr2()); + exception_exit(prev_state); + break; + case KVM_PV_REASON_PAGE_READY: + rcu_irq_enter(); + exit_idle(); + kvm_async_pf_task_wake((u32)read_cr2()); + rcu_irq_exit(); + break; + } +} +NOKPROBE_SYMBOL(do_async_page_fault); + +static void __init paravirt_ops_setup(void) +{ + pv_info.name = "KVM"; + pv_info.paravirt_enabled = 1; + + if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) + pv_cpu_ops.io_delay = kvm_io_delay; + +#ifdef CONFIG_X86_IO_APIC + no_timer_check = 1; +#endif +} + +static void kvm_register_steal_time(void) +{ + int cpu = smp_processor_id(); + struct kvm_steal_time *st = &per_cpu(steal_time, cpu); + + if (!has_steal_clock) + return; + + memset(st, 0, sizeof(*st)); + + wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED)); + pr_info("kvm-stealtime: cpu %d, msr %llx\n", + cpu, (unsigned long long) slow_virt_to_phys(st)); +} + +static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; + +static void kvm_guest_apic_eoi_write(u32 reg, u32 val) +{ + /** + * This relies on __test_and_clear_bit to modify the memory + * in a way that is atomic with respect to the local CPU. + * The hypervisor only accesses this memory from the local CPU so + * there's no need for lock or memory barriers. + * An optimization barrier is implied in apic write. + */ + if (__test_and_clear_bit(KVM_PV_EOI_BIT, &__get_cpu_var(kvm_apic_eoi))) + return; + apic_write(APIC_EOI, APIC_EOI_ACK); +} + +void kvm_guest_cpu_init(void) +{ + if (!kvm_para_available()) + return; + + if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) { + u64 pa = slow_virt_to_phys(&__get_cpu_var(apf_reason)); + +#ifdef CONFIG_PREEMPT + pa |= KVM_ASYNC_PF_SEND_ALWAYS; +#endif + wrmsrl(MSR_KVM_ASYNC_PF_EN, pa | KVM_ASYNC_PF_ENABLED); + __get_cpu_var(apf_reason).enabled = 1; + printk(KERN_INFO"KVM setup async PF for cpu %d\n", + smp_processor_id()); + } + + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) { + unsigned long pa; + /* Size alignment is implied but just to make it explicit. */ + BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4); + __get_cpu_var(kvm_apic_eoi) = 0; + pa = slow_virt_to_phys(&__get_cpu_var(kvm_apic_eoi)) + | KVM_MSR_ENABLED; + wrmsrl(MSR_KVM_PV_EOI_EN, pa); + } + + if (has_steal_clock) + kvm_register_steal_time(); +} + +static void kvm_pv_disable_apf(void) +{ + if (!__get_cpu_var(apf_reason).enabled) + return; + + wrmsrl(MSR_KVM_ASYNC_PF_EN, 0); + __get_cpu_var(apf_reason).enabled = 0; + + printk(KERN_INFO"Unregister pv shared memory for cpu %d\n", + smp_processor_id()); +} + +static void kvm_pv_guest_cpu_reboot(void *unused) +{ + /* + * We disable PV EOI before we load a new kernel by kexec, + * since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory. + * New kernel can re-enable when it boots. + */ + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) + wrmsrl(MSR_KVM_PV_EOI_EN, 0); + kvm_pv_disable_apf(); + kvm_disable_steal_time(); +} + +static int kvm_pv_reboot_notify(struct notifier_block *nb, + unsigned long code, void *unused) +{ + if (code == SYS_RESTART) + on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1); + return NOTIFY_DONE; +} + +static struct notifier_block kvm_pv_reboot_nb = { + .notifier_call = kvm_pv_reboot_notify, +}; + +static u64 kvm_steal_clock(int cpu) +{ + u64 steal; + struct kvm_steal_time *src; + int version; + + src = &per_cpu(steal_time, cpu); + do { + version = src->version; + rmb(); + steal = src->steal; + rmb(); + } while ((version & 1) || (version != src->version)); + + return steal; +} + +void kvm_disable_steal_time(void) +{ + if (!has_steal_clock) + return; + + wrmsr(MSR_KVM_STEAL_TIME, 0, 0); +} + +#ifdef CONFIG_SMP +static void __init kvm_smp_prepare_boot_cpu(void) +{ + kvm_guest_cpu_init(); + native_smp_prepare_boot_cpu(); + kvm_spinlock_init(); +} + +static void kvm_guest_cpu_online(void *dummy) +{ + kvm_guest_cpu_init(); +} + +static void kvm_guest_cpu_offline(void *dummy) +{ + kvm_disable_steal_time(); + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) + wrmsrl(MSR_KVM_PV_EOI_EN, 0); + kvm_pv_disable_apf(); + apf_task_wake_all(); +} + +static int kvm_cpu_notify(struct notifier_block *self, unsigned long action, + void *hcpu) +{ + int cpu = (unsigned long)hcpu; + switch (action) { + case CPU_ONLINE: + case CPU_DOWN_FAILED: + case CPU_ONLINE_FROZEN: + smp_call_function_single(cpu, kvm_guest_cpu_online, NULL, 0); + break; + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + smp_call_function_single(cpu, kvm_guest_cpu_offline, NULL, 1); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block kvm_cpu_notifier = { + .notifier_call = kvm_cpu_notify, +}; +#endif + +static void __init kvm_apf_trap_init(void) +{ + set_intr_gate(14, async_page_fault); +} + +void __init kvm_guest_init(void) +{ + int i; + + if (!kvm_para_available()) + return; + + paravirt_ops_setup(); + register_reboot_notifier(&kvm_pv_reboot_nb); + for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) + spin_lock_init(&async_pf_sleepers[i].lock); + if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF)) + x86_init.irqs.trap_init = kvm_apf_trap_init; + + if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { + has_steal_clock = 1; + pv_time_ops.steal_clock = kvm_steal_clock; + } + + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) + apic_set_eoi_write(kvm_guest_apic_eoi_write); + + if (kvmclock_vsyscall) + kvm_setup_vsyscall_timeinfo(); + +#ifdef CONFIG_SMP + smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; + register_cpu_notifier(&kvm_cpu_notifier); +#else + kvm_guest_cpu_init(); +#endif +} + +static noinline uint32_t __kvm_cpuid_base(void) +{ + if (boot_cpu_data.cpuid_level < 0) + return 0; /* So we don't blow up on old processors */ + + if (cpu_has_hypervisor) + return hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0); + + return 0; +} + +static inline uint32_t kvm_cpuid_base(void) +{ + static int kvm_cpuid_base = -1; + + if (kvm_cpuid_base == -1) + kvm_cpuid_base = __kvm_cpuid_base(); + + return kvm_cpuid_base; +} + +bool kvm_para_available(void) +{ + return kvm_cpuid_base() != 0; +} +EXPORT_SYMBOL_GPL(kvm_para_available); + +unsigned int kvm_arch_para_features(void) +{ + return cpuid_eax(kvm_cpuid_base() | KVM_CPUID_FEATURES); +} + +static uint32_t __init kvm_detect(void) +{ + return kvm_cpuid_base(); +} + +const struct hypervisor_x86 x86_hyper_kvm __refconst = { + .name = "KVM", + .detect = kvm_detect, + .x2apic_available = kvm_para_available, +}; +EXPORT_SYMBOL_GPL(x86_hyper_kvm); + +static __init int activate_jump_labels(void) +{ + if (has_steal_clock) { + static_key_slow_inc(¶virt_steal_enabled); + if (steal_acc) + static_key_slow_inc(¶virt_steal_rq_enabled); + } + + return 0; +} +arch_initcall(activate_jump_labels); + +#ifdef CONFIG_PARAVIRT_SPINLOCKS + +/* Kick a cpu by its apicid. Used to wake up a halted vcpu */ +static void kvm_kick_cpu(int cpu) +{ + int apicid; + unsigned long flags = 0; + + apicid = per_cpu(x86_cpu_to_apicid, cpu); + kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid); +} + +enum kvm_contention_stat { + TAKEN_SLOW, + TAKEN_SLOW_PICKUP, + RELEASED_SLOW, + RELEASED_SLOW_KICKED, + NR_CONTENTION_STATS +}; + +#ifdef CONFIG_KVM_DEBUG_FS +#define HISTO_BUCKETS 30 + +static struct kvm_spinlock_stats +{ + u32 contention_stats[NR_CONTENTION_STATS]; + u32 histo_spin_blocked[HISTO_BUCKETS+1]; + u64 time_blocked; +} spinlock_stats; + +static u8 zero_stats; + +static inline void check_zero(void) +{ + u8 ret; + u8 old; + + old = ACCESS_ONCE(zero_stats); + if (unlikely(old)) { + ret = cmpxchg(&zero_stats, old, 0); + /* This ensures only one fellow resets the stat */ + if (ret == old) + memset(&spinlock_stats, 0, sizeof(spinlock_stats)); + } +} + +static inline void add_stats(enum kvm_contention_stat var, u32 val) +{ + check_zero(); + spinlock_stats.contention_stats[var] += val; +} + + +static inline u64 spin_time_start(void) +{ + return sched_clock(); +} + +static void __spin_time_accum(u64 delta, u32 *array) +{ + unsigned index; + + index = ilog2(delta); + check_zero(); + + if (index < HISTO_BUCKETS) + array[index]++; + else + array[HISTO_BUCKETS]++; +} + +static inline void spin_time_accum_blocked(u64 start) +{ + u32 delta; + + delta = sched_clock() - start; + __spin_time_accum(delta, spinlock_stats.histo_spin_blocked); + spinlock_stats.time_blocked += delta; +} + +static struct dentry *d_spin_debug; +static struct dentry *d_kvm_debug; + +struct dentry *kvm_init_debugfs(void) +{ + d_kvm_debug = debugfs_create_dir("kvm-guest", NULL); + if (!d_kvm_debug) + printk(KERN_WARNING "Could not create 'kvm' debugfs directory\n"); + + return d_kvm_debug; +} + +static int __init kvm_spinlock_debugfs(void) +{ + struct dentry *d_kvm; + + d_kvm = kvm_init_debugfs(); + if (d_kvm == NULL) + return -ENOMEM; + + d_spin_debug = debugfs_create_dir("spinlocks", d_kvm); + + debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats); + + debugfs_create_u32("taken_slow", 0444, d_spin_debug, + &spinlock_stats.contention_stats[TAKEN_SLOW]); + debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug, + &spinlock_stats.contention_stats[TAKEN_SLOW_PICKUP]); + + debugfs_create_u32("released_slow", 0444, d_spin_debug, + &spinlock_stats.contention_stats[RELEASED_SLOW]); + debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug, + &spinlock_stats.contention_stats[RELEASED_SLOW_KICKED]); + + debugfs_create_u64("time_blocked", 0444, d_spin_debug, + &spinlock_stats.time_blocked); + + debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug, + spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1); + + return 0; +} +fs_initcall(kvm_spinlock_debugfs); +#else /* !CONFIG_KVM_DEBUG_FS */ +static inline void add_stats(enum kvm_contention_stat var, u32 val) +{ +} + +static inline u64 spin_time_start(void) +{ + return 0; +} + +static inline void spin_time_accum_blocked(u64 start) +{ +} +#endif /* CONFIG_KVM_DEBUG_FS */ + +struct kvm_lock_waiting { + struct arch_spinlock *lock; + __ticket_t want; +}; + +/* cpus 'waiting' on a spinlock to become available */ +static cpumask_t waiting_cpus; + +/* Track spinlock on which a cpu is waiting */ +static DEFINE_PER_CPU(struct kvm_lock_waiting, klock_waiting); + +__visible void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want) +{ + struct kvm_lock_waiting *w; + int cpu; + u64 start; + unsigned long flags; + + if (in_nmi()) + return; + + w = &__get_cpu_var(klock_waiting); + cpu = smp_processor_id(); + start = spin_time_start(); + + /* + * Make sure an interrupt handler can't upset things in a + * partially setup state. + */ + local_irq_save(flags); + + /* + * The ordering protocol on this is that the "lock" pointer + * may only be set non-NULL if the "want" ticket is correct. + * If we're updating "want", we must first clear "lock". + */ + w->lock = NULL; + smp_wmb(); + w->want = want; + smp_wmb(); + w->lock = lock; + + add_stats(TAKEN_SLOW, 1); + + /* + * This uses set_bit, which is atomic but we should not rely on its + * reordering gurantees. So barrier is needed after this call. + */ + cpumask_set_cpu(cpu, &waiting_cpus); + + barrier(); + + /* + * Mark entry to slowpath before doing the pickup test to make + * sure we don't deadlock with an unlocker. + */ + __ticket_enter_slowpath(lock); + + /* + * check again make sure it didn't become free while + * we weren't looking. + */ + if (ACCESS_ONCE(lock->tickets.head) == want) { + add_stats(TAKEN_SLOW_PICKUP, 1); + goto out; + } + + /* + * halt until it's our turn and kicked. Note that we do safe halt + * for irq enabled case to avoid hang when lock info is overwritten + * in irq spinlock slowpath and no spurious interrupt occur to save us. + */ + if (arch_irqs_disabled_flags(flags)) + halt(); + else + safe_halt(); + +out: + cpumask_clear_cpu(cpu, &waiting_cpus); + w->lock = NULL; + local_irq_restore(flags); + spin_time_accum_blocked(start); +} +PV_CALLEE_SAVE_REGS_THUNK(kvm_lock_spinning); + +/* Kick vcpu waiting on @lock->head to reach value @ticket */ +static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket) +{ + int cpu; + + add_stats(RELEASED_SLOW, 1); + for_each_cpu(cpu, &waiting_cpus) { + const struct kvm_lock_waiting *w = &per_cpu(klock_waiting, cpu); + if (ACCESS_ONCE(w->lock) == lock && + ACCESS_ONCE(w->want) == ticket) { + add_stats(RELEASED_SLOW_KICKED, 1); + kvm_kick_cpu(cpu); + break; + } + } +} + +/* + * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present. + */ +void __init kvm_spinlock_init(void) +{ + if (!kvm_para_available()) + return; + /* Does host kernel support KVM_FEATURE_PV_UNHALT? */ + if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) + return; + + pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(kvm_lock_spinning); + pv_lock_ops.unlock_kick = kvm_unlock_kick; +} + +static __init int kvm_spinlock_init_jump(void) +{ + if (!kvm_para_available()) + return 0; + if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) + return 0; + + static_key_slow_inc(¶virt_ticketlocks_enabled); + printk(KERN_INFO "KVM setup paravirtual spinlock\n"); + + return 0; +} +early_initcall(kvm_spinlock_init_jump); + +#endif /* CONFIG_PARAVIRT_SPINLOCKS */ diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c new file mode 100644 index 00000000000..d9156ceecdf --- /dev/null +++ b/arch/x86/kernel/kvmclock.c @@ -0,0 +1,308 @@ +/* KVM paravirtual clock driver. A clocksource implementation + Copyright (C) 2008 Glauber de Oliveira Costa, Red Hat Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#include <linux/clocksource.h> +#include <linux/kvm_para.h> +#include <asm/pvclock.h> +#include <asm/msr.h> +#include <asm/apic.h> +#include <linux/percpu.h> +#include <linux/hardirq.h> +#include <linux/memblock.h> + +#include <asm/x86_init.h> +#include <asm/reboot.h> + +static int kvmclock = 1; +static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; +static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; + +static int parse_no_kvmclock(char *arg) +{ + kvmclock = 0; + return 0; +} +early_param("no-kvmclock", parse_no_kvmclock); + +/* The hypervisor will put information about time periodically here */ +static struct pvclock_vsyscall_time_info *hv_clock; +static struct pvclock_wall_clock wall_clock; + +/* + * The wallclock is the time of day when we booted. Since then, some time may + * have elapsed since the hypervisor wrote the data. So we try to account for + * that with system time + */ +static void kvm_get_wallclock(struct timespec *now) +{ + struct pvclock_vcpu_time_info *vcpu_time; + int low, high; + int cpu; + + low = (int)__pa_symbol(&wall_clock); + high = ((u64)__pa_symbol(&wall_clock) >> 32); + + native_write_msr(msr_kvm_wall_clock, low, high); + + preempt_disable(); + cpu = smp_processor_id(); + + vcpu_time = &hv_clock[cpu].pvti; + pvclock_read_wallclock(&wall_clock, vcpu_time, now); + + preempt_enable(); +} + +static int kvm_set_wallclock(const struct timespec *now) +{ + return -1; +} + +static cycle_t kvm_clock_read(void) +{ + struct pvclock_vcpu_time_info *src; + cycle_t ret; + int cpu; + + preempt_disable_notrace(); + cpu = smp_processor_id(); + src = &hv_clock[cpu].pvti; + ret = pvclock_clocksource_read(src); + preempt_enable_notrace(); + return ret; +} + +static cycle_t kvm_clock_get_cycles(struct clocksource *cs) +{ + return kvm_clock_read(); +} + +/* + * If we don't do that, there is the possibility that the guest + * will calibrate under heavy load - thus, getting a lower lpj - + * and execute the delays themselves without load. This is wrong, + * because no delay loop can finish beforehand. + * Any heuristics is subject to fail, because ultimately, a large + * poll of guests can be running and trouble each other. So we preset + * lpj here + */ +static unsigned long kvm_get_tsc_khz(void) +{ + struct pvclock_vcpu_time_info *src; + int cpu; + unsigned long tsc_khz; + + preempt_disable(); + cpu = smp_processor_id(); + src = &hv_clock[cpu].pvti; + tsc_khz = pvclock_tsc_khz(src); + preempt_enable(); + return tsc_khz; +} + +static void kvm_get_preset_lpj(void) +{ + unsigned long khz; + u64 lpj; + + khz = kvm_get_tsc_khz(); + + lpj = ((u64)khz * 1000); + do_div(lpj, HZ); + preset_lpj = lpj; +} + +bool kvm_check_and_clear_guest_paused(void) +{ + bool ret = false; + struct pvclock_vcpu_time_info *src; + int cpu = smp_processor_id(); + + if (!hv_clock) + return ret; + + src = &hv_clock[cpu].pvti; + if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) { + src->flags &= ~PVCLOCK_GUEST_STOPPED; + pvclock_touch_watchdogs(); + ret = true; + } + + return ret; +} + +static struct clocksource kvm_clock = { + .name = "kvm-clock", + .read = kvm_clock_get_cycles, + .rating = 400, + .mask = CLOCKSOURCE_MASK(64), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + +int kvm_register_clock(char *txt) +{ + int cpu = smp_processor_id(); + int low, high, ret; + struct pvclock_vcpu_time_info *src; + + if (!hv_clock) + return 0; + + src = &hv_clock[cpu].pvti; + low = (int)slow_virt_to_phys(src) | 1; + high = ((u64)slow_virt_to_phys(src) >> 32); + ret = native_write_msr_safe(msr_kvm_system_time, low, high); + printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", + cpu, high, low, txt); + + return ret; +} + +static void kvm_save_sched_clock_state(void) +{ +} + +static void kvm_restore_sched_clock_state(void) +{ + kvm_register_clock("primary cpu clock, resume"); +} + +#ifdef CONFIG_X86_LOCAL_APIC +static void kvm_setup_secondary_clock(void) +{ + /* + * Now that the first cpu already had this clocksource initialized, + * we shouldn't fail. + */ + WARN_ON(kvm_register_clock("secondary cpu clock")); +} +#endif + +/* + * After the clock is registered, the host will keep writing to the + * registered memory location. If the guest happens to shutdown, this memory + * won't be valid. In cases like kexec, in which you install a new kernel, this + * means a random memory location will be kept being written. So before any + * kind of shutdown from our side, we unregister the clock by writting anything + * that does not have the 'enable' bit set in the msr + */ +#ifdef CONFIG_KEXEC +static void kvm_crash_shutdown(struct pt_regs *regs) +{ + native_write_msr(msr_kvm_system_time, 0, 0); + kvm_disable_steal_time(); + native_machine_crash_shutdown(regs); +} +#endif + +static void kvm_shutdown(void) +{ + native_write_msr(msr_kvm_system_time, 0, 0); + kvm_disable_steal_time(); + native_machine_shutdown(); +} + +void __init kvmclock_init(void) +{ + unsigned long mem; + int size; + + size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); + + if (!kvm_para_available()) + return; + + if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) { + msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW; + msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW; + } else if (!(kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE))) + return; + + printk(KERN_INFO "kvm-clock: Using msrs %x and %x", + msr_kvm_system_time, msr_kvm_wall_clock); + + mem = memblock_alloc(size, PAGE_SIZE); + if (!mem) + return; + hv_clock = __va(mem); + memset(hv_clock, 0, size); + + if (kvm_register_clock("primary cpu clock")) { + hv_clock = NULL; + memblock_free(mem, size); + return; + } + pv_time_ops.sched_clock = kvm_clock_read; + x86_platform.calibrate_tsc = kvm_get_tsc_khz; + x86_platform.get_wallclock = kvm_get_wallclock; + x86_platform.set_wallclock = kvm_set_wallclock; +#ifdef CONFIG_X86_LOCAL_APIC + x86_cpuinit.early_percpu_clock_init = + kvm_setup_secondary_clock; +#endif + x86_platform.save_sched_clock_state = kvm_save_sched_clock_state; + x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state; + machine_ops.shutdown = kvm_shutdown; +#ifdef CONFIG_KEXEC + machine_ops.crash_shutdown = kvm_crash_shutdown; +#endif + kvm_get_preset_lpj(); + clocksource_register_hz(&kvm_clock, NSEC_PER_SEC); + pv_info.paravirt_enabled = 1; + pv_info.name = "KVM"; + + if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) + pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); +} + +int __init kvm_setup_vsyscall_timeinfo(void) +{ +#ifdef CONFIG_X86_64 + int cpu; + int ret; + u8 flags; + struct pvclock_vcpu_time_info *vcpu_time; + unsigned int size; + + if (!hv_clock) + return 0; + + size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); + + preempt_disable(); + cpu = smp_processor_id(); + + vcpu_time = &hv_clock[cpu].pvti; + flags = pvclock_read_flags(vcpu_time); + + if (!(flags & PVCLOCK_TSC_STABLE_BIT)) { + preempt_enable(); + return 1; + } + + if ((ret = pvclock_init_vsyscall(hv_clock, size))) { + preempt_enable(); + return ret; + } + + preempt_enable(); + + kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; +#endif + return 0; +} diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c new file mode 100644 index 00000000000..c37886d759c --- /dev/null +++ b/arch/x86/kernel/ldt.c @@ -0,0 +1,272 @@ +/* + * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds + * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> + * Copyright (C) 2002 Andi Kleen + * + * This handles calls from both 32bit and 64bit mode. + */ + +#include <linux/errno.h> +#include <linux/gfp.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/vmalloc.h> +#include <linux/uaccess.h> + +#include <asm/ldt.h> +#include <asm/desc.h> +#include <asm/mmu_context.h> +#include <asm/syscalls.h> + +#ifdef CONFIG_SMP +static void flush_ldt(void *current_mm) +{ + if (current->active_mm == current_mm) + load_LDT(¤t->active_mm->context); +} +#endif + +static int alloc_ldt(mm_context_t *pc, int mincount, int reload) +{ + void *oldldt, *newldt; + int oldsize; + + if (mincount <= pc->size) + return 0; + oldsize = pc->size; + mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) & + (~(PAGE_SIZE / LDT_ENTRY_SIZE - 1)); + if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE) + newldt = vmalloc(mincount * LDT_ENTRY_SIZE); + else + newldt = (void *)__get_free_page(GFP_KERNEL); + + if (!newldt) + return -ENOMEM; + + if (oldsize) + memcpy(newldt, pc->ldt, oldsize * LDT_ENTRY_SIZE); + oldldt = pc->ldt; + memset(newldt + oldsize * LDT_ENTRY_SIZE, 0, + (mincount - oldsize) * LDT_ENTRY_SIZE); + + paravirt_alloc_ldt(newldt, mincount); + +#ifdef CONFIG_X86_64 + /* CHECKME: Do we really need this ? */ + wmb(); +#endif + pc->ldt = newldt; + wmb(); + pc->size = mincount; + wmb(); + + if (reload) { +#ifdef CONFIG_SMP + preempt_disable(); + load_LDT(pc); + if (!cpumask_equal(mm_cpumask(current->mm), + cpumask_of(smp_processor_id()))) + smp_call_function(flush_ldt, current->mm, 1); + preempt_enable(); +#else + load_LDT(pc); +#endif + } + if (oldsize) { + paravirt_free_ldt(oldldt, oldsize); + if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE) + vfree(oldldt); + else + put_page(virt_to_page(oldldt)); + } + return 0; +} + +static inline int copy_ldt(mm_context_t *new, mm_context_t *old) +{ + int err = alloc_ldt(new, old->size, 0); + int i; + + if (err < 0) + return err; + + for (i = 0; i < old->size; i++) + write_ldt_entry(new->ldt, i, old->ldt + i * LDT_ENTRY_SIZE); + return 0; +} + +/* + * we do not have to muck with descriptors here, that is + * done in switch_mm() as needed. + */ +int init_new_context(struct task_struct *tsk, struct mm_struct *mm) +{ + struct mm_struct *old_mm; + int retval = 0; + + mutex_init(&mm->context.lock); + mm->context.size = 0; + old_mm = current->mm; + if (old_mm && old_mm->context.size > 0) { + mutex_lock(&old_mm->context.lock); + retval = copy_ldt(&mm->context, &old_mm->context); + mutex_unlock(&old_mm->context.lock); + } + return retval; +} + +/* + * No need to lock the MM as we are the last user + * + * 64bit: Don't touch the LDT register - we're already in the next thread. + */ +void destroy_context(struct mm_struct *mm) +{ + if (mm->context.size) { +#ifdef CONFIG_X86_32 + /* CHECKME: Can this ever happen ? */ + if (mm == current->active_mm) + clear_LDT(); +#endif + paravirt_free_ldt(mm->context.ldt, mm->context.size); + if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE) + vfree(mm->context.ldt); + else + put_page(virt_to_page(mm->context.ldt)); + mm->context.size = 0; + } +} + +static int read_ldt(void __user *ptr, unsigned long bytecount) +{ + int err; + unsigned long size; + struct mm_struct *mm = current->mm; + + if (!mm->context.size) + return 0; + if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES) + bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES; + + mutex_lock(&mm->context.lock); + size = mm->context.size * LDT_ENTRY_SIZE; + if (size > bytecount) + size = bytecount; + + err = 0; + if (copy_to_user(ptr, mm->context.ldt, size)) + err = -EFAULT; + mutex_unlock(&mm->context.lock); + if (err < 0) + goto error_return; + if (size != bytecount) { + /* zero-fill the rest */ + if (clear_user(ptr + size, bytecount - size) != 0) { + err = -EFAULT; + goto error_return; + } + } + return bytecount; +error_return: + return err; +} + +static int read_default_ldt(void __user *ptr, unsigned long bytecount) +{ + /* CHECKME: Can we use _one_ random number ? */ +#ifdef CONFIG_X86_32 + unsigned long size = 5 * sizeof(struct desc_struct); +#else + unsigned long size = 128; +#endif + if (bytecount > size) + bytecount = size; + if (clear_user(ptr, bytecount)) + return -EFAULT; + return bytecount; +} + +static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) +{ + struct mm_struct *mm = current->mm; + struct desc_struct ldt; + int error; + struct user_desc ldt_info; + + error = -EINVAL; + if (bytecount != sizeof(ldt_info)) + goto out; + error = -EFAULT; + if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info))) + goto out; + + error = -EINVAL; + if (ldt_info.entry_number >= LDT_ENTRIES) + goto out; + if (ldt_info.contents == 3) { + if (oldmode) + goto out; + if (ldt_info.seg_not_present == 0) + goto out; + } + + mutex_lock(&mm->context.lock); + if (ldt_info.entry_number >= mm->context.size) { + error = alloc_ldt(¤t->mm->context, + ldt_info.entry_number + 1, 1); + if (error < 0) + goto out_unlock; + } + + /* Allow LDTs to be cleared by the user. */ + if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { + if (oldmode || LDT_empty(&ldt_info)) { + memset(&ldt, 0, sizeof(ldt)); + goto install; + } + } + + if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) { + error = -EINVAL; + goto out_unlock; + } + + fill_ldt(&ldt, &ldt_info); + if (oldmode) + ldt.avl = 0; + + /* Install the new entry ... */ +install: + write_ldt_entry(mm->context.ldt, ldt_info.entry_number, &ldt); + error = 0; + +out_unlock: + mutex_unlock(&mm->context.lock); +out: + return error; +} + +asmlinkage int sys_modify_ldt(int func, void __user *ptr, + unsigned long bytecount) +{ + int ret = -ENOSYS; + + switch (func) { + case 0: + ret = read_ldt(ptr, bytecount); + break; + case 1: + ret = write_ldt(ptr, bytecount, 1); + break; + case 2: + ret = read_default_ldt(ptr, bytecount); + break; + case 0x11: + ret = write_ldt(ptr, bytecount, 0); + break; + } + return ret; +} diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c new file mode 100644 index 00000000000..1667b1de8d5 --- /dev/null +++ b/arch/x86/kernel/machine_kexec_32.c @@ -0,0 +1,271 @@ +/* + * handle transition of Linux booting another kernel + * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include <linux/mm.h> +#include <linux/kexec.h> +#include <linux/delay.h> +#include <linux/numa.h> +#include <linux/ftrace.h> +#include <linux/suspend.h> +#include <linux/gfp.h> +#include <linux/io.h> + +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/tlbflush.h> +#include <asm/mmu_context.h> +#include <asm/apic.h> +#include <asm/cpufeature.h> +#include <asm/desc.h> +#include <asm/cacheflush.h> +#include <asm/debugreg.h> + +static void set_idt(void *newidt, __u16 limit) +{ + struct desc_ptr curidt; + + /* ia32 supports unaliged loads & stores */ + curidt.size = limit; + curidt.address = (unsigned long)newidt; + + load_idt(&curidt); +} + + +static void set_gdt(void *newgdt, __u16 limit) +{ + struct desc_ptr curgdt; + + /* ia32 supports unaligned loads & stores */ + curgdt.size = limit; + curgdt.address = (unsigned long)newgdt; + + load_gdt(&curgdt); +} + +static void load_segments(void) +{ +#define __STR(X) #X +#define STR(X) __STR(X) + + __asm__ __volatile__ ( + "\tljmp $"STR(__KERNEL_CS)",$1f\n" + "\t1:\n" + "\tmovl $"STR(__KERNEL_DS)",%%eax\n" + "\tmovl %%eax,%%ds\n" + "\tmovl %%eax,%%es\n" + "\tmovl %%eax,%%fs\n" + "\tmovl %%eax,%%gs\n" + "\tmovl %%eax,%%ss\n" + : : : "eax", "memory"); +#undef STR +#undef __STR +} + +static void machine_kexec_free_page_tables(struct kimage *image) +{ + free_page((unsigned long)image->arch.pgd); +#ifdef CONFIG_X86_PAE + free_page((unsigned long)image->arch.pmd0); + free_page((unsigned long)image->arch.pmd1); +#endif + free_page((unsigned long)image->arch.pte0); + free_page((unsigned long)image->arch.pte1); +} + +static int machine_kexec_alloc_page_tables(struct kimage *image) +{ + image->arch.pgd = (pgd_t *)get_zeroed_page(GFP_KERNEL); +#ifdef CONFIG_X86_PAE + image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL); + image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL); +#endif + image->arch.pte0 = (pte_t *)get_zeroed_page(GFP_KERNEL); + image->arch.pte1 = (pte_t *)get_zeroed_page(GFP_KERNEL); + if (!image->arch.pgd || +#ifdef CONFIG_X86_PAE + !image->arch.pmd0 || !image->arch.pmd1 || +#endif + !image->arch.pte0 || !image->arch.pte1) { + machine_kexec_free_page_tables(image); + return -ENOMEM; + } + return 0; +} + +static void machine_kexec_page_table_set_one( + pgd_t *pgd, pmd_t *pmd, pte_t *pte, + unsigned long vaddr, unsigned long paddr) +{ + pud_t *pud; + + pgd += pgd_index(vaddr); +#ifdef CONFIG_X86_PAE + if (!(pgd_val(*pgd) & _PAGE_PRESENT)) + set_pgd(pgd, __pgd(__pa(pmd) | _PAGE_PRESENT)); +#endif + pud = pud_offset(pgd, vaddr); + pmd = pmd_offset(pud, vaddr); + if (!(pmd_val(*pmd) & _PAGE_PRESENT)) + set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE)); + pte = pte_offset_kernel(pmd, vaddr); + set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC)); +} + +static void machine_kexec_prepare_page_tables(struct kimage *image) +{ + void *control_page; + pmd_t *pmd = NULL; + + control_page = page_address(image->control_code_page); +#ifdef CONFIG_X86_PAE + pmd = image->arch.pmd0; +#endif + machine_kexec_page_table_set_one( + image->arch.pgd, pmd, image->arch.pte0, + (unsigned long)control_page, __pa(control_page)); +#ifdef CONFIG_X86_PAE + pmd = image->arch.pmd1; +#endif + machine_kexec_page_table_set_one( + image->arch.pgd, pmd, image->arch.pte1, + __pa(control_page), __pa(control_page)); +} + +/* + * A architecture hook called to validate the + * proposed image and prepare the control pages + * as needed. The pages for KEXEC_CONTROL_PAGE_SIZE + * have been allocated, but the segments have yet + * been copied into the kernel. + * + * Do what every setup is needed on image and the + * reboot code buffer to allow us to avoid allocations + * later. + * + * - Make control page executable. + * - Allocate page tables + * - Setup page tables + */ +int machine_kexec_prepare(struct kimage *image) +{ + int error; + + set_pages_x(image->control_code_page, 1); + error = machine_kexec_alloc_page_tables(image); + if (error) + return error; + machine_kexec_prepare_page_tables(image); + return 0; +} + +/* + * Undo anything leftover by machine_kexec_prepare + * when an image is freed. + */ +void machine_kexec_cleanup(struct kimage *image) +{ + set_pages_nx(image->control_code_page, 1); + machine_kexec_free_page_tables(image); +} + +/* + * Do not allocate memory (or fail in any way) in machine_kexec(). + * We are past the point of no return, committed to rebooting now. + */ +void machine_kexec(struct kimage *image) +{ + unsigned long page_list[PAGES_NR]; + void *control_page; + int save_ftrace_enabled; + asmlinkage unsigned long + (*relocate_kernel_ptr)(unsigned long indirection_page, + unsigned long control_page, + unsigned long start_address, + unsigned int has_pae, + unsigned int preserve_context); + +#ifdef CONFIG_KEXEC_JUMP + if (image->preserve_context) + save_processor_state(); +#endif + + save_ftrace_enabled = __ftrace_enabled_save(); + + /* Interrupts aren't acceptable while we reboot */ + local_irq_disable(); + hw_breakpoint_disable(); + + if (image->preserve_context) { +#ifdef CONFIG_X86_IO_APIC + /* + * We need to put APICs in legacy mode so that we can + * get timer interrupts in second kernel. kexec/kdump + * paths already have calls to disable_IO_APIC() in + * one form or other. kexec jump path also need + * one. + */ + disable_IO_APIC(); +#endif + } + + control_page = page_address(image->control_code_page); + memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE); + + relocate_kernel_ptr = control_page; + page_list[PA_CONTROL_PAGE] = __pa(control_page); + page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; + page_list[PA_PGD] = __pa(image->arch.pgd); + + if (image->type == KEXEC_TYPE_DEFAULT) + page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) + << PAGE_SHIFT); + + /* + * The segment registers are funny things, they have both a + * visible and an invisible part. Whenever the visible part is + * set to a specific selector, the invisible part is loaded + * with from a table in memory. At no other time is the + * descriptor table in memory accessed. + * + * I take advantage of this here by force loading the + * segments, before I zap the gdt with an invalid value. + */ + load_segments(); + /* + * The gdt & idt are now invalid. + * If you want to load them you must set up your own idt & gdt. + */ + set_gdt(phys_to_virt(0), 0); + set_idt(phys_to_virt(0), 0); + + /* now call it */ + image->start = relocate_kernel_ptr((unsigned long)image->head, + (unsigned long)page_list, + image->start, cpu_has_pae, + image->preserve_context); + +#ifdef CONFIG_KEXEC_JUMP + if (image->preserve_context) + restore_processor_state(); +#endif + + __ftrace_enabled_restore(save_ftrace_enabled); +} + +void arch_crash_save_vmcoreinfo(void) +{ +#ifdef CONFIG_NUMA + VMCOREINFO_SYMBOL(node_data); + VMCOREINFO_LENGTH(node_data, MAX_NUMNODES); +#endif +#ifdef CONFIG_X86_PAE + VMCOREINFO_CONFIG(X86_PAE); +#endif +} + diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c new file mode 100644 index 00000000000..679cef0791c --- /dev/null +++ b/arch/x86/kernel/machine_kexec_64.c @@ -0,0 +1,285 @@ +/* + * handle transition of Linux booting another kernel + * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include <linux/mm.h> +#include <linux/kexec.h> +#include <linux/string.h> +#include <linux/gfp.h> +#include <linux/reboot.h> +#include <linux/numa.h> +#include <linux/ftrace.h> +#include <linux/io.h> +#include <linux/suspend.h> + +#include <asm/init.h> +#include <asm/pgtable.h> +#include <asm/tlbflush.h> +#include <asm/mmu_context.h> +#include <asm/debugreg.h> + +static void free_transition_pgtable(struct kimage *image) +{ + free_page((unsigned long)image->arch.pud); + free_page((unsigned long)image->arch.pmd); + free_page((unsigned long)image->arch.pte); +} + +static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) +{ + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + unsigned long vaddr, paddr; + int result = -ENOMEM; + + vaddr = (unsigned long)relocate_kernel; + paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE); + pgd += pgd_index(vaddr); + if (!pgd_present(*pgd)) { + pud = (pud_t *)get_zeroed_page(GFP_KERNEL); + if (!pud) + goto err; + image->arch.pud = pud; + set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); + } + pud = pud_offset(pgd, vaddr); + if (!pud_present(*pud)) { + pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL); + if (!pmd) + goto err; + image->arch.pmd = pmd; + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); + } + pmd = pmd_offset(pud, vaddr); + if (!pmd_present(*pmd)) { + pte = (pte_t *)get_zeroed_page(GFP_KERNEL); + if (!pte) + goto err; + image->arch.pte = pte; + set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); + } + pte = pte_offset_kernel(pmd, vaddr); + set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC)); + return 0; +err: + free_transition_pgtable(image); + return result; +} + +static void *alloc_pgt_page(void *data) +{ + struct kimage *image = (struct kimage *)data; + struct page *page; + void *p = NULL; + + page = kimage_alloc_control_pages(image, 0); + if (page) { + p = page_address(page); + clear_page(p); + } + + return p; +} + +static int init_pgtable(struct kimage *image, unsigned long start_pgtable) +{ + struct x86_mapping_info info = { + .alloc_pgt_page = alloc_pgt_page, + .context = image, + .pmd_flag = __PAGE_KERNEL_LARGE_EXEC, + }; + unsigned long mstart, mend; + pgd_t *level4p; + int result; + int i; + + level4p = (pgd_t *)__va(start_pgtable); + clear_page(level4p); + for (i = 0; i < nr_pfn_mapped; i++) { + mstart = pfn_mapped[i].start << PAGE_SHIFT; + mend = pfn_mapped[i].end << PAGE_SHIFT; + + result = kernel_ident_mapping_init(&info, + level4p, mstart, mend); + if (result) + return result; + } + + /* + * segments's mem ranges could be outside 0 ~ max_pfn, + * for example when jump back to original kernel from kexeced kernel. + * or first kernel is booted with user mem map, and second kernel + * could be loaded out of that range. + */ + for (i = 0; i < image->nr_segments; i++) { + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz; + + result = kernel_ident_mapping_init(&info, + level4p, mstart, mend); + + if (result) + return result; + } + + return init_transition_pgtable(image, level4p); +} + +static void set_idt(void *newidt, u16 limit) +{ + struct desc_ptr curidt; + + /* x86-64 supports unaliged loads & stores */ + curidt.size = limit; + curidt.address = (unsigned long)newidt; + + __asm__ __volatile__ ( + "lidtq %0\n" + : : "m" (curidt) + ); +}; + + +static void set_gdt(void *newgdt, u16 limit) +{ + struct desc_ptr curgdt; + + /* x86-64 supports unaligned loads & stores */ + curgdt.size = limit; + curgdt.address = (unsigned long)newgdt; + + __asm__ __volatile__ ( + "lgdtq %0\n" + : : "m" (curgdt) + ); +}; + +static void load_segments(void) +{ + __asm__ __volatile__ ( + "\tmovl %0,%%ds\n" + "\tmovl %0,%%es\n" + "\tmovl %0,%%ss\n" + "\tmovl %0,%%fs\n" + "\tmovl %0,%%gs\n" + : : "a" (__KERNEL_DS) : "memory" + ); +} + +int machine_kexec_prepare(struct kimage *image) +{ + unsigned long start_pgtable; + int result; + + /* Calculate the offsets */ + start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; + + /* Setup the identity mapped 64bit page table */ + result = init_pgtable(image, start_pgtable); + if (result) + return result; + + return 0; +} + +void machine_kexec_cleanup(struct kimage *image) +{ + free_transition_pgtable(image); +} + +/* + * Do not allocate memory (or fail in any way) in machine_kexec(). + * We are past the point of no return, committed to rebooting now. + */ +void machine_kexec(struct kimage *image) +{ + unsigned long page_list[PAGES_NR]; + void *control_page; + int save_ftrace_enabled; + +#ifdef CONFIG_KEXEC_JUMP + if (image->preserve_context) + save_processor_state(); +#endif + + save_ftrace_enabled = __ftrace_enabled_save(); + + /* Interrupts aren't acceptable while we reboot */ + local_irq_disable(); + hw_breakpoint_disable(); + + if (image->preserve_context) { +#ifdef CONFIG_X86_IO_APIC + /* + * We need to put APICs in legacy mode so that we can + * get timer interrupts in second kernel. kexec/kdump + * paths already have calls to disable_IO_APIC() in + * one form or other. kexec jump path also need + * one. + */ + disable_IO_APIC(); +#endif + } + + control_page = page_address(image->control_code_page) + PAGE_SIZE; + memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE); + + page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page); + page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; + page_list[PA_TABLE_PAGE] = + (unsigned long)__pa(page_address(image->control_code_page)); + + if (image->type == KEXEC_TYPE_DEFAULT) + page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) + << PAGE_SHIFT); + + /* + * The segment registers are funny things, they have both a + * visible and an invisible part. Whenever the visible part is + * set to a specific selector, the invisible part is loaded + * with from a table in memory. At no other time is the + * descriptor table in memory accessed. + * + * I take advantage of this here by force loading the + * segments, before I zap the gdt with an invalid value. + */ + load_segments(); + /* + * The gdt & idt are now invalid. + * If you want to load them you must set up your own idt & gdt. + */ + set_gdt(phys_to_virt(0), 0); + set_idt(phys_to_virt(0), 0); + + /* now call it */ + image->start = relocate_kernel((unsigned long)image->head, + (unsigned long)page_list, + image->start, + image->preserve_context); + +#ifdef CONFIG_KEXEC_JUMP + if (image->preserve_context) + restore_processor_state(); +#endif + + __ftrace_enabled_restore(save_ftrace_enabled); +} + +void arch_crash_save_vmcoreinfo(void) +{ + VMCOREINFO_SYMBOL(phys_base); + VMCOREINFO_SYMBOL(init_level4_pgt); + +#ifdef CONFIG_NUMA + VMCOREINFO_SYMBOL(node_data); + VMCOREINFO_LENGTH(node_data, MAX_NUMNODES); +#endif + vmcoreinfo_append_str("KERNELOFFSET=%lx\n", + (unsigned long)&_text - __START_KERNEL); +} + diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/mcount_64.S new file mode 100644 index 00000000000..c050a015316 --- /dev/null +++ b/arch/x86/kernel/mcount_64.S @@ -0,0 +1,217 @@ +/* + * linux/arch/x86_64/mcount_64.S + * + * Copyright (C) 2014 Steven Rostedt, Red Hat Inc + */ + +#include <linux/linkage.h> +#include <asm/ptrace.h> +#include <asm/ftrace.h> + + + .code64 + .section .entry.text, "ax" + + +#ifdef CONFIG_FUNCTION_TRACER + +#ifdef CC_USING_FENTRY +# define function_hook __fentry__ +#else +# define function_hook mcount +#endif + +#ifdef CONFIG_DYNAMIC_FTRACE + +ENTRY(function_hook) + retq +END(function_hook) + +/* skip is set if stack has been adjusted */ +.macro ftrace_caller_setup skip=0 + MCOUNT_SAVE_FRAME \skip + + /* Load the ftrace_ops into the 3rd parameter */ + movq function_trace_op(%rip), %rdx + + /* Load ip into the first parameter */ + movq RIP(%rsp), %rdi + subq $MCOUNT_INSN_SIZE, %rdi + /* Load the parent_ip into the second parameter */ +#ifdef CC_USING_FENTRY + movq SS+16(%rsp), %rsi +#else + movq 8(%rbp), %rsi +#endif +.endm + +ENTRY(ftrace_caller) + /* Check if tracing was disabled (quick check) */ + cmpl $0, function_trace_stop + jne ftrace_stub + + ftrace_caller_setup + /* regs go into 4th parameter (but make it NULL) */ + movq $0, %rcx + +GLOBAL(ftrace_call) + call ftrace_stub + + MCOUNT_RESTORE_FRAME +ftrace_return: + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +GLOBAL(ftrace_graph_call) + jmp ftrace_stub +#endif + +GLOBAL(ftrace_stub) + retq +END(ftrace_caller) + +ENTRY(ftrace_regs_caller) + /* Save the current flags before compare (in SS location)*/ + pushfq + + /* Check if tracing was disabled (quick check) */ + cmpl $0, function_trace_stop + jne ftrace_restore_flags + + /* skip=8 to skip flags saved in SS */ + ftrace_caller_setup 8 + + /* Save the rest of pt_regs */ + movq %r15, R15(%rsp) + movq %r14, R14(%rsp) + movq %r13, R13(%rsp) + movq %r12, R12(%rsp) + movq %r11, R11(%rsp) + movq %r10, R10(%rsp) + movq %rbp, RBP(%rsp) + movq %rbx, RBX(%rsp) + /* Copy saved flags */ + movq SS(%rsp), %rcx + movq %rcx, EFLAGS(%rsp) + /* Kernel segments */ + movq $__KERNEL_DS, %rcx + movq %rcx, SS(%rsp) + movq $__KERNEL_CS, %rcx + movq %rcx, CS(%rsp) + /* Stack - skipping return address */ + leaq SS+16(%rsp), %rcx + movq %rcx, RSP(%rsp) + + /* regs go into 4th parameter */ + leaq (%rsp), %rcx + +GLOBAL(ftrace_regs_call) + call ftrace_stub + + /* Copy flags back to SS, to restore them */ + movq EFLAGS(%rsp), %rax + movq %rax, SS(%rsp) + + /* Handlers can change the RIP */ + movq RIP(%rsp), %rax + movq %rax, SS+8(%rsp) + + /* restore the rest of pt_regs */ + movq R15(%rsp), %r15 + movq R14(%rsp), %r14 + movq R13(%rsp), %r13 + movq R12(%rsp), %r12 + movq R10(%rsp), %r10 + movq RBP(%rsp), %rbp + movq RBX(%rsp), %rbx + + /* skip=8 to skip flags saved in SS */ + MCOUNT_RESTORE_FRAME 8 + + /* Restore flags */ + popfq + + jmp ftrace_return +ftrace_restore_flags: + popfq + jmp ftrace_stub + +END(ftrace_regs_caller) + + +#else /* ! CONFIG_DYNAMIC_FTRACE */ + +ENTRY(function_hook) + cmpl $0, function_trace_stop + jne ftrace_stub + + cmpq $ftrace_stub, ftrace_trace_function + jnz trace + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + cmpq $ftrace_stub, ftrace_graph_return + jnz ftrace_graph_caller + + cmpq $ftrace_graph_entry_stub, ftrace_graph_entry + jnz ftrace_graph_caller +#endif + +GLOBAL(ftrace_stub) + retq + +trace: + MCOUNT_SAVE_FRAME + + movq RIP(%rsp), %rdi +#ifdef CC_USING_FENTRY + movq SS+16(%rsp), %rsi +#else + movq 8(%rbp), %rsi +#endif + subq $MCOUNT_INSN_SIZE, %rdi + + call *ftrace_trace_function + + MCOUNT_RESTORE_FRAME + + jmp ftrace_stub +END(function_hook) +#endif /* CONFIG_DYNAMIC_FTRACE */ +#endif /* CONFIG_FUNCTION_TRACER */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +ENTRY(ftrace_graph_caller) + MCOUNT_SAVE_FRAME + +#ifdef CC_USING_FENTRY + leaq SS+16(%rsp), %rdi + movq $0, %rdx /* No framepointers needed */ +#else + leaq 8(%rbp), %rdi + movq (%rbp), %rdx +#endif + movq RIP(%rsp), %rsi + subq $MCOUNT_INSN_SIZE, %rsi + + call prepare_ftrace_return + + MCOUNT_RESTORE_FRAME + + retq +END(ftrace_graph_caller) + +GLOBAL(return_to_handler) + subq $24, %rsp + + /* Save the return values */ + movq %rax, (%rsp) + movq %rdx, 8(%rsp) + movq %rbp, %rdi + + call ftrace_return_to_handler + + movq %rax, %rdi + movq 8(%rsp), %rdx + movq (%rsp), %rax + addq $24, %rsp + jmp *%rdi +#endif diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c new file mode 100644 index 00000000000..f4c886d9165 --- /dev/null +++ b/arch/x86/kernel/mmconf-fam10h_64.c @@ -0,0 +1,237 @@ +/* + * AMD Family 10h mmconfig enablement + */ + +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/pci.h> +#include <linux/dmi.h> +#include <linux/range.h> + +#include <asm/pci-direct.h> +#include <linux/sort.h> +#include <asm/io.h> +#include <asm/msr.h> +#include <asm/acpi.h> +#include <asm/mmconfig.h> +#include <asm/pci_x86.h> + +struct pci_hostbridge_probe { + u32 bus; + u32 slot; + u32 vendor; + u32 device; +}; + +static u64 fam10h_pci_mmconf_base; + +static struct pci_hostbridge_probe pci_probes[] = { + { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1200 }, + { 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 }, +}; + +static int cmp_range(const void *x1, const void *x2) +{ + const struct range *r1 = x1; + const struct range *r2 = x2; + int start1, start2; + + start1 = r1->start >> 32; + start2 = r2->start >> 32; + + return start1 - start2; +} + +#define MMCONF_UNIT (1ULL << FAM10H_MMIO_CONF_BASE_SHIFT) +#define MMCONF_MASK (~(MMCONF_UNIT - 1)) +#define MMCONF_SIZE (MMCONF_UNIT << 8) +/* need to avoid (0xfd<<32), (0xfe<<32), and (0xff<<32), ht used space */ +#define FAM10H_PCI_MMCONF_BASE (0xfcULL<<32) +#define BASE_VALID(b) ((b) + MMCONF_SIZE <= (0xfdULL<<32) || (b) >= (1ULL<<40)) +static void get_fam10h_pci_mmconf_base(void) +{ + int i; + unsigned bus; + unsigned slot; + int found; + + u64 val; + u32 address; + u64 tom2; + u64 base = FAM10H_PCI_MMCONF_BASE; + + int hi_mmio_num; + struct range range[8]; + + /* only try to get setting from BSP */ + if (fam10h_pci_mmconf_base) + return; + + if (!early_pci_allowed()) + return; + + found = 0; + for (i = 0; i < ARRAY_SIZE(pci_probes); i++) { + u32 id; + u16 device; + u16 vendor; + + bus = pci_probes[i].bus; + slot = pci_probes[i].slot; + id = read_pci_config(bus, slot, 0, PCI_VENDOR_ID); + + vendor = id & 0xffff; + device = (id>>16) & 0xffff; + if (pci_probes[i].vendor == vendor && + pci_probes[i].device == device) { + found = 1; + break; + } + } + + if (!found) + return; + + /* SYS_CFG */ + address = MSR_K8_SYSCFG; + rdmsrl(address, val); + + /* TOP_MEM2 is not enabled? */ + if (!(val & (1<<21))) { + tom2 = 1ULL << 32; + } else { + /* TOP_MEM2 */ + address = MSR_K8_TOP_MEM2; + rdmsrl(address, val); + tom2 = max(val & 0xffffff800000ULL, 1ULL << 32); + } + + if (base <= tom2) + base = (tom2 + 2 * MMCONF_UNIT - 1) & MMCONF_MASK; + + /* + * need to check if the range is in the high mmio range that is + * above 4G + */ + hi_mmio_num = 0; + for (i = 0; i < 8; i++) { + u32 reg; + u64 start; + u64 end; + reg = read_pci_config(bus, slot, 1, 0x80 + (i << 3)); + if (!(reg & 3)) + continue; + + start = (u64)(reg & 0xffffff00) << 8; /* 39:16 on 31:8*/ + reg = read_pci_config(bus, slot, 1, 0x84 + (i << 3)); + end = ((u64)(reg & 0xffffff00) << 8) | 0xffff; /* 39:16 on 31:8*/ + + if (end < tom2) + continue; + + range[hi_mmio_num].start = start; + range[hi_mmio_num].end = end; + hi_mmio_num++; + } + + if (!hi_mmio_num) + goto out; + + /* sort the range */ + sort(range, hi_mmio_num, sizeof(struct range), cmp_range, NULL); + + if (range[hi_mmio_num - 1].end < base) + goto out; + if (range[0].start > base + MMCONF_SIZE) + goto out; + + /* need to find one window */ + base = (range[0].start & MMCONF_MASK) - MMCONF_UNIT; + if ((base > tom2) && BASE_VALID(base)) + goto out; + base = (range[hi_mmio_num - 1].end + MMCONF_UNIT) & MMCONF_MASK; + if (BASE_VALID(base)) + goto out; + /* need to find window between ranges */ + for (i = 1; i < hi_mmio_num; i++) { + base = (range[i - 1].end + MMCONF_UNIT) & MMCONF_MASK; + val = range[i].start & MMCONF_MASK; + if (val >= base + MMCONF_SIZE && BASE_VALID(base)) + goto out; + } + return; + +out: + fam10h_pci_mmconf_base = base; +} + +void fam10h_check_enable_mmcfg(void) +{ + u64 val; + u32 address; + + if (!(pci_probe & PCI_CHECK_ENABLE_AMD_MMCONF)) + return; + + address = MSR_FAM10H_MMIO_CONF_BASE; + rdmsrl(address, val); + + /* try to make sure that AP's setting is identical to BSP setting */ + if (val & FAM10H_MMIO_CONF_ENABLE) { + unsigned busnbits; + busnbits = (val >> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) & + FAM10H_MMIO_CONF_BUSRANGE_MASK; + + /* only trust the one handle 256 buses, if acpi=off */ + if (!acpi_pci_disabled || busnbits >= 8) { + u64 base = val & MMCONF_MASK; + + if (!fam10h_pci_mmconf_base) { + fam10h_pci_mmconf_base = base; + return; + } else if (fam10h_pci_mmconf_base == base) + return; + } + } + + /* + * if it is not enabled, try to enable it and assume only one segment + * with 256 buses + */ + get_fam10h_pci_mmconf_base(); + if (!fam10h_pci_mmconf_base) { + pci_probe &= ~PCI_CHECK_ENABLE_AMD_MMCONF; + return; + } + + printk(KERN_INFO "Enable MMCONFIG on AMD Family 10h\n"); + val &= ~((FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT) | + (FAM10H_MMIO_CONF_BUSRANGE_MASK<<FAM10H_MMIO_CONF_BUSRANGE_SHIFT)); + val |= fam10h_pci_mmconf_base | (8 << FAM10H_MMIO_CONF_BUSRANGE_SHIFT) | + FAM10H_MMIO_CONF_ENABLE; + wrmsrl(address, val); +} + +static int __init set_check_enable_amd_mmconf(const struct dmi_system_id *d) +{ + pci_probe |= PCI_CHECK_ENABLE_AMD_MMCONF; + return 0; +} + +static const struct dmi_system_id __initconst mmconf_dmi_table[] = { + { + .callback = set_check_enable_amd_mmconf, + .ident = "Sun Microsystems Machine", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Sun Microsystems"), + }, + }, + {} +}; + +/* Called from a non __init function, but only on the BSP. */ +void __ref check_enable_amd_mmconf_dmi(void) +{ + dmi_check_system(mmconf_dmi_table); +} diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c new file mode 100644 index 00000000000..e69f9882bf9 --- /dev/null +++ b/arch/x86/kernel/module.c @@ -0,0 +1,253 @@ +/* Kernel module help for x86. + Copyright (C) 2001 Rusty Russell. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/moduleloader.h> +#include <linux/elf.h> +#include <linux/vmalloc.h> +#include <linux/fs.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/bug.h> +#include <linux/mm.h> +#include <linux/gfp.h> +#include <linux/jump_label.h> +#include <linux/random.h> + +#include <asm/page.h> +#include <asm/pgtable.h> + +#if 0 +#define DEBUGP(fmt, ...) \ + printk(KERN_DEBUG fmt, ##__VA_ARGS__) +#else +#define DEBUGP(fmt, ...) \ +do { \ + if (0) \ + printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ +} while (0) +#endif + +#ifdef CONFIG_RANDOMIZE_BASE +static unsigned long module_load_offset; +static int randomize_modules = 1; + +/* Mutex protects the module_load_offset. */ +static DEFINE_MUTEX(module_kaslr_mutex); + +static int __init parse_nokaslr(char *p) +{ + randomize_modules = 0; + return 0; +} +early_param("nokaslr", parse_nokaslr); + +static unsigned long int get_module_load_offset(void) +{ + if (randomize_modules) { + mutex_lock(&module_kaslr_mutex); + /* + * Calculate the module_load_offset the first time this + * code is called. Once calculated it stays the same until + * reboot. + */ + if (module_load_offset == 0) + module_load_offset = + (get_random_int() % 1024 + 1) * PAGE_SIZE; + mutex_unlock(&module_kaslr_mutex); + } + return module_load_offset; +} +#else +static unsigned long int get_module_load_offset(void) +{ + return 0; +} +#endif + +void *module_alloc(unsigned long size) +{ + if (PAGE_ALIGN(size) > MODULES_LEN) + return NULL; + return __vmalloc_node_range(size, 1, + MODULES_VADDR + get_module_load_offset(), + MODULES_END, GFP_KERNEL | __GFP_HIGHMEM, + PAGE_KERNEL_EXEC, NUMA_NO_NODE, + __builtin_return_address(0)); +} + +#ifdef CONFIG_X86_32 +int apply_relocate(Elf32_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ + unsigned int i; + Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr; + Elf32_Sym *sym; + uint32_t *location; + + DEBUGP("Applying relocate section %u to %u\n", + relsec, sechdrs[relsec].sh_info); + for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { + /* This is where to make the change */ + location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr + + rel[i].r_offset; + /* This is the symbol it is referring to. Note that all + undefined symbols have been resolved. */ + sym = (Elf32_Sym *)sechdrs[symindex].sh_addr + + ELF32_R_SYM(rel[i].r_info); + + switch (ELF32_R_TYPE(rel[i].r_info)) { + case R_386_32: + /* We add the value into the location given */ + *location += sym->st_value; + break; + case R_386_PC32: + /* Add the value, subtract its position */ + *location += sym->st_value - (uint32_t)location; + break; + default: + pr_err("%s: Unknown relocation: %u\n", + me->name, ELF32_R_TYPE(rel[i].r_info)); + return -ENOEXEC; + } + } + return 0; +} +#else /*X86_64*/ +int apply_relocate_add(Elf64_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ + unsigned int i; + Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr; + Elf64_Sym *sym; + void *loc; + u64 val; + + DEBUGP("Applying relocate section %u to %u\n", + relsec, sechdrs[relsec].sh_info); + for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { + /* This is where to make the change */ + loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr + + rel[i].r_offset; + + /* This is the symbol it is referring to. Note that all + undefined symbols have been resolved. */ + sym = (Elf64_Sym *)sechdrs[symindex].sh_addr + + ELF64_R_SYM(rel[i].r_info); + + DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n", + (int)ELF64_R_TYPE(rel[i].r_info), + sym->st_value, rel[i].r_addend, (u64)loc); + + val = sym->st_value + rel[i].r_addend; + + switch (ELF64_R_TYPE(rel[i].r_info)) { + case R_X86_64_NONE: + break; + case R_X86_64_64: + *(u64 *)loc = val; + break; + case R_X86_64_32: + *(u32 *)loc = val; + if (val != *(u32 *)loc) + goto overflow; + break; + case R_X86_64_32S: + *(s32 *)loc = val; + if ((s64)val != *(s32 *)loc) + goto overflow; + break; + case R_X86_64_PC32: + val -= (u64)loc; + *(u32 *)loc = val; +#if 0 + if ((s64)val != *(s32 *)loc) + goto overflow; +#endif + break; + default: + pr_err("%s: Unknown rela relocation: %llu\n", + me->name, ELF64_R_TYPE(rel[i].r_info)); + return -ENOEXEC; + } + } + return 0; + +overflow: + pr_err("overflow in relocation type %d val %Lx\n", + (int)ELF64_R_TYPE(rel[i].r_info), val); + pr_err("`%s' likely not compiled with -mcmodel=kernel\n", + me->name); + return -ENOEXEC; +} +#endif + +int module_finalize(const Elf_Ehdr *hdr, + const Elf_Shdr *sechdrs, + struct module *me) +{ + const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, + *para = NULL; + char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; + + for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { + if (!strcmp(".text", secstrings + s->sh_name)) + text = s; + if (!strcmp(".altinstructions", secstrings + s->sh_name)) + alt = s; + if (!strcmp(".smp_locks", secstrings + s->sh_name)) + locks = s; + if (!strcmp(".parainstructions", secstrings + s->sh_name)) + para = s; + } + + if (alt) { + /* patch .altinstructions */ + void *aseg = (void *)alt->sh_addr; + apply_alternatives(aseg, aseg + alt->sh_size); + } + if (locks && text) { + void *lseg = (void *)locks->sh_addr; + void *tseg = (void *)text->sh_addr; + alternatives_smp_module_add(me, me->name, + lseg, lseg + locks->sh_size, + tseg, tseg + text->sh_size); + } + + if (para) { + void *pseg = (void *)para->sh_addr; + apply_paravirt(pseg, pseg + para->sh_size); + } + + /* make jump label nops */ + jump_label_apply_nops(me); + + return 0; +} + +void module_arch_cleanup(struct module *mod) +{ + alternatives_smp_module_del(mod); +} diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c new file mode 100644 index 00000000000..d2b56489d70 --- /dev/null +++ b/arch/x86/kernel/mpparse.c @@ -0,0 +1,917 @@ +/* + * Intel Multiprocessor Specification 1.1 and 1.4 + * compliant MP-table parsing routines. + * + * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk> + * (c) 1998, 1999, 2000, 2009 Ingo Molnar <mingo@redhat.com> + * (c) 2008 Alexey Starikovskiy <astarikovskiy@suse.de> + */ + +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/delay.h> +#include <linux/bootmem.h> +#include <linux/memblock.h> +#include <linux/kernel_stat.h> +#include <linux/mc146818rtc.h> +#include <linux/bitops.h> +#include <linux/acpi.h> +#include <linux/module.h> +#include <linux/smp.h> +#include <linux/pci.h> + +#include <asm/mtrr.h> +#include <asm/mpspec.h> +#include <asm/pgalloc.h> +#include <asm/io_apic.h> +#include <asm/proto.h> +#include <asm/bios_ebda.h> +#include <asm/e820.h> +#include <asm/setup.h> +#include <asm/smp.h> + +#include <asm/apic.h> +/* + * Checksum an MP configuration block. + */ + +static int __init mpf_checksum(unsigned char *mp, int len) +{ + int sum = 0; + + while (len--) + sum += *mp++; + + return sum & 0xFF; +} + +int __init default_mpc_apic_id(struct mpc_cpu *m) +{ + return m->apicid; +} + +static void __init MP_processor_info(struct mpc_cpu *m) +{ + int apicid; + char *bootup_cpu = ""; + + if (!(m->cpuflag & CPU_ENABLED)) { + disabled_cpus++; + return; + } + + apicid = x86_init.mpparse.mpc_apic_id(m); + + if (m->cpuflag & CPU_BOOTPROCESSOR) { + bootup_cpu = " (Bootup-CPU)"; + boot_cpu_physical_apicid = m->apicid; + } + + printk(KERN_INFO "Processor #%d%s\n", m->apicid, bootup_cpu); + generic_processor_info(apicid, m->apicver); +} + +#ifdef CONFIG_X86_IO_APIC +void __init default_mpc_oem_bus_info(struct mpc_bus *m, char *str) +{ + memcpy(str, m->bustype, 6); + str[6] = 0; + apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str); +} + +static void __init MP_bus_info(struct mpc_bus *m) +{ + char str[7]; + + x86_init.mpparse.mpc_oem_bus_info(m, str); + +#if MAX_MP_BUSSES < 256 + if (m->busid >= MAX_MP_BUSSES) { + printk(KERN_WARNING "MP table busid value (%d) for bustype %s " + " is too large, max. supported is %d\n", + m->busid, str, MAX_MP_BUSSES - 1); + return; + } +#endif + + set_bit(m->busid, mp_bus_not_pci); + if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) { +#ifdef CONFIG_EISA + mp_bus_id_to_type[m->busid] = MP_BUS_ISA; +#endif + } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { + if (x86_init.mpparse.mpc_oem_pci_bus) + x86_init.mpparse.mpc_oem_pci_bus(m); + + clear_bit(m->busid, mp_bus_not_pci); +#ifdef CONFIG_EISA + mp_bus_id_to_type[m->busid] = MP_BUS_PCI; + } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) { + mp_bus_id_to_type[m->busid] = MP_BUS_EISA; +#endif + } else + printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); +} + +static void __init MP_ioapic_info(struct mpc_ioapic *m) +{ + if (m->flags & MPC_APIC_USABLE) + mp_register_ioapic(m->apicid, m->apicaddr, gsi_top); +} + +static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq) +{ + apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x," + " IRQ %02x, APIC ID %x, APIC INT %02x\n", + mp_irq->irqtype, mp_irq->irqflag & 3, + (mp_irq->irqflag >> 2) & 3, mp_irq->srcbus, + mp_irq->srcbusirq, mp_irq->dstapic, mp_irq->dstirq); +} + +#else /* CONFIG_X86_IO_APIC */ +static inline void __init MP_bus_info(struct mpc_bus *m) {} +static inline void __init MP_ioapic_info(struct mpc_ioapic *m) {} +#endif /* CONFIG_X86_IO_APIC */ + +static void __init MP_lintsrc_info(struct mpc_lintsrc *m) +{ + apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x," + " IRQ %02x, APIC ID %x, APIC LINT %02x\n", + m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbusid, + m->srcbusirq, m->destapic, m->destapiclint); +} + +/* + * Read/parse the MPC + */ +static int __init smp_check_mpc(struct mpc_table *mpc, char *oem, char *str) +{ + + if (memcmp(mpc->signature, MPC_SIGNATURE, 4)) { + printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n", + mpc->signature[0], mpc->signature[1], + mpc->signature[2], mpc->signature[3]); + return 0; + } + if (mpf_checksum((unsigned char *)mpc, mpc->length)) { + printk(KERN_ERR "MPTABLE: checksum error!\n"); + return 0; + } + if (mpc->spec != 0x01 && mpc->spec != 0x04) { + printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n", + mpc->spec); + return 0; + } + if (!mpc->lapic) { + printk(KERN_ERR "MPTABLE: null local APIC address!\n"); + return 0; + } + memcpy(oem, mpc->oem, 8); + oem[8] = 0; + printk(KERN_INFO "MPTABLE: OEM ID: %s\n", oem); + + memcpy(str, mpc->productid, 12); + str[12] = 0; + + printk(KERN_INFO "MPTABLE: Product ID: %s\n", str); + + printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->lapic); + + return 1; +} + +static void skip_entry(unsigned char **ptr, int *count, int size) +{ + *ptr += size; + *count += size; +} + +static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt) +{ + printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n" + "type %x\n", *mpt); + print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16, + 1, mpc, mpc->length, 1); +} + +void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { } + +static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) +{ + char str[16]; + char oem[10]; + + int count = sizeof(*mpc); + unsigned char *mpt = ((unsigned char *)mpc) + count; + + if (!smp_check_mpc(mpc, oem, str)) + return 0; + +#ifdef CONFIG_X86_32 + generic_mps_oem_check(mpc, oem, str); +#endif + /* Initialize the lapic mapping */ + if (!acpi_lapic) + register_lapic_address(mpc->lapic); + + if (early) + return 1; + + if (mpc->oemptr) + x86_init.mpparse.smp_read_mpc_oem(mpc); + + /* + * Now process the configuration blocks. + */ + x86_init.mpparse.mpc_record(0); + + while (count < mpc->length) { + switch (*mpt) { + case MP_PROCESSOR: + /* ACPI may have already provided this data */ + if (!acpi_lapic) + MP_processor_info((struct mpc_cpu *)mpt); + skip_entry(&mpt, &count, sizeof(struct mpc_cpu)); + break; + case MP_BUS: + MP_bus_info((struct mpc_bus *)mpt); + skip_entry(&mpt, &count, sizeof(struct mpc_bus)); + break; + case MP_IOAPIC: + MP_ioapic_info((struct mpc_ioapic *)mpt); + skip_entry(&mpt, &count, sizeof(struct mpc_ioapic)); + break; + case MP_INTSRC: + mp_save_irq((struct mpc_intsrc *)mpt); + skip_entry(&mpt, &count, sizeof(struct mpc_intsrc)); + break; + case MP_LINTSRC: + MP_lintsrc_info((struct mpc_lintsrc *)mpt); + skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc)); + break; + default: + /* wrong mptable */ + smp_dump_mptable(mpc, mpt); + count = mpc->length; + break; + } + x86_init.mpparse.mpc_record(1); + } + + if (!num_processors) + printk(KERN_ERR "MPTABLE: no processors registered!\n"); + return num_processors; +} + +#ifdef CONFIG_X86_IO_APIC + +static int __init ELCR_trigger(unsigned int irq) +{ + unsigned int port; + + port = 0x4d0 + (irq >> 3); + return (inb(port) >> (irq & 7)) & 1; +} + +static void __init construct_default_ioirq_mptable(int mpc_default_type) +{ + struct mpc_intsrc intsrc; + int i; + int ELCR_fallback = 0; + + intsrc.type = MP_INTSRC; + intsrc.irqflag = 0; /* conforming */ + intsrc.srcbus = 0; + intsrc.dstapic = mpc_ioapic_id(0); + + intsrc.irqtype = mp_INT; + + /* + * If true, we have an ISA/PCI system with no IRQ entries + * in the MP table. To prevent the PCI interrupts from being set up + * incorrectly, we try to use the ELCR. The sanity check to see if + * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can + * never be level sensitive, so we simply see if the ELCR agrees. + * If it does, we assume it's valid. + */ + if (mpc_default_type == 5) { + printk(KERN_INFO "ISA/PCI bus type with no IRQ information... " + "falling back to ELCR\n"); + + if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || + ELCR_trigger(13)) + printk(KERN_ERR "ELCR contains invalid data... " + "not using ELCR\n"); + else { + printk(KERN_INFO + "Using ELCR to identify PCI interrupts\n"); + ELCR_fallback = 1; + } + } + + for (i = 0; i < 16; i++) { + switch (mpc_default_type) { + case 2: + if (i == 0 || i == 13) + continue; /* IRQ0 & IRQ13 not connected */ + /* fall through */ + default: + if (i == 2) + continue; /* IRQ2 is never connected */ + } + + if (ELCR_fallback) { + /* + * If the ELCR indicates a level-sensitive interrupt, we + * copy that information over to the MP table in the + * irqflag field (level sensitive, active high polarity). + */ + if (ELCR_trigger(i)) + intsrc.irqflag = 13; + else + intsrc.irqflag = 0; + } + + intsrc.srcbusirq = i; + intsrc.dstirq = i ? i : 2; /* IRQ0 to INTIN2 */ + mp_save_irq(&intsrc); + } + + intsrc.irqtype = mp_ExtINT; + intsrc.srcbusirq = 0; + intsrc.dstirq = 0; /* 8259A to INTIN0 */ + mp_save_irq(&intsrc); +} + + +static void __init construct_ioapic_table(int mpc_default_type) +{ + struct mpc_ioapic ioapic; + struct mpc_bus bus; + + bus.type = MP_BUS; + bus.busid = 0; + switch (mpc_default_type) { + default: + printk(KERN_ERR "???\nUnknown standard configuration %d\n", + mpc_default_type); + /* fall through */ + case 1: + case 5: + memcpy(bus.bustype, "ISA ", 6); + break; + case 2: + case 6: + case 3: + memcpy(bus.bustype, "EISA ", 6); + break; + } + MP_bus_info(&bus); + if (mpc_default_type > 4) { + bus.busid = 1; + memcpy(bus.bustype, "PCI ", 6); + MP_bus_info(&bus); + } + + ioapic.type = MP_IOAPIC; + ioapic.apicid = 2; + ioapic.apicver = mpc_default_type > 4 ? 0x10 : 0x01; + ioapic.flags = MPC_APIC_USABLE; + ioapic.apicaddr = IO_APIC_DEFAULT_PHYS_BASE; + MP_ioapic_info(&ioapic); + + /* + * We set up most of the low 16 IO-APIC pins according to MPS rules. + */ + construct_default_ioirq_mptable(mpc_default_type); +} +#else +static inline void __init construct_ioapic_table(int mpc_default_type) { } +#endif + +static inline void __init construct_default_ISA_mptable(int mpc_default_type) +{ + struct mpc_cpu processor; + struct mpc_lintsrc lintsrc; + int linttypes[2] = { mp_ExtINT, mp_NMI }; + int i; + + /* + * local APIC has default address + */ + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + + /* + * 2 CPUs, numbered 0 & 1. + */ + processor.type = MP_PROCESSOR; + /* Either an integrated APIC or a discrete 82489DX. */ + processor.apicver = mpc_default_type > 4 ? 0x10 : 0x01; + processor.cpuflag = CPU_ENABLED; + processor.cpufeature = (boot_cpu_data.x86 << 8) | + (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; + processor.featureflag = boot_cpu_data.x86_capability[0]; + processor.reserved[0] = 0; + processor.reserved[1] = 0; + for (i = 0; i < 2; i++) { + processor.apicid = i; + MP_processor_info(&processor); + } + + construct_ioapic_table(mpc_default_type); + + lintsrc.type = MP_LINTSRC; + lintsrc.irqflag = 0; /* conforming */ + lintsrc.srcbusid = 0; + lintsrc.srcbusirq = 0; + lintsrc.destapic = MP_APIC_ALL; + for (i = 0; i < 2; i++) { + lintsrc.irqtype = linttypes[i]; + lintsrc.destapiclint = i; + MP_lintsrc_info(&lintsrc); + } +} + +static struct mpf_intel *mpf_found; + +static unsigned long __init get_mpc_size(unsigned long physptr) +{ + struct mpc_table *mpc; + unsigned long size; + + mpc = early_ioremap(physptr, PAGE_SIZE); + size = mpc->length; + early_iounmap(mpc, PAGE_SIZE); + apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size); + + return size; +} + +static int __init check_physptr(struct mpf_intel *mpf, unsigned int early) +{ + struct mpc_table *mpc; + unsigned long size; + + size = get_mpc_size(mpf->physptr); + mpc = early_ioremap(mpf->physptr, size); + /* + * Read the physical hardware table. Anything here will + * override the defaults. + */ + if (!smp_read_mpc(mpc, early)) { +#ifdef CONFIG_X86_LOCAL_APIC + smp_found_config = 0; +#endif + printk(KERN_ERR "BIOS bug, MP table errors detected!...\n" + "... disabling SMP support. (tell your hw vendor)\n"); + early_iounmap(mpc, size); + return -1; + } + early_iounmap(mpc, size); + + if (early) + return -1; + +#ifdef CONFIG_X86_IO_APIC + /* + * If there are no explicit MP IRQ entries, then we are + * broken. We set up most of the low 16 IO-APIC pins to + * ISA defaults and hope it will work. + */ + if (!mp_irq_entries) { + struct mpc_bus bus; + + printk(KERN_ERR "BIOS bug, no explicit IRQ entries, " + "using default mptable. (tell your hw vendor)\n"); + + bus.type = MP_BUS; + bus.busid = 0; + memcpy(bus.bustype, "ISA ", 6); + MP_bus_info(&bus); + + construct_default_ioirq_mptable(0); + } +#endif + + return 0; +} + +/* + * Scan the memory blocks for an SMP configuration block. + */ +void __init default_get_smp_config(unsigned int early) +{ + struct mpf_intel *mpf = mpf_found; + + if (!mpf) + return; + + if (acpi_lapic && early) + return; + + /* + * MPS doesn't support hyperthreading, aka only have + * thread 0 apic id in MPS table + */ + if (acpi_lapic && acpi_ioapic) + return; + + printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", + mpf->specification); +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) + if (mpf->feature2 & (1 << 7)) { + printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); + pic_mode = 1; + } else { + printk(KERN_INFO " Virtual Wire compatibility mode.\n"); + pic_mode = 0; + } +#endif + /* + * Now see if we need to read further. + */ + if (mpf->feature1 != 0) { + if (early) { + /* + * local APIC has default address + */ + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + return; + } + + printk(KERN_INFO "Default MP configuration #%d\n", + mpf->feature1); + construct_default_ISA_mptable(mpf->feature1); + + } else if (mpf->physptr) { + if (check_physptr(mpf, early)) + return; + } else + BUG(); + + if (!early) + printk(KERN_INFO "Processors: %d\n", num_processors); + /* + * Only use the first configuration found. + */ +} + +static void __init smp_reserve_memory(struct mpf_intel *mpf) +{ + memblock_reserve(mpf->physptr, get_mpc_size(mpf->physptr)); +} + +static int __init smp_scan_config(unsigned long base, unsigned long length) +{ + unsigned int *bp = phys_to_virt(base); + struct mpf_intel *mpf; + unsigned long mem; + + apic_printk(APIC_VERBOSE, "Scan for SMP in [mem %#010lx-%#010lx]\n", + base, base + length - 1); + BUILD_BUG_ON(sizeof(*mpf) != 16); + + while (length > 0) { + mpf = (struct mpf_intel *)bp; + if ((*bp == SMP_MAGIC_IDENT) && + (mpf->length == 1) && + !mpf_checksum((unsigned char *)bp, 16) && + ((mpf->specification == 1) + || (mpf->specification == 4))) { +#ifdef CONFIG_X86_LOCAL_APIC + smp_found_config = 1; +#endif + mpf_found = mpf; + + printk(KERN_INFO "found SMP MP-table at [mem %#010llx-%#010llx] mapped at [%p]\n", + (unsigned long long) virt_to_phys(mpf), + (unsigned long long) virt_to_phys(mpf) + + sizeof(*mpf) - 1, mpf); + + mem = virt_to_phys(mpf); + memblock_reserve(mem, sizeof(*mpf)); + if (mpf->physptr) + smp_reserve_memory(mpf); + + return 1; + } + bp += 4; + length -= 16; + } + return 0; +} + +void __init default_find_smp_config(void) +{ + unsigned int address; + + /* + * FIXME: Linux assumes you have 640K of base ram.. + * this continues the error... + * + * 1) Scan the bottom 1K for a signature + * 2) Scan the top 1K of base RAM + * 3) Scan the 64K of bios + */ + if (smp_scan_config(0x0, 0x400) || + smp_scan_config(639 * 0x400, 0x400) || + smp_scan_config(0xF0000, 0x10000)) + return; + /* + * If it is an SMP machine we should know now, unless the + * configuration is in an EISA bus machine with an + * extended bios data area. + * + * there is a real-mode segmented pointer pointing to the + * 4K EBDA area at 0x40E, calculate and scan it here. + * + * NOTE! There are Linux loaders that will corrupt the EBDA + * area, and as such this kind of SMP config may be less + * trustworthy, simply because the SMP table may have been + * stomped on during early boot. These loaders are buggy and + * should be fixed. + * + * MP1.4 SPEC states to only scan first 1K of 4K EBDA. + */ + + address = get_bios_ebda(); + if (address) + smp_scan_config(address, 0x400); +} + +#ifdef CONFIG_X86_IO_APIC +static u8 __initdata irq_used[MAX_IRQ_SOURCES]; + +static int __init get_MP_intsrc_index(struct mpc_intsrc *m) +{ + int i; + + if (m->irqtype != mp_INT) + return 0; + + if (m->irqflag != 0x0f) + return 0; + + /* not legacy */ + + for (i = 0; i < mp_irq_entries; i++) { + if (mp_irqs[i].irqtype != mp_INT) + continue; + + if (mp_irqs[i].irqflag != 0x0f) + continue; + + if (mp_irqs[i].srcbus != m->srcbus) + continue; + if (mp_irqs[i].srcbusirq != m->srcbusirq) + continue; + if (irq_used[i]) { + /* already claimed */ + return -2; + } + irq_used[i] = 1; + return i; + } + + /* not found */ + return -1; +} + +#define SPARE_SLOT_NUM 20 + +static struct mpc_intsrc __initdata *m_spare[SPARE_SLOT_NUM]; + +static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) +{ + int i; + + apic_printk(APIC_VERBOSE, "OLD "); + print_mp_irq_info(m); + + i = get_MP_intsrc_index(m); + if (i > 0) { + memcpy(m, &mp_irqs[i], sizeof(*m)); + apic_printk(APIC_VERBOSE, "NEW "); + print_mp_irq_info(&mp_irqs[i]); + return; + } + if (!i) { + /* legacy, do nothing */ + return; + } + if (*nr_m_spare < SPARE_SLOT_NUM) { + /* + * not found (-1), or duplicated (-2) are invalid entries, + * we need to use the slot later + */ + m_spare[*nr_m_spare] = m; + *nr_m_spare += 1; + } +} + +static int __init +check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count) +{ + if (!mpc_new_phys || count <= mpc_new_length) { + WARN(1, "update_mptable: No spare slots (length: %x)\n", count); + return -1; + } + + return 0; +} +#else /* CONFIG_X86_IO_APIC */ +static +inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {} +#endif /* CONFIG_X86_IO_APIC */ + +static int __init replace_intsrc_all(struct mpc_table *mpc, + unsigned long mpc_new_phys, + unsigned long mpc_new_length) +{ +#ifdef CONFIG_X86_IO_APIC + int i; +#endif + int count = sizeof(*mpc); + int nr_m_spare = 0; + unsigned char *mpt = ((unsigned char *)mpc) + count; + + printk(KERN_INFO "mpc_length %x\n", mpc->length); + while (count < mpc->length) { + switch (*mpt) { + case MP_PROCESSOR: + skip_entry(&mpt, &count, sizeof(struct mpc_cpu)); + break; + case MP_BUS: + skip_entry(&mpt, &count, sizeof(struct mpc_bus)); + break; + case MP_IOAPIC: + skip_entry(&mpt, &count, sizeof(struct mpc_ioapic)); + break; + case MP_INTSRC: + check_irq_src((struct mpc_intsrc *)mpt, &nr_m_spare); + skip_entry(&mpt, &count, sizeof(struct mpc_intsrc)); + break; + case MP_LINTSRC: + skip_entry(&mpt, &count, sizeof(struct mpc_lintsrc)); + break; + default: + /* wrong mptable */ + smp_dump_mptable(mpc, mpt); + goto out; + } + } + +#ifdef CONFIG_X86_IO_APIC + for (i = 0; i < mp_irq_entries; i++) { + if (irq_used[i]) + continue; + + if (mp_irqs[i].irqtype != mp_INT) + continue; + + if (mp_irqs[i].irqflag != 0x0f) + continue; + + if (nr_m_spare > 0) { + apic_printk(APIC_VERBOSE, "*NEW* found\n"); + nr_m_spare--; + memcpy(m_spare[nr_m_spare], &mp_irqs[i], sizeof(mp_irqs[i])); + m_spare[nr_m_spare] = NULL; + } else { + struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; + count += sizeof(struct mpc_intsrc); + if (check_slot(mpc_new_phys, mpc_new_length, count) < 0) + goto out; + memcpy(m, &mp_irqs[i], sizeof(*m)); + mpc->length = count; + mpt += sizeof(struct mpc_intsrc); + } + print_mp_irq_info(&mp_irqs[i]); + } +#endif +out: + /* update checksum */ + mpc->checksum = 0; + mpc->checksum -= mpf_checksum((unsigned char *)mpc, mpc->length); + + return 0; +} + +int enable_update_mptable; + +static int __init update_mptable_setup(char *str) +{ + enable_update_mptable = 1; +#ifdef CONFIG_PCI + pci_routeirq = 1; +#endif + return 0; +} +early_param("update_mptable", update_mptable_setup); + +static unsigned long __initdata mpc_new_phys; +static unsigned long mpc_new_length __initdata = 4096; + +/* alloc_mptable or alloc_mptable=4k */ +static int __initdata alloc_mptable; +static int __init parse_alloc_mptable_opt(char *p) +{ + enable_update_mptable = 1; +#ifdef CONFIG_PCI + pci_routeirq = 1; +#endif + alloc_mptable = 1; + if (!p) + return 0; + mpc_new_length = memparse(p, &p); + return 0; +} +early_param("alloc_mptable", parse_alloc_mptable_opt); + +void __init early_reserve_e820_mpc_new(void) +{ + if (enable_update_mptable && alloc_mptable) + mpc_new_phys = early_reserve_e820(mpc_new_length, 4); +} + +static int __init update_mp_table(void) +{ + char str[16]; + char oem[10]; + struct mpf_intel *mpf; + struct mpc_table *mpc, *mpc_new; + + if (!enable_update_mptable) + return 0; + + mpf = mpf_found; + if (!mpf) + return 0; + + /* + * Now see if we need to go further. + */ + if (mpf->feature1 != 0) + return 0; + + if (!mpf->physptr) + return 0; + + mpc = phys_to_virt(mpf->physptr); + + if (!smp_check_mpc(mpc, oem, str)) + return 0; + + printk(KERN_INFO "mpf: %llx\n", (u64)virt_to_phys(mpf)); + printk(KERN_INFO "physptr: %x\n", mpf->physptr); + + if (mpc_new_phys && mpc->length > mpc_new_length) { + mpc_new_phys = 0; + printk(KERN_INFO "mpc_new_length is %ld, please use alloc_mptable=8k\n", + mpc_new_length); + } + + if (!mpc_new_phys) { + unsigned char old, new; + /* check if we can change the position */ + mpc->checksum = 0; + old = mpf_checksum((unsigned char *)mpc, mpc->length); + mpc->checksum = 0xff; + new = mpf_checksum((unsigned char *)mpc, mpc->length); + if (old == new) { + printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n"); + return 0; + } + printk(KERN_INFO "use in-position replacing\n"); + } else { + mpf->physptr = mpc_new_phys; + mpc_new = phys_to_virt(mpc_new_phys); + memcpy(mpc_new, mpc, mpc->length); + mpc = mpc_new; + /* check if we can modify that */ + if (mpc_new_phys - mpf->physptr) { + struct mpf_intel *mpf_new; + /* steal 16 bytes from [0, 1k) */ + printk(KERN_INFO "mpf new: %x\n", 0x400 - 16); + mpf_new = phys_to_virt(0x400 - 16); + memcpy(mpf_new, mpf, 16); + mpf = mpf_new; + mpf->physptr = mpc_new_phys; + } + mpf->checksum = 0; + mpf->checksum -= mpf_checksum((unsigned char *)mpf, 16); + printk(KERN_INFO "physptr new: %x\n", mpf->physptr); + } + + /* + * only replace the one with mp_INT and + * MP_IRQ_TRIGGER_LEVEL|MP_IRQ_POLARITY_LOW, + * already in mp_irqs , stored by ... and mp_config_acpi_gsi, + * may need pci=routeirq for all coverage + */ + replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length); + + return 0; +} + +late_initcall(update_mp_table); diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c new file mode 100644 index 00000000000..c9603ac80de --- /dev/null +++ b/arch/x86/kernel/msr.c @@ -0,0 +1,305 @@ +/* ----------------------------------------------------------------------- * + * + * Copyright 2000-2008 H. Peter Anvin - All Rights Reserved + * Copyright 2009 Intel Corporation; author: H. Peter Anvin + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, + * USA; either version 2 of the License, or (at your option) any later + * version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * x86 MSR access device + * + * This device is accessed by lseek() to the appropriate register number + * and then read/write in chunks of 8 bytes. A larger size means multiple + * reads or writes of the same register. + * + * This driver uses /dev/cpu/%d/msr where %d is the minor number, and on + * an SMP box will direct the access to CPU %d. + */ + +#include <linux/module.h> + +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/fcntl.h> +#include <linux/init.h> +#include <linux/poll.h> +#include <linux/smp.h> +#include <linux/major.h> +#include <linux/fs.h> +#include <linux/device.h> +#include <linux/cpu.h> +#include <linux/notifier.h> +#include <linux/uaccess.h> +#include <linux/gfp.h> + +#include <asm/processor.h> +#include <asm/msr.h> + +static struct class *msr_class; + +static loff_t msr_seek(struct file *file, loff_t offset, int orig) +{ + loff_t ret; + struct inode *inode = file_inode(file); + + mutex_lock(&inode->i_mutex); + switch (orig) { + case 0: + file->f_pos = offset; + ret = file->f_pos; + break; + case 1: + file->f_pos += offset; + ret = file->f_pos; + break; + default: + ret = -EINVAL; + } + mutex_unlock(&inode->i_mutex); + return ret; +} + +static ssize_t msr_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + u32 __user *tmp = (u32 __user *) buf; + u32 data[2]; + u32 reg = *ppos; + int cpu = iminor(file_inode(file)); + int err = 0; + ssize_t bytes = 0; + + if (count % 8) + return -EINVAL; /* Invalid chunk size */ + + for (; count; count -= 8) { + err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]); + if (err) + break; + if (copy_to_user(tmp, &data, 8)) { + err = -EFAULT; + break; + } + tmp += 2; + bytes += 8; + } + + return bytes ? bytes : err; +} + +static ssize_t msr_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + const u32 __user *tmp = (const u32 __user *)buf; + u32 data[2]; + u32 reg = *ppos; + int cpu = iminor(file_inode(file)); + int err = 0; + ssize_t bytes = 0; + + if (count % 8) + return -EINVAL; /* Invalid chunk size */ + + for (; count; count -= 8) { + if (copy_from_user(&data, tmp, 8)) { + err = -EFAULT; + break; + } + err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]); + if (err) + break; + tmp += 2; + bytes += 8; + } + + return bytes ? bytes : err; +} + +static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg) +{ + u32 __user *uregs = (u32 __user *)arg; + u32 regs[8]; + int cpu = iminor(file_inode(file)); + int err; + + switch (ioc) { + case X86_IOC_RDMSR_REGS: + if (!(file->f_mode & FMODE_READ)) { + err = -EBADF; + break; + } + if (copy_from_user(®s, uregs, sizeof regs)) { + err = -EFAULT; + break; + } + err = rdmsr_safe_regs_on_cpu(cpu, regs); + if (err) + break; + if (copy_to_user(uregs, ®s, sizeof regs)) + err = -EFAULT; + break; + + case X86_IOC_WRMSR_REGS: + if (!(file->f_mode & FMODE_WRITE)) { + err = -EBADF; + break; + } + if (copy_from_user(®s, uregs, sizeof regs)) { + err = -EFAULT; + break; + } + err = wrmsr_safe_regs_on_cpu(cpu, regs); + if (err) + break; + if (copy_to_user(uregs, ®s, sizeof regs)) + err = -EFAULT; + break; + + default: + err = -ENOTTY; + break; + } + + return err; +} + +static int msr_open(struct inode *inode, struct file *file) +{ + unsigned int cpu = iminor(file_inode(file)); + struct cpuinfo_x86 *c; + + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + + if (cpu >= nr_cpu_ids || !cpu_online(cpu)) + return -ENXIO; /* No such CPU */ + + c = &cpu_data(cpu); + if (!cpu_has(c, X86_FEATURE_MSR)) + return -EIO; /* MSR not supported */ + + return 0; +} + +/* + * File operations we support + */ +static const struct file_operations msr_fops = { + .owner = THIS_MODULE, + .llseek = msr_seek, + .read = msr_read, + .write = msr_write, + .open = msr_open, + .unlocked_ioctl = msr_ioctl, + .compat_ioctl = msr_ioctl, +}; + +static int msr_device_create(int cpu) +{ + struct device *dev; + + dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), NULL, + "msr%d", cpu); + return IS_ERR(dev) ? PTR_ERR(dev) : 0; +} + +static void msr_device_destroy(int cpu) +{ + device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu)); +} + +static int msr_class_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + int err = 0; + + switch (action) { + case CPU_UP_PREPARE: + err = msr_device_create(cpu); + break; + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + case CPU_DEAD: + msr_device_destroy(cpu); + break; + } + return notifier_from_errno(err); +} + +static struct notifier_block __refdata msr_class_cpu_notifier = { + .notifier_call = msr_class_cpu_callback, +}; + +static char *msr_devnode(struct device *dev, umode_t *mode) +{ + return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt)); +} + +static int __init msr_init(void) +{ + int i, err = 0; + i = 0; + + if (__register_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr", &msr_fops)) { + printk(KERN_ERR "msr: unable to get major %d for msr\n", + MSR_MAJOR); + err = -EBUSY; + goto out; + } + msr_class = class_create(THIS_MODULE, "msr"); + if (IS_ERR(msr_class)) { + err = PTR_ERR(msr_class); + goto out_chrdev; + } + msr_class->devnode = msr_devnode; + + cpu_notifier_register_begin(); + for_each_online_cpu(i) { + err = msr_device_create(i); + if (err != 0) + goto out_class; + } + __register_hotcpu_notifier(&msr_class_cpu_notifier); + cpu_notifier_register_done(); + + err = 0; + goto out; + +out_class: + i = 0; + for_each_online_cpu(i) + msr_device_destroy(i); + cpu_notifier_register_done(); + class_destroy(msr_class); +out_chrdev: + __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); +out: + return err; +} + +static void __exit msr_exit(void) +{ + int cpu = 0; + + cpu_notifier_register_begin(); + for_each_online_cpu(cpu) + msr_device_destroy(cpu); + class_destroy(msr_class); + __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); + __unregister_hotcpu_notifier(&msr_class_cpu_notifier); + cpu_notifier_register_done(); +} + +module_init(msr_init); +module_exit(msr_exit) + +MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>"); +MODULE_DESCRIPTION("x86 generic MSR driver"); +MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c new file mode 100644 index 00000000000..c3e985d1751 --- /dev/null +++ b/arch/x86/kernel/nmi.c @@ -0,0 +1,562 @@ +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + * Copyright (C) 2011 Don Zickus Red Hat, Inc. + * + * Pentium III FXSR, SSE support + * Gareth Hughes <gareth@valinux.com>, May 2000 + */ + +/* + * Handle hardware traps and faults. + */ +#include <linux/spinlock.h> +#include <linux/kprobes.h> +#include <linux/kdebug.h> +#include <linux/nmi.h> +#include <linux/debugfs.h> +#include <linux/delay.h> +#include <linux/hardirq.h> +#include <linux/slab.h> +#include <linux/export.h> + +#if defined(CONFIG_EDAC) +#include <linux/edac.h> +#endif + +#include <linux/atomic.h> +#include <asm/traps.h> +#include <asm/mach_traps.h> +#include <asm/nmi.h> +#include <asm/x86_init.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/nmi.h> + +struct nmi_desc { + spinlock_t lock; + struct list_head head; +}; + +static struct nmi_desc nmi_desc[NMI_MAX] = +{ + { + .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[0].lock), + .head = LIST_HEAD_INIT(nmi_desc[0].head), + }, + { + .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock), + .head = LIST_HEAD_INIT(nmi_desc[1].head), + }, + { + .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock), + .head = LIST_HEAD_INIT(nmi_desc[2].head), + }, + { + .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock), + .head = LIST_HEAD_INIT(nmi_desc[3].head), + }, + +}; + +struct nmi_stats { + unsigned int normal; + unsigned int unknown; + unsigned int external; + unsigned int swallow; +}; + +static DEFINE_PER_CPU(struct nmi_stats, nmi_stats); + +static int ignore_nmis; + +int unknown_nmi_panic; +/* + * Prevent NMI reason port (0x61) being accessed simultaneously, can + * only be used in NMI handler. + */ +static DEFINE_RAW_SPINLOCK(nmi_reason_lock); + +static int __init setup_unknown_nmi_panic(char *str) +{ + unknown_nmi_panic = 1; + return 1; +} +__setup("unknown_nmi_panic", setup_unknown_nmi_panic); + +#define nmi_to_desc(type) (&nmi_desc[type]) + +static u64 nmi_longest_ns = 1 * NSEC_PER_MSEC; + +static int __init nmi_warning_debugfs(void) +{ + debugfs_create_u64("nmi_longest_ns", 0644, + arch_debugfs_dir, &nmi_longest_ns); + return 0; +} +fs_initcall(nmi_warning_debugfs); + +static void nmi_max_handler(struct irq_work *w) +{ + struct nmiaction *a = container_of(w, struct nmiaction, irq_work); + int remainder_ns, decimal_msecs; + u64 whole_msecs = ACCESS_ONCE(a->max_duration); + + remainder_ns = do_div(whole_msecs, (1000 * 1000)); + decimal_msecs = remainder_ns / 1000; + + printk_ratelimited(KERN_INFO + "INFO: NMI handler (%ps) took too long to run: %lld.%03d msecs\n", + a->handler, whole_msecs, decimal_msecs); +} + +static int nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b) +{ + struct nmi_desc *desc = nmi_to_desc(type); + struct nmiaction *a; + int handled=0; + + rcu_read_lock(); + + /* + * NMIs are edge-triggered, which means if you have enough + * of them concurrently, you can lose some because only one + * can be latched at any given time. Walk the whole list + * to handle those situations. + */ + list_for_each_entry_rcu(a, &desc->head, list) { + int thishandled; + u64 delta; + + delta = sched_clock(); + thishandled = a->handler(type, regs); + handled += thishandled; + delta = sched_clock() - delta; + trace_nmi_handler(a->handler, (int)delta, thishandled); + + if (delta < nmi_longest_ns || delta < a->max_duration) + continue; + + a->max_duration = delta; + irq_work_queue(&a->irq_work); + } + + rcu_read_unlock(); + + /* return total number of NMI events handled */ + return handled; +} +NOKPROBE_SYMBOL(nmi_handle); + +int __register_nmi_handler(unsigned int type, struct nmiaction *action) +{ + struct nmi_desc *desc = nmi_to_desc(type); + unsigned long flags; + + if (!action->handler) + return -EINVAL; + + init_irq_work(&action->irq_work, nmi_max_handler); + + spin_lock_irqsave(&desc->lock, flags); + + /* + * most handlers of type NMI_UNKNOWN never return because + * they just assume the NMI is theirs. Just a sanity check + * to manage expectations + */ + WARN_ON_ONCE(type == NMI_UNKNOWN && !list_empty(&desc->head)); + WARN_ON_ONCE(type == NMI_SERR && !list_empty(&desc->head)); + WARN_ON_ONCE(type == NMI_IO_CHECK && !list_empty(&desc->head)); + + /* + * some handlers need to be executed first otherwise a fake + * event confuses some handlers (kdump uses this flag) + */ + if (action->flags & NMI_FLAG_FIRST) + list_add_rcu(&action->list, &desc->head); + else + list_add_tail_rcu(&action->list, &desc->head); + + spin_unlock_irqrestore(&desc->lock, flags); + return 0; +} +EXPORT_SYMBOL(__register_nmi_handler); + +void unregister_nmi_handler(unsigned int type, const char *name) +{ + struct nmi_desc *desc = nmi_to_desc(type); + struct nmiaction *n; + unsigned long flags; + + spin_lock_irqsave(&desc->lock, flags); + + list_for_each_entry_rcu(n, &desc->head, list) { + /* + * the name passed in to describe the nmi handler + * is used as the lookup key + */ + if (!strcmp(n->name, name)) { + WARN(in_nmi(), + "Trying to free NMI (%s) from NMI context!\n", n->name); + list_del_rcu(&n->list); + break; + } + } + + spin_unlock_irqrestore(&desc->lock, flags); + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(unregister_nmi_handler); + +static void +pci_serr_error(unsigned char reason, struct pt_regs *regs) +{ + /* check to see if anyone registered against these types of errors */ + if (nmi_handle(NMI_SERR, regs, false)) + return; + + pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n", + reason, smp_processor_id()); + + /* + * On some machines, PCI SERR line is used to report memory + * errors. EDAC makes use of it. + */ +#if defined(CONFIG_EDAC) + if (edac_handler_set()) { + edac_atomic_assert_error(); + return; + } +#endif + + if (panic_on_unrecovered_nmi) + panic("NMI: Not continuing"); + + pr_emerg("Dazed and confused, but trying to continue\n"); + + /* Clear and disable the PCI SERR error line. */ + reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR; + outb(reason, NMI_REASON_PORT); +} +NOKPROBE_SYMBOL(pci_serr_error); + +static void +io_check_error(unsigned char reason, struct pt_regs *regs) +{ + unsigned long i; + + /* check to see if anyone registered against these types of errors */ + if (nmi_handle(NMI_IO_CHECK, regs, false)) + return; + + pr_emerg( + "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n", + reason, smp_processor_id()); + show_regs(regs); + + if (panic_on_io_nmi) + panic("NMI IOCK error: Not continuing"); + + /* Re-enable the IOCK line, wait for a few seconds */ + reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK; + outb(reason, NMI_REASON_PORT); + + i = 20000; + while (--i) { + touch_nmi_watchdog(); + udelay(100); + } + + reason &= ~NMI_REASON_CLEAR_IOCHK; + outb(reason, NMI_REASON_PORT); +} +NOKPROBE_SYMBOL(io_check_error); + +static void +unknown_nmi_error(unsigned char reason, struct pt_regs *regs) +{ + int handled; + + /* + * Use 'false' as back-to-back NMIs are dealt with one level up. + * Of course this makes having multiple 'unknown' handlers useless + * as only the first one is ever run (unless it can actually determine + * if it caused the NMI) + */ + handled = nmi_handle(NMI_UNKNOWN, regs, false); + if (handled) { + __this_cpu_add(nmi_stats.unknown, handled); + return; + } + + __this_cpu_add(nmi_stats.unknown, 1); + + pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", + reason, smp_processor_id()); + + pr_emerg("Do you have a strange power saving mode enabled?\n"); + if (unknown_nmi_panic || panic_on_unrecovered_nmi) + panic("NMI: Not continuing"); + + pr_emerg("Dazed and confused, but trying to continue\n"); +} +NOKPROBE_SYMBOL(unknown_nmi_error); + +static DEFINE_PER_CPU(bool, swallow_nmi); +static DEFINE_PER_CPU(unsigned long, last_nmi_rip); + +static void default_do_nmi(struct pt_regs *regs) +{ + unsigned char reason = 0; + int handled; + bool b2b = false; + + /* + * CPU-specific NMI must be processed before non-CPU-specific + * NMI, otherwise we may lose it, because the CPU-specific + * NMI can not be detected/processed on other CPUs. + */ + + /* + * Back-to-back NMIs are interesting because they can either + * be two NMI or more than two NMIs (any thing over two is dropped + * due to NMI being edge-triggered). If this is the second half + * of the back-to-back NMI, assume we dropped things and process + * more handlers. Otherwise reset the 'swallow' NMI behaviour + */ + if (regs->ip == __this_cpu_read(last_nmi_rip)) + b2b = true; + else + __this_cpu_write(swallow_nmi, false); + + __this_cpu_write(last_nmi_rip, regs->ip); + + handled = nmi_handle(NMI_LOCAL, regs, b2b); + __this_cpu_add(nmi_stats.normal, handled); + if (handled) { + /* + * There are cases when a NMI handler handles multiple + * events in the current NMI. One of these events may + * be queued for in the next NMI. Because the event is + * already handled, the next NMI will result in an unknown + * NMI. Instead lets flag this for a potential NMI to + * swallow. + */ + if (handled > 1) + __this_cpu_write(swallow_nmi, true); + return; + } + + /* Non-CPU-specific NMI: NMI sources can be processed on any CPU */ + raw_spin_lock(&nmi_reason_lock); + reason = x86_platform.get_nmi_reason(); + + if (reason & NMI_REASON_MASK) { + if (reason & NMI_REASON_SERR) + pci_serr_error(reason, regs); + else if (reason & NMI_REASON_IOCHK) + io_check_error(reason, regs); +#ifdef CONFIG_X86_32 + /* + * Reassert NMI in case it became active + * meanwhile as it's edge-triggered: + */ + reassert_nmi(); +#endif + __this_cpu_add(nmi_stats.external, 1); + raw_spin_unlock(&nmi_reason_lock); + return; + } + raw_spin_unlock(&nmi_reason_lock); + + /* + * Only one NMI can be latched at a time. To handle + * this we may process multiple nmi handlers at once to + * cover the case where an NMI is dropped. The downside + * to this approach is we may process an NMI prematurely, + * while its real NMI is sitting latched. This will cause + * an unknown NMI on the next run of the NMI processing. + * + * We tried to flag that condition above, by setting the + * swallow_nmi flag when we process more than one event. + * This condition is also only present on the second half + * of a back-to-back NMI, so we flag that condition too. + * + * If both are true, we assume we already processed this + * NMI previously and we swallow it. Otherwise we reset + * the logic. + * + * There are scenarios where we may accidentally swallow + * a 'real' unknown NMI. For example, while processing + * a perf NMI another perf NMI comes in along with a + * 'real' unknown NMI. These two NMIs get combined into + * one (as descibed above). When the next NMI gets + * processed, it will be flagged by perf as handled, but + * noone will know that there was a 'real' unknown NMI sent + * also. As a result it gets swallowed. Or if the first + * perf NMI returns two events handled then the second + * NMI will get eaten by the logic below, again losing a + * 'real' unknown NMI. But this is the best we can do + * for now. + */ + if (b2b && __this_cpu_read(swallow_nmi)) + __this_cpu_add(nmi_stats.swallow, 1); + else + unknown_nmi_error(reason, regs); +} +NOKPROBE_SYMBOL(default_do_nmi); + +/* + * NMIs can hit breakpoints which will cause it to lose its + * NMI context with the CPU when the breakpoint does an iret. + */ +#ifdef CONFIG_X86_32 +/* + * For i386, NMIs use the same stack as the kernel, and we can + * add a workaround to the iret problem in C (preventing nested + * NMIs if an NMI takes a trap). Simply have 3 states the NMI + * can be in: + * + * 1) not running + * 2) executing + * 3) latched + * + * When no NMI is in progress, it is in the "not running" state. + * When an NMI comes in, it goes into the "executing" state. + * Normally, if another NMI is triggered, it does not interrupt + * the running NMI and the HW will simply latch it so that when + * the first NMI finishes, it will restart the second NMI. + * (Note, the latch is binary, thus multiple NMIs triggering, + * when one is running, are ignored. Only one NMI is restarted.) + * + * If an NMI hits a breakpoint that executes an iret, another + * NMI can preempt it. We do not want to allow this new NMI + * to run, but we want to execute it when the first one finishes. + * We set the state to "latched", and the exit of the first NMI will + * perform a dec_return, if the result is zero (NOT_RUNNING), then + * it will simply exit the NMI handler. If not, the dec_return + * would have set the state to NMI_EXECUTING (what we want it to + * be when we are running). In this case, we simply jump back + * to rerun the NMI handler again, and restart the 'latched' NMI. + * + * No trap (breakpoint or page fault) should be hit before nmi_restart, + * thus there is no race between the first check of state for NOT_RUNNING + * and setting it to NMI_EXECUTING. The HW will prevent nested NMIs + * at this point. + * + * In case the NMI takes a page fault, we need to save off the CR2 + * because the NMI could have preempted another page fault and corrupt + * the CR2 that is about to be read. As nested NMIs must be restarted + * and they can not take breakpoints or page faults, the update of the + * CR2 must be done before converting the nmi state back to NOT_RUNNING. + * Otherwise, there would be a race of another nested NMI coming in + * after setting state to NOT_RUNNING but before updating the nmi_cr2. + */ +enum nmi_states { + NMI_NOT_RUNNING = 0, + NMI_EXECUTING, + NMI_LATCHED, +}; +static DEFINE_PER_CPU(enum nmi_states, nmi_state); +static DEFINE_PER_CPU(unsigned long, nmi_cr2); + +#define nmi_nesting_preprocess(regs) \ + do { \ + if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) { \ + this_cpu_write(nmi_state, NMI_LATCHED); \ + return; \ + } \ + this_cpu_write(nmi_state, NMI_EXECUTING); \ + this_cpu_write(nmi_cr2, read_cr2()); \ + } while (0); \ + nmi_restart: + +#define nmi_nesting_postprocess() \ + do { \ + if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) \ + write_cr2(this_cpu_read(nmi_cr2)); \ + if (this_cpu_dec_return(nmi_state)) \ + goto nmi_restart; \ + } while (0) +#else /* x86_64 */ +/* + * In x86_64 things are a bit more difficult. This has the same problem + * where an NMI hitting a breakpoint that calls iret will remove the + * NMI context, allowing a nested NMI to enter. What makes this more + * difficult is that both NMIs and breakpoints have their own stack. + * When a new NMI or breakpoint is executed, the stack is set to a fixed + * point. If an NMI is nested, it will have its stack set at that same + * fixed address that the first NMI had, and will start corrupting the + * stack. This is handled in entry_64.S, but the same problem exists with + * the breakpoint stack. + * + * If a breakpoint is being processed, and the debug stack is being used, + * if an NMI comes in and also hits a breakpoint, the stack pointer + * will be set to the same fixed address as the breakpoint that was + * interrupted, causing that stack to be corrupted. To handle this case, + * check if the stack that was interrupted is the debug stack, and if + * so, change the IDT so that new breakpoints will use the current stack + * and not switch to the fixed address. On return of the NMI, switch back + * to the original IDT. + */ +static DEFINE_PER_CPU(int, update_debug_stack); + +static inline void nmi_nesting_preprocess(struct pt_regs *regs) +{ + /* + * If we interrupted a breakpoint, it is possible that + * the nmi handler will have breakpoints too. We need to + * change the IDT such that breakpoints that happen here + * continue to use the NMI stack. + */ + if (unlikely(is_debug_stack(regs->sp))) { + debug_stack_set_zero(); + this_cpu_write(update_debug_stack, 1); + } +} + +static inline void nmi_nesting_postprocess(void) +{ + if (unlikely(this_cpu_read(update_debug_stack))) { + debug_stack_reset(); + this_cpu_write(update_debug_stack, 0); + } +} +#endif + +dotraplinkage notrace void +do_nmi(struct pt_regs *regs, long error_code) +{ + nmi_nesting_preprocess(regs); + + nmi_enter(); + + inc_irq_stat(__nmi_count); + + if (!ignore_nmis) + default_do_nmi(regs); + + nmi_exit(); + + /* On i386, may loop back to preprocess */ + nmi_nesting_postprocess(); +} +NOKPROBE_SYMBOL(do_nmi); + +void stop_nmi(void) +{ + ignore_nmis++; +} + +void restart_nmi(void) +{ + ignore_nmis--; +} + +/* reset the back-to-back NMI logic */ +void local_touch_nmi(void) +{ + __this_cpu_write(last_nmi_rip, 0); +} +EXPORT_SYMBOL_GPL(local_touch_nmi); diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c new file mode 100644 index 00000000000..6d9582ec032 --- /dev/null +++ b/arch/x86/kernel/nmi_selftest.c @@ -0,0 +1,183 @@ +/* + * arch/x86/kernel/nmi-selftest.c + * + * Testsuite for NMI: IPIs + * + * Started by Don Zickus: + * (using lib/locking-selftest.c as a guide) + * + * Copyright (C) 2011 Red Hat, Inc., Don Zickus <dzickus@redhat.com> + */ + +#include <linux/smp.h> +#include <linux/cpumask.h> +#include <linux/delay.h> +#include <linux/init.h> +#include <linux/percpu.h> + +#include <asm/apic.h> +#include <asm/nmi.h> + +#define SUCCESS 0 +#define FAILURE 1 +#define TIMEOUT 2 + +static int __initdata nmi_fail; + +/* check to see if NMI IPIs work on this machine */ +static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __initdata; + +static int __initdata testcase_total; +static int __initdata testcase_successes; +static int __initdata expected_testcase_failures; +static int __initdata unexpected_testcase_failures; +static int __initdata unexpected_testcase_unknowns; + +static int __init nmi_unk_cb(unsigned int val, struct pt_regs *regs) +{ + unexpected_testcase_unknowns++; + return NMI_HANDLED; +} + +static void __init init_nmi_testsuite(void) +{ + /* trap all the unknown NMIs we may generate */ + register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk", + __initdata); +} + +static void __init cleanup_nmi_testsuite(void) +{ + unregister_nmi_handler(NMI_UNKNOWN, "nmi_selftest_unk"); +} + +static int __init test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs) +{ + int cpu = raw_smp_processor_id(); + + if (cpumask_test_and_clear_cpu(cpu, to_cpumask(nmi_ipi_mask))) + return NMI_HANDLED; + + return NMI_DONE; +} + +static void __init test_nmi_ipi(struct cpumask *mask) +{ + unsigned long timeout; + + if (register_nmi_handler(NMI_LOCAL, test_nmi_ipi_callback, + NMI_FLAG_FIRST, "nmi_selftest", __initdata)) { + nmi_fail = FAILURE; + return; + } + + /* sync above data before sending NMI */ + wmb(); + + apic->send_IPI_mask(mask, NMI_VECTOR); + + /* Don't wait longer than a second */ + timeout = USEC_PER_SEC; + while (!cpumask_empty(mask) && timeout--) + udelay(1); + + /* What happens if we timeout, do we still unregister?? */ + unregister_nmi_handler(NMI_LOCAL, "nmi_selftest"); + + if (!timeout) + nmi_fail = TIMEOUT; + return; +} + +static void __init remote_ipi(void) +{ + cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask)); + if (!cpumask_empty(to_cpumask(nmi_ipi_mask))) + test_nmi_ipi(to_cpumask(nmi_ipi_mask)); +} + +static void __init local_ipi(void) +{ + cpumask_clear(to_cpumask(nmi_ipi_mask)); + cpumask_set_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask)); + test_nmi_ipi(to_cpumask(nmi_ipi_mask)); +} + +static void __init reset_nmi(void) +{ + nmi_fail = 0; +} + +static void __init dotest(void (*testcase_fn)(void), int expected) +{ + testcase_fn(); + /* + * Filter out expected failures: + */ + if (nmi_fail != expected) { + unexpected_testcase_failures++; + + if (nmi_fail == FAILURE) + printk(KERN_CONT "FAILED |"); + else if (nmi_fail == TIMEOUT) + printk(KERN_CONT "TIMEOUT|"); + else + printk(KERN_CONT "ERROR |"); + dump_stack(); + } else { + testcase_successes++; + printk(KERN_CONT " ok |"); + } + testcase_total++; + + reset_nmi(); +} + +static inline void __init print_testname(const char *testname) +{ + printk("%12s:", testname); +} + +void __init nmi_selftest(void) +{ + init_nmi_testsuite(); + + /* + * Run the testsuite: + */ + printk("----------------\n"); + printk("| NMI testsuite:\n"); + printk("--------------------\n"); + + print_testname("remote IPI"); + dotest(remote_ipi, SUCCESS); + printk(KERN_CONT "\n"); + print_testname("local IPI"); + dotest(local_ipi, SUCCESS); + printk(KERN_CONT "\n"); + + cleanup_nmi_testsuite(); + + if (unexpected_testcase_failures) { + printk("--------------------\n"); + printk("BUG: %3d unexpected failures (out of %3d) - debugging disabled! |\n", + unexpected_testcase_failures, testcase_total); + printk("-----------------------------------------------------------------\n"); + } else if (expected_testcase_failures && testcase_successes) { + printk("--------------------\n"); + printk("%3d out of %3d testcases failed, as expected. |\n", + expected_testcase_failures, testcase_total); + printk("----------------------------------------------------\n"); + } else if (expected_testcase_failures && !testcase_successes) { + printk("--------------------\n"); + printk("All %3d testcases failed, as expected. |\n", + expected_testcase_failures); + printk("----------------------------------------\n"); + } else { + printk("--------------------\n"); + printk("Good, all %3d testcases passed! |\n", + testcase_successes); + printk("---------------------------------\n"); + } +} diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c new file mode 100644 index 00000000000..bbb6c731634 --- /dev/null +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -0,0 +1,20 @@ +/* + * Split spinlock implementation out into its own file, so it can be + * compiled in a FTRACE-compatible way. + */ +#include <linux/spinlock.h> +#include <linux/module.h> +#include <linux/jump_label.h> + +#include <asm/paravirt.h> + +struct pv_lock_ops pv_lock_ops = { +#ifdef CONFIG_SMP + .lock_spinning = __PV_IS_CALLEE_SAVE(paravirt_nop), + .unlock_kick = paravirt_nop, +#endif +}; +EXPORT_SYMBOL(pv_lock_ops); + +struct static_key paravirt_ticketlocks_enabled = STATIC_KEY_INIT_FALSE; +EXPORT_SYMBOL(paravirt_ticketlocks_enabled); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c new file mode 100644 index 00000000000..548d25f00c9 --- /dev/null +++ b/arch/x86/kernel/paravirt.c @@ -0,0 +1,489 @@ +/* Paravirtualization interfaces + Copyright (C) 2006 Rusty Russell IBM Corporation + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + 2007 - x86_64 support added by Glauber de Oliveira Costa, Red Hat Inc +*/ + +#include <linux/errno.h> +#include <linux/module.h> +#include <linux/efi.h> +#include <linux/bcd.h> +#include <linux/highmem.h> +#include <linux/kprobes.h> + +#include <asm/bug.h> +#include <asm/paravirt.h> +#include <asm/debugreg.h> +#include <asm/desc.h> +#include <asm/setup.h> +#include <asm/pgtable.h> +#include <asm/time.h> +#include <asm/pgalloc.h> +#include <asm/irq.h> +#include <asm/delay.h> +#include <asm/fixmap.h> +#include <asm/apic.h> +#include <asm/tlbflush.h> +#include <asm/timer.h> +#include <asm/special_insns.h> + +/* nop stub */ +void _paravirt_nop(void) +{ +} + +/* identity function, which can be inlined */ +u32 _paravirt_ident_32(u32 x) +{ + return x; +} + +u64 _paravirt_ident_64(u64 x) +{ + return x; +} + +void __init default_banner(void) +{ + printk(KERN_INFO "Booting paravirtualized kernel on %s\n", + pv_info.name); +} + +/* Undefined instruction for dealing with missing ops pointers. */ +static const unsigned char ud2a[] = { 0x0f, 0x0b }; + +unsigned paravirt_patch_nop(void) +{ + return 0; +} + +unsigned paravirt_patch_ignore(unsigned len) +{ + return len; +} + +struct branch { + unsigned char opcode; + u32 delta; +} __attribute__((packed)); + +unsigned paravirt_patch_call(void *insnbuf, + const void *target, u16 tgt_clobbers, + unsigned long addr, u16 site_clobbers, + unsigned len) +{ + struct branch *b = insnbuf; + unsigned long delta = (unsigned long)target - (addr+5); + + if (tgt_clobbers & ~site_clobbers) + return len; /* target would clobber too much for this site */ + if (len < 5) + return len; /* call too long for patch site */ + + b->opcode = 0xe8; /* call */ + b->delta = delta; + BUILD_BUG_ON(sizeof(*b) != 5); + + return 5; +} + +unsigned paravirt_patch_jmp(void *insnbuf, const void *target, + unsigned long addr, unsigned len) +{ + struct branch *b = insnbuf; + unsigned long delta = (unsigned long)target - (addr+5); + + if (len < 5) + return len; /* call too long for patch site */ + + b->opcode = 0xe9; /* jmp */ + b->delta = delta; + + return 5; +} + +/* Neat trick to map patch type back to the call within the + * corresponding structure. */ +static void *get_call_destination(u8 type) +{ + struct paravirt_patch_template tmpl = { + .pv_init_ops = pv_init_ops, + .pv_time_ops = pv_time_ops, + .pv_cpu_ops = pv_cpu_ops, + .pv_irq_ops = pv_irq_ops, + .pv_apic_ops = pv_apic_ops, + .pv_mmu_ops = pv_mmu_ops, +#ifdef CONFIG_PARAVIRT_SPINLOCKS + .pv_lock_ops = pv_lock_ops, +#endif + }; + return *((void **)&tmpl + type); +} + +unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, + unsigned long addr, unsigned len) +{ + void *opfunc = get_call_destination(type); + unsigned ret; + + if (opfunc == NULL) + /* If there's no function, patch it with a ud2a (BUG) */ + ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a)); + else if (opfunc == _paravirt_nop) + /* If the operation is a nop, then nop the callsite */ + ret = paravirt_patch_nop(); + + /* identity functions just return their single argument */ + else if (opfunc == _paravirt_ident_32) + ret = paravirt_patch_ident_32(insnbuf, len); + else if (opfunc == _paravirt_ident_64) + ret = paravirt_patch_ident_64(insnbuf, len); + + else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) || + type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) || + type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) || + type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret64)) + /* If operation requires a jmp, then jmp */ + ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len); + else + /* Otherwise call the function; assume target could + clobber any caller-save reg */ + ret = paravirt_patch_call(insnbuf, opfunc, CLBR_ANY, + addr, clobbers, len); + + return ret; +} + +unsigned paravirt_patch_insns(void *insnbuf, unsigned len, + const char *start, const char *end) +{ + unsigned insn_len = end - start; + + if (insn_len > len || start == NULL) + insn_len = len; + else + memcpy(insnbuf, start, insn_len); + + return insn_len; +} + +static void native_flush_tlb(void) +{ + __native_flush_tlb(); +} + +/* + * Global pages have to be flushed a bit differently. Not a real + * performance problem because this does not happen often. + */ +static void native_flush_tlb_global(void) +{ + __native_flush_tlb_global(); +} + +static void native_flush_tlb_single(unsigned long addr) +{ + __native_flush_tlb_single(addr); +} + +struct static_key paravirt_steal_enabled; +struct static_key paravirt_steal_rq_enabled; + +static u64 native_steal_clock(int cpu) +{ + return 0; +} + +/* These are in entry.S */ +extern void native_iret(void); +extern void native_irq_enable_sysexit(void); +extern void native_usergs_sysret32(void); +extern void native_usergs_sysret64(void); + +static struct resource reserve_ioports = { + .start = 0, + .end = IO_SPACE_LIMIT, + .name = "paravirt-ioport", + .flags = IORESOURCE_IO | IORESOURCE_BUSY, +}; + +/* + * Reserve the whole legacy IO space to prevent any legacy drivers + * from wasting time probing for their hardware. This is a fairly + * brute-force approach to disabling all non-virtual drivers. + * + * Note that this must be called very early to have any effect. + */ +int paravirt_disable_iospace(void) +{ + return request_resource(&ioport_resource, &reserve_ioports); +} + +static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LAZY_NONE; + +static inline void enter_lazy(enum paravirt_lazy_mode mode) +{ + BUG_ON(this_cpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); + + this_cpu_write(paravirt_lazy_mode, mode); +} + +static void leave_lazy(enum paravirt_lazy_mode mode) +{ + BUG_ON(this_cpu_read(paravirt_lazy_mode) != mode); + + this_cpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE); +} + +void paravirt_enter_lazy_mmu(void) +{ + enter_lazy(PARAVIRT_LAZY_MMU); +} + +void paravirt_leave_lazy_mmu(void) +{ + leave_lazy(PARAVIRT_LAZY_MMU); +} + +void paravirt_flush_lazy_mmu(void) +{ + preempt_disable(); + + if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { + arch_leave_lazy_mmu_mode(); + arch_enter_lazy_mmu_mode(); + } + + preempt_enable(); +} + +void paravirt_start_context_switch(struct task_struct *prev) +{ + BUG_ON(preemptible()); + + if (this_cpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) { + arch_leave_lazy_mmu_mode(); + set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES); + } + enter_lazy(PARAVIRT_LAZY_CPU); +} + +void paravirt_end_context_switch(struct task_struct *next) +{ + BUG_ON(preemptible()); + + leave_lazy(PARAVIRT_LAZY_CPU); + + if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES)) + arch_enter_lazy_mmu_mode(); +} + +enum paravirt_lazy_mode paravirt_get_lazy_mode(void) +{ + if (in_interrupt()) + return PARAVIRT_LAZY_NONE; + + return this_cpu_read(paravirt_lazy_mode); +} + +struct pv_info pv_info = { + .name = "bare hardware", + .paravirt_enabled = 0, + .kernel_rpl = 0, + .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */ + +#ifdef CONFIG_X86_64 + .extra_user_64bit_cs = __USER_CS, +#endif +}; + +struct pv_init_ops pv_init_ops = { + .patch = native_patch, +}; + +struct pv_time_ops pv_time_ops = { + .sched_clock = native_sched_clock, + .steal_clock = native_steal_clock, +}; + +__visible struct pv_irq_ops pv_irq_ops = { + .save_fl = __PV_IS_CALLEE_SAVE(native_save_fl), + .restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl), + .irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable), + .irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable), + .safe_halt = native_safe_halt, + .halt = native_halt, +#ifdef CONFIG_X86_64 + .adjust_exception_frame = paravirt_nop, +#endif +}; + +__visible struct pv_cpu_ops pv_cpu_ops = { + .cpuid = native_cpuid, + .get_debugreg = native_get_debugreg, + .set_debugreg = native_set_debugreg, + .clts = native_clts, + .read_cr0 = native_read_cr0, + .write_cr0 = native_write_cr0, + .read_cr4 = native_read_cr4, + .read_cr4_safe = native_read_cr4_safe, + .write_cr4 = native_write_cr4, +#ifdef CONFIG_X86_64 + .read_cr8 = native_read_cr8, + .write_cr8 = native_write_cr8, +#endif + .wbinvd = native_wbinvd, + .read_msr = native_read_msr_safe, + .write_msr = native_write_msr_safe, + .read_tsc = native_read_tsc, + .read_pmc = native_read_pmc, + .read_tscp = native_read_tscp, + .load_tr_desc = native_load_tr_desc, + .set_ldt = native_set_ldt, + .load_gdt = native_load_gdt, + .load_idt = native_load_idt, + .store_idt = native_store_idt, + .store_tr = native_store_tr, + .load_tls = native_load_tls, +#ifdef CONFIG_X86_64 + .load_gs_index = native_load_gs_index, +#endif + .write_ldt_entry = native_write_ldt_entry, + .write_gdt_entry = native_write_gdt_entry, + .write_idt_entry = native_write_idt_entry, + + .alloc_ldt = paravirt_nop, + .free_ldt = paravirt_nop, + + .load_sp0 = native_load_sp0, + +#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) + .irq_enable_sysexit = native_irq_enable_sysexit, +#endif +#ifdef CONFIG_X86_64 +#ifdef CONFIG_IA32_EMULATION + .usergs_sysret32 = native_usergs_sysret32, +#endif + .usergs_sysret64 = native_usergs_sysret64, +#endif + .iret = native_iret, + .swapgs = native_swapgs, + + .set_iopl_mask = native_set_iopl_mask, + .io_delay = native_io_delay, + + .start_context_switch = paravirt_nop, + .end_context_switch = paravirt_nop, +}; + +/* At this point, native_get/set_debugreg has real function entries */ +NOKPROBE_SYMBOL(native_get_debugreg); +NOKPROBE_SYMBOL(native_set_debugreg); +NOKPROBE_SYMBOL(native_load_idt); + +struct pv_apic_ops pv_apic_ops = { +#ifdef CONFIG_X86_LOCAL_APIC + .startup_ipi_hook = paravirt_nop, +#endif +}; + +#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_PAE) +/* 32-bit pagetable entries */ +#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_32) +#else +/* 64-bit pagetable entries */ +#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64) +#endif + +struct pv_mmu_ops pv_mmu_ops = { + + .read_cr2 = native_read_cr2, + .write_cr2 = native_write_cr2, + .read_cr3 = native_read_cr3, + .write_cr3 = native_write_cr3, + + .flush_tlb_user = native_flush_tlb, + .flush_tlb_kernel = native_flush_tlb_global, + .flush_tlb_single = native_flush_tlb_single, + .flush_tlb_others = native_flush_tlb_others, + + .pgd_alloc = __paravirt_pgd_alloc, + .pgd_free = paravirt_nop, + + .alloc_pte = paravirt_nop, + .alloc_pmd = paravirt_nop, + .alloc_pud = paravirt_nop, + .release_pte = paravirt_nop, + .release_pmd = paravirt_nop, + .release_pud = paravirt_nop, + + .set_pte = native_set_pte, + .set_pte_at = native_set_pte_at, + .set_pmd = native_set_pmd, + .set_pmd_at = native_set_pmd_at, + .pte_update = paravirt_nop, + .pte_update_defer = paravirt_nop, + .pmd_update = paravirt_nop, + .pmd_update_defer = paravirt_nop, + + .ptep_modify_prot_start = __ptep_modify_prot_start, + .ptep_modify_prot_commit = __ptep_modify_prot_commit, + +#if PAGETABLE_LEVELS >= 3 +#ifdef CONFIG_X86_PAE + .set_pte_atomic = native_set_pte_atomic, + .pte_clear = native_pte_clear, + .pmd_clear = native_pmd_clear, +#endif + .set_pud = native_set_pud, + + .pmd_val = PTE_IDENT, + .make_pmd = PTE_IDENT, + +#if PAGETABLE_LEVELS == 4 + .pud_val = PTE_IDENT, + .make_pud = PTE_IDENT, + + .set_pgd = native_set_pgd, +#endif +#endif /* PAGETABLE_LEVELS >= 3 */ + + .pte_val = PTE_IDENT, + .pgd_val = PTE_IDENT, + + .make_pte = PTE_IDENT, + .make_pgd = PTE_IDENT, + + .dup_mmap = paravirt_nop, + .exit_mmap = paravirt_nop, + .activate_mm = paravirt_nop, + + .lazy_mode = { + .enter = paravirt_nop, + .leave = paravirt_nop, + .flush = paravirt_nop, + }, + + .set_fixmap = native_set_fixmap, +}; + +EXPORT_SYMBOL_GPL(pv_time_ops); +EXPORT_SYMBOL (pv_cpu_ops); +EXPORT_SYMBOL (pv_mmu_ops); +EXPORT_SYMBOL_GPL(pv_apic_ops); +EXPORT_SYMBOL_GPL(pv_info); +EXPORT_SYMBOL (pv_irq_ops); diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c new file mode 100644 index 00000000000..d9f32e6d6ab --- /dev/null +++ b/arch/x86/kernel/paravirt_patch_32.c @@ -0,0 +1,61 @@ +#include <asm/paravirt.h> + +DEF_NATIVE(pv_irq_ops, irq_disable, "cli"); +DEF_NATIVE(pv_irq_ops, irq_enable, "sti"); +DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf"); +DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax"); +DEF_NATIVE(pv_cpu_ops, iret, "iret"); +DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "sti; sysexit"); +DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax"); +DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3"); +DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax"); +DEF_NATIVE(pv_cpu_ops, clts, "clts"); +DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc"); + +unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len) +{ + /* arg in %eax, return in %eax */ + return 0; +} + +unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len) +{ + /* arg in %edx:%eax, return in %edx:%eax */ + return 0; +} + +unsigned native_patch(u8 type, u16 clobbers, void *ibuf, + unsigned long addr, unsigned len) +{ + const unsigned char *start, *end; + unsigned ret; + +#define PATCH_SITE(ops, x) \ + case PARAVIRT_PATCH(ops.x): \ + start = start_##ops##_##x; \ + end = end_##ops##_##x; \ + goto patch_site + switch (type) { + PATCH_SITE(pv_irq_ops, irq_disable); + PATCH_SITE(pv_irq_ops, irq_enable); + PATCH_SITE(pv_irq_ops, restore_fl); + PATCH_SITE(pv_irq_ops, save_fl); + PATCH_SITE(pv_cpu_ops, iret); + PATCH_SITE(pv_cpu_ops, irq_enable_sysexit); + PATCH_SITE(pv_mmu_ops, read_cr2); + PATCH_SITE(pv_mmu_ops, read_cr3); + PATCH_SITE(pv_mmu_ops, write_cr3); + PATCH_SITE(pv_cpu_ops, clts); + PATCH_SITE(pv_cpu_ops, read_tsc); + + patch_site: + ret = paravirt_patch_insns(ibuf, len, start, end); + break; + + default: + ret = paravirt_patch_default(type, clobbers, ibuf, addr, len); + break; + } +#undef PATCH_SITE + return ret; +} diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c new file mode 100644 index 00000000000..a1da6737ba5 --- /dev/null +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -0,0 +1,73 @@ +#include <asm/paravirt.h> +#include <asm/asm-offsets.h> +#include <linux/stringify.h> + +DEF_NATIVE(pv_irq_ops, irq_disable, "cli"); +DEF_NATIVE(pv_irq_ops, irq_enable, "sti"); +DEF_NATIVE(pv_irq_ops, restore_fl, "pushq %rdi; popfq"); +DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax"); +DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax"); +DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax"); +DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3"); +DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)"); +DEF_NATIVE(pv_cpu_ops, clts, "clts"); +DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd"); + +DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "swapgs; sti; sysexit"); +DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq"); +DEF_NATIVE(pv_cpu_ops, usergs_sysret32, "swapgs; sysretl"); +DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs"); + +DEF_NATIVE(, mov32, "mov %edi, %eax"); +DEF_NATIVE(, mov64, "mov %rdi, %rax"); + +unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len) +{ + return paravirt_patch_insns(insnbuf, len, + start__mov32, end__mov32); +} + +unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len) +{ + return paravirt_patch_insns(insnbuf, len, + start__mov64, end__mov64); +} + +unsigned native_patch(u8 type, u16 clobbers, void *ibuf, + unsigned long addr, unsigned len) +{ + const unsigned char *start, *end; + unsigned ret; + +#define PATCH_SITE(ops, x) \ + case PARAVIRT_PATCH(ops.x): \ + start = start_##ops##_##x; \ + end = end_##ops##_##x; \ + goto patch_site + switch(type) { + PATCH_SITE(pv_irq_ops, restore_fl); + PATCH_SITE(pv_irq_ops, save_fl); + PATCH_SITE(pv_irq_ops, irq_enable); + PATCH_SITE(pv_irq_ops, irq_disable); + PATCH_SITE(pv_cpu_ops, irq_enable_sysexit); + PATCH_SITE(pv_cpu_ops, usergs_sysret32); + PATCH_SITE(pv_cpu_ops, usergs_sysret64); + PATCH_SITE(pv_cpu_ops, swapgs); + PATCH_SITE(pv_mmu_ops, read_cr2); + PATCH_SITE(pv_mmu_ops, read_cr3); + PATCH_SITE(pv_mmu_ops, write_cr3); + PATCH_SITE(pv_cpu_ops, clts); + PATCH_SITE(pv_mmu_ops, flush_tlb_single); + PATCH_SITE(pv_cpu_ops, wbinvd); + + patch_site: + ret = paravirt_patch_insns(ibuf, len, start, end); + break; + + default: + ret = paravirt_patch_default(type, clobbers, ibuf, addr, len); + break; + } +#undef PATCH_SITE + return ret; +} diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c new file mode 100644 index 00000000000..0497f719977 --- /dev/null +++ b/arch/x86/kernel/pci-calgary_64.c @@ -0,0 +1,1607 @@ +/* + * Derived from arch/powerpc/kernel/iommu.c + * + * Copyright IBM Corporation, 2006-2007 + * Copyright (C) 2006 Jon Mason <jdmason@kudzu.us> + * + * Author: Jon Mason <jdmason@kudzu.us> + * Author: Muli Ben-Yehuda <muli@il.ibm.com> + + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#define pr_fmt(fmt) "Calgary: " fmt + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/spinlock.h> +#include <linux/string.h> +#include <linux/crash_dump.h> +#include <linux/dma-mapping.h> +#include <linux/bitmap.h> +#include <linux/pci_ids.h> +#include <linux/pci.h> +#include <linux/delay.h> +#include <linux/scatterlist.h> +#include <linux/iommu-helper.h> + +#include <asm/iommu.h> +#include <asm/calgary.h> +#include <asm/tce.h> +#include <asm/pci-direct.h> +#include <asm/dma.h> +#include <asm/rio.h> +#include <asm/bios_ebda.h> +#include <asm/x86_init.h> +#include <asm/iommu_table.h> + +#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT +int use_calgary __read_mostly = 1; +#else +int use_calgary __read_mostly = 0; +#endif /* CONFIG_CALGARY_DEFAULT_ENABLED */ + +#define PCI_DEVICE_ID_IBM_CALGARY 0x02a1 +#define PCI_DEVICE_ID_IBM_CALIOC2 0x0308 + +/* register offsets inside the host bridge space */ +#define CALGARY_CONFIG_REG 0x0108 +#define PHB_CSR_OFFSET 0x0110 /* Channel Status */ +#define PHB_PLSSR_OFFSET 0x0120 +#define PHB_CONFIG_RW_OFFSET 0x0160 +#define PHB_IOBASE_BAR_LOW 0x0170 +#define PHB_IOBASE_BAR_HIGH 0x0180 +#define PHB_MEM_1_LOW 0x0190 +#define PHB_MEM_1_HIGH 0x01A0 +#define PHB_IO_ADDR_SIZE 0x01B0 +#define PHB_MEM_1_SIZE 0x01C0 +#define PHB_MEM_ST_OFFSET 0x01D0 +#define PHB_AER_OFFSET 0x0200 +#define PHB_CONFIG_0_HIGH 0x0220 +#define PHB_CONFIG_0_LOW 0x0230 +#define PHB_CONFIG_0_END 0x0240 +#define PHB_MEM_2_LOW 0x02B0 +#define PHB_MEM_2_HIGH 0x02C0 +#define PHB_MEM_2_SIZE_HIGH 0x02D0 +#define PHB_MEM_2_SIZE_LOW 0x02E0 +#define PHB_DOSHOLE_OFFSET 0x08E0 + +/* CalIOC2 specific */ +#define PHB_SAVIOR_L2 0x0DB0 +#define PHB_PAGE_MIG_CTRL 0x0DA8 +#define PHB_PAGE_MIG_DEBUG 0x0DA0 +#define PHB_ROOT_COMPLEX_STATUS 0x0CB0 + +/* PHB_CONFIG_RW */ +#define PHB_TCE_ENABLE 0x20000000 +#define PHB_SLOT_DISABLE 0x1C000000 +#define PHB_DAC_DISABLE 0x01000000 +#define PHB_MEM2_ENABLE 0x00400000 +#define PHB_MCSR_ENABLE 0x00100000 +/* TAR (Table Address Register) */ +#define TAR_SW_BITS 0x0000ffffffff800fUL +#define TAR_VALID 0x0000000000000008UL +/* CSR (Channel/DMA Status Register) */ +#define CSR_AGENT_MASK 0xffe0ffff +/* CCR (Calgary Configuration Register) */ +#define CCR_2SEC_TIMEOUT 0x000000000000000EUL +/* PMCR/PMDR (Page Migration Control/Debug Registers */ +#define PMR_SOFTSTOP 0x80000000 +#define PMR_SOFTSTOPFAULT 0x40000000 +#define PMR_HARDSTOP 0x20000000 + +/* + * The maximum PHB bus number. + * x3950M2 (rare): 8 chassis, 48 PHBs per chassis = 384 + * x3950M2: 4 chassis, 48 PHBs per chassis = 192 + * x3950 (PCIE): 8 chassis, 32 PHBs per chassis = 256 + * x3950 (PCIX): 8 chassis, 16 PHBs per chassis = 128 + */ +#define MAX_PHB_BUS_NUM 256 + +#define PHBS_PER_CALGARY 4 + +/* register offsets in Calgary's internal register space */ +static const unsigned long tar_offsets[] = { + 0x0580 /* TAR0 */, + 0x0588 /* TAR1 */, + 0x0590 /* TAR2 */, + 0x0598 /* TAR3 */ +}; + +static const unsigned long split_queue_offsets[] = { + 0x4870 /* SPLIT QUEUE 0 */, + 0x5870 /* SPLIT QUEUE 1 */, + 0x6870 /* SPLIT QUEUE 2 */, + 0x7870 /* SPLIT QUEUE 3 */ +}; + +static const unsigned long phb_offsets[] = { + 0x8000 /* PHB0 */, + 0x9000 /* PHB1 */, + 0xA000 /* PHB2 */, + 0xB000 /* PHB3 */ +}; + +/* PHB debug registers */ + +static const unsigned long phb_debug_offsets[] = { + 0x4000 /* PHB 0 DEBUG */, + 0x5000 /* PHB 1 DEBUG */, + 0x6000 /* PHB 2 DEBUG */, + 0x7000 /* PHB 3 DEBUG */ +}; + +/* + * STUFF register for each debug PHB, + * byte 1 = start bus number, byte 2 = end bus number + */ + +#define PHB_DEBUG_STUFF_OFFSET 0x0020 + +#define EMERGENCY_PAGES 32 /* = 128KB */ + +unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED; +static int translate_empty_slots __read_mostly = 0; +static int calgary_detected __read_mostly = 0; + +static struct rio_table_hdr *rio_table_hdr __initdata; +static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata; +static struct rio_detail *rio_devs[MAX_NUMNODES * 4] __initdata; + +struct calgary_bus_info { + void *tce_space; + unsigned char translation_disabled; + signed char phbid; + void __iomem *bbar; +}; + +static void calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev); +static void calgary_tce_cache_blast(struct iommu_table *tbl); +static void calgary_dump_error_regs(struct iommu_table *tbl); +static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev); +static void calioc2_tce_cache_blast(struct iommu_table *tbl); +static void calioc2_dump_error_regs(struct iommu_table *tbl); +static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl); +static void get_tce_space_from_tar(void); + +static struct cal_chipset_ops calgary_chip_ops = { + .handle_quirks = calgary_handle_quirks, + .tce_cache_blast = calgary_tce_cache_blast, + .dump_error_regs = calgary_dump_error_regs +}; + +static struct cal_chipset_ops calioc2_chip_ops = { + .handle_quirks = calioc2_handle_quirks, + .tce_cache_blast = calioc2_tce_cache_blast, + .dump_error_regs = calioc2_dump_error_regs +}; + +static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, }; + +static inline int translation_enabled(struct iommu_table *tbl) +{ + /* only PHBs with translation enabled have an IOMMU table */ + return (tbl != NULL); +} + +static void iommu_range_reserve(struct iommu_table *tbl, + unsigned long start_addr, unsigned int npages) +{ + unsigned long index; + unsigned long end; + unsigned long flags; + + index = start_addr >> PAGE_SHIFT; + + /* bail out if we're asked to reserve a region we don't cover */ + if (index >= tbl->it_size) + return; + + end = index + npages; + if (end > tbl->it_size) /* don't go off the table */ + end = tbl->it_size; + + spin_lock_irqsave(&tbl->it_lock, flags); + + bitmap_set(tbl->it_map, index, npages); + + spin_unlock_irqrestore(&tbl->it_lock, flags); +} + +static unsigned long iommu_range_alloc(struct device *dev, + struct iommu_table *tbl, + unsigned int npages) +{ + unsigned long flags; + unsigned long offset; + unsigned long boundary_size; + + boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, + PAGE_SIZE) >> PAGE_SHIFT; + + BUG_ON(npages == 0); + + spin_lock_irqsave(&tbl->it_lock, flags); + + offset = iommu_area_alloc(tbl->it_map, tbl->it_size, tbl->it_hint, + npages, 0, boundary_size, 0); + if (offset == ~0UL) { + tbl->chip_ops->tce_cache_blast(tbl); + + offset = iommu_area_alloc(tbl->it_map, tbl->it_size, 0, + npages, 0, boundary_size, 0); + if (offset == ~0UL) { + pr_warn("IOMMU full\n"); + spin_unlock_irqrestore(&tbl->it_lock, flags); + if (panic_on_overflow) + panic("Calgary: fix the allocator.\n"); + else + return DMA_ERROR_CODE; + } + } + + tbl->it_hint = offset + npages; + BUG_ON(tbl->it_hint > tbl->it_size); + + spin_unlock_irqrestore(&tbl->it_lock, flags); + + return offset; +} + +static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, + void *vaddr, unsigned int npages, int direction) +{ + unsigned long entry; + dma_addr_t ret; + + entry = iommu_range_alloc(dev, tbl, npages); + + if (unlikely(entry == DMA_ERROR_CODE)) { + pr_warn("failed to allocate %u pages in iommu %p\n", + npages, tbl); + return DMA_ERROR_CODE; + } + + /* set the return dma address */ + ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK); + + /* put the TCEs in the HW table */ + tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK, + direction); + return ret; +} + +static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, + unsigned int npages) +{ + unsigned long entry; + unsigned long badend; + unsigned long flags; + + /* were we called with bad_dma_address? */ + badend = DMA_ERROR_CODE + (EMERGENCY_PAGES * PAGE_SIZE); + if (unlikely((dma_addr >= DMA_ERROR_CODE) && (dma_addr < badend))) { + WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA " + "address 0x%Lx\n", dma_addr); + return; + } + + entry = dma_addr >> PAGE_SHIFT; + + BUG_ON(entry + npages > tbl->it_size); + + tce_free(tbl, entry, npages); + + spin_lock_irqsave(&tbl->it_lock, flags); + + bitmap_clear(tbl->it_map, entry, npages); + + spin_unlock_irqrestore(&tbl->it_lock, flags); +} + +static inline struct iommu_table *find_iommu_table(struct device *dev) +{ + struct pci_dev *pdev; + struct pci_bus *pbus; + struct iommu_table *tbl; + + pdev = to_pci_dev(dev); + + /* search up the device tree for an iommu */ + pbus = pdev->bus; + do { + tbl = pci_iommu(pbus); + if (tbl && tbl->it_busno == pbus->number) + break; + tbl = NULL; + pbus = pbus->parent; + } while (pbus); + + BUG_ON(tbl && (tbl->it_busno != pbus->number)); + + return tbl; +} + +static void calgary_unmap_sg(struct device *dev, struct scatterlist *sglist, + int nelems,enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + struct iommu_table *tbl = find_iommu_table(dev); + struct scatterlist *s; + int i; + + if (!translation_enabled(tbl)) + return; + + for_each_sg(sglist, s, nelems, i) { + unsigned int npages; + dma_addr_t dma = s->dma_address; + unsigned int dmalen = s->dma_length; + + if (dmalen == 0) + break; + + npages = iommu_num_pages(dma, dmalen, PAGE_SIZE); + iommu_free(tbl, dma, npages); + } +} + +static int calgary_map_sg(struct device *dev, struct scatterlist *sg, + int nelems, enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + struct iommu_table *tbl = find_iommu_table(dev); + struct scatterlist *s; + unsigned long vaddr; + unsigned int npages; + unsigned long entry; + int i; + + for_each_sg(sg, s, nelems, i) { + BUG_ON(!sg_page(s)); + + vaddr = (unsigned long) sg_virt(s); + npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE); + + entry = iommu_range_alloc(dev, tbl, npages); + if (entry == DMA_ERROR_CODE) { + /* makes sure unmap knows to stop */ + s->dma_length = 0; + goto error; + } + + s->dma_address = (entry << PAGE_SHIFT) | s->offset; + + /* insert into HW table */ + tce_build(tbl, entry, npages, vaddr & PAGE_MASK, dir); + + s->dma_length = s->length; + } + + return nelems; +error: + calgary_unmap_sg(dev, sg, nelems, dir, NULL); + for_each_sg(sg, s, nelems, i) { + sg->dma_address = DMA_ERROR_CODE; + sg->dma_length = 0; + } + return 0; +} + +static dma_addr_t calgary_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + void *vaddr = page_address(page) + offset; + unsigned long uaddr; + unsigned int npages; + struct iommu_table *tbl = find_iommu_table(dev); + + uaddr = (unsigned long)vaddr; + npages = iommu_num_pages(uaddr, size, PAGE_SIZE); + + return iommu_alloc(dev, tbl, vaddr, npages, dir); +} + +static void calgary_unmap_page(struct device *dev, dma_addr_t dma_addr, + size_t size, enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + struct iommu_table *tbl = find_iommu_table(dev); + unsigned int npages; + + npages = iommu_num_pages(dma_addr, size, PAGE_SIZE); + iommu_free(tbl, dma_addr, npages); +} + +static void* calgary_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t flag, struct dma_attrs *attrs) +{ + void *ret = NULL; + dma_addr_t mapping; + unsigned int npages, order; + struct iommu_table *tbl = find_iommu_table(dev); + + size = PAGE_ALIGN(size); /* size rounded up to full pages */ + npages = size >> PAGE_SHIFT; + order = get_order(size); + + flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); + + /* alloc enough pages (and possibly more) */ + ret = (void *)__get_free_pages(flag, order); + if (!ret) + goto error; + memset(ret, 0, size); + + /* set up tces to cover the allocated range */ + mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); + if (mapping == DMA_ERROR_CODE) + goto free; + *dma_handle = mapping; + return ret; +free: + free_pages((unsigned long)ret, get_order(size)); + ret = NULL; +error: + return ret; +} + +static void calgary_free_coherent(struct device *dev, size_t size, + void *vaddr, dma_addr_t dma_handle, + struct dma_attrs *attrs) +{ + unsigned int npages; + struct iommu_table *tbl = find_iommu_table(dev); + + size = PAGE_ALIGN(size); + npages = size >> PAGE_SHIFT; + + iommu_free(tbl, dma_handle, npages); + free_pages((unsigned long)vaddr, get_order(size)); +} + +static struct dma_map_ops calgary_dma_ops = { + .alloc = calgary_alloc_coherent, + .free = calgary_free_coherent, + .map_sg = calgary_map_sg, + .unmap_sg = calgary_unmap_sg, + .map_page = calgary_map_page, + .unmap_page = calgary_unmap_page, +}; + +static inline void __iomem * busno_to_bbar(unsigned char num) +{ + return bus_info[num].bbar; +} + +static inline int busno_to_phbid(unsigned char num) +{ + return bus_info[num].phbid; +} + +static inline unsigned long split_queue_offset(unsigned char num) +{ + size_t idx = busno_to_phbid(num); + + return split_queue_offsets[idx]; +} + +static inline unsigned long tar_offset(unsigned char num) +{ + size_t idx = busno_to_phbid(num); + + return tar_offsets[idx]; +} + +static inline unsigned long phb_offset(unsigned char num) +{ + size_t idx = busno_to_phbid(num); + + return phb_offsets[idx]; +} + +static inline void __iomem* calgary_reg(void __iomem *bar, unsigned long offset) +{ + unsigned long target = ((unsigned long)bar) | offset; + return (void __iomem*)target; +} + +static inline int is_calioc2(unsigned short device) +{ + return (device == PCI_DEVICE_ID_IBM_CALIOC2); +} + +static inline int is_calgary(unsigned short device) +{ + return (device == PCI_DEVICE_ID_IBM_CALGARY); +} + +static inline int is_cal_pci_dev(unsigned short device) +{ + return (is_calgary(device) || is_calioc2(device)); +} + +static void calgary_tce_cache_blast(struct iommu_table *tbl) +{ + u64 val; + u32 aer; + int i = 0; + void __iomem *bbar = tbl->bbar; + void __iomem *target; + + /* disable arbitration on the bus */ + target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET); + aer = readl(target); + writel(0, target); + + /* read plssr to ensure it got there */ + target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET); + val = readl(target); + + /* poll split queues until all DMA activity is done */ + target = calgary_reg(bbar, split_queue_offset(tbl->it_busno)); + do { + val = readq(target); + i++; + } while ((val & 0xff) != 0xff && i < 100); + if (i == 100) + pr_warn("PCI bus not quiesced, continuing anyway\n"); + + /* invalidate TCE cache */ + target = calgary_reg(bbar, tar_offset(tbl->it_busno)); + writeq(tbl->tar_val, target); + + /* enable arbitration */ + target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET); + writel(aer, target); + (void)readl(target); /* flush */ +} + +static void calioc2_tce_cache_blast(struct iommu_table *tbl) +{ + void __iomem *bbar = tbl->bbar; + void __iomem *target; + u64 val64; + u32 val; + int i = 0; + int count = 1; + unsigned char bus = tbl->it_busno; + +begin: + printk(KERN_DEBUG "Calgary: CalIOC2 bus 0x%x entering tce cache blast " + "sequence - count %d\n", bus, count); + + /* 1. using the Page Migration Control reg set SoftStop */ + target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); + val = be32_to_cpu(readl(target)); + printk(KERN_DEBUG "1a. read 0x%x [LE] from %p\n", val, target); + val |= PMR_SOFTSTOP; + printk(KERN_DEBUG "1b. writing 0x%x [LE] to %p\n", val, target); + writel(cpu_to_be32(val), target); + + /* 2. poll split queues until all DMA activity is done */ + printk(KERN_DEBUG "2a. starting to poll split queues\n"); + target = calgary_reg(bbar, split_queue_offset(bus)); + do { + val64 = readq(target); + i++; + } while ((val64 & 0xff) != 0xff && i < 100); + if (i == 100) + pr_warn("CalIOC2: PCI bus not quiesced, continuing anyway\n"); + + /* 3. poll Page Migration DEBUG for SoftStopFault */ + target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG); + val = be32_to_cpu(readl(target)); + printk(KERN_DEBUG "3. read 0x%x [LE] from %p\n", val, target); + + /* 4. if SoftStopFault - goto (1) */ + if (val & PMR_SOFTSTOPFAULT) { + if (++count < 100) + goto begin; + else { + pr_warn("CalIOC2: too many SoftStopFaults, aborting TCE cache flush sequence!\n"); + return; /* pray for the best */ + } + } + + /* 5. Slam into HardStop by reading PHB_PAGE_MIG_CTRL */ + target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); + printk(KERN_DEBUG "5a. slamming into HardStop by reading %p\n", target); + val = be32_to_cpu(readl(target)); + printk(KERN_DEBUG "5b. read 0x%x [LE] from %p\n", val, target); + target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG); + val = be32_to_cpu(readl(target)); + printk(KERN_DEBUG "5c. read 0x%x [LE] from %p (debug)\n", val, target); + + /* 6. invalidate TCE cache */ + printk(KERN_DEBUG "6. invalidating TCE cache\n"); + target = calgary_reg(bbar, tar_offset(bus)); + writeq(tbl->tar_val, target); + + /* 7. Re-read PMCR */ + printk(KERN_DEBUG "7a. Re-reading PMCR\n"); + target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); + val = be32_to_cpu(readl(target)); + printk(KERN_DEBUG "7b. read 0x%x [LE] from %p\n", val, target); + + /* 8. Remove HardStop */ + printk(KERN_DEBUG "8a. removing HardStop from PMCR\n"); + target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); + val = 0; + printk(KERN_DEBUG "8b. writing 0x%x [LE] to %p\n", val, target); + writel(cpu_to_be32(val), target); + val = be32_to_cpu(readl(target)); + printk(KERN_DEBUG "8c. read 0x%x [LE] from %p\n", val, target); +} + +static void __init calgary_reserve_mem_region(struct pci_dev *dev, u64 start, + u64 limit) +{ + unsigned int numpages; + + limit = limit | 0xfffff; + limit++; + + numpages = ((limit - start) >> PAGE_SHIFT); + iommu_range_reserve(pci_iommu(dev->bus), start, numpages); +} + +static void __init calgary_reserve_peripheral_mem_1(struct pci_dev *dev) +{ + void __iomem *target; + u64 low, high, sizelow; + u64 start, limit; + struct iommu_table *tbl = pci_iommu(dev->bus); + unsigned char busnum = dev->bus->number; + void __iomem *bbar = tbl->bbar; + + /* peripheral MEM_1 region */ + target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_LOW); + low = be32_to_cpu(readl(target)); + target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_HIGH); + high = be32_to_cpu(readl(target)); + target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_SIZE); + sizelow = be32_to_cpu(readl(target)); + + start = (high << 32) | low; + limit = sizelow; + + calgary_reserve_mem_region(dev, start, limit); +} + +static void __init calgary_reserve_peripheral_mem_2(struct pci_dev *dev) +{ + void __iomem *target; + u32 val32; + u64 low, high, sizelow, sizehigh; + u64 start, limit; + struct iommu_table *tbl = pci_iommu(dev->bus); + unsigned char busnum = dev->bus->number; + void __iomem *bbar = tbl->bbar; + + /* is it enabled? */ + target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET); + val32 = be32_to_cpu(readl(target)); + if (!(val32 & PHB_MEM2_ENABLE)) + return; + + target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_LOW); + low = be32_to_cpu(readl(target)); + target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_HIGH); + high = be32_to_cpu(readl(target)); + target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_LOW); + sizelow = be32_to_cpu(readl(target)); + target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_HIGH); + sizehigh = be32_to_cpu(readl(target)); + + start = (high << 32) | low; + limit = (sizehigh << 32) | sizelow; + + calgary_reserve_mem_region(dev, start, limit); +} + +/* + * some regions of the IO address space do not get translated, so we + * must not give devices IO addresses in those regions. The regions + * are the 640KB-1MB region and the two PCI peripheral memory holes. + * Reserve all of them in the IOMMU bitmap to avoid giving them out + * later. + */ +static void __init calgary_reserve_regions(struct pci_dev *dev) +{ + unsigned int npages; + u64 start; + struct iommu_table *tbl = pci_iommu(dev->bus); + + /* reserve EMERGENCY_PAGES from bad_dma_address and up */ + iommu_range_reserve(tbl, DMA_ERROR_CODE, EMERGENCY_PAGES); + + /* avoid the BIOS/VGA first 640KB-1MB region */ + /* for CalIOC2 - avoid the entire first MB */ + if (is_calgary(dev->device)) { + start = (640 * 1024); + npages = ((1024 - 640) * 1024) >> PAGE_SHIFT; + } else { /* calioc2 */ + start = 0; + npages = (1 * 1024 * 1024) >> PAGE_SHIFT; + } + iommu_range_reserve(tbl, start, npages); + + /* reserve the two PCI peripheral memory regions in IO space */ + calgary_reserve_peripheral_mem_1(dev); + calgary_reserve_peripheral_mem_2(dev); +} + +static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar) +{ + u64 val64; + u64 table_phys; + void __iomem *target; + int ret; + struct iommu_table *tbl; + + /* build TCE tables for each PHB */ + ret = build_tce_table(dev, bbar); + if (ret) + return ret; + + tbl = pci_iommu(dev->bus); + tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space; + + if (is_kdump_kernel()) + calgary_init_bitmap_from_tce_table(tbl); + else + tce_free(tbl, 0, tbl->it_size); + + if (is_calgary(dev->device)) + tbl->chip_ops = &calgary_chip_ops; + else if (is_calioc2(dev->device)) + tbl->chip_ops = &calioc2_chip_ops; + else + BUG(); + + calgary_reserve_regions(dev); + + /* set TARs for each PHB */ + target = calgary_reg(bbar, tar_offset(dev->bus->number)); + val64 = be64_to_cpu(readq(target)); + + /* zero out all TAR bits under sw control */ + val64 &= ~TAR_SW_BITS; + table_phys = (u64)__pa(tbl->it_base); + + val64 |= table_phys; + + BUG_ON(specified_table_size > TCE_TABLE_SIZE_8M); + val64 |= (u64) specified_table_size; + + tbl->tar_val = cpu_to_be64(val64); + + writeq(tbl->tar_val, target); + readq(target); /* flush */ + + return 0; +} + +static void __init calgary_free_bus(struct pci_dev *dev) +{ + u64 val64; + struct iommu_table *tbl = pci_iommu(dev->bus); + void __iomem *target; + unsigned int bitmapsz; + + target = calgary_reg(tbl->bbar, tar_offset(dev->bus->number)); + val64 = be64_to_cpu(readq(target)); + val64 &= ~TAR_SW_BITS; + writeq(cpu_to_be64(val64), target); + readq(target); /* flush */ + + bitmapsz = tbl->it_size / BITS_PER_BYTE; + free_pages((unsigned long)tbl->it_map, get_order(bitmapsz)); + tbl->it_map = NULL; + + kfree(tbl); + + set_pci_iommu(dev->bus, NULL); + + /* Can't free bootmem allocated memory after system is up :-( */ + bus_info[dev->bus->number].tce_space = NULL; +} + +static void calgary_dump_error_regs(struct iommu_table *tbl) +{ + void __iomem *bbar = tbl->bbar; + void __iomem *target; + u32 csr, plssr; + + target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET); + csr = be32_to_cpu(readl(target)); + + target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET); + plssr = be32_to_cpu(readl(target)); + + /* If no error, the agent ID in the CSR is not valid */ + pr_emerg("DMA error on Calgary PHB 0x%x, 0x%08x@CSR 0x%08x@PLSSR\n", + tbl->it_busno, csr, plssr); +} + +static void calioc2_dump_error_regs(struct iommu_table *tbl) +{ + void __iomem *bbar = tbl->bbar; + u32 csr, csmr, plssr, mck, rcstat; + void __iomem *target; + unsigned long phboff = phb_offset(tbl->it_busno); + unsigned long erroff; + u32 errregs[7]; + int i; + + /* dump CSR */ + target = calgary_reg(bbar, phboff | PHB_CSR_OFFSET); + csr = be32_to_cpu(readl(target)); + /* dump PLSSR */ + target = calgary_reg(bbar, phboff | PHB_PLSSR_OFFSET); + plssr = be32_to_cpu(readl(target)); + /* dump CSMR */ + target = calgary_reg(bbar, phboff | 0x290); + csmr = be32_to_cpu(readl(target)); + /* dump mck */ + target = calgary_reg(bbar, phboff | 0x800); + mck = be32_to_cpu(readl(target)); + + pr_emerg("DMA error on CalIOC2 PHB 0x%x\n", tbl->it_busno); + + pr_emerg("0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n", + csr, plssr, csmr, mck); + + /* dump rest of error regs */ + pr_emerg(""); + for (i = 0; i < ARRAY_SIZE(errregs); i++) { + /* err regs are at 0x810 - 0x870 */ + erroff = (0x810 + (i * 0x10)); + target = calgary_reg(bbar, phboff | erroff); + errregs[i] = be32_to_cpu(readl(target)); + pr_cont("0x%08x@0x%lx ", errregs[i], erroff); + } + pr_cont("\n"); + + /* root complex status */ + target = calgary_reg(bbar, phboff | PHB_ROOT_COMPLEX_STATUS); + rcstat = be32_to_cpu(readl(target)); + printk(KERN_EMERG "Calgary: 0x%08x@0x%x\n", rcstat, + PHB_ROOT_COMPLEX_STATUS); +} + +static void calgary_watchdog(unsigned long data) +{ + struct pci_dev *dev = (struct pci_dev *)data; + struct iommu_table *tbl = pci_iommu(dev->bus); + void __iomem *bbar = tbl->bbar; + u32 val32; + void __iomem *target; + + target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET); + val32 = be32_to_cpu(readl(target)); + + /* If no error, the agent ID in the CSR is not valid */ + if (val32 & CSR_AGENT_MASK) { + tbl->chip_ops->dump_error_regs(tbl); + + /* reset error */ + writel(0, target); + + /* Disable bus that caused the error */ + target = calgary_reg(bbar, phb_offset(tbl->it_busno) | + PHB_CONFIG_RW_OFFSET); + val32 = be32_to_cpu(readl(target)); + val32 |= PHB_SLOT_DISABLE; + writel(cpu_to_be32(val32), target); + readl(target); /* flush */ + } else { + /* Reset the timer */ + mod_timer(&tbl->watchdog_timer, jiffies + 2 * HZ); + } +} + +static void __init calgary_set_split_completion_timeout(void __iomem *bbar, + unsigned char busnum, unsigned long timeout) +{ + u64 val64; + void __iomem *target; + unsigned int phb_shift = ~0; /* silence gcc */ + u64 mask; + + switch (busno_to_phbid(busnum)) { + case 0: phb_shift = (63 - 19); + break; + case 1: phb_shift = (63 - 23); + break; + case 2: phb_shift = (63 - 27); + break; + case 3: phb_shift = (63 - 35); + break; + default: + BUG_ON(busno_to_phbid(busnum)); + } + + target = calgary_reg(bbar, CALGARY_CONFIG_REG); + val64 = be64_to_cpu(readq(target)); + + /* zero out this PHB's timer bits */ + mask = ~(0xFUL << phb_shift); + val64 &= mask; + val64 |= (timeout << phb_shift); + writeq(cpu_to_be64(val64), target); + readq(target); /* flush */ +} + +static void __init calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev) +{ + unsigned char busnum = dev->bus->number; + void __iomem *bbar = tbl->bbar; + void __iomem *target; + u32 val; + + /* + * CalIOC2 designers recommend setting bit 8 in 0xnDB0 to 1 + */ + target = calgary_reg(bbar, phb_offset(busnum) | PHB_SAVIOR_L2); + val = cpu_to_be32(readl(target)); + val |= 0x00800000; + writel(cpu_to_be32(val), target); +} + +static void __init calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev) +{ + unsigned char busnum = dev->bus->number; + + /* + * Give split completion a longer timeout on bus 1 for aic94xx + * http://bugzilla.kernel.org/show_bug.cgi?id=7180 + */ + if (is_calgary(dev->device) && (busnum == 1)) + calgary_set_split_completion_timeout(tbl->bbar, busnum, + CCR_2SEC_TIMEOUT); +} + +static void __init calgary_enable_translation(struct pci_dev *dev) +{ + u32 val32; + unsigned char busnum; + void __iomem *target; + void __iomem *bbar; + struct iommu_table *tbl; + + busnum = dev->bus->number; + tbl = pci_iommu(dev->bus); + bbar = tbl->bbar; + + /* enable TCE in PHB Config Register */ + target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET); + val32 = be32_to_cpu(readl(target)); + val32 |= PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE; + + printk(KERN_INFO "Calgary: enabling translation on %s PHB %#x\n", + (dev->device == PCI_DEVICE_ID_IBM_CALGARY) ? + "Calgary" : "CalIOC2", busnum); + printk(KERN_INFO "Calgary: errant DMAs will now be prevented on this " + "bus.\n"); + + writel(cpu_to_be32(val32), target); + readl(target); /* flush */ + + init_timer(&tbl->watchdog_timer); + tbl->watchdog_timer.function = &calgary_watchdog; + tbl->watchdog_timer.data = (unsigned long)dev; + mod_timer(&tbl->watchdog_timer, jiffies); +} + +static void __init calgary_disable_translation(struct pci_dev *dev) +{ + u32 val32; + unsigned char busnum; + void __iomem *target; + void __iomem *bbar; + struct iommu_table *tbl; + + busnum = dev->bus->number; + tbl = pci_iommu(dev->bus); + bbar = tbl->bbar; + + /* disable TCE in PHB Config Register */ + target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET); + val32 = be32_to_cpu(readl(target)); + val32 &= ~(PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE); + + printk(KERN_INFO "Calgary: disabling translation on PHB %#x!\n", busnum); + writel(cpu_to_be32(val32), target); + readl(target); /* flush */ + + del_timer_sync(&tbl->watchdog_timer); +} + +static void __init calgary_init_one_nontraslated(struct pci_dev *dev) +{ + pci_dev_get(dev); + set_pci_iommu(dev->bus, NULL); + + /* is the device behind a bridge? */ + if (dev->bus->parent) + dev->bus->parent->self = dev; + else + dev->bus->self = dev; +} + +static int __init calgary_init_one(struct pci_dev *dev) +{ + void __iomem *bbar; + struct iommu_table *tbl; + int ret; + + bbar = busno_to_bbar(dev->bus->number); + ret = calgary_setup_tar(dev, bbar); + if (ret) + goto done; + + pci_dev_get(dev); + + if (dev->bus->parent) { + if (dev->bus->parent->self) + printk(KERN_WARNING "Calgary: IEEEE, dev %p has " + "bus->parent->self!\n", dev); + dev->bus->parent->self = dev; + } else + dev->bus->self = dev; + + tbl = pci_iommu(dev->bus); + tbl->chip_ops->handle_quirks(tbl, dev); + + calgary_enable_translation(dev); + + return 0; + +done: + return ret; +} + +static int __init calgary_locate_bbars(void) +{ + int ret; + int rioidx, phb, bus; + void __iomem *bbar; + void __iomem *target; + unsigned long offset; + u8 start_bus, end_bus; + u32 val; + + ret = -ENODATA; + for (rioidx = 0; rioidx < rio_table_hdr->num_rio_dev; rioidx++) { + struct rio_detail *rio = rio_devs[rioidx]; + + if ((rio->type != COMPAT_CALGARY) && (rio->type != ALT_CALGARY)) + continue; + + /* map entire 1MB of Calgary config space */ + bbar = ioremap_nocache(rio->BBAR, 1024 * 1024); + if (!bbar) + goto error; + + for (phb = 0; phb < PHBS_PER_CALGARY; phb++) { + offset = phb_debug_offsets[phb] | PHB_DEBUG_STUFF_OFFSET; + target = calgary_reg(bbar, offset); + + val = be32_to_cpu(readl(target)); + + start_bus = (u8)((val & 0x00FF0000) >> 16); + end_bus = (u8)((val & 0x0000FF00) >> 8); + + if (end_bus) { + for (bus = start_bus; bus <= end_bus; bus++) { + bus_info[bus].bbar = bbar; + bus_info[bus].phbid = phb; + } + } else { + bus_info[start_bus].bbar = bbar; + bus_info[start_bus].phbid = phb; + } + } + } + + return 0; + +error: + /* scan bus_info and iounmap any bbars we previously ioremap'd */ + for (bus = 0; bus < ARRAY_SIZE(bus_info); bus++) + if (bus_info[bus].bbar) + iounmap(bus_info[bus].bbar); + + return ret; +} + +static int __init calgary_init(void) +{ + int ret; + struct pci_dev *dev = NULL; + struct calgary_bus_info *info; + + ret = calgary_locate_bbars(); + if (ret) + return ret; + + /* Purely for kdump kernel case */ + if (is_kdump_kernel()) + get_tce_space_from_tar(); + + do { + dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev); + if (!dev) + break; + if (!is_cal_pci_dev(dev->device)) + continue; + + info = &bus_info[dev->bus->number]; + if (info->translation_disabled) { + calgary_init_one_nontraslated(dev); + continue; + } + + if (!info->tce_space && !translate_empty_slots) + continue; + + ret = calgary_init_one(dev); + if (ret) + goto error; + } while (1); + + dev = NULL; + for_each_pci_dev(dev) { + struct iommu_table *tbl; + + tbl = find_iommu_table(&dev->dev); + + if (translation_enabled(tbl)) + dev->dev.archdata.dma_ops = &calgary_dma_ops; + } + + return ret; + +error: + do { + dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev); + if (!dev) + break; + if (!is_cal_pci_dev(dev->device)) + continue; + + info = &bus_info[dev->bus->number]; + if (info->translation_disabled) { + pci_dev_put(dev); + continue; + } + if (!info->tce_space && !translate_empty_slots) + continue; + + calgary_disable_translation(dev); + calgary_free_bus(dev); + pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */ + dev->dev.archdata.dma_ops = NULL; + } while (1); + + return ret; +} + +static inline int __init determine_tce_table_size(void) +{ + int ret; + + if (specified_table_size != TCE_TABLE_SIZE_UNSPECIFIED) + return specified_table_size; + + if (is_kdump_kernel() && saved_max_pfn) { + /* + * Table sizes are from 0 to 7 (TCE_TABLE_SIZE_64K to + * TCE_TABLE_SIZE_8M). Table size 0 has 8K entries and each + * larger table size has twice as many entries, so shift the + * max ram address by 13 to divide by 8K and then look at the + * order of the result to choose between 0-7. + */ + ret = get_order((saved_max_pfn * PAGE_SIZE) >> 13); + if (ret > TCE_TABLE_SIZE_8M) + ret = TCE_TABLE_SIZE_8M; + } else { + /* + * Use 8M by default (suggested by Muli) if it's not + * kdump kernel and saved_max_pfn isn't set. + */ + ret = TCE_TABLE_SIZE_8M; + } + + return ret; +} + +static int __init build_detail_arrays(void) +{ + unsigned long ptr; + unsigned numnodes, i; + int scal_detail_size, rio_detail_size; + + numnodes = rio_table_hdr->num_scal_dev; + if (numnodes > MAX_NUMNODES){ + printk(KERN_WARNING + "Calgary: MAX_NUMNODES too low! Defined as %d, " + "but system has %d nodes.\n", + MAX_NUMNODES, numnodes); + return -ENODEV; + } + + switch (rio_table_hdr->version){ + case 2: + scal_detail_size = 11; + rio_detail_size = 13; + break; + case 3: + scal_detail_size = 12; + rio_detail_size = 15; + break; + default: + printk(KERN_WARNING + "Calgary: Invalid Rio Grande Table Version: %d\n", + rio_table_hdr->version); + return -EPROTO; + } + + ptr = ((unsigned long)rio_table_hdr) + 3; + for (i = 0; i < numnodes; i++, ptr += scal_detail_size) + scal_devs[i] = (struct scal_detail *)ptr; + + for (i = 0; i < rio_table_hdr->num_rio_dev; + i++, ptr += rio_detail_size) + rio_devs[i] = (struct rio_detail *)ptr; + + return 0; +} + +static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev) +{ + int dev; + u32 val; + + if (pci_dev == PCI_DEVICE_ID_IBM_CALIOC2) { + /* + * FIXME: properly scan for devices across the + * PCI-to-PCI bridge on every CalIOC2 port. + */ + return 1; + } + + for (dev = 1; dev < 8; dev++) { + val = read_pci_config(bus, dev, 0, 0); + if (val != 0xffffffff) + break; + } + return (val != 0xffffffff); +} + +/* + * calgary_init_bitmap_from_tce_table(): + * Function for kdump case. In the second/kdump kernel initialize + * the bitmap based on the tce table entries obtained from first kernel + */ +static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl) +{ + u64 *tp; + unsigned int index; + tp = ((u64 *)tbl->it_base); + for (index = 0 ; index < tbl->it_size; index++) { + if (*tp != 0x0) + set_bit(index, tbl->it_map); + tp++; + } +} + +/* + * get_tce_space_from_tar(): + * Function for kdump case. Get the tce tables from first kernel + * by reading the contents of the base address register of calgary iommu + */ +static void __init get_tce_space_from_tar(void) +{ + int bus; + void __iomem *target; + unsigned long tce_space; + + for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { + struct calgary_bus_info *info = &bus_info[bus]; + unsigned short pci_device; + u32 val; + + val = read_pci_config(bus, 0, 0, 0); + pci_device = (val & 0xFFFF0000) >> 16; + + if (!is_cal_pci_dev(pci_device)) + continue; + if (info->translation_disabled) + continue; + + if (calgary_bus_has_devices(bus, pci_device) || + translate_empty_slots) { + target = calgary_reg(bus_info[bus].bbar, + tar_offset(bus)); + tce_space = be64_to_cpu(readq(target)); + tce_space = tce_space & TAR_SW_BITS; + + tce_space = tce_space & (~specified_table_size); + info->tce_space = (u64 *)__va(tce_space); + } + } + return; +} + +static int __init calgary_iommu_init(void) +{ + int ret; + + /* ok, we're trying to use Calgary - let's roll */ + printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n"); + + ret = calgary_init(); + if (ret) { + printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " + "falling back to no_iommu\n", ret); + return ret; + } + + return 0; +} + +int __init detect_calgary(void) +{ + int bus; + void *tbl; + int calgary_found = 0; + unsigned long ptr; + unsigned int offset, prev_offset; + int ret; + + /* + * if the user specified iommu=off or iommu=soft or we found + * another HW IOMMU already, bail out. + */ + if (no_iommu || iommu_detected) + return -ENODEV; + + if (!use_calgary) + return -ENODEV; + + if (!early_pci_allowed()) + return -ENODEV; + + printk(KERN_DEBUG "Calgary: detecting Calgary via BIOS EBDA area\n"); + + ptr = (unsigned long)phys_to_virt(get_bios_ebda()); + + rio_table_hdr = NULL; + prev_offset = 0; + offset = 0x180; + /* + * The next offset is stored in the 1st word. + * Only parse up until the offset increases: + */ + while (offset > prev_offset) { + /* The block id is stored in the 2nd word */ + if (*((unsigned short *)(ptr + offset + 2)) == 0x4752){ + /* set the pointer past the offset & block id */ + rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4); + break; + } + prev_offset = offset; + offset = *((unsigned short *)(ptr + offset)); + } + if (!rio_table_hdr) { + printk(KERN_DEBUG "Calgary: Unable to locate Rio Grande table " + "in EBDA - bailing!\n"); + return -ENODEV; + } + + ret = build_detail_arrays(); + if (ret) { + printk(KERN_DEBUG "Calgary: build_detail_arrays ret %d\n", ret); + return -ENOMEM; + } + + specified_table_size = determine_tce_table_size(); + + for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { + struct calgary_bus_info *info = &bus_info[bus]; + unsigned short pci_device; + u32 val; + + val = read_pci_config(bus, 0, 0, 0); + pci_device = (val & 0xFFFF0000) >> 16; + + if (!is_cal_pci_dev(pci_device)) + continue; + + if (info->translation_disabled) + continue; + + if (calgary_bus_has_devices(bus, pci_device) || + translate_empty_slots) { + /* + * If it is kdump kernel, find and use tce tables + * from first kernel, else allocate tce tables here + */ + if (!is_kdump_kernel()) { + tbl = alloc_tce_table(); + if (!tbl) + goto cleanup; + info->tce_space = tbl; + } + calgary_found = 1; + } + } + + printk(KERN_DEBUG "Calgary: finished detection, Calgary %s\n", + calgary_found ? "found" : "not found"); + + if (calgary_found) { + iommu_detected = 1; + calgary_detected = 1; + printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n"); + printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n", + specified_table_size); + + x86_init.iommu.iommu_init = calgary_iommu_init; + } + return calgary_found; + +cleanup: + for (--bus; bus >= 0; --bus) { + struct calgary_bus_info *info = &bus_info[bus]; + + if (info->tce_space) + free_tce_table(info->tce_space); + } + return -ENOMEM; +} + +static int __init calgary_parse_options(char *p) +{ + unsigned int bridge; + unsigned long val; + size_t len; + ssize_t ret; + + while (*p) { + if (!strncmp(p, "64k", 3)) + specified_table_size = TCE_TABLE_SIZE_64K; + else if (!strncmp(p, "128k", 4)) + specified_table_size = TCE_TABLE_SIZE_128K; + else if (!strncmp(p, "256k", 4)) + specified_table_size = TCE_TABLE_SIZE_256K; + else if (!strncmp(p, "512k", 4)) + specified_table_size = TCE_TABLE_SIZE_512K; + else if (!strncmp(p, "1M", 2)) + specified_table_size = TCE_TABLE_SIZE_1M; + else if (!strncmp(p, "2M", 2)) + specified_table_size = TCE_TABLE_SIZE_2M; + else if (!strncmp(p, "4M", 2)) + specified_table_size = TCE_TABLE_SIZE_4M; + else if (!strncmp(p, "8M", 2)) + specified_table_size = TCE_TABLE_SIZE_8M; + + len = strlen("translate_empty_slots"); + if (!strncmp(p, "translate_empty_slots", len)) + translate_empty_slots = 1; + + len = strlen("disable"); + if (!strncmp(p, "disable", len)) { + p += len; + if (*p == '=') + ++p; + if (*p == '\0') + break; + ret = kstrtoul(p, 0, &val); + if (ret) + break; + + bridge = val; + if (bridge < MAX_PHB_BUS_NUM) { + printk(KERN_INFO "Calgary: disabling " + "translation for PHB %#x\n", bridge); + bus_info[bridge].translation_disabled = 1; + } + } + + p = strpbrk(p, ","); + if (!p) + break; + + p++; /* skip ',' */ + } + return 1; +} +__setup("calgary=", calgary_parse_options); + +static void __init calgary_fixup_one_tce_space(struct pci_dev *dev) +{ + struct iommu_table *tbl; + unsigned int npages; + int i; + + tbl = pci_iommu(dev->bus); + + for (i = 0; i < 4; i++) { + struct resource *r = &dev->resource[PCI_BRIDGE_RESOURCES + i]; + + /* Don't give out TCEs that map MEM resources */ + if (!(r->flags & IORESOURCE_MEM)) + continue; + + /* 0-based? we reserve the whole 1st MB anyway */ + if (!r->start) + continue; + + /* cover the whole region */ + npages = resource_size(r) >> PAGE_SHIFT; + npages++; + + iommu_range_reserve(tbl, r->start, npages); + } +} + +static int __init calgary_fixup_tce_spaces(void) +{ + struct pci_dev *dev = NULL; + struct calgary_bus_info *info; + + if (no_iommu || swiotlb || !calgary_detected) + return -ENODEV; + + printk(KERN_DEBUG "Calgary: fixing up tce spaces\n"); + + do { + dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev); + if (!dev) + break; + if (!is_cal_pci_dev(dev->device)) + continue; + + info = &bus_info[dev->bus->number]; + if (info->translation_disabled) + continue; + + if (!info->tce_space) + continue; + + calgary_fixup_one_tce_space(dev); + + } while (1); + + return 0; +} + +/* + * We need to be call after pcibios_assign_resources (fs_initcall level) + * and before device_initcall. + */ +rootfs_initcall(calgary_fixup_tce_spaces); + +IOMMU_INIT_POST(detect_calgary); diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c new file mode 100644 index 00000000000..a25e202bb31 --- /dev/null +++ b/arch/x86/kernel/pci-dma.c @@ -0,0 +1,284 @@ +#include <linux/dma-mapping.h> +#include <linux/dma-debug.h> +#include <linux/dmar.h> +#include <linux/export.h> +#include <linux/bootmem.h> +#include <linux/gfp.h> +#include <linux/pci.h> +#include <linux/kmemleak.h> + +#include <asm/proto.h> +#include <asm/dma.h> +#include <asm/iommu.h> +#include <asm/gart.h> +#include <asm/calgary.h> +#include <asm/x86_init.h> +#include <asm/iommu_table.h> + +static int forbid_dac __read_mostly; + +struct dma_map_ops *dma_ops = &nommu_dma_ops; +EXPORT_SYMBOL(dma_ops); + +static int iommu_sac_force __read_mostly; + +#ifdef CONFIG_IOMMU_DEBUG +int panic_on_overflow __read_mostly = 1; +int force_iommu __read_mostly = 1; +#else +int panic_on_overflow __read_mostly = 0; +int force_iommu __read_mostly = 0; +#endif + +int iommu_merge __read_mostly = 0; + +int no_iommu __read_mostly; +/* Set this to 1 if there is a HW IOMMU in the system */ +int iommu_detected __read_mostly = 0; + +/* + * This variable becomes 1 if iommu=pt is passed on the kernel command line. + * If this variable is 1, IOMMU implementations do no DMA translation for + * devices and allow every device to access to whole physical memory. This is + * useful if a user wants to use an IOMMU only for KVM device assignment to + * guests and not for driver dma translation. + */ +int iommu_pass_through __read_mostly; + +extern struct iommu_table_entry __iommu_table[], __iommu_table_end[]; + +/* Dummy device used for NULL arguments (normally ISA). */ +struct device x86_dma_fallback_dev = { + .init_name = "fallback device", + .coherent_dma_mask = ISA_DMA_BIT_MASK, + .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask, +}; +EXPORT_SYMBOL(x86_dma_fallback_dev); + +/* Number of entries preallocated for DMA-API debugging */ +#define PREALLOC_DMA_DEBUG_ENTRIES 65536 + +int dma_set_mask(struct device *dev, u64 mask) +{ + if (!dev->dma_mask || !dma_supported(dev, mask)) + return -EIO; + + *dev->dma_mask = mask; + + return 0; +} +EXPORT_SYMBOL(dma_set_mask); + +void __init pci_iommu_alloc(void) +{ + struct iommu_table_entry *p; + + sort_iommu_table(__iommu_table, __iommu_table_end); + check_iommu_entries(__iommu_table, __iommu_table_end); + + for (p = __iommu_table; p < __iommu_table_end; p++) { + if (p && p->detect && p->detect() > 0) { + p->flags |= IOMMU_DETECTED; + if (p->early_init) + p->early_init(); + if (p->flags & IOMMU_FINISH_IF_DETECTED) + break; + } + } +} +void *dma_generic_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_addr, gfp_t flag, + struct dma_attrs *attrs) +{ + unsigned long dma_mask; + struct page *page; + unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; + dma_addr_t addr; + + dma_mask = dma_alloc_coherent_mask(dev, flag); + + flag &= ~__GFP_ZERO; +again: + page = NULL; + /* CMA can be used only in the context which permits sleeping */ + if (flag & __GFP_WAIT) { + page = dma_alloc_from_contiguous(dev, count, get_order(size)); + if (page && page_to_phys(page) + size > dma_mask) { + dma_release_from_contiguous(dev, page, count); + page = NULL; + } + } + /* fallback */ + if (!page) + page = alloc_pages_node(dev_to_node(dev), flag, get_order(size)); + if (!page) + return NULL; + + addr = page_to_phys(page); + if (addr + size > dma_mask) { + __free_pages(page, get_order(size)); + + if (dma_mask < DMA_BIT_MASK(32) && !(flag & GFP_DMA)) { + flag = (flag & ~GFP_DMA32) | GFP_DMA; + goto again; + } + + return NULL; + } + memset(page_address(page), 0, size); + *dma_addr = addr; + return page_address(page); +} + +void dma_generic_free_coherent(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_addr, struct dma_attrs *attrs) +{ + unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; + struct page *page = virt_to_page(vaddr); + + if (!dma_release_from_contiguous(dev, page, count)) + free_pages((unsigned long)vaddr, get_order(size)); +} + +/* + * See <Documentation/x86/x86_64/boot-options.txt> for the iommu kernel + * parameter documentation. + */ +static __init int iommu_setup(char *p) +{ + iommu_merge = 1; + + if (!p) + return -EINVAL; + + while (*p) { + if (!strncmp(p, "off", 3)) + no_iommu = 1; + /* gart_parse_options has more force support */ + if (!strncmp(p, "force", 5)) + force_iommu = 1; + if (!strncmp(p, "noforce", 7)) { + iommu_merge = 0; + force_iommu = 0; + } + + if (!strncmp(p, "biomerge", 8)) { + iommu_merge = 1; + force_iommu = 1; + } + if (!strncmp(p, "panic", 5)) + panic_on_overflow = 1; + if (!strncmp(p, "nopanic", 7)) + panic_on_overflow = 0; + if (!strncmp(p, "merge", 5)) { + iommu_merge = 1; + force_iommu = 1; + } + if (!strncmp(p, "nomerge", 7)) + iommu_merge = 0; + if (!strncmp(p, "forcesac", 8)) + iommu_sac_force = 1; + if (!strncmp(p, "allowdac", 8)) + forbid_dac = 0; + if (!strncmp(p, "nodac", 5)) + forbid_dac = 1; + if (!strncmp(p, "usedac", 6)) { + forbid_dac = -1; + return 1; + } +#ifdef CONFIG_SWIOTLB + if (!strncmp(p, "soft", 4)) + swiotlb = 1; +#endif + if (!strncmp(p, "pt", 2)) + iommu_pass_through = 1; + + gart_parse_options(p); + +#ifdef CONFIG_CALGARY_IOMMU + if (!strncmp(p, "calgary", 7)) + use_calgary = 1; +#endif /* CONFIG_CALGARY_IOMMU */ + + p += strcspn(p, ","); + if (*p == ',') + ++p; + } + return 0; +} +early_param("iommu", iommu_setup); + +int dma_supported(struct device *dev, u64 mask) +{ + struct dma_map_ops *ops = get_dma_ops(dev); + +#ifdef CONFIG_PCI + if (mask > 0xffffffff && forbid_dac > 0) { + dev_info(dev, "PCI: Disallowing DAC for device\n"); + return 0; + } +#endif + + if (ops->dma_supported) + return ops->dma_supported(dev, mask); + + /* Copied from i386. Doesn't make much sense, because it will + only work for pci_alloc_coherent. + The caller just has to use GFP_DMA in this case. */ + if (mask < DMA_BIT_MASK(24)) + return 0; + + /* Tell the device to use SAC when IOMMU force is on. This + allows the driver to use cheaper accesses in some cases. + + Problem with this is that if we overflow the IOMMU area and + return DAC as fallback address the device may not handle it + correctly. + + As a special case some controllers have a 39bit address + mode that is as efficient as 32bit (aic79xx). Don't force + SAC for these. Assume all masks <= 40 bits are of this + type. Normally this doesn't make any difference, but gives + more gentle handling of IOMMU overflow. */ + if (iommu_sac_force && (mask >= DMA_BIT_MASK(40))) { + dev_info(dev, "Force SAC with mask %Lx\n", mask); + return 0; + } + + return 1; +} +EXPORT_SYMBOL(dma_supported); + +static int __init pci_iommu_init(void) +{ + struct iommu_table_entry *p; + dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES); + +#ifdef CONFIG_PCI + dma_debug_add_bus(&pci_bus_type); +#endif + x86_init.iommu.iommu_init(); + + for (p = __iommu_table; p < __iommu_table_end; p++) { + if (p && (p->flags & IOMMU_DETECTED) && p->late_init) + p->late_init(); + } + + return 0; +} +/* Must execute after PCI subsystem */ +rootfs_initcall(pci_iommu_init); + +#ifdef CONFIG_PCI +/* Many VIA bridges seem to corrupt data for DAC. Disable it here */ + +static void via_no_dac(struct pci_dev *dev) +{ + if (forbid_dac == 0) { + dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n"); + forbid_dac = 1; + } +} +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, + PCI_CLASS_BRIDGE_PCI, 8, via_no_dac); +#endif diff --git a/arch/x86/kernel/pci-iommu_table.c b/arch/x86/kernel/pci-iommu_table.c new file mode 100644 index 00000000000..35ccf75696e --- /dev/null +++ b/arch/x86/kernel/pci-iommu_table.c @@ -0,0 +1,79 @@ +#include <linux/dma-mapping.h> +#include <asm/iommu_table.h> +#include <linux/string.h> +#include <linux/kallsyms.h> + + +#define DEBUG 1 + +static struct iommu_table_entry * __init +find_dependents_of(struct iommu_table_entry *start, + struct iommu_table_entry *finish, + struct iommu_table_entry *q) +{ + struct iommu_table_entry *p; + + if (!q) + return NULL; + + for (p = start; p < finish; p++) + if (p->detect == q->depend) + return p; + + return NULL; +} + + +void __init sort_iommu_table(struct iommu_table_entry *start, + struct iommu_table_entry *finish) { + + struct iommu_table_entry *p, *q, tmp; + + for (p = start; p < finish; p++) { +again: + q = find_dependents_of(start, finish, p); + /* We are bit sneaky here. We use the memory address to figure + * out if the node we depend on is past our point, if so, swap. + */ + if (q > p) { + tmp = *p; + memmove(p, q, sizeof(*p)); + *q = tmp; + goto again; + } + } + +} + +#ifdef DEBUG +void __init check_iommu_entries(struct iommu_table_entry *start, + struct iommu_table_entry *finish) +{ + struct iommu_table_entry *p, *q, *x; + + /* Simple cyclic dependency checker. */ + for (p = start; p < finish; p++) { + q = find_dependents_of(start, finish, p); + x = find_dependents_of(start, finish, q); + if (p == x) { + printk(KERN_ERR "CYCLIC DEPENDENCY FOUND! %pS depends on %pS and vice-versa. BREAKING IT.\n", + p->detect, q->detect); + /* Heavy handed way..*/ + x->depend = 0; + } + } + + for (p = start; p < finish; p++) { + q = find_dependents_of(p, finish, p); + if (q && q > p) { + printk(KERN_ERR "EXECUTION ORDER INVALID! %pS should be called before %pS!\n", + p->detect, q->detect); + } + } +} +#else +inline void check_iommu_entries(struct iommu_table_entry *start, + struct iommu_table_entry *finish) +{ +} +#endif diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c new file mode 100644 index 00000000000..da15918d1c8 --- /dev/null +++ b/arch/x86/kernel/pci-nommu.c @@ -0,0 +1,99 @@ +/* Fallback functions when the main IOMMU code is not compiled in. This + code is roughly equivalent to i386. */ +#include <linux/dma-mapping.h> +#include <linux/scatterlist.h> +#include <linux/string.h> +#include <linux/gfp.h> +#include <linux/pci.h> +#include <linux/mm.h> + +#include <asm/processor.h> +#include <asm/iommu.h> +#include <asm/dma.h> + +static int +check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size) +{ + if (hwdev && !dma_capable(hwdev, bus, size)) { + if (*hwdev->dma_mask >= DMA_BIT_MASK(32)) + printk(KERN_ERR + "nommu_%s: overflow %Lx+%zu of device mask %Lx\n", + name, (long long)bus, size, + (long long)*hwdev->dma_mask); + return 0; + } + return 1; +} + +static dma_addr_t nommu_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + dma_addr_t bus = page_to_phys(page) + offset; + WARN_ON(size == 0); + if (!check_addr("map_single", dev, bus, size)) + return DMA_ERROR_CODE; + flush_write_buffers(); + return bus; +} + +/* Map a set of buffers described by scatterlist in streaming + * mode for DMA. This is the scatter-gather version of the + * above pci_map_single interface. Here the scatter gather list + * elements are each tagged with the appropriate dma address + * and length. They are obtained via sg_dma_{address,length}(SG). + * + * NOTE: An implementation may be able to use a smaller number of + * DMA address/length pairs than there are SG table elements. + * (for example via virtual mapping capabilities) + * The routine returns the number of addr/length pairs actually + * used, at most nents. + * + * Device ownership issues as mentioned above for pci_map_single are + * the same here. + */ +static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, + int nents, enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + struct scatterlist *s; + int i; + + WARN_ON(nents == 0 || sg[0].length == 0); + + for_each_sg(sg, s, nents, i) { + BUG_ON(!sg_page(s)); + s->dma_address = sg_phys(s); + if (!check_addr("map_sg", hwdev, s->dma_address, s->length)) + return 0; + s->dma_length = s->length; + } + flush_write_buffers(); + return nents; +} + +static void nommu_sync_single_for_device(struct device *dev, + dma_addr_t addr, size_t size, + enum dma_data_direction dir) +{ + flush_write_buffers(); +} + + +static void nommu_sync_sg_for_device(struct device *dev, + struct scatterlist *sg, int nelems, + enum dma_data_direction dir) +{ + flush_write_buffers(); +} + +struct dma_map_ops nommu_dma_ops = { + .alloc = dma_generic_alloc_coherent, + .free = dma_generic_free_coherent, + .map_sg = nommu_map_sg, + .map_page = nommu_map_page, + .sync_single_for_device = nommu_sync_single_for_device, + .sync_sg_for_device = nommu_sync_sg_for_device, + .is_phys = 1, +}; diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c new file mode 100644 index 00000000000..77dd0ad58be --- /dev/null +++ b/arch/x86/kernel/pci-swiotlb.c @@ -0,0 +1,112 @@ +/* Glue code to lib/swiotlb.c */ + +#include <linux/pci.h> +#include <linux/cache.h> +#include <linux/module.h> +#include <linux/swiotlb.h> +#include <linux/bootmem.h> +#include <linux/dma-mapping.h> + +#include <asm/iommu.h> +#include <asm/swiotlb.h> +#include <asm/dma.h> +#include <asm/xen/swiotlb-xen.h> +#include <asm/iommu_table.h> +int swiotlb __read_mostly; + +void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, + dma_addr_t *dma_handle, gfp_t flags, + struct dma_attrs *attrs) +{ + void *vaddr; + + vaddr = dma_generic_alloc_coherent(hwdev, size, dma_handle, flags, + attrs); + if (vaddr) + return vaddr; + + return swiotlb_alloc_coherent(hwdev, size, dma_handle, flags); +} + +void x86_swiotlb_free_coherent(struct device *dev, size_t size, + void *vaddr, dma_addr_t dma_addr, + struct dma_attrs *attrs) +{ + if (is_swiotlb_buffer(dma_to_phys(dev, dma_addr))) + swiotlb_free_coherent(dev, size, vaddr, dma_addr); + else + dma_generic_free_coherent(dev, size, vaddr, dma_addr, attrs); +} + +static struct dma_map_ops swiotlb_dma_ops = { + .mapping_error = swiotlb_dma_mapping_error, + .alloc = x86_swiotlb_alloc_coherent, + .free = x86_swiotlb_free_coherent, + .sync_single_for_cpu = swiotlb_sync_single_for_cpu, + .sync_single_for_device = swiotlb_sync_single_for_device, + .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, + .sync_sg_for_device = swiotlb_sync_sg_for_device, + .map_sg = swiotlb_map_sg_attrs, + .unmap_sg = swiotlb_unmap_sg_attrs, + .map_page = swiotlb_map_page, + .unmap_page = swiotlb_unmap_page, + .dma_supported = NULL, +}; + +/* + * pci_swiotlb_detect_override - set swiotlb to 1 if necessary + * + * This returns non-zero if we are forced to use swiotlb (by the boot + * option). + */ +int __init pci_swiotlb_detect_override(void) +{ + int use_swiotlb = swiotlb | swiotlb_force; + + if (swiotlb_force) + swiotlb = 1; + + return use_swiotlb; +} +IOMMU_INIT_FINISH(pci_swiotlb_detect_override, + pci_xen_swiotlb_detect, + pci_swiotlb_init, + pci_swiotlb_late_init); + +/* + * if 4GB or more detected (and iommu=off not set) return 1 + * and set swiotlb to 1. + */ +int __init pci_swiotlb_detect_4gb(void) +{ + /* don't initialize swiotlb if iommu=off (no_iommu=1) */ +#ifdef CONFIG_X86_64 + if (!no_iommu && max_pfn > MAX_DMA32_PFN) + swiotlb = 1; +#endif + return swiotlb; +} +IOMMU_INIT(pci_swiotlb_detect_4gb, + pci_swiotlb_detect_override, + pci_swiotlb_init, + pci_swiotlb_late_init); + +void __init pci_swiotlb_init(void) +{ + if (swiotlb) { + swiotlb_init(0); + dma_ops = &swiotlb_dma_ops; + } +} + +void __init pci_swiotlb_late_init(void) +{ + /* An IOMMU turned us off. */ + if (!swiotlb) + swiotlb_free(); + else { + printk(KERN_INFO "PCI-DMA: " + "Using software bounce buffering for IO (SWIOTLB)\n"); + swiotlb_print_info(); + } +} diff --git a/arch/x86/kernel/pcspeaker.c b/arch/x86/kernel/pcspeaker.c new file mode 100644 index 00000000000..a311ffcaad1 --- /dev/null +++ b/arch/x86/kernel/pcspeaker.c @@ -0,0 +1,13 @@ +#include <linux/platform_device.h> +#include <linux/err.h> +#include <linux/init.h> + +static __init int add_pcspkr(void) +{ + struct platform_device *pd; + + pd = platform_device_register_simple("pcspkr", -1, NULL, 0); + + return IS_ERR(pd) ? PTR_ERR(pd) : 0; +} +device_initcall(add_pcspkr); diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c new file mode 100644 index 00000000000..e309cc5c276 --- /dev/null +++ b/arch/x86/kernel/perf_regs.c @@ -0,0 +1,105 @@ +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/perf_event.h> +#include <linux/bug.h> +#include <linux/stddef.h> +#include <asm/perf_regs.h> +#include <asm/ptrace.h> + +#ifdef CONFIG_X86_32 +#define PERF_REG_X86_MAX PERF_REG_X86_32_MAX +#else +#define PERF_REG_X86_MAX PERF_REG_X86_64_MAX +#endif + +#define PT_REGS_OFFSET(id, r) [id] = offsetof(struct pt_regs, r) + +static unsigned int pt_regs_offset[PERF_REG_X86_MAX] = { + PT_REGS_OFFSET(PERF_REG_X86_AX, ax), + PT_REGS_OFFSET(PERF_REG_X86_BX, bx), + PT_REGS_OFFSET(PERF_REG_X86_CX, cx), + PT_REGS_OFFSET(PERF_REG_X86_DX, dx), + PT_REGS_OFFSET(PERF_REG_X86_SI, si), + PT_REGS_OFFSET(PERF_REG_X86_DI, di), + PT_REGS_OFFSET(PERF_REG_X86_BP, bp), + PT_REGS_OFFSET(PERF_REG_X86_SP, sp), + PT_REGS_OFFSET(PERF_REG_X86_IP, ip), + PT_REGS_OFFSET(PERF_REG_X86_FLAGS, flags), + PT_REGS_OFFSET(PERF_REG_X86_CS, cs), + PT_REGS_OFFSET(PERF_REG_X86_SS, ss), +#ifdef CONFIG_X86_32 + PT_REGS_OFFSET(PERF_REG_X86_DS, ds), + PT_REGS_OFFSET(PERF_REG_X86_ES, es), + PT_REGS_OFFSET(PERF_REG_X86_FS, fs), + PT_REGS_OFFSET(PERF_REG_X86_GS, gs), +#else + /* + * The pt_regs struct does not store + * ds, es, fs, gs in 64 bit mode. + */ + (unsigned int) -1, + (unsigned int) -1, + (unsigned int) -1, + (unsigned int) -1, +#endif +#ifdef CONFIG_X86_64 + PT_REGS_OFFSET(PERF_REG_X86_R8, r8), + PT_REGS_OFFSET(PERF_REG_X86_R9, r9), + PT_REGS_OFFSET(PERF_REG_X86_R10, r10), + PT_REGS_OFFSET(PERF_REG_X86_R11, r11), + PT_REGS_OFFSET(PERF_REG_X86_R12, r12), + PT_REGS_OFFSET(PERF_REG_X86_R13, r13), + PT_REGS_OFFSET(PERF_REG_X86_R14, r14), + PT_REGS_OFFSET(PERF_REG_X86_R15, r15), +#endif +}; + +u64 perf_reg_value(struct pt_regs *regs, int idx) +{ + if (WARN_ON_ONCE(idx >= ARRAY_SIZE(pt_regs_offset))) + return 0; + + return regs_get_register(regs, pt_regs_offset[idx]); +} + +#define REG_RESERVED (~((1ULL << PERF_REG_X86_MAX) - 1ULL)) + +#ifdef CONFIG_X86_32 +int perf_reg_validate(u64 mask) +{ + if (!mask || mask & REG_RESERVED) + return -EINVAL; + + return 0; +} + +u64 perf_reg_abi(struct task_struct *task) +{ + return PERF_SAMPLE_REGS_ABI_32; +} +#else /* CONFIG_X86_64 */ +#define REG_NOSUPPORT ((1ULL << PERF_REG_X86_DS) | \ + (1ULL << PERF_REG_X86_ES) | \ + (1ULL << PERF_REG_X86_FS) | \ + (1ULL << PERF_REG_X86_GS)) + +int perf_reg_validate(u64 mask) +{ + if (!mask || mask & REG_RESERVED) + return -EINVAL; + + if (mask & REG_NOSUPPORT) + return -EINVAL; + + return 0; +} + +u64 perf_reg_abi(struct task_struct *task) +{ + if (test_tsk_thread_flag(task, TIF_IA32)) + return PERF_SAMPLE_REGS_ABI_32; + else + return PERF_SAMPLE_REGS_ABI_64; +} +#endif /* CONFIG_X86_32 */ diff --git a/arch/x86/kernel/preempt.S b/arch/x86/kernel/preempt.S new file mode 100644 index 00000000000..ca7f0d58a87 --- /dev/null +++ b/arch/x86/kernel/preempt.S @@ -0,0 +1,25 @@ + +#include <linux/linkage.h> +#include <asm/dwarf2.h> +#include <asm/asm.h> +#include <asm/calling.h> + +ENTRY(___preempt_schedule) + CFI_STARTPROC + SAVE_ALL + call preempt_schedule + RESTORE_ALL + ret + CFI_ENDPROC + +#ifdef CONFIG_CONTEXT_TRACKING + +ENTRY(___preempt_schedule_context) + CFI_STARTPROC + SAVE_ALL + call preempt_schedule_context + RESTORE_ALL + ret + CFI_ENDPROC + +#endif diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c new file mode 100644 index 00000000000..d5f15c3f7b2 --- /dev/null +++ b/arch/x86/kernel/probe_roms.c @@ -0,0 +1,268 @@ +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/uaccess.h> +#include <linux/mmzone.h> +#include <linux/ioport.h> +#include <linux/seq_file.h> +#include <linux/console.h> +#include <linux/init.h> +#include <linux/edd.h> +#include <linux/dmi.h> +#include <linux/pfn.h> +#include <linux/pci.h> +#include <linux/export.h> + +#include <asm/probe_roms.h> +#include <asm/pci-direct.h> +#include <asm/e820.h> +#include <asm/mmzone.h> +#include <asm/setup.h> +#include <asm/sections.h> +#include <asm/io.h> +#include <asm/setup_arch.h> + +static struct resource system_rom_resource = { + .name = "System ROM", + .start = 0xf0000, + .end = 0xfffff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +static struct resource extension_rom_resource = { + .name = "Extension ROM", + .start = 0xe0000, + .end = 0xeffff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +static struct resource adapter_rom_resources[] = { { + .name = "Adapter ROM", + .start = 0xc8000, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +} }; + +static struct resource video_rom_resource = { + .name = "Video ROM", + .start = 0xc0000, + .end = 0xc7fff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +/* does this oprom support the given pci device, or any of the devices + * that the driver supports? + */ +static bool match_id(struct pci_dev *pdev, unsigned short vendor, unsigned short device) +{ + struct pci_driver *drv = pdev->driver; + const struct pci_device_id *id; + + if (pdev->vendor == vendor && pdev->device == device) + return true; + + for (id = drv ? drv->id_table : NULL; id && id->vendor; id++) + if (id->vendor == vendor && id->device == device) + break; + + return id && id->vendor; +} + +static bool probe_list(struct pci_dev *pdev, unsigned short vendor, + const unsigned char *rom_list) +{ + unsigned short device; + + do { + if (probe_kernel_address(rom_list, device) != 0) + device = 0; + + if (device && match_id(pdev, vendor, device)) + break; + + rom_list += 2; + } while (device); + + return !!device; +} + +static struct resource *find_oprom(struct pci_dev *pdev) +{ + struct resource *oprom = NULL; + int i; + + for (i = 0; i < ARRAY_SIZE(adapter_rom_resources); i++) { + struct resource *res = &adapter_rom_resources[i]; + unsigned short offset, vendor, device, list, rev; + const unsigned char *rom; + + if (res->end == 0) + break; + + rom = isa_bus_to_virt(res->start); + if (probe_kernel_address(rom + 0x18, offset) != 0) + continue; + + if (probe_kernel_address(rom + offset + 0x4, vendor) != 0) + continue; + + if (probe_kernel_address(rom + offset + 0x6, device) != 0) + continue; + + if (match_id(pdev, vendor, device)) { + oprom = res; + break; + } + + if (probe_kernel_address(rom + offset + 0x8, list) == 0 && + probe_kernel_address(rom + offset + 0xc, rev) == 0 && + rev >= 3 && list && + probe_list(pdev, vendor, rom + offset + list)) { + oprom = res; + break; + } + } + + return oprom; +} + +void __iomem *pci_map_biosrom(struct pci_dev *pdev) +{ + struct resource *oprom = find_oprom(pdev); + + if (!oprom) + return NULL; + + return ioremap(oprom->start, resource_size(oprom)); +} +EXPORT_SYMBOL(pci_map_biosrom); + +void pci_unmap_biosrom(void __iomem *image) +{ + iounmap(image); +} +EXPORT_SYMBOL(pci_unmap_biosrom); + +size_t pci_biosrom_size(struct pci_dev *pdev) +{ + struct resource *oprom = find_oprom(pdev); + + return oprom ? resource_size(oprom) : 0; +} +EXPORT_SYMBOL(pci_biosrom_size); + +#define ROMSIGNATURE 0xaa55 + +static int __init romsignature(const unsigned char *rom) +{ + const unsigned short * const ptr = (const unsigned short *)rom; + unsigned short sig; + + return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE; +} + +static int __init romchecksum(const unsigned char *rom, unsigned long length) +{ + unsigned char sum, c; + + for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--) + sum += c; + return !length && !sum; +} + +void __init probe_roms(void) +{ + const unsigned char *rom; + unsigned long start, length, upper; + unsigned char c; + int i; + + /* video rom */ + upper = adapter_rom_resources[0].start; + for (start = video_rom_resource.start; start < upper; start += 2048) { + rom = isa_bus_to_virt(start); + if (!romsignature(rom)) + continue; + + video_rom_resource.start = start; + + if (probe_kernel_address(rom + 2, c) != 0) + continue; + + /* 0 < length <= 0x7f * 512, historically */ + length = c * 512; + + /* if checksum okay, trust length byte */ + if (length && romchecksum(rom, length)) + video_rom_resource.end = start + length - 1; + + request_resource(&iomem_resource, &video_rom_resource); + break; + } + + start = (video_rom_resource.end + 1 + 2047) & ~2047UL; + if (start < upper) + start = upper; + + /* system rom */ + request_resource(&iomem_resource, &system_rom_resource); + upper = system_rom_resource.start; + + /* check for extension rom (ignore length byte!) */ + rom = isa_bus_to_virt(extension_rom_resource.start); + if (romsignature(rom)) { + length = resource_size(&extension_rom_resource); + if (romchecksum(rom, length)) { + request_resource(&iomem_resource, &extension_rom_resource); + upper = extension_rom_resource.start; + } + } + + /* check for adapter roms on 2k boundaries */ + for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) { + rom = isa_bus_to_virt(start); + if (!romsignature(rom)) + continue; + + if (probe_kernel_address(rom + 2, c) != 0) + continue; + + /* 0 < length <= 0x7f * 512, historically */ + length = c * 512; + + /* but accept any length that fits if checksum okay */ + if (!length || start + length > upper || !romchecksum(rom, length)) + continue; + + adapter_rom_resources[i].start = start; + adapter_rom_resources[i].end = start + length - 1; + request_resource(&iomem_resource, &adapter_rom_resources[i]); + + start = adapter_rom_resources[i++].end & ~2047UL; + } +} + diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c new file mode 100644 index 00000000000..4505e2a950d --- /dev/null +++ b/arch/x86/kernel/process.c @@ -0,0 +1,468 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/prctl.h> +#include <linux/slab.h> +#include <linux/sched.h> +#include <linux/module.h> +#include <linux/pm.h> +#include <linux/clockchips.h> +#include <linux/random.h> +#include <linux/user-return-notifier.h> +#include <linux/dmi.h> +#include <linux/utsname.h> +#include <linux/stackprotector.h> +#include <linux/tick.h> +#include <linux/cpuidle.h> +#include <trace/events/power.h> +#include <linux/hw_breakpoint.h> +#include <asm/cpu.h> +#include <asm/apic.h> +#include <asm/syscalls.h> +#include <asm/idle.h> +#include <asm/uaccess.h> +#include <asm/i387.h> +#include <asm/fpu-internal.h> +#include <asm/debugreg.h> +#include <asm/nmi.h> + +/* + * per-CPU TSS segments. Threads are completely 'soft' on Linux, + * no more per-task TSS's. The TSS size is kept cacheline-aligned + * so they are allowed to end up in the .data..cacheline_aligned + * section. Since TSS's are completely CPU-local, we want them + * on exact cacheline boundaries, to eliminate cacheline ping-pong. + */ +__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; + +#ifdef CONFIG_X86_64 +static DEFINE_PER_CPU(unsigned char, is_idle); +static ATOMIC_NOTIFIER_HEAD(idle_notifier); + +void idle_notifier_register(struct notifier_block *n) +{ + atomic_notifier_chain_register(&idle_notifier, n); +} +EXPORT_SYMBOL_GPL(idle_notifier_register); + +void idle_notifier_unregister(struct notifier_block *n) +{ + atomic_notifier_chain_unregister(&idle_notifier, n); +} +EXPORT_SYMBOL_GPL(idle_notifier_unregister); +#endif + +struct kmem_cache *task_xstate_cachep; +EXPORT_SYMBOL_GPL(task_xstate_cachep); + +/* + * this gets called so that we can store lazy state into memory and copy the + * current task into the new thread. + */ +int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) +{ + int ret; + + *dst = *src; + if (fpu_allocated(&src->thread.fpu)) { + memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu)); + ret = fpu_alloc(&dst->thread.fpu); + if (ret) + return ret; + fpu_copy(dst, src); + } + return 0; +} + +void free_thread_xstate(struct task_struct *tsk) +{ + fpu_free(&tsk->thread.fpu); +} + +void arch_release_task_struct(struct task_struct *tsk) +{ + free_thread_xstate(tsk); +} + +void arch_task_cache_init(void) +{ + task_xstate_cachep = + kmem_cache_create("task_xstate", xstate_size, + __alignof__(union thread_xstate), + SLAB_PANIC | SLAB_NOTRACK, NULL); +} + +/* + * Free current thread data structures etc.. + */ +void exit_thread(void) +{ + struct task_struct *me = current; + struct thread_struct *t = &me->thread; + unsigned long *bp = t->io_bitmap_ptr; + + if (bp) { + struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); + + t->io_bitmap_ptr = NULL; + clear_thread_flag(TIF_IO_BITMAP); + /* + * Careful, clear this in the TSS too: + */ + memset(tss->io_bitmap, 0xff, t->io_bitmap_max); + t->io_bitmap_max = 0; + put_cpu(); + kfree(bp); + } + + drop_fpu(me); +} + +void flush_thread(void) +{ + struct task_struct *tsk = current; + + flush_ptrace_hw_breakpoint(tsk); + memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); + drop_init_fpu(tsk); + /* + * Free the FPU state for non xsave platforms. They get reallocated + * lazily at the first use. + */ + if (!use_eager_fpu()) + free_thread_xstate(tsk); +} + +static void hard_disable_TSC(void) +{ + write_cr4(read_cr4() | X86_CR4_TSD); +} + +void disable_TSC(void) +{ + preempt_disable(); + if (!test_and_set_thread_flag(TIF_NOTSC)) + /* + * Must flip the CPU state synchronously with + * TIF_NOTSC in the current running context. + */ + hard_disable_TSC(); + preempt_enable(); +} + +static void hard_enable_TSC(void) +{ + write_cr4(read_cr4() & ~X86_CR4_TSD); +} + +static void enable_TSC(void) +{ + preempt_disable(); + if (test_and_clear_thread_flag(TIF_NOTSC)) + /* + * Must flip the CPU state synchronously with + * TIF_NOTSC in the current running context. + */ + hard_enable_TSC(); + preempt_enable(); +} + +int get_tsc_mode(unsigned long adr) +{ + unsigned int val; + + if (test_thread_flag(TIF_NOTSC)) + val = PR_TSC_SIGSEGV; + else + val = PR_TSC_ENABLE; + + return put_user(val, (unsigned int __user *)adr); +} + +int set_tsc_mode(unsigned int val) +{ + if (val == PR_TSC_SIGSEGV) + disable_TSC(); + else if (val == PR_TSC_ENABLE) + enable_TSC(); + else + return -EINVAL; + + return 0; +} + +void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, + struct tss_struct *tss) +{ + struct thread_struct *prev, *next; + + prev = &prev_p->thread; + next = &next_p->thread; + + if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^ + test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) { + unsigned long debugctl = get_debugctlmsr(); + + debugctl &= ~DEBUGCTLMSR_BTF; + if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) + debugctl |= DEBUGCTLMSR_BTF; + + update_debugctlmsr(debugctl); + } + + if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ + test_tsk_thread_flag(next_p, TIF_NOTSC)) { + /* prev and next are different */ + if (test_tsk_thread_flag(next_p, TIF_NOTSC)) + hard_disable_TSC(); + else + hard_enable_TSC(); + } + + if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { + /* + * Copy the relevant range of the IO bitmap. + * Normally this is 128 bytes or less: + */ + memcpy(tss->io_bitmap, next->io_bitmap_ptr, + max(prev->io_bitmap_max, next->io_bitmap_max)); + } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) { + /* + * Clear any possible leftover bits: + */ + memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); + } + propagate_user_return_notify(prev_p, next_p); +} + +/* + * Idle related variables and functions + */ +unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE; +EXPORT_SYMBOL(boot_option_idle_override); + +static void (*x86_idle)(void); + +#ifndef CONFIG_SMP +static inline void play_dead(void) +{ + BUG(); +} +#endif + +#ifdef CONFIG_X86_64 +void enter_idle(void) +{ + this_cpu_write(is_idle, 1); + atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); +} + +static void __exit_idle(void) +{ + if (x86_test_and_clear_bit_percpu(0, is_idle) == 0) + return; + atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); +} + +/* Called from interrupts to signify idle end */ +void exit_idle(void) +{ + /* idle loop has pid 0 */ + if (current->pid) + return; + __exit_idle(); +} +#endif + +void arch_cpu_idle_enter(void) +{ + local_touch_nmi(); + enter_idle(); +} + +void arch_cpu_idle_exit(void) +{ + __exit_idle(); +} + +void arch_cpu_idle_dead(void) +{ + play_dead(); +} + +/* + * Called from the generic idle code. + */ +void arch_cpu_idle(void) +{ + x86_idle(); +} + +/* + * We use this if we don't have any better idle routine.. + */ +void default_idle(void) +{ + trace_cpu_idle_rcuidle(1, smp_processor_id()); + safe_halt(); + trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); +} +#ifdef CONFIG_APM_MODULE +EXPORT_SYMBOL(default_idle); +#endif + +#ifdef CONFIG_XEN +bool xen_set_default_idle(void) +{ + bool ret = !!x86_idle; + + x86_idle = default_idle; + + return ret; +} +#endif +void stop_this_cpu(void *dummy) +{ + local_irq_disable(); + /* + * Remove this CPU: + */ + set_cpu_online(smp_processor_id(), false); + disable_local_APIC(); + + for (;;) + halt(); +} + +bool amd_e400_c1e_detected; +EXPORT_SYMBOL(amd_e400_c1e_detected); + +static cpumask_var_t amd_e400_c1e_mask; + +void amd_e400_remove_cpu(int cpu) +{ + if (amd_e400_c1e_mask != NULL) + cpumask_clear_cpu(cpu, amd_e400_c1e_mask); +} + +/* + * AMD Erratum 400 aware idle routine. We check for C1E active in the interrupt + * pending message MSR. If we detect C1E, then we handle it the same + * way as C3 power states (local apic timer and TSC stop) + */ +static void amd_e400_idle(void) +{ + if (!amd_e400_c1e_detected) { + u32 lo, hi; + + rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); + + if (lo & K8_INTP_C1E_ACTIVE_MASK) { + amd_e400_c1e_detected = true; + if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) + mark_tsc_unstable("TSC halt in AMD C1E"); + pr_info("System has AMD C1E enabled\n"); + } + } + + if (amd_e400_c1e_detected) { + int cpu = smp_processor_id(); + + if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) { + cpumask_set_cpu(cpu, amd_e400_c1e_mask); + /* + * Force broadcast so ACPI can not interfere. + */ + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, + &cpu); + pr_info("Switch to broadcast mode on CPU%d\n", cpu); + } + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); + + default_idle(); + + /* + * The switch back from broadcast mode needs to be + * called with interrupts disabled. + */ + local_irq_disable(); + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); + local_irq_enable(); + } else + default_idle(); +} + +void select_idle_routine(const struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + if (boot_option_idle_override == IDLE_POLL && smp_num_siblings > 1) + pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n"); +#endif + if (x86_idle || boot_option_idle_override == IDLE_POLL) + return; + + if (cpu_has_bug(c, X86_BUG_AMD_APIC_C1E)) { + /* E400: APIC timer interrupt does not wake up CPU from C1e */ + pr_info("using AMD E400 aware idle routine\n"); + x86_idle = amd_e400_idle; + } else + x86_idle = default_idle; +} + +void __init init_amd_e400_c1e_mask(void) +{ + /* If we're using amd_e400_idle, we need to allocate amd_e400_c1e_mask. */ + if (x86_idle == amd_e400_idle) + zalloc_cpumask_var(&amd_e400_c1e_mask, GFP_KERNEL); +} + +static int __init idle_setup(char *str) +{ + if (!str) + return -EINVAL; + + if (!strcmp(str, "poll")) { + pr_info("using polling idle threads\n"); + boot_option_idle_override = IDLE_POLL; + cpu_idle_poll_ctrl(true); + } else if (!strcmp(str, "halt")) { + /* + * When the boot option of idle=halt is added, halt is + * forced to be used for CPU idle. In such case CPU C2/C3 + * won't be used again. + * To continue to load the CPU idle driver, don't touch + * the boot_option_idle_override. + */ + x86_idle = default_idle; + boot_option_idle_override = IDLE_HALT; + } else if (!strcmp(str, "nomwait")) { + /* + * If the boot option of "idle=nomwait" is added, + * it means that mwait will be disabled for CPU C2/C3 + * states. In such case it won't touch the variable + * of boot_option_idle_override. + */ + boot_option_idle_override = IDLE_NOMWAIT; + } else + return -1; + + return 0; +} +early_param("idle", idle_setup); + +unsigned long arch_align_stack(unsigned long sp) +{ + if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) + sp -= get_random_int() % 8192; + return sp & ~0xf; +} + +unsigned long arch_randomize_brk(struct mm_struct *mm) +{ + unsigned long range_end = mm->brk + 0x02000000; + return randomize_range(mm->brk, range_end, 0) ? : mm->brk; +} + diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c new file mode 100644 index 00000000000..7bc86bbe748 --- /dev/null +++ b/arch/x86/kernel/process_32.c @@ -0,0 +1,360 @@ +/* + * Copyright (C) 1995 Linus Torvalds + * + * Pentium III FXSR, SSE support + * Gareth Hughes <gareth@valinux.com>, May 2000 + */ + +/* + * This file handles the architecture-dependent parts of process handling.. + */ + +#include <linux/cpu.h> +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/fs.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/elfcore.h> +#include <linux/smp.h> +#include <linux/stddef.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/user.h> +#include <linux/interrupt.h> +#include <linux/delay.h> +#include <linux/reboot.h> +#include <linux/mc146818rtc.h> +#include <linux/module.h> +#include <linux/kallsyms.h> +#include <linux/ptrace.h> +#include <linux/personality.h> +#include <linux/percpu.h> +#include <linux/prctl.h> +#include <linux/ftrace.h> +#include <linux/uaccess.h> +#include <linux/io.h> +#include <linux/kdebug.h> + +#include <asm/pgtable.h> +#include <asm/ldt.h> +#include <asm/processor.h> +#include <asm/i387.h> +#include <asm/fpu-internal.h> +#include <asm/desc.h> +#ifdef CONFIG_MATH_EMULATION +#include <asm/math_emu.h> +#endif + +#include <linux/err.h> + +#include <asm/tlbflush.h> +#include <asm/cpu.h> +#include <asm/idle.h> +#include <asm/syscalls.h> +#include <asm/debugreg.h> +#include <asm/switch_to.h> + +asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); +asmlinkage void ret_from_kernel_thread(void) __asm__("ret_from_kernel_thread"); + +/* + * Return saved PC of a blocked thread. + */ +unsigned long thread_saved_pc(struct task_struct *tsk) +{ + return ((unsigned long *)tsk->thread.sp)[3]; +} + +void __show_regs(struct pt_regs *regs, int all) +{ + unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; + unsigned long d0, d1, d2, d3, d6, d7; + unsigned long sp; + unsigned short ss, gs; + + if (user_mode_vm(regs)) { + sp = regs->sp; + ss = regs->ss & 0xffff; + gs = get_user_gs(regs); + } else { + sp = kernel_stack_pointer(regs); + savesegment(ss, ss); + savesegment(gs, gs); + } + + printk(KERN_DEFAULT "EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", + (u16)regs->cs, regs->ip, regs->flags, + smp_processor_id()); + print_symbol("EIP is at %s\n", regs->ip); + + printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", + regs->ax, regs->bx, regs->cx, regs->dx); + printk(KERN_DEFAULT "ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", + regs->si, regs->di, regs->bp, sp); + printk(KERN_DEFAULT " DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", + (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss); + + if (!all) + return; + + cr0 = read_cr0(); + cr2 = read_cr2(); + cr3 = read_cr3(); + cr4 = read_cr4_safe(); + printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", + cr0, cr2, cr3, cr4); + + get_debugreg(d0, 0); + get_debugreg(d1, 1); + get_debugreg(d2, 2); + get_debugreg(d3, 3); + get_debugreg(d6, 6); + get_debugreg(d7, 7); + + /* Only print out debug registers if they are in their non-default state. */ + if ((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) && + (d6 == DR6_RESERVED) && (d7 == 0x400)) + return; + + printk(KERN_DEFAULT "DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", + d0, d1, d2, d3); + printk(KERN_DEFAULT "DR6: %08lx DR7: %08lx\n", + d6, d7); +} + +void release_thread(struct task_struct *dead_task) +{ + BUG_ON(dead_task->mm); + release_vm86_irqs(dead_task); +} + +int copy_thread(unsigned long clone_flags, unsigned long sp, + unsigned long arg, struct task_struct *p) +{ + struct pt_regs *childregs = task_pt_regs(p); + struct task_struct *tsk; + int err; + + p->thread.sp = (unsigned long) childregs; + p->thread.sp0 = (unsigned long) (childregs+1); + + if (unlikely(p->flags & PF_KTHREAD)) { + /* kernel thread */ + memset(childregs, 0, sizeof(struct pt_regs)); + p->thread.ip = (unsigned long) ret_from_kernel_thread; + task_user_gs(p) = __KERNEL_STACK_CANARY; + childregs->ds = __USER_DS; + childregs->es = __USER_DS; + childregs->fs = __KERNEL_PERCPU; + childregs->bx = sp; /* function */ + childregs->bp = arg; + childregs->orig_ax = -1; + childregs->cs = __KERNEL_CS | get_kernel_rpl(); + childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED; + p->thread.fpu_counter = 0; + p->thread.io_bitmap_ptr = NULL; + memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); + return 0; + } + *childregs = *current_pt_regs(); + childregs->ax = 0; + if (sp) + childregs->sp = sp; + + p->thread.ip = (unsigned long) ret_from_fork; + task_user_gs(p) = get_user_gs(current_pt_regs()); + + p->thread.fpu_counter = 0; + p->thread.io_bitmap_ptr = NULL; + tsk = current; + err = -ENOMEM; + + memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); + + if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { + p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, + IO_BITMAP_BYTES, GFP_KERNEL); + if (!p->thread.io_bitmap_ptr) { + p->thread.io_bitmap_max = 0; + return -ENOMEM; + } + set_tsk_thread_flag(p, TIF_IO_BITMAP); + } + + err = 0; + + /* + * Set a new TLS for the child thread? + */ + if (clone_flags & CLONE_SETTLS) + err = do_set_thread_area(p, -1, + (struct user_desc __user *)childregs->si, 0); + + if (err && p->thread.io_bitmap_ptr) { + kfree(p->thread.io_bitmap_ptr); + p->thread.io_bitmap_max = 0; + } + return err; +} + +void +start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) +{ + set_user_gs(regs, 0); + regs->fs = 0; + regs->ds = __USER_DS; + regs->es = __USER_DS; + regs->ss = __USER_DS; + regs->cs = __USER_CS; + regs->ip = new_ip; + regs->sp = new_sp; + regs->flags = X86_EFLAGS_IF; + /* + * force it to the iret return path by making it look as if there was + * some work pending. + */ + set_thread_flag(TIF_NOTIFY_RESUME); +} +EXPORT_SYMBOL_GPL(start_thread); + + +/* + * switch_to(x,y) should switch tasks from x to y. + * + * We fsave/fwait so that an exception goes off at the right time + * (as a call from the fsave or fwait in effect) rather than to + * the wrong process. Lazy FP saving no longer makes any sense + * with modern CPU's, and this simplifies a lot of things (SMP + * and UP become the same). + * + * NOTE! We used to use the x86 hardware context switching. The + * reason for not using it any more becomes apparent when you + * try to recover gracefully from saved state that is no longer + * valid (stale segment register values in particular). With the + * hardware task-switch, there is no way to fix up bad state in + * a reasonable manner. + * + * The fact that Intel documents the hardware task-switching to + * be slow is a fairly red herring - this code is not noticeably + * faster. However, there _is_ some room for improvement here, + * so the performance issues may eventually be a valid point. + * More important, however, is the fact that this allows us much + * more flexibility. + * + * The return value (in %ax) will be the "prev" task after + * the task-switch, and shows up in ret_from_fork in entry.S, + * for example. + */ +__visible __notrace_funcgraph struct task_struct * +__switch_to(struct task_struct *prev_p, struct task_struct *next_p) +{ + struct thread_struct *prev = &prev_p->thread, + *next = &next_p->thread; + int cpu = smp_processor_id(); + struct tss_struct *tss = &per_cpu(init_tss, cpu); + fpu_switch_t fpu; + + /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ + + fpu = switch_fpu_prepare(prev_p, next_p, cpu); + + /* + * Reload esp0. + */ + load_sp0(tss, next); + + /* + * Save away %gs. No need to save %fs, as it was saved on the + * stack on entry. No need to save %es and %ds, as those are + * always kernel segments while inside the kernel. Doing this + * before setting the new TLS descriptors avoids the situation + * where we temporarily have non-reloadable segments in %fs + * and %gs. This could be an issue if the NMI handler ever + * used %fs or %gs (it does not today), or if the kernel is + * running inside of a hypervisor layer. + */ + lazy_save_gs(prev->gs); + + /* + * Load the per-thread Thread-Local Storage descriptor. + */ + load_TLS(next, cpu); + + /* + * Restore IOPL if needed. In normal use, the flags restore + * in the switch assembly will handle this. But if the kernel + * is running virtualized at a non-zero CPL, the popf will + * not restore flags, so it must be done in a separate step. + */ + if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl)) + set_iopl_mask(next->iopl); + + /* + * If it were not for PREEMPT_ACTIVE we could guarantee that the + * preempt_count of all tasks was equal here and this would not be + * needed. + */ + task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count); + this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count); + + /* + * Now maybe handle debug registers and/or IO bitmaps + */ + if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV || + task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT)) + __switch_to_xtra(prev_p, next_p, tss); + + /* + * Leave lazy mode, flushing any hypercalls made here. + * This must be done before restoring TLS segments so + * the GDT and LDT are properly updated, and must be + * done before math_state_restore, so the TS bit is up + * to date. + */ + arch_end_context_switch(next_p); + + this_cpu_write(kernel_stack, + (unsigned long)task_stack_page(next_p) + + THREAD_SIZE - KERNEL_STACK_OFFSET); + + /* + * Restore %gs if needed (which is common) + */ + if (prev->gs | next->gs) + lazy_load_gs(next->gs); + + switch_fpu_finish(next_p, fpu); + + this_cpu_write(current_task, next_p); + + return prev_p; +} + +#define top_esp (THREAD_SIZE - sizeof(unsigned long)) +#define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long)) + +unsigned long get_wchan(struct task_struct *p) +{ + unsigned long bp, sp, ip; + unsigned long stack_page; + int count = 0; + if (!p || p == current || p->state == TASK_RUNNING) + return 0; + stack_page = (unsigned long)task_stack_page(p); + sp = p->thread.sp; + if (!stack_page || sp < stack_page || sp > top_esp+stack_page) + return 0; + /* include/asm-i386/system.h:switch_to() pushes bp last. */ + bp = *(unsigned long *) sp; + do { + if (bp < stack_page || bp > top_ebp+stack_page) + return 0; + ip = *(unsigned long *) (bp+4); + if (!in_sched_functions(ip)) + return ip; + bp = *(unsigned long *) bp; + } while (count++ < 16); + return 0; +} + diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c new file mode 100644 index 00000000000..ca5b02d405c --- /dev/null +++ b/arch/x86/kernel/process_64.c @@ -0,0 +1,565 @@ +/* + * Copyright (C) 1995 Linus Torvalds + * + * Pentium III FXSR, SSE support + * Gareth Hughes <gareth@valinux.com>, May 2000 + * + * X86-64 port + * Andi Kleen. + * + * CPU hotplug support - ashok.raj@intel.com + */ + +/* + * This file handles the architecture-dependent parts of process handling.. + */ + +#include <linux/cpu.h> +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/fs.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/elfcore.h> +#include <linux/smp.h> +#include <linux/slab.h> +#include <linux/user.h> +#include <linux/interrupt.h> +#include <linux/delay.h> +#include <linux/module.h> +#include <linux/ptrace.h> +#include <linux/notifier.h> +#include <linux/kprobes.h> +#include <linux/kdebug.h> +#include <linux/prctl.h> +#include <linux/uaccess.h> +#include <linux/io.h> +#include <linux/ftrace.h> + +#include <asm/pgtable.h> +#include <asm/processor.h> +#include <asm/i387.h> +#include <asm/fpu-internal.h> +#include <asm/mmu_context.h> +#include <asm/prctl.h> +#include <asm/desc.h> +#include <asm/proto.h> +#include <asm/ia32.h> +#include <asm/idle.h> +#include <asm/syscalls.h> +#include <asm/debugreg.h> +#include <asm/switch_to.h> + +asmlinkage extern void ret_from_fork(void); + +__visible DEFINE_PER_CPU(unsigned long, old_rsp); + +/* Prints also some state that isn't saved in the pt_regs */ +void __show_regs(struct pt_regs *regs, int all) +{ + unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; + unsigned long d0, d1, d2, d3, d6, d7; + unsigned int fsindex, gsindex; + unsigned int ds, cs, es; + + printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); + printk_address(regs->ip); + printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, + regs->sp, regs->flags); + printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n", + regs->ax, regs->bx, regs->cx); + printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n", + regs->dx, regs->si, regs->di); + printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n", + regs->bp, regs->r8, regs->r9); + printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n", + regs->r10, regs->r11, regs->r12); + printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n", + regs->r13, regs->r14, regs->r15); + + asm("movl %%ds,%0" : "=r" (ds)); + asm("movl %%cs,%0" : "=r" (cs)); + asm("movl %%es,%0" : "=r" (es)); + asm("movl %%fs,%0" : "=r" (fsindex)); + asm("movl %%gs,%0" : "=r" (gsindex)); + + rdmsrl(MSR_FS_BASE, fs); + rdmsrl(MSR_GS_BASE, gs); + rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); + + if (!all) + return; + + cr0 = read_cr0(); + cr2 = read_cr2(); + cr3 = read_cr3(); + cr4 = read_cr4(); + + printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", + fs, fsindex, gs, gsindex, shadowgs); + printk(KERN_DEFAULT "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, + es, cr0); + printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, + cr4); + + get_debugreg(d0, 0); + get_debugreg(d1, 1); + get_debugreg(d2, 2); + get_debugreg(d3, 3); + get_debugreg(d6, 6); + get_debugreg(d7, 7); + + /* Only print out debug registers if they are in their non-default state. */ + if ((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) && + (d6 == DR6_RESERVED) && (d7 == 0x400)) + return; + + printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); + printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); + +} + +void release_thread(struct task_struct *dead_task) +{ + if (dead_task->mm) { + if (dead_task->mm->context.size) { + pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n", + dead_task->comm, + dead_task->mm->context.ldt, + dead_task->mm->context.size); + BUG(); + } + } +} + +static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) +{ + struct user_desc ud = { + .base_addr = addr, + .limit = 0xfffff, + .seg_32bit = 1, + .limit_in_pages = 1, + .useable = 1, + }; + struct desc_struct *desc = t->thread.tls_array; + desc += tls; + fill_ldt(desc, &ud); +} + +static inline u32 read_32bit_tls(struct task_struct *t, int tls) +{ + return get_desc_base(&t->thread.tls_array[tls]); +} + +int copy_thread(unsigned long clone_flags, unsigned long sp, + unsigned long arg, struct task_struct *p) +{ + int err; + struct pt_regs *childregs; + struct task_struct *me = current; + + p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE; + childregs = task_pt_regs(p); + p->thread.sp = (unsigned long) childregs; + p->thread.usersp = me->thread.usersp; + set_tsk_thread_flag(p, TIF_FORK); + p->thread.fpu_counter = 0; + p->thread.io_bitmap_ptr = NULL; + + savesegment(gs, p->thread.gsindex); + p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs; + savesegment(fs, p->thread.fsindex); + p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs; + savesegment(es, p->thread.es); + savesegment(ds, p->thread.ds); + memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); + + if (unlikely(p->flags & PF_KTHREAD)) { + /* kernel thread */ + memset(childregs, 0, sizeof(struct pt_regs)); + childregs->sp = (unsigned long)childregs; + childregs->ss = __KERNEL_DS; + childregs->bx = sp; /* function */ + childregs->bp = arg; + childregs->orig_ax = -1; + childregs->cs = __KERNEL_CS | get_kernel_rpl(); + childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED; + return 0; + } + *childregs = *current_pt_regs(); + + childregs->ax = 0; + if (sp) + childregs->sp = sp; + + err = -ENOMEM; + memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); + + if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { + p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr, + IO_BITMAP_BYTES, GFP_KERNEL); + if (!p->thread.io_bitmap_ptr) { + p->thread.io_bitmap_max = 0; + return -ENOMEM; + } + set_tsk_thread_flag(p, TIF_IO_BITMAP); + } + + /* + * Set a new TLS for the child thread? + */ + if (clone_flags & CLONE_SETTLS) { +#ifdef CONFIG_IA32_EMULATION + if (test_thread_flag(TIF_IA32)) + err = do_set_thread_area(p, -1, + (struct user_desc __user *)childregs->si, 0); + else +#endif + err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); + if (err) + goto out; + } + err = 0; +out: + if (err && p->thread.io_bitmap_ptr) { + kfree(p->thread.io_bitmap_ptr); + p->thread.io_bitmap_max = 0; + } + + return err; +} + +static void +start_thread_common(struct pt_regs *regs, unsigned long new_ip, + unsigned long new_sp, + unsigned int _cs, unsigned int _ss, unsigned int _ds) +{ + loadsegment(fs, 0); + loadsegment(es, _ds); + loadsegment(ds, _ds); + load_gs_index(0); + current->thread.usersp = new_sp; + regs->ip = new_ip; + regs->sp = new_sp; + this_cpu_write(old_rsp, new_sp); + regs->cs = _cs; + regs->ss = _ss; + regs->flags = X86_EFLAGS_IF; +} + +void +start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) +{ + start_thread_common(regs, new_ip, new_sp, + __USER_CS, __USER_DS, 0); +} + +#ifdef CONFIG_IA32_EMULATION +void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp) +{ + start_thread_common(regs, new_ip, new_sp, + test_thread_flag(TIF_X32) + ? __USER_CS : __USER32_CS, + __USER_DS, __USER_DS); +} +#endif + +/* + * switch_to(x,y) should switch tasks from x to y. + * + * This could still be optimized: + * - fold all the options into a flag word and test it with a single test. + * - could test fs/gs bitsliced + * + * Kprobes not supported here. Set the probe on schedule instead. + * Function graph tracer not supported too. + */ +__visible __notrace_funcgraph struct task_struct * +__switch_to(struct task_struct *prev_p, struct task_struct *next_p) +{ + struct thread_struct *prev = &prev_p->thread; + struct thread_struct *next = &next_p->thread; + int cpu = smp_processor_id(); + struct tss_struct *tss = &per_cpu(init_tss, cpu); + unsigned fsindex, gsindex; + fpu_switch_t fpu; + + fpu = switch_fpu_prepare(prev_p, next_p, cpu); + + /* + * Reload esp0, LDT and the page table pointer: + */ + load_sp0(tss, next); + + /* + * Switch DS and ES. + * This won't pick up thread selector changes, but I guess that is ok. + */ + savesegment(es, prev->es); + if (unlikely(next->es | prev->es)) + loadsegment(es, next->es); + + savesegment(ds, prev->ds); + if (unlikely(next->ds | prev->ds)) + loadsegment(ds, next->ds); + + + /* We must save %fs and %gs before load_TLS() because + * %fs and %gs may be cleared by load_TLS(). + * + * (e.g. xen_load_tls()) + */ + savesegment(fs, fsindex); + savesegment(gs, gsindex); + + load_TLS(next, cpu); + + /* + * Leave lazy mode, flushing any hypercalls made here. + * This must be done before restoring TLS segments so + * the GDT and LDT are properly updated, and must be + * done before math_state_restore, so the TS bit is up + * to date. + */ + arch_end_context_switch(next_p); + + /* + * Switch FS and GS. + * + * Segment register != 0 always requires a reload. Also + * reload when it has changed. When prev process used 64bit + * base always reload to avoid an information leak. + */ + if (unlikely(fsindex | next->fsindex | prev->fs)) { + loadsegment(fs, next->fsindex); + /* + * Check if the user used a selector != 0; if yes + * clear 64bit base, since overloaded base is always + * mapped to the Null selector + */ + if (fsindex) + prev->fs = 0; + } + /* when next process has a 64bit base use it */ + if (next->fs) + wrmsrl(MSR_FS_BASE, next->fs); + prev->fsindex = fsindex; + + if (unlikely(gsindex | next->gsindex | prev->gs)) { + load_gs_index(next->gsindex); + if (gsindex) + prev->gs = 0; + } + if (next->gs) + wrmsrl(MSR_KERNEL_GS_BASE, next->gs); + prev->gsindex = gsindex; + + switch_fpu_finish(next_p, fpu); + + /* + * Switch the PDA and FPU contexts. + */ + prev->usersp = this_cpu_read(old_rsp); + this_cpu_write(old_rsp, next->usersp); + this_cpu_write(current_task, next_p); + + /* + * If it were not for PREEMPT_ACTIVE we could guarantee that the + * preempt_count of all tasks was equal here and this would not be + * needed. + */ + task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count); + this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count); + + this_cpu_write(kernel_stack, + (unsigned long)task_stack_page(next_p) + + THREAD_SIZE - KERNEL_STACK_OFFSET); + + /* + * Now maybe reload the debug registers and handle I/O bitmaps + */ + if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT || + task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) + __switch_to_xtra(prev_p, next_p, tss); + + return prev_p; +} + +void set_personality_64bit(void) +{ + /* inherit personality from parent */ + + /* Make sure to be in 64bit mode */ + clear_thread_flag(TIF_IA32); + clear_thread_flag(TIF_ADDR32); + clear_thread_flag(TIF_X32); + + /* Ensure the corresponding mm is not marked. */ + if (current->mm) + current->mm->context.ia32_compat = 0; + + /* TBD: overwrites user setup. Should have two bits. + But 64bit processes have always behaved this way, + so it's not too bad. The main problem is just that + 32bit childs are affected again. */ + current->personality &= ~READ_IMPLIES_EXEC; +} + +void set_personality_ia32(bool x32) +{ + /* inherit personality from parent */ + + /* Make sure to be in 32bit mode */ + set_thread_flag(TIF_ADDR32); + + /* Mark the associated mm as containing 32-bit tasks. */ + if (x32) { + clear_thread_flag(TIF_IA32); + set_thread_flag(TIF_X32); + if (current->mm) + current->mm->context.ia32_compat = TIF_X32; + current->personality &= ~READ_IMPLIES_EXEC; + /* is_compat_task() uses the presence of the x32 + syscall bit flag to determine compat status */ + current_thread_info()->status &= ~TS_COMPAT; + } else { + set_thread_flag(TIF_IA32); + clear_thread_flag(TIF_X32); + if (current->mm) + current->mm->context.ia32_compat = TIF_IA32; + current->personality |= force_personality32; + /* Prepare the first "return" to user space */ + current_thread_info()->status |= TS_COMPAT; + } +} +EXPORT_SYMBOL_GPL(set_personality_ia32); + +unsigned long get_wchan(struct task_struct *p) +{ + unsigned long stack; + u64 fp, ip; + int count = 0; + + if (!p || p == current || p->state == TASK_RUNNING) + return 0; + stack = (unsigned long)task_stack_page(p); + if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE) + return 0; + fp = *(u64 *)(p->thread.sp); + do { + if (fp < (unsigned long)stack || + fp >= (unsigned long)stack+THREAD_SIZE) + return 0; + ip = *(u64 *)(fp+8); + if (!in_sched_functions(ip)) + return ip; + fp = *(u64 *)fp; + } while (count++ < 16); + return 0; +} + +long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) +{ + int ret = 0; + int doit = task == current; + int cpu; + + switch (code) { + case ARCH_SET_GS: + if (addr >= TASK_SIZE_OF(task)) + return -EPERM; + cpu = get_cpu(); + /* handle small bases via the GDT because that's faster to + switch. */ + if (addr <= 0xffffffff) { + set_32bit_tls(task, GS_TLS, addr); + if (doit) { + load_TLS(&task->thread, cpu); + load_gs_index(GS_TLS_SEL); + } + task->thread.gsindex = GS_TLS_SEL; + task->thread.gs = 0; + } else { + task->thread.gsindex = 0; + task->thread.gs = addr; + if (doit) { + load_gs_index(0); + ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr); + } + } + put_cpu(); + break; + case ARCH_SET_FS: + /* Not strictly needed for fs, but do it for symmetry + with gs */ + if (addr >= TASK_SIZE_OF(task)) + return -EPERM; + cpu = get_cpu(); + /* handle small bases via the GDT because that's faster to + switch. */ + if (addr <= 0xffffffff) { + set_32bit_tls(task, FS_TLS, addr); + if (doit) { + load_TLS(&task->thread, cpu); + loadsegment(fs, FS_TLS_SEL); + } + task->thread.fsindex = FS_TLS_SEL; + task->thread.fs = 0; + } else { + task->thread.fsindex = 0; + task->thread.fs = addr; + if (doit) { + /* set the selector to 0 to not confuse + __switch_to */ + loadsegment(fs, 0); + ret = wrmsrl_safe(MSR_FS_BASE, addr); + } + } + put_cpu(); + break; + case ARCH_GET_FS: { + unsigned long base; + if (task->thread.fsindex == FS_TLS_SEL) + base = read_32bit_tls(task, FS_TLS); + else if (doit) + rdmsrl(MSR_FS_BASE, base); + else + base = task->thread.fs; + ret = put_user(base, (unsigned long __user *)addr); + break; + } + case ARCH_GET_GS: { + unsigned long base; + unsigned gsindex; + if (task->thread.gsindex == GS_TLS_SEL) + base = read_32bit_tls(task, GS_TLS); + else if (doit) { + savesegment(gs, gsindex); + if (gsindex) + rdmsrl(MSR_KERNEL_GS_BASE, base); + else + base = task->thread.gs; + } else + base = task->thread.gs; + ret = put_user(base, (unsigned long __user *)addr); + break; + } + + default: + ret = -EINVAL; + break; + } + + return ret; +} + +long sys_arch_prctl(int code, unsigned long addr) +{ + return do_arch_prctl(current, code, addr); +} + +unsigned long KSTK_ESP(struct task_struct *task) +{ + return (test_tsk_thread_flag(task, TIF_IA32)) ? + (task_pt_regs(task)->sp) : ((task)->thread.usersp); +} diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c new file mode 100644 index 00000000000..678c0ada3b3 --- /dev/null +++ b/arch/x86/kernel/ptrace.c @@ -0,0 +1,1535 @@ +/* By Ross Biro 1/23/92 */ +/* + * Pentium III FXSR, SSE support + * Gareth Hughes <gareth@valinux.com>, May 2000 + */ + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/errno.h> +#include <linux/slab.h> +#include <linux/ptrace.h> +#include <linux/regset.h> +#include <linux/tracehook.h> +#include <linux/user.h> +#include <linux/elf.h> +#include <linux/security.h> +#include <linux/audit.h> +#include <linux/seccomp.h> +#include <linux/signal.h> +#include <linux/perf_event.h> +#include <linux/hw_breakpoint.h> +#include <linux/rcupdate.h> +#include <linux/export.h> +#include <linux/context_tracking.h> + +#include <asm/uaccess.h> +#include <asm/pgtable.h> +#include <asm/processor.h> +#include <asm/i387.h> +#include <asm/fpu-internal.h> +#include <asm/debugreg.h> +#include <asm/ldt.h> +#include <asm/desc.h> +#include <asm/prctl.h> +#include <asm/proto.h> +#include <asm/hw_breakpoint.h> +#include <asm/traps.h> + +#include "tls.h" + +#define CREATE_TRACE_POINTS +#include <trace/events/syscalls.h> + +enum x86_regset { + REGSET_GENERAL, + REGSET_FP, + REGSET_XFP, + REGSET_IOPERM64 = REGSET_XFP, + REGSET_XSTATE, + REGSET_TLS, + REGSET_IOPERM32, +}; + +struct pt_regs_offset { + const char *name; + int offset; +}; + +#define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)} +#define REG_OFFSET_END {.name = NULL, .offset = 0} + +static const struct pt_regs_offset regoffset_table[] = { +#ifdef CONFIG_X86_64 + REG_OFFSET_NAME(r15), + REG_OFFSET_NAME(r14), + REG_OFFSET_NAME(r13), + REG_OFFSET_NAME(r12), + REG_OFFSET_NAME(r11), + REG_OFFSET_NAME(r10), + REG_OFFSET_NAME(r9), + REG_OFFSET_NAME(r8), +#endif + REG_OFFSET_NAME(bx), + REG_OFFSET_NAME(cx), + REG_OFFSET_NAME(dx), + REG_OFFSET_NAME(si), + REG_OFFSET_NAME(di), + REG_OFFSET_NAME(bp), + REG_OFFSET_NAME(ax), +#ifdef CONFIG_X86_32 + REG_OFFSET_NAME(ds), + REG_OFFSET_NAME(es), + REG_OFFSET_NAME(fs), + REG_OFFSET_NAME(gs), +#endif + REG_OFFSET_NAME(orig_ax), + REG_OFFSET_NAME(ip), + REG_OFFSET_NAME(cs), + REG_OFFSET_NAME(flags), + REG_OFFSET_NAME(sp), + REG_OFFSET_NAME(ss), + REG_OFFSET_END, +}; + +/** + * regs_query_register_offset() - query register offset from its name + * @name: the name of a register + * + * regs_query_register_offset() returns the offset of a register in struct + * pt_regs from its name. If the name is invalid, this returns -EINVAL; + */ +int regs_query_register_offset(const char *name) +{ + const struct pt_regs_offset *roff; + for (roff = regoffset_table; roff->name != NULL; roff++) + if (!strcmp(roff->name, name)) + return roff->offset; + return -EINVAL; +} + +/** + * regs_query_register_name() - query register name from its offset + * @offset: the offset of a register in struct pt_regs. + * + * regs_query_register_name() returns the name of a register from its + * offset in struct pt_regs. If the @offset is invalid, this returns NULL; + */ +const char *regs_query_register_name(unsigned int offset) +{ + const struct pt_regs_offset *roff; + for (roff = regoffset_table; roff->name != NULL; roff++) + if (roff->offset == offset) + return roff->name; + return NULL; +} + +static const int arg_offs_table[] = { +#ifdef CONFIG_X86_32 + [0] = offsetof(struct pt_regs, ax), + [1] = offsetof(struct pt_regs, dx), + [2] = offsetof(struct pt_regs, cx) +#else /* CONFIG_X86_64 */ + [0] = offsetof(struct pt_regs, di), + [1] = offsetof(struct pt_regs, si), + [2] = offsetof(struct pt_regs, dx), + [3] = offsetof(struct pt_regs, cx), + [4] = offsetof(struct pt_regs, r8), + [5] = offsetof(struct pt_regs, r9) +#endif +}; + +/* + * does not yet catch signals sent when the child dies. + * in exit.c or in signal.c. + */ + +/* + * Determines which flags the user has access to [1 = access, 0 = no access]. + */ +#define FLAG_MASK_32 ((unsigned long) \ + (X86_EFLAGS_CF | X86_EFLAGS_PF | \ + X86_EFLAGS_AF | X86_EFLAGS_ZF | \ + X86_EFLAGS_SF | X86_EFLAGS_TF | \ + X86_EFLAGS_DF | X86_EFLAGS_OF | \ + X86_EFLAGS_RF | X86_EFLAGS_AC)) + +/* + * Determines whether a value may be installed in a segment register. + */ +static inline bool invalid_selector(u16 value) +{ + return unlikely(value != 0 && (value & SEGMENT_RPL_MASK) != USER_RPL); +} + +#ifdef CONFIG_X86_32 + +#define FLAG_MASK FLAG_MASK_32 + +/* + * X86_32 CPUs don't save ss and esp if the CPU is already in kernel mode + * when it traps. The previous stack will be directly underneath the saved + * registers, and 'sp/ss' won't even have been saved. Thus the '®s->sp'. + * + * Now, if the stack is empty, '®s->sp' is out of range. In this + * case we try to take the previous stack. To always return a non-null + * stack pointer we fall back to regs as stack if no previous stack + * exists. + * + * This is valid only for kernel mode traps. + */ +unsigned long kernel_stack_pointer(struct pt_regs *regs) +{ + unsigned long context = (unsigned long)regs & ~(THREAD_SIZE - 1); + unsigned long sp = (unsigned long)®s->sp; + u32 *prev_esp; + + if (context == (sp & ~(THREAD_SIZE - 1))) + return sp; + + prev_esp = (u32 *)(context); + if (prev_esp) + return (unsigned long)prev_esp; + + return (unsigned long)regs; +} +EXPORT_SYMBOL_GPL(kernel_stack_pointer); + +static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno) +{ + BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0); + return ®s->bx + (regno >> 2); +} + +static u16 get_segment_reg(struct task_struct *task, unsigned long offset) +{ + /* + * Returning the value truncates it to 16 bits. + */ + unsigned int retval; + if (offset != offsetof(struct user_regs_struct, gs)) + retval = *pt_regs_access(task_pt_regs(task), offset); + else { + if (task == current) + retval = get_user_gs(task_pt_regs(task)); + else + retval = task_user_gs(task); + } + return retval; +} + +static int set_segment_reg(struct task_struct *task, + unsigned long offset, u16 value) +{ + /* + * The value argument was already truncated to 16 bits. + */ + if (invalid_selector(value)) + return -EIO; + + /* + * For %cs and %ss we cannot permit a null selector. + * We can permit a bogus selector as long as it has USER_RPL. + * Null selectors are fine for other segment registers, but + * we will never get back to user mode with invalid %cs or %ss + * and will take the trap in iret instead. Much code relies + * on user_mode() to distinguish a user trap frame (which can + * safely use invalid selectors) from a kernel trap frame. + */ + switch (offset) { + case offsetof(struct user_regs_struct, cs): + case offsetof(struct user_regs_struct, ss): + if (unlikely(value == 0)) + return -EIO; + + default: + *pt_regs_access(task_pt_regs(task), offset) = value; + break; + + case offsetof(struct user_regs_struct, gs): + if (task == current) + set_user_gs(task_pt_regs(task), value); + else + task_user_gs(task) = value; + } + + return 0; +} + +#else /* CONFIG_X86_64 */ + +#define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT) + +static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long offset) +{ + BUILD_BUG_ON(offsetof(struct pt_regs, r15) != 0); + return ®s->r15 + (offset / sizeof(regs->r15)); +} + +static u16 get_segment_reg(struct task_struct *task, unsigned long offset) +{ + /* + * Returning the value truncates it to 16 bits. + */ + unsigned int seg; + + switch (offset) { + case offsetof(struct user_regs_struct, fs): + if (task == current) { + /* Older gas can't assemble movq %?s,%r?? */ + asm("movl %%fs,%0" : "=r" (seg)); + return seg; + } + return task->thread.fsindex; + case offsetof(struct user_regs_struct, gs): + if (task == current) { + asm("movl %%gs,%0" : "=r" (seg)); + return seg; + } + return task->thread.gsindex; + case offsetof(struct user_regs_struct, ds): + if (task == current) { + asm("movl %%ds,%0" : "=r" (seg)); + return seg; + } + return task->thread.ds; + case offsetof(struct user_regs_struct, es): + if (task == current) { + asm("movl %%es,%0" : "=r" (seg)); + return seg; + } + return task->thread.es; + + case offsetof(struct user_regs_struct, cs): + case offsetof(struct user_regs_struct, ss): + break; + } + return *pt_regs_access(task_pt_regs(task), offset); +} + +static int set_segment_reg(struct task_struct *task, + unsigned long offset, u16 value) +{ + /* + * The value argument was already truncated to 16 bits. + */ + if (invalid_selector(value)) + return -EIO; + + switch (offset) { + case offsetof(struct user_regs_struct,fs): + /* + * If this is setting fs as for normal 64-bit use but + * setting fs_base has implicitly changed it, leave it. + */ + if ((value == FS_TLS_SEL && task->thread.fsindex == 0 && + task->thread.fs != 0) || + (value == 0 && task->thread.fsindex == FS_TLS_SEL && + task->thread.fs == 0)) + break; + task->thread.fsindex = value; + if (task == current) + loadsegment(fs, task->thread.fsindex); + break; + case offsetof(struct user_regs_struct,gs): + /* + * If this is setting gs as for normal 64-bit use but + * setting gs_base has implicitly changed it, leave it. + */ + if ((value == GS_TLS_SEL && task->thread.gsindex == 0 && + task->thread.gs != 0) || + (value == 0 && task->thread.gsindex == GS_TLS_SEL && + task->thread.gs == 0)) + break; + task->thread.gsindex = value; + if (task == current) + load_gs_index(task->thread.gsindex); + break; + case offsetof(struct user_regs_struct,ds): + task->thread.ds = value; + if (task == current) + loadsegment(ds, task->thread.ds); + break; + case offsetof(struct user_regs_struct,es): + task->thread.es = value; + if (task == current) + loadsegment(es, task->thread.es); + break; + + /* + * Can't actually change these in 64-bit mode. + */ + case offsetof(struct user_regs_struct,cs): + if (unlikely(value == 0)) + return -EIO; +#ifdef CONFIG_IA32_EMULATION + if (test_tsk_thread_flag(task, TIF_IA32)) + task_pt_regs(task)->cs = value; +#endif + break; + case offsetof(struct user_regs_struct,ss): + if (unlikely(value == 0)) + return -EIO; +#ifdef CONFIG_IA32_EMULATION + if (test_tsk_thread_flag(task, TIF_IA32)) + task_pt_regs(task)->ss = value; +#endif + break; + } + + return 0; +} + +#endif /* CONFIG_X86_32 */ + +static unsigned long get_flags(struct task_struct *task) +{ + unsigned long retval = task_pt_regs(task)->flags; + + /* + * If the debugger set TF, hide it from the readout. + */ + if (test_tsk_thread_flag(task, TIF_FORCED_TF)) + retval &= ~X86_EFLAGS_TF; + + return retval; +} + +static int set_flags(struct task_struct *task, unsigned long value) +{ + struct pt_regs *regs = task_pt_regs(task); + + /* + * If the user value contains TF, mark that + * it was not "us" (the debugger) that set it. + * If not, make sure it stays set if we had. + */ + if (value & X86_EFLAGS_TF) + clear_tsk_thread_flag(task, TIF_FORCED_TF); + else if (test_tsk_thread_flag(task, TIF_FORCED_TF)) + value |= X86_EFLAGS_TF; + + regs->flags = (regs->flags & ~FLAG_MASK) | (value & FLAG_MASK); + + return 0; +} + +static int putreg(struct task_struct *child, + unsigned long offset, unsigned long value) +{ + switch (offset) { + case offsetof(struct user_regs_struct, cs): + case offsetof(struct user_regs_struct, ds): + case offsetof(struct user_regs_struct, es): + case offsetof(struct user_regs_struct, fs): + case offsetof(struct user_regs_struct, gs): + case offsetof(struct user_regs_struct, ss): + return set_segment_reg(child, offset, value); + + case offsetof(struct user_regs_struct, flags): + return set_flags(child, value); + +#ifdef CONFIG_X86_64 + case offsetof(struct user_regs_struct,fs_base): + if (value >= TASK_SIZE_OF(child)) + return -EIO; + /* + * When changing the segment base, use do_arch_prctl + * to set either thread.fs or thread.fsindex and the + * corresponding GDT slot. + */ + if (child->thread.fs != value) + return do_arch_prctl(child, ARCH_SET_FS, value); + return 0; + case offsetof(struct user_regs_struct,gs_base): + /* + * Exactly the same here as the %fs handling above. + */ + if (value >= TASK_SIZE_OF(child)) + return -EIO; + if (child->thread.gs != value) + return do_arch_prctl(child, ARCH_SET_GS, value); + return 0; +#endif + } + + *pt_regs_access(task_pt_regs(child), offset) = value; + return 0; +} + +static unsigned long getreg(struct task_struct *task, unsigned long offset) +{ + switch (offset) { + case offsetof(struct user_regs_struct, cs): + case offsetof(struct user_regs_struct, ds): + case offsetof(struct user_regs_struct, es): + case offsetof(struct user_regs_struct, fs): + case offsetof(struct user_regs_struct, gs): + case offsetof(struct user_regs_struct, ss): + return get_segment_reg(task, offset); + + case offsetof(struct user_regs_struct, flags): + return get_flags(task); + +#ifdef CONFIG_X86_64 + case offsetof(struct user_regs_struct, fs_base): { + /* + * do_arch_prctl may have used a GDT slot instead of + * the MSR. To userland, it appears the same either + * way, except the %fs segment selector might not be 0. + */ + unsigned int seg = task->thread.fsindex; + if (task->thread.fs != 0) + return task->thread.fs; + if (task == current) + asm("movl %%fs,%0" : "=r" (seg)); + if (seg != FS_TLS_SEL) + return 0; + return get_desc_base(&task->thread.tls_array[FS_TLS]); + } + case offsetof(struct user_regs_struct, gs_base): { + /* + * Exactly the same here as the %fs handling above. + */ + unsigned int seg = task->thread.gsindex; + if (task->thread.gs != 0) + return task->thread.gs; + if (task == current) + asm("movl %%gs,%0" : "=r" (seg)); + if (seg != GS_TLS_SEL) + return 0; + return get_desc_base(&task->thread.tls_array[GS_TLS]); + } +#endif + } + + return *pt_regs_access(task_pt_regs(task), offset); +} + +static int genregs_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + if (kbuf) { + unsigned long *k = kbuf; + while (count >= sizeof(*k)) { + *k++ = getreg(target, pos); + count -= sizeof(*k); + pos += sizeof(*k); + } + } else { + unsigned long __user *u = ubuf; + while (count >= sizeof(*u)) { + if (__put_user(getreg(target, pos), u++)) + return -EFAULT; + count -= sizeof(*u); + pos += sizeof(*u); + } + } + + return 0; +} + +static int genregs_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret = 0; + if (kbuf) { + const unsigned long *k = kbuf; + while (count >= sizeof(*k) && !ret) { + ret = putreg(target, pos, *k++); + count -= sizeof(*k); + pos += sizeof(*k); + } + } else { + const unsigned long __user *u = ubuf; + while (count >= sizeof(*u) && !ret) { + unsigned long word; + ret = __get_user(word, u++); + if (ret) + break; + ret = putreg(target, pos, word); + count -= sizeof(*u); + pos += sizeof(*u); + } + } + return ret; +} + +static void ptrace_triggered(struct perf_event *bp, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + int i; + struct thread_struct *thread = &(current->thread); + + /* + * Store in the virtual DR6 register the fact that the breakpoint + * was hit so the thread's debugger will see it. + */ + for (i = 0; i < HBP_NUM; i++) { + if (thread->ptrace_bps[i] == bp) + break; + } + + thread->debugreg6 |= (DR_TRAP0 << i); +} + +/* + * Walk through every ptrace breakpoints for this thread and + * build the dr7 value on top of their attributes. + * + */ +static unsigned long ptrace_get_dr7(struct perf_event *bp[]) +{ + int i; + int dr7 = 0; + struct arch_hw_breakpoint *info; + + for (i = 0; i < HBP_NUM; i++) { + if (bp[i] && !bp[i]->attr.disabled) { + info = counter_arch_bp(bp[i]); + dr7 |= encode_dr7(i, info->len, info->type); + } + } + + return dr7; +} + +static int ptrace_fill_bp_fields(struct perf_event_attr *attr, + int len, int type, bool disabled) +{ + int err, bp_len, bp_type; + + err = arch_bp_generic_fields(len, type, &bp_len, &bp_type); + if (!err) { + attr->bp_len = bp_len; + attr->bp_type = bp_type; + attr->disabled = disabled; + } + + return err; +} + +static struct perf_event * +ptrace_register_breakpoint(struct task_struct *tsk, int len, int type, + unsigned long addr, bool disabled) +{ + struct perf_event_attr attr; + int err; + + ptrace_breakpoint_init(&attr); + attr.bp_addr = addr; + + err = ptrace_fill_bp_fields(&attr, len, type, disabled); + if (err) + return ERR_PTR(err); + + return register_user_hw_breakpoint(&attr, ptrace_triggered, + NULL, tsk); +} + +static int ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, + int disabled) +{ + struct perf_event_attr attr = bp->attr; + int err; + + err = ptrace_fill_bp_fields(&attr, len, type, disabled); + if (err) + return err; + + return modify_user_hw_breakpoint(bp, &attr); +} + +/* + * Handle ptrace writes to debug register 7. + */ +static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data) +{ + struct thread_struct *thread = &tsk->thread; + unsigned long old_dr7; + bool second_pass = false; + int i, rc, ret = 0; + + data &= ~DR_CONTROL_RESERVED; + old_dr7 = ptrace_get_dr7(thread->ptrace_bps); + +restore: + rc = 0; + for (i = 0; i < HBP_NUM; i++) { + unsigned len, type; + bool disabled = !decode_dr7(data, i, &len, &type); + struct perf_event *bp = thread->ptrace_bps[i]; + + if (!bp) { + if (disabled) + continue; + + bp = ptrace_register_breakpoint(tsk, + len, type, 0, disabled); + if (IS_ERR(bp)) { + rc = PTR_ERR(bp); + break; + } + + thread->ptrace_bps[i] = bp; + continue; + } + + rc = ptrace_modify_breakpoint(bp, len, type, disabled); + if (rc) + break; + } + + /* Restore if the first pass failed, second_pass shouldn't fail. */ + if (rc && !WARN_ON(second_pass)) { + ret = rc; + data = old_dr7; + second_pass = true; + goto restore; + } + + return ret; +} + +/* + * Handle PTRACE_PEEKUSR calls for the debug register area. + */ +static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) +{ + struct thread_struct *thread = &tsk->thread; + unsigned long val = 0; + + if (n < HBP_NUM) { + struct perf_event *bp = thread->ptrace_bps[n]; + + if (bp) + val = bp->hw.info.address; + } else if (n == 6) { + val = thread->debugreg6; + } else if (n == 7) { + val = thread->ptrace_dr7; + } + return val; +} + +static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, + unsigned long addr) +{ + struct thread_struct *t = &tsk->thread; + struct perf_event *bp = t->ptrace_bps[nr]; + int err = 0; + + if (!bp) { + /* + * Put stub len and type to create an inactive but correct bp. + * + * CHECKME: the previous code returned -EIO if the addr wasn't + * a valid task virtual addr. The new one will return -EINVAL in + * this case. + * -EINVAL may be what we want for in-kernel breakpoints users, + * but -EIO looks better for ptrace, since we refuse a register + * writing for the user. And anyway this is the previous + * behaviour. + */ + bp = ptrace_register_breakpoint(tsk, + X86_BREAKPOINT_LEN_1, X86_BREAKPOINT_WRITE, + addr, true); + if (IS_ERR(bp)) + err = PTR_ERR(bp); + else + t->ptrace_bps[nr] = bp; + } else { + struct perf_event_attr attr = bp->attr; + + attr.bp_addr = addr; + err = modify_user_hw_breakpoint(bp, &attr); + } + + return err; +} + +/* + * Handle PTRACE_POKEUSR calls for the debug register area. + */ +static int ptrace_set_debugreg(struct task_struct *tsk, int n, + unsigned long val) +{ + struct thread_struct *thread = &tsk->thread; + /* There are no DR4 or DR5 registers */ + int rc = -EIO; + + if (n < HBP_NUM) { + rc = ptrace_set_breakpoint_addr(tsk, n, val); + } else if (n == 6) { + thread->debugreg6 = val; + rc = 0; + } else if (n == 7) { + rc = ptrace_write_dr7(tsk, val); + if (!rc) + thread->ptrace_dr7 = val; + } + return rc; +} + +/* + * These access the current or another (stopped) task's io permission + * bitmap for debugging or core dump. + */ +static int ioperm_active(struct task_struct *target, + const struct user_regset *regset) +{ + return target->thread.io_bitmap_max / regset->size; +} + +static int ioperm_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + if (!target->thread.io_bitmap_ptr) + return -ENXIO; + + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + target->thread.io_bitmap_ptr, + 0, IO_BITMAP_BYTES); +} + +/* + * Called by kernel/ptrace.c when detaching.. + * + * Make sure the single step bit is not set. + */ +void ptrace_disable(struct task_struct *child) +{ + user_disable_single_step(child); +#ifdef TIF_SYSCALL_EMU + clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); +#endif +} + +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION +static const struct user_regset_view user_x86_32_view; /* Initialized below. */ +#endif + +long arch_ptrace(struct task_struct *child, long request, + unsigned long addr, unsigned long data) +{ + int ret; + unsigned long __user *datap = (unsigned long __user *)data; + + switch (request) { + /* read the word at location addr in the USER area. */ + case PTRACE_PEEKUSR: { + unsigned long tmp; + + ret = -EIO; + if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user)) + break; + + tmp = 0; /* Default return condition */ + if (addr < sizeof(struct user_regs_struct)) + tmp = getreg(child, addr); + else if (addr >= offsetof(struct user, u_debugreg[0]) && + addr <= offsetof(struct user, u_debugreg[7])) { + addr -= offsetof(struct user, u_debugreg[0]); + tmp = ptrace_get_debugreg(child, addr / sizeof(data)); + } + ret = put_user(tmp, datap); + break; + } + + case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ + ret = -EIO; + if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user)) + break; + + if (addr < sizeof(struct user_regs_struct)) + ret = putreg(child, addr, data); + else if (addr >= offsetof(struct user, u_debugreg[0]) && + addr <= offsetof(struct user, u_debugreg[7])) { + addr -= offsetof(struct user, u_debugreg[0]); + ret = ptrace_set_debugreg(child, + addr / sizeof(data), data); + } + break; + + case PTRACE_GETREGS: /* Get all gp regs from the child. */ + return copy_regset_to_user(child, + task_user_regset_view(current), + REGSET_GENERAL, + 0, sizeof(struct user_regs_struct), + datap); + + case PTRACE_SETREGS: /* Set all gp regs in the child. */ + return copy_regset_from_user(child, + task_user_regset_view(current), + REGSET_GENERAL, + 0, sizeof(struct user_regs_struct), + datap); + + case PTRACE_GETFPREGS: /* Get the child FPU state. */ + return copy_regset_to_user(child, + task_user_regset_view(current), + REGSET_FP, + 0, sizeof(struct user_i387_struct), + datap); + + case PTRACE_SETFPREGS: /* Set the child FPU state. */ + return copy_regset_from_user(child, + task_user_regset_view(current), + REGSET_FP, + 0, sizeof(struct user_i387_struct), + datap); + +#ifdef CONFIG_X86_32 + case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */ + return copy_regset_to_user(child, &user_x86_32_view, + REGSET_XFP, + 0, sizeof(struct user_fxsr_struct), + datap) ? -EIO : 0; + + case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */ + return copy_regset_from_user(child, &user_x86_32_view, + REGSET_XFP, + 0, sizeof(struct user_fxsr_struct), + datap) ? -EIO : 0; +#endif + +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION + case PTRACE_GET_THREAD_AREA: + if ((int) addr < 0) + return -EIO; + ret = do_get_thread_area(child, addr, + (struct user_desc __user *)data); + break; + + case PTRACE_SET_THREAD_AREA: + if ((int) addr < 0) + return -EIO; + ret = do_set_thread_area(child, addr, + (struct user_desc __user *)data, 0); + break; +#endif + +#ifdef CONFIG_X86_64 + /* normal 64bit interface to access TLS data. + Works just like arch_prctl, except that the arguments + are reversed. */ + case PTRACE_ARCH_PRCTL: + ret = do_arch_prctl(child, data, addr); + break; +#endif + + default: + ret = ptrace_request(child, request, addr, data); + break; + } + + return ret; +} + +#ifdef CONFIG_IA32_EMULATION + +#include <linux/compat.h> +#include <linux/syscalls.h> +#include <asm/ia32.h> +#include <asm/user32.h> + +#define R32(l,q) \ + case offsetof(struct user32, regs.l): \ + regs->q = value; break + +#define SEG32(rs) \ + case offsetof(struct user32, regs.rs): \ + return set_segment_reg(child, \ + offsetof(struct user_regs_struct, rs), \ + value); \ + break + +static int putreg32(struct task_struct *child, unsigned regno, u32 value) +{ + struct pt_regs *regs = task_pt_regs(child); + + switch (regno) { + + SEG32(cs); + SEG32(ds); + SEG32(es); + SEG32(fs); + SEG32(gs); + SEG32(ss); + + R32(ebx, bx); + R32(ecx, cx); + R32(edx, dx); + R32(edi, di); + R32(esi, si); + R32(ebp, bp); + R32(eax, ax); + R32(eip, ip); + R32(esp, sp); + + case offsetof(struct user32, regs.orig_eax): + /* + * A 32-bit debugger setting orig_eax means to restore + * the state of the task restarting a 32-bit syscall. + * Make sure we interpret the -ERESTART* codes correctly + * in case the task is not actually still sitting at the + * exit from a 32-bit syscall with TS_COMPAT still set. + */ + regs->orig_ax = value; + if (syscall_get_nr(child, regs) >= 0) + task_thread_info(child)->status |= TS_COMPAT; + break; + + case offsetof(struct user32, regs.eflags): + return set_flags(child, value); + + case offsetof(struct user32, u_debugreg[0]) ... + offsetof(struct user32, u_debugreg[7]): + regno -= offsetof(struct user32, u_debugreg[0]); + return ptrace_set_debugreg(child, regno / 4, value); + + default: + if (regno > sizeof(struct user32) || (regno & 3)) + return -EIO; + + /* + * Other dummy fields in the virtual user structure + * are ignored + */ + break; + } + return 0; +} + +#undef R32 +#undef SEG32 + +#define R32(l,q) \ + case offsetof(struct user32, regs.l): \ + *val = regs->q; break + +#define SEG32(rs) \ + case offsetof(struct user32, regs.rs): \ + *val = get_segment_reg(child, \ + offsetof(struct user_regs_struct, rs)); \ + break + +static int getreg32(struct task_struct *child, unsigned regno, u32 *val) +{ + struct pt_regs *regs = task_pt_regs(child); + + switch (regno) { + + SEG32(ds); + SEG32(es); + SEG32(fs); + SEG32(gs); + + R32(cs, cs); + R32(ss, ss); + R32(ebx, bx); + R32(ecx, cx); + R32(edx, dx); + R32(edi, di); + R32(esi, si); + R32(ebp, bp); + R32(eax, ax); + R32(orig_eax, orig_ax); + R32(eip, ip); + R32(esp, sp); + + case offsetof(struct user32, regs.eflags): + *val = get_flags(child); + break; + + case offsetof(struct user32, u_debugreg[0]) ... + offsetof(struct user32, u_debugreg[7]): + regno -= offsetof(struct user32, u_debugreg[0]); + *val = ptrace_get_debugreg(child, regno / 4); + break; + + default: + if (regno > sizeof(struct user32) || (regno & 3)) + return -EIO; + + /* + * Other dummy fields in the virtual user structure + * are ignored + */ + *val = 0; + break; + } + return 0; +} + +#undef R32 +#undef SEG32 + +static int genregs32_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + if (kbuf) { + compat_ulong_t *k = kbuf; + while (count >= sizeof(*k)) { + getreg32(target, pos, k++); + count -= sizeof(*k); + pos += sizeof(*k); + } + } else { + compat_ulong_t __user *u = ubuf; + while (count >= sizeof(*u)) { + compat_ulong_t word; + getreg32(target, pos, &word); + if (__put_user(word, u++)) + return -EFAULT; + count -= sizeof(*u); + pos += sizeof(*u); + } + } + + return 0; +} + +static int genregs32_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret = 0; + if (kbuf) { + const compat_ulong_t *k = kbuf; + while (count >= sizeof(*k) && !ret) { + ret = putreg32(target, pos, *k++); + count -= sizeof(*k); + pos += sizeof(*k); + } + } else { + const compat_ulong_t __user *u = ubuf; + while (count >= sizeof(*u) && !ret) { + compat_ulong_t word; + ret = __get_user(word, u++); + if (ret) + break; + ret = putreg32(target, pos, word); + count -= sizeof(*u); + pos += sizeof(*u); + } + } + return ret; +} + +#ifdef CONFIG_X86_X32_ABI +static long x32_arch_ptrace(struct task_struct *child, + compat_long_t request, compat_ulong_t caddr, + compat_ulong_t cdata) +{ + unsigned long addr = caddr; + unsigned long data = cdata; + void __user *datap = compat_ptr(data); + int ret; + + switch (request) { + /* Read 32bits at location addr in the USER area. Only allow + to return the lower 32bits of segment and debug registers. */ + case PTRACE_PEEKUSR: { + u32 tmp; + + ret = -EIO; + if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user) || + addr < offsetof(struct user_regs_struct, cs)) + break; + + tmp = 0; /* Default return condition */ + if (addr < sizeof(struct user_regs_struct)) + tmp = getreg(child, addr); + else if (addr >= offsetof(struct user, u_debugreg[0]) && + addr <= offsetof(struct user, u_debugreg[7])) { + addr -= offsetof(struct user, u_debugreg[0]); + tmp = ptrace_get_debugreg(child, addr / sizeof(data)); + } + ret = put_user(tmp, (__u32 __user *)datap); + break; + } + + /* Write the word at location addr in the USER area. Only allow + to update segment and debug registers with the upper 32bits + zero-extended. */ + case PTRACE_POKEUSR: + ret = -EIO; + if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user) || + addr < offsetof(struct user_regs_struct, cs)) + break; + + if (addr < sizeof(struct user_regs_struct)) + ret = putreg(child, addr, data); + else if (addr >= offsetof(struct user, u_debugreg[0]) && + addr <= offsetof(struct user, u_debugreg[7])) { + addr -= offsetof(struct user, u_debugreg[0]); + ret = ptrace_set_debugreg(child, + addr / sizeof(data), data); + } + break; + + case PTRACE_GETREGS: /* Get all gp regs from the child. */ + return copy_regset_to_user(child, + task_user_regset_view(current), + REGSET_GENERAL, + 0, sizeof(struct user_regs_struct), + datap); + + case PTRACE_SETREGS: /* Set all gp regs in the child. */ + return copy_regset_from_user(child, + task_user_regset_view(current), + REGSET_GENERAL, + 0, sizeof(struct user_regs_struct), + datap); + + case PTRACE_GETFPREGS: /* Get the child FPU state. */ + return copy_regset_to_user(child, + task_user_regset_view(current), + REGSET_FP, + 0, sizeof(struct user_i387_struct), + datap); + + case PTRACE_SETFPREGS: /* Set the child FPU state. */ + return copy_regset_from_user(child, + task_user_regset_view(current), + REGSET_FP, + 0, sizeof(struct user_i387_struct), + datap); + + default: + return compat_ptrace_request(child, request, addr, data); + } + + return ret; +} +#endif + +long compat_arch_ptrace(struct task_struct *child, compat_long_t request, + compat_ulong_t caddr, compat_ulong_t cdata) +{ + unsigned long addr = caddr; + unsigned long data = cdata; + void __user *datap = compat_ptr(data); + int ret; + __u32 val; + +#ifdef CONFIG_X86_X32_ABI + if (!is_ia32_task()) + return x32_arch_ptrace(child, request, caddr, cdata); +#endif + + switch (request) { + case PTRACE_PEEKUSR: + ret = getreg32(child, addr, &val); + if (ret == 0) + ret = put_user(val, (__u32 __user *)datap); + break; + + case PTRACE_POKEUSR: + ret = putreg32(child, addr, data); + break; + + case PTRACE_GETREGS: /* Get all gp regs from the child. */ + return copy_regset_to_user(child, &user_x86_32_view, + REGSET_GENERAL, + 0, sizeof(struct user_regs_struct32), + datap); + + case PTRACE_SETREGS: /* Set all gp regs in the child. */ + return copy_regset_from_user(child, &user_x86_32_view, + REGSET_GENERAL, 0, + sizeof(struct user_regs_struct32), + datap); + + case PTRACE_GETFPREGS: /* Get the child FPU state. */ + return copy_regset_to_user(child, &user_x86_32_view, + REGSET_FP, 0, + sizeof(struct user_i387_ia32_struct), + datap); + + case PTRACE_SETFPREGS: /* Set the child FPU state. */ + return copy_regset_from_user( + child, &user_x86_32_view, REGSET_FP, + 0, sizeof(struct user_i387_ia32_struct), datap); + + case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */ + return copy_regset_to_user(child, &user_x86_32_view, + REGSET_XFP, 0, + sizeof(struct user32_fxsr_struct), + datap); + + case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */ + return copy_regset_from_user(child, &user_x86_32_view, + REGSET_XFP, 0, + sizeof(struct user32_fxsr_struct), + datap); + + case PTRACE_GET_THREAD_AREA: + case PTRACE_SET_THREAD_AREA: + return arch_ptrace(child, request, addr, data); + + default: + return compat_ptrace_request(child, request, addr, data); + } + + return ret; +} + +#endif /* CONFIG_IA32_EMULATION */ + +#ifdef CONFIG_X86_64 + +static struct user_regset x86_64_regsets[] __read_mostly = { + [REGSET_GENERAL] = { + .core_note_type = NT_PRSTATUS, + .n = sizeof(struct user_regs_struct) / sizeof(long), + .size = sizeof(long), .align = sizeof(long), + .get = genregs_get, .set = genregs_set + }, + [REGSET_FP] = { + .core_note_type = NT_PRFPREG, + .n = sizeof(struct user_i387_struct) / sizeof(long), + .size = sizeof(long), .align = sizeof(long), + .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set + }, + [REGSET_XSTATE] = { + .core_note_type = NT_X86_XSTATE, + .size = sizeof(u64), .align = sizeof(u64), + .active = xstateregs_active, .get = xstateregs_get, + .set = xstateregs_set + }, + [REGSET_IOPERM64] = { + .core_note_type = NT_386_IOPERM, + .n = IO_BITMAP_LONGS, + .size = sizeof(long), .align = sizeof(long), + .active = ioperm_active, .get = ioperm_get + }, +}; + +static const struct user_regset_view user_x86_64_view = { + .name = "x86_64", .e_machine = EM_X86_64, + .regsets = x86_64_regsets, .n = ARRAY_SIZE(x86_64_regsets) +}; + +#else /* CONFIG_X86_32 */ + +#define user_regs_struct32 user_regs_struct +#define genregs32_get genregs_get +#define genregs32_set genregs_set + +#endif /* CONFIG_X86_64 */ + +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION +static struct user_regset x86_32_regsets[] __read_mostly = { + [REGSET_GENERAL] = { + .core_note_type = NT_PRSTATUS, + .n = sizeof(struct user_regs_struct32) / sizeof(u32), + .size = sizeof(u32), .align = sizeof(u32), + .get = genregs32_get, .set = genregs32_set + }, + [REGSET_FP] = { + .core_note_type = NT_PRFPREG, + .n = sizeof(struct user_i387_ia32_struct) / sizeof(u32), + .size = sizeof(u32), .align = sizeof(u32), + .active = fpregs_active, .get = fpregs_get, .set = fpregs_set + }, + [REGSET_XFP] = { + .core_note_type = NT_PRXFPREG, + .n = sizeof(struct user32_fxsr_struct) / sizeof(u32), + .size = sizeof(u32), .align = sizeof(u32), + .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set + }, + [REGSET_XSTATE] = { + .core_note_type = NT_X86_XSTATE, + .size = sizeof(u64), .align = sizeof(u64), + .active = xstateregs_active, .get = xstateregs_get, + .set = xstateregs_set + }, + [REGSET_TLS] = { + .core_note_type = NT_386_TLS, + .n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN, + .size = sizeof(struct user_desc), + .align = sizeof(struct user_desc), + .active = regset_tls_active, + .get = regset_tls_get, .set = regset_tls_set + }, + [REGSET_IOPERM32] = { + .core_note_type = NT_386_IOPERM, + .n = IO_BITMAP_BYTES / sizeof(u32), + .size = sizeof(u32), .align = sizeof(u32), + .active = ioperm_active, .get = ioperm_get + }, +}; + +static const struct user_regset_view user_x86_32_view = { + .name = "i386", .e_machine = EM_386, + .regsets = x86_32_regsets, .n = ARRAY_SIZE(x86_32_regsets) +}; +#endif + +/* + * This represents bytes 464..511 in the memory layout exported through + * the REGSET_XSTATE interface. + */ +u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; + +void update_regset_xstate_info(unsigned int size, u64 xstate_mask) +{ +#ifdef CONFIG_X86_64 + x86_64_regsets[REGSET_XSTATE].n = size / sizeof(u64); +#endif +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION + x86_32_regsets[REGSET_XSTATE].n = size / sizeof(u64); +#endif + xstate_fx_sw_bytes[USER_XSTATE_XCR0_WORD] = xstate_mask; +} + +const struct user_regset_view *task_user_regset_view(struct task_struct *task) +{ +#ifdef CONFIG_IA32_EMULATION + if (test_tsk_thread_flag(task, TIF_IA32)) +#endif +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION + return &user_x86_32_view; +#endif +#ifdef CONFIG_X86_64 + return &user_x86_64_view; +#endif +} + +static void fill_sigtrap_info(struct task_struct *tsk, + struct pt_regs *regs, + int error_code, int si_code, + struct siginfo *info) +{ + tsk->thread.trap_nr = X86_TRAP_DB; + tsk->thread.error_code = error_code; + + memset(info, 0, sizeof(*info)); + info->si_signo = SIGTRAP; + info->si_code = si_code; + info->si_addr = user_mode_vm(regs) ? (void __user *)regs->ip : NULL; +} + +void user_single_step_siginfo(struct task_struct *tsk, + struct pt_regs *regs, + struct siginfo *info) +{ + fill_sigtrap_info(tsk, regs, 0, TRAP_BRKPT, info); +} + +void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, + int error_code, int si_code) +{ + struct siginfo info; + + fill_sigtrap_info(tsk, regs, error_code, si_code, &info); + /* Send us the fake SIGTRAP */ + force_sig_info(SIGTRAP, &info, tsk); +} + + +#ifdef CONFIG_X86_32 +# define IS_IA32 1 +#elif defined CONFIG_IA32_EMULATION +# define IS_IA32 is_compat_task() +#else +# define IS_IA32 0 +#endif + +/* + * We must return the syscall number to actually look up in the table. + * This can be -1L to skip running any syscall at all. + */ +long syscall_trace_enter(struct pt_regs *regs) +{ + long ret = 0; + + user_exit(); + + /* + * If we stepped into a sysenter/syscall insn, it trapped in + * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP. + * If user-mode had set TF itself, then it's still clear from + * do_debug() and we need to set it again to restore the user + * state. If we entered on the slow path, TF was already set. + */ + if (test_thread_flag(TIF_SINGLESTEP)) + regs->flags |= X86_EFLAGS_TF; + + /* do the secure computing check first */ + if (secure_computing(regs->orig_ax)) { + /* seccomp failures shouldn't expose any additional code. */ + ret = -1L; + goto out; + } + + if (unlikely(test_thread_flag(TIF_SYSCALL_EMU))) + ret = -1L; + + if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) && + tracehook_report_syscall_entry(regs)) + ret = -1L; + + if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) + trace_sys_enter(regs, regs->orig_ax); + + if (IS_IA32) + audit_syscall_entry(AUDIT_ARCH_I386, + regs->orig_ax, + regs->bx, regs->cx, + regs->dx, regs->si); +#ifdef CONFIG_X86_64 + else + audit_syscall_entry(AUDIT_ARCH_X86_64, + regs->orig_ax, + regs->di, regs->si, + regs->dx, regs->r10); +#endif + +out: + return ret ?: regs->orig_ax; +} + +void syscall_trace_leave(struct pt_regs *regs) +{ + bool step; + + /* + * We may come here right after calling schedule_user() + * or do_notify_resume(), in which case we can be in RCU + * user mode. + */ + user_exit(); + + audit_syscall_exit(regs); + + if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) + trace_sys_exit(regs, regs->ax); + + /* + * If TIF_SYSCALL_EMU is set, we only get here because of + * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). + * We already reported this syscall instruction in + * syscall_trace_enter(). + */ + step = unlikely(test_thread_flag(TIF_SINGLESTEP)) && + !test_thread_flag(TIF_SYSCALL_EMU); + if (step || test_thread_flag(TIF_SYSCALL_TRACE)) + tracehook_report_syscall_exit(regs, step); + + user_enter(); +} diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c new file mode 100644 index 00000000000..2f355d229a5 --- /dev/null +++ b/arch/x86/kernel/pvclock.c @@ -0,0 +1,166 @@ +/* paravirtual clock -- common code used by kvm/xen + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#include <linux/kernel.h> +#include <linux/percpu.h> +#include <linux/notifier.h> +#include <linux/sched.h> +#include <linux/gfp.h> +#include <linux/bootmem.h> +#include <asm/fixmap.h> +#include <asm/pvclock.h> + +static u8 valid_flags __read_mostly = 0; + +void pvclock_set_flags(u8 flags) +{ + valid_flags = flags; +} + +unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src) +{ + u64 pv_tsc_khz = 1000000ULL << 32; + + do_div(pv_tsc_khz, src->tsc_to_system_mul); + if (src->tsc_shift < 0) + pv_tsc_khz <<= -src->tsc_shift; + else + pv_tsc_khz >>= src->tsc_shift; + return pv_tsc_khz; +} + +void pvclock_touch_watchdogs(void) +{ + touch_softlockup_watchdog_sync(); + clocksource_touch_watchdog(); + rcu_cpu_stall_reset(); + reset_hung_task_detector(); +} + +static atomic64_t last_value = ATOMIC64_INIT(0); + +void pvclock_resume(void) +{ + atomic64_set(&last_value, 0); +} + +u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src) +{ + unsigned version; + cycle_t ret; + u8 flags; + + do { + version = __pvclock_read_cycles(src, &ret, &flags); + } while ((src->version & 1) || version != src->version); + + return flags & valid_flags; +} + +cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) +{ + unsigned version; + cycle_t ret; + u64 last; + u8 flags; + + do { + version = __pvclock_read_cycles(src, &ret, &flags); + } while ((src->version & 1) || version != src->version); + + if (unlikely((flags & PVCLOCK_GUEST_STOPPED) != 0)) { + src->flags &= ~PVCLOCK_GUEST_STOPPED; + pvclock_touch_watchdogs(); + } + + if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) && + (flags & PVCLOCK_TSC_STABLE_BIT)) + return ret; + + /* + * Assumption here is that last_value, a global accumulator, always goes + * forward. If we are less than that, we should not be much smaller. + * We assume there is an error marging we're inside, and then the correction + * does not sacrifice accuracy. + * + * For reads: global may have changed between test and return, + * but this means someone else updated poked the clock at a later time. + * We just need to make sure we are not seeing a backwards event. + * + * For updates: last_value = ret is not enough, since two vcpus could be + * updating at the same time, and one of them could be slightly behind, + * making the assumption that last_value always go forward fail to hold. + */ + last = atomic64_read(&last_value); + do { + if (ret < last) + return last; + last = atomic64_cmpxchg(&last_value, last, ret); + } while (unlikely(last != ret)); + + return ret; +} + +void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, + struct pvclock_vcpu_time_info *vcpu_time, + struct timespec *ts) +{ + u32 version; + u64 delta; + struct timespec now; + + /* get wallclock at system boot */ + do { + version = wall_clock->version; + rmb(); /* fetch version before time */ + now.tv_sec = wall_clock->sec; + now.tv_nsec = wall_clock->nsec; + rmb(); /* fetch time before checking version */ + } while ((wall_clock->version & 1) || (version != wall_clock->version)); + + delta = pvclock_clocksource_read(vcpu_time); /* time since system boot */ + delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec; + + now.tv_nsec = do_div(delta, NSEC_PER_SEC); + now.tv_sec = delta; + + set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); +} + +#ifdef CONFIG_X86_64 +/* + * Initialize the generic pvclock vsyscall state. This will allocate + * a/some page(s) for the per-vcpu pvclock information, set up a + * fixmap mapping for the page(s) + */ + +int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i, + int size) +{ + int idx; + + WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE); + + for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) { + __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx, + __pa(i) + (idx*PAGE_SIZE), + PAGE_KERNEL_VVAR); + } + + return 0; +} +#endif diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c new file mode 100644 index 00000000000..ff898bbf579 --- /dev/null +++ b/arch/x86/kernel/quirks.c @@ -0,0 +1,610 @@ +/* + * This file contains work-arounds for x86 and x86_64 platform bugs. + */ +#include <linux/pci.h> +#include <linux/irq.h> + +#include <asm/hpet.h> + +#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI) + +static void quirk_intel_irqbalance(struct pci_dev *dev) +{ + u8 config; + u16 word; + + /* BIOS may enable hardware IRQ balancing for + * E7520/E7320/E7525(revision ID 0x9 and below) + * based platforms. + * Disable SW irqbalance/affinity on those platforms. + */ + if (dev->revision > 0x9) + return; + + /* enable access to config space*/ + pci_read_config_byte(dev, 0xf4, &config); + pci_write_config_byte(dev, 0xf4, config|0x2); + + /* + * read xTPR register. We may not have a pci_dev for device 8 + * because it might be hidden until the above write. + */ + pci_bus_read_config_word(dev->bus, PCI_DEVFN(8, 0), 0x4c, &word); + + if (!(word & (1 << 13))) { + dev_info(&dev->dev, "Intel E7520/7320/7525 detected; " + "disabling irq balancing and affinity\n"); + noirqdebug_setup(""); +#ifdef CONFIG_PROC_FS + no_irq_affinity = 1; +#endif + } + + /* put back the original value for config space*/ + if (!(config & 0x2)) + pci_write_config_byte(dev, 0xf4, config); +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, + quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, + quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, + quirk_intel_irqbalance); +#endif + +#if defined(CONFIG_HPET_TIMER) +unsigned long force_hpet_address; + +static enum { + NONE_FORCE_HPET_RESUME, + OLD_ICH_FORCE_HPET_RESUME, + ICH_FORCE_HPET_RESUME, + VT8237_FORCE_HPET_RESUME, + NVIDIA_FORCE_HPET_RESUME, + ATI_FORCE_HPET_RESUME, +} force_hpet_resume_type; + +static void __iomem *rcba_base; + +static void ich_force_hpet_resume(void) +{ + u32 val; + + if (!force_hpet_address) + return; + + BUG_ON(rcba_base == NULL); + + /* read the Function Disable register, dword mode only */ + val = readl(rcba_base + 0x3404); + if (!(val & 0x80)) { + /* HPET disabled in HPTC. Trying to enable */ + writel(val | 0x80, rcba_base + 0x3404); + } + + val = readl(rcba_base + 0x3404); + if (!(val & 0x80)) + BUG(); + else + printk(KERN_DEBUG "Force enabled HPET at resume\n"); + + return; +} + +static void ich_force_enable_hpet(struct pci_dev *dev) +{ + u32 val; + u32 uninitialized_var(rcba); + int err = 0; + + if (hpet_address || force_hpet_address) + return; + + pci_read_config_dword(dev, 0xF0, &rcba); + rcba &= 0xFFFFC000; + if (rcba == 0) { + dev_printk(KERN_DEBUG, &dev->dev, "RCBA disabled; " + "cannot force enable HPET\n"); + return; + } + + /* use bits 31:14, 16 kB aligned */ + rcba_base = ioremap_nocache(rcba, 0x4000); + if (rcba_base == NULL) { + dev_printk(KERN_DEBUG, &dev->dev, "ioremap failed; " + "cannot force enable HPET\n"); + return; + } + + /* read the Function Disable register, dword mode only */ + val = readl(rcba_base + 0x3404); + + if (val & 0x80) { + /* HPET is enabled in HPTC. Just not reported by BIOS */ + val = val & 0x3; + force_hpet_address = 0xFED00000 | (val << 12); + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at " + "0x%lx\n", force_hpet_address); + iounmap(rcba_base); + return; + } + + /* HPET disabled in HPTC. Trying to enable */ + writel(val | 0x80, rcba_base + 0x3404); + + val = readl(rcba_base + 0x3404); + if (!(val & 0x80)) { + err = 1; + } else { + val = val & 0x3; + force_hpet_address = 0xFED00000 | (val << 12); + } + + if (err) { + force_hpet_address = 0; + iounmap(rcba_base); + dev_printk(KERN_DEBUG, &dev->dev, + "Failed to force enable HPET\n"); + } else { + force_hpet_resume_type = ICH_FORCE_HPET_RESUME; + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at " + "0x%lx\n", force_hpet_address); + } +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB2_0, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_0, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_1, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_0, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_1, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_31, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_4, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7, + ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x3a16, /* ICH10 */ + ich_force_enable_hpet); + +static struct pci_dev *cached_dev; + +static void hpet_print_force_info(void) +{ + printk(KERN_INFO "HPET not enabled in BIOS. " + "You might try hpet=force boot option\n"); +} + +static void old_ich_force_hpet_resume(void) +{ + u32 val; + u32 uninitialized_var(gen_cntl); + + if (!force_hpet_address || !cached_dev) + return; + + pci_read_config_dword(cached_dev, 0xD0, &gen_cntl); + gen_cntl &= (~(0x7 << 15)); + gen_cntl |= (0x4 << 15); + + pci_write_config_dword(cached_dev, 0xD0, gen_cntl); + pci_read_config_dword(cached_dev, 0xD0, &gen_cntl); + val = gen_cntl >> 15; + val &= 0x7; + if (val == 0x4) + printk(KERN_DEBUG "Force enabled HPET at resume\n"); + else + BUG(); +} + +static void old_ich_force_enable_hpet(struct pci_dev *dev) +{ + u32 val; + u32 uninitialized_var(gen_cntl); + + if (hpet_address || force_hpet_address) + return; + + pci_read_config_dword(dev, 0xD0, &gen_cntl); + /* + * Bit 17 is HPET enable bit. + * Bit 16:15 control the HPET base address. + */ + val = gen_cntl >> 15; + val &= 0x7; + if (val & 0x4) { + val &= 0x3; + force_hpet_address = 0xFED00000 | (val << 12); + dev_printk(KERN_DEBUG, &dev->dev, "HPET at 0x%lx\n", + force_hpet_address); + return; + } + + /* + * HPET is disabled. Trying enabling at FED00000 and check + * whether it sticks + */ + gen_cntl &= (~(0x7 << 15)); + gen_cntl |= (0x4 << 15); + pci_write_config_dword(dev, 0xD0, gen_cntl); + + pci_read_config_dword(dev, 0xD0, &gen_cntl); + + val = gen_cntl >> 15; + val &= 0x7; + if (val & 0x4) { + /* HPET is enabled in HPTC. Just not reported by BIOS */ + val &= 0x3; + force_hpet_address = 0xFED00000 | (val << 12); + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at " + "0x%lx\n", force_hpet_address); + cached_dev = dev; + force_hpet_resume_type = OLD_ICH_FORCE_HPET_RESUME; + return; + } + + dev_printk(KERN_DEBUG, &dev->dev, "Failed to force enable HPET\n"); +} + +/* + * Undocumented chipset features. Make sure that the user enforced + * this. + */ +static void old_ich_force_enable_hpet_user(struct pci_dev *dev) +{ + if (hpet_force_user) + old_ich_force_enable_hpet(dev); +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB_1, + old_ich_force_enable_hpet_user); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_0, + old_ich_force_enable_hpet_user); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_12, + old_ich_force_enable_hpet_user); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801DB_0, + old_ich_force_enable_hpet_user); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801DB_12, + old_ich_force_enable_hpet_user); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801EB_0, + old_ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801EB_12, + old_ich_force_enable_hpet); + + +static void vt8237_force_hpet_resume(void) +{ + u32 val; + + if (!force_hpet_address || !cached_dev) + return; + + val = 0xfed00000 | 0x80; + pci_write_config_dword(cached_dev, 0x68, val); + + pci_read_config_dword(cached_dev, 0x68, &val); + if (val & 0x80) + printk(KERN_DEBUG "Force enabled HPET at resume\n"); + else + BUG(); +} + +static void vt8237_force_enable_hpet(struct pci_dev *dev) +{ + u32 uninitialized_var(val); + + if (hpet_address || force_hpet_address) + return; + + if (!hpet_force_user) { + hpet_print_force_info(); + return; + } + + pci_read_config_dword(dev, 0x68, &val); + /* + * Bit 7 is HPET enable bit. + * Bit 31:10 is HPET base address (contrary to what datasheet claims) + */ + if (val & 0x80) { + force_hpet_address = (val & ~0x3ff); + dev_printk(KERN_DEBUG, &dev->dev, "HPET at 0x%lx\n", + force_hpet_address); + return; + } + + /* + * HPET is disabled. Trying enabling at FED00000 and check + * whether it sticks + */ + val = 0xfed00000 | 0x80; + pci_write_config_dword(dev, 0x68, val); + + pci_read_config_dword(dev, 0x68, &val); + if (val & 0x80) { + force_hpet_address = (val & ~0x3ff); + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at " + "0x%lx\n", force_hpet_address); + cached_dev = dev; + force_hpet_resume_type = VT8237_FORCE_HPET_RESUME; + return; + } + + dev_printk(KERN_DEBUG, &dev->dev, "Failed to force enable HPET\n"); +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, + vt8237_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237, + vt8237_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_CX700, + vt8237_force_enable_hpet); + +static void ati_force_hpet_resume(void) +{ + pci_write_config_dword(cached_dev, 0x14, 0xfed00000); + printk(KERN_DEBUG "Force enabled HPET at resume\n"); +} + +static u32 ati_ixp4x0_rev(struct pci_dev *dev) +{ + int err = 0; + u32 d = 0; + u8 b = 0; + + err = pci_read_config_byte(dev, 0xac, &b); + b &= ~(1<<5); + err |= pci_write_config_byte(dev, 0xac, b); + err |= pci_read_config_dword(dev, 0x70, &d); + d |= 1<<8; + err |= pci_write_config_dword(dev, 0x70, d); + err |= pci_read_config_dword(dev, 0x8, &d); + d &= 0xff; + dev_printk(KERN_DEBUG, &dev->dev, "SB4X0 revision 0x%x\n", d); + + WARN_ON_ONCE(err); + + return d; +} + +static void ati_force_enable_hpet(struct pci_dev *dev) +{ + u32 d, val; + u8 b; + + if (hpet_address || force_hpet_address) + return; + + if (!hpet_force_user) { + hpet_print_force_info(); + return; + } + + d = ati_ixp4x0_rev(dev); + if (d < 0x82) + return; + + /* base address */ + pci_write_config_dword(dev, 0x14, 0xfed00000); + pci_read_config_dword(dev, 0x14, &val); + + /* enable interrupt */ + outb(0x72, 0xcd6); b = inb(0xcd7); + b |= 0x1; + outb(0x72, 0xcd6); outb(b, 0xcd7); + outb(0x72, 0xcd6); b = inb(0xcd7); + if (!(b & 0x1)) + return; + pci_read_config_dword(dev, 0x64, &d); + d |= (1<<10); + pci_write_config_dword(dev, 0x64, d); + pci_read_config_dword(dev, 0x64, &d); + if (!(d & (1<<10))) + return; + + force_hpet_address = val; + force_hpet_resume_type = ATI_FORCE_HPET_RESUME; + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n", + force_hpet_address); + cached_dev = dev; +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS, + ati_force_enable_hpet); + +/* + * Undocumented chipset feature taken from LinuxBIOS. + */ +static void nvidia_force_hpet_resume(void) +{ + pci_write_config_dword(cached_dev, 0x44, 0xfed00001); + printk(KERN_DEBUG "Force enabled HPET at resume\n"); +} + +static void nvidia_force_enable_hpet(struct pci_dev *dev) +{ + u32 uninitialized_var(val); + + if (hpet_address || force_hpet_address) + return; + + if (!hpet_force_user) { + hpet_print_force_info(); + return; + } + + pci_write_config_dword(dev, 0x44, 0xfed00001); + pci_read_config_dword(dev, 0x44, &val); + force_hpet_address = val & 0xfffffffe; + force_hpet_resume_type = NVIDIA_FORCE_HPET_RESUME; + dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n", + force_hpet_address); + cached_dev = dev; + return; +} + +/* ISA Bridges */ +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0050, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0051, + nvidia_force_enable_hpet); + +/* LPC bridges */ +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0260, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0360, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0361, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0362, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0363, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0364, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0365, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0366, + nvidia_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NVIDIA, 0x0367, + nvidia_force_enable_hpet); + +void force_hpet_resume(void) +{ + switch (force_hpet_resume_type) { + case ICH_FORCE_HPET_RESUME: + ich_force_hpet_resume(); + return; + case OLD_ICH_FORCE_HPET_RESUME: + old_ich_force_hpet_resume(); + return; + case VT8237_FORCE_HPET_RESUME: + vt8237_force_hpet_resume(); + return; + case NVIDIA_FORCE_HPET_RESUME: + nvidia_force_hpet_resume(); + return; + case ATI_FORCE_HPET_RESUME: + ati_force_hpet_resume(); + return; + default: + break; + } +} + +/* + * HPET MSI on some boards (ATI SB700/SB800) has side effect on + * floppy DMA. Disable HPET MSI on such platforms. + * See erratum #27 (Misinterpreted MSI Requests May Result in + * Corrupted LPC DMA Data) in AMD Publication #46837, + * "SB700 Family Product Errata", Rev. 1.0, March 2010. + */ +static void force_disable_hpet_msi(struct pci_dev *unused) +{ + hpet_msi_disable = 1; +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS, + force_disable_hpet_msi); + +#endif + +#if defined(CONFIG_PCI) && defined(CONFIG_NUMA) +/* Set correct numa_node information for AMD NB functions */ +static void quirk_amd_nb_node(struct pci_dev *dev) +{ + struct pci_dev *nb_ht; + unsigned int devfn; + u32 node; + u32 val; + + devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0); + nb_ht = pci_get_slot(dev->bus, devfn); + if (!nb_ht) + return; + + pci_read_config_dword(nb_ht, 0x60, &val); + node = pcibus_to_node(dev->bus) | (val & 7); + /* + * Some hardware may return an invalid node ID, + * so check it first: + */ + if (node_online(node)) + set_dev_node(&dev->dev, node); + pci_dev_put(nb_ht); +} + +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_ADDRMAP, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MEMCTL, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_HT, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MAP, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_DRAM, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_LINK, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F0, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F1, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F2, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F5, + quirk_amd_nb_node); + +#endif + +#ifdef CONFIG_PCI +/* + * Processor does not ensure DRAM scrub read/write sequence + * is atomic wrt accesses to CC6 save state area. Therefore + * if a concurrent scrub read/write access is to same address + * the entry may appear as if it is not written. This quirk + * applies to Fam16h models 00h-0Fh + * + * See "Revision Guide" for AMD F16h models 00h-0fh, + * document 51810 rev. 3.04, Nov 2013 + */ +static void amd_disable_seq_and_redirect_scrub(struct pci_dev *dev) +{ + u32 val; + + /* + * Suggested workaround: + * set D18F3x58[4:0] = 00h and set D18F3x5C[0] = 0b + */ + pci_read_config_dword(dev, 0x58, &val); + if (val & 0x1F) { + val &= ~(0x1F); + pci_write_config_dword(dev, 0x58, val); + } + + pci_read_config_dword(dev, 0x5C, &val); + if (val & BIT(0)) { + val &= ~BIT(0); + pci_write_config_dword(dev, 0x5c, val); + } +} + +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3, + amd_disable_seq_and_redirect_scrub); + +#endif diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c new file mode 100644 index 00000000000..52b1157c53e --- /dev/null +++ b/arch/x86/kernel/reboot.c @@ -0,0 +1,775 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/reboot.h> +#include <linux/init.h> +#include <linux/pm.h> +#include <linux/efi.h> +#include <linux/dmi.h> +#include <linux/sched.h> +#include <linux/tboot.h> +#include <linux/delay.h> +#include <acpi/reboot.h> +#include <asm/io.h> +#include <asm/apic.h> +#include <asm/desc.h> +#include <asm/hpet.h> +#include <asm/pgtable.h> +#include <asm/proto.h> +#include <asm/reboot_fixups.h> +#include <asm/reboot.h> +#include <asm/pci_x86.h> +#include <asm/virtext.h> +#include <asm/cpu.h> +#include <asm/nmi.h> +#include <asm/smp.h> + +#include <linux/ctype.h> +#include <linux/mc146818rtc.h> +#include <asm/realmode.h> +#include <asm/x86_init.h> + +/* + * Power off function, if any + */ +void (*pm_power_off)(void); +EXPORT_SYMBOL(pm_power_off); + +static const struct desc_ptr no_idt = {}; + +/* + * This is set if we need to go through the 'emergency' path. + * When machine_emergency_restart() is called, we may be on + * an inconsistent state and won't be able to do a clean cleanup + */ +static int reboot_emergency; + +/* This is set by the PCI code if either type 1 or type 2 PCI is detected */ +bool port_cf9_safe = false; + +/* + * Reboot options and system auto-detection code provided by + * Dell Inc. so their systems "just work". :-) + */ + +/* + * Some machines require the "reboot=b" or "reboot=k" commandline options, + * this quirk makes that automatic. + */ +static int __init set_bios_reboot(const struct dmi_system_id *d) +{ + if (reboot_type != BOOT_BIOS) { + reboot_type = BOOT_BIOS; + pr_info("%s series board detected. Selecting %s-method for reboots.\n", + d->ident, "BIOS"); + } + return 0; +} + +void __noreturn machine_real_restart(unsigned int type) +{ + local_irq_disable(); + + /* + * Write zero to CMOS register number 0x0f, which the BIOS POST + * routine will recognize as telling it to do a proper reboot. (Well + * that's what this book in front of me says -- it may only apply to + * the Phoenix BIOS though, it's not clear). At the same time, + * disable NMIs by setting the top bit in the CMOS address register, + * as we're about to do peculiar things to the CPU. I'm not sure if + * `outb_p' is needed instead of just `outb'. Use it to be on the + * safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.) + */ + spin_lock(&rtc_lock); + CMOS_WRITE(0x00, 0x8f); + spin_unlock(&rtc_lock); + + /* + * Switch back to the initial page table. + */ +#ifdef CONFIG_X86_32 + load_cr3(initial_page_table); +#else + write_cr3(real_mode_header->trampoline_pgd); +#endif + + /* Jump to the identity-mapped low memory code */ +#ifdef CONFIG_X86_32 + asm volatile("jmpl *%0" : : + "rm" (real_mode_header->machine_real_restart_asm), + "a" (type)); +#else + asm volatile("ljmpl *%0" : : + "m" (real_mode_header->machine_real_restart_asm), + "D" (type)); +#endif + unreachable(); +} +#ifdef CONFIG_APM_MODULE +EXPORT_SYMBOL(machine_real_restart); +#endif + +/* + * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot + */ +static int __init set_pci_reboot(const struct dmi_system_id *d) +{ + if (reboot_type != BOOT_CF9_FORCE) { + reboot_type = BOOT_CF9_FORCE; + pr_info("%s series board detected. Selecting %s-method for reboots.\n", + d->ident, "PCI"); + } + return 0; +} + +static int __init set_kbd_reboot(const struct dmi_system_id *d) +{ + if (reboot_type != BOOT_KBD) { + reboot_type = BOOT_KBD; + pr_info("%s series board detected. Selecting %s-method for reboot.\n", + d->ident, "KBD"); + } + return 0; +} + +/* + * This is a single dmi_table handling all reboot quirks. + */ +static struct dmi_system_id __initdata reboot_dmi_table[] = { + + /* Acer */ + { /* Handle reboot issue on Acer Aspire one */ + .callback = set_kbd_reboot, + .ident = "Acer Aspire One A110", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Acer"), + DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"), + }, + }, + + /* Apple */ + { /* Handle problems with rebooting on Apple MacBook5 */ + .callback = set_pci_reboot, + .ident = "Apple MacBook5", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"), + }, + }, + { /* Handle problems with rebooting on Apple MacBookPro5 */ + .callback = set_pci_reboot, + .ident = "Apple MacBookPro5", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"), + }, + }, + { /* Handle problems with rebooting on Apple Macmini3,1 */ + .callback = set_pci_reboot, + .ident = "Apple Macmini3,1", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"), + }, + }, + { /* Handle problems with rebooting on the iMac9,1. */ + .callback = set_pci_reboot, + .ident = "Apple iMac9,1", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"), + }, + }, + + /* ASUS */ + { /* Handle problems with rebooting on ASUS P4S800 */ + .callback = set_bios_reboot, + .ident = "ASUS P4S800", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), + DMI_MATCH(DMI_BOARD_NAME, "P4S800"), + }, + }, + + /* Certec */ + { /* Handle problems with rebooting on Certec BPC600 */ + .callback = set_pci_reboot, + .ident = "Certec BPC600", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Certec"), + DMI_MATCH(DMI_PRODUCT_NAME, "BPC600"), + }, + }, + + /* Dell */ + { /* Handle problems with rebooting on Dell DXP061 */ + .callback = set_bios_reboot, + .ident = "Dell DXP061", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Dell DXP061"), + }, + }, + { /* Handle problems with rebooting on Dell E520's */ + .callback = set_bios_reboot, + .ident = "Dell E520", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Dell DM061"), + }, + }, + { /* Handle problems with rebooting on the Latitude E5410. */ + .callback = set_pci_reboot, + .ident = "Dell Latitude E5410", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5410"), + }, + }, + { /* Handle problems with rebooting on the Latitude E5420. */ + .callback = set_pci_reboot, + .ident = "Dell Latitude E5420", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5420"), + }, + }, + { /* Handle problems with rebooting on the Latitude E6320. */ + .callback = set_pci_reboot, + .ident = "Dell Latitude E6320", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6320"), + }, + }, + { /* Handle problems with rebooting on the Latitude E6420. */ + .callback = set_pci_reboot, + .ident = "Dell Latitude E6420", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6420"), + }, + }, + { /* Handle problems with rebooting on Dell Optiplex 330 with 0KP561 */ + .callback = set_bios_reboot, + .ident = "Dell OptiPlex 330", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 330"), + DMI_MATCH(DMI_BOARD_NAME, "0KP561"), + }, + }, + { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */ + .callback = set_bios_reboot, + .ident = "Dell OptiPlex 360", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 360"), + DMI_MATCH(DMI_BOARD_NAME, "0T656F"), + }, + }, + { /* Handle problems with rebooting on Dell Optiplex 745's SFF */ + .callback = set_bios_reboot, + .ident = "Dell OptiPlex 745", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"), + }, + }, + { /* Handle problems with rebooting on Dell Optiplex 745's DFF */ + .callback = set_bios_reboot, + .ident = "Dell OptiPlex 745", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"), + DMI_MATCH(DMI_BOARD_NAME, "0MM599"), + }, + }, + { /* Handle problems with rebooting on Dell Optiplex 745 with 0KW626 */ + .callback = set_bios_reboot, + .ident = "Dell OptiPlex 745", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"), + DMI_MATCH(DMI_BOARD_NAME, "0KW626"), + }, + }, + { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G */ + .callback = set_bios_reboot, + .ident = "Dell OptiPlex 760", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 760"), + DMI_MATCH(DMI_BOARD_NAME, "0G919G"), + }, + }, + { /* Handle problems with rebooting on the OptiPlex 990. */ + .callback = set_pci_reboot, + .ident = "Dell OptiPlex 990", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"), + }, + }, + { /* Handle problems with rebooting on Dell 300's */ + .callback = set_bios_reboot, + .ident = "Dell PowerEdge 300", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"), + }, + }, + { /* Handle problems with rebooting on Dell 1300's */ + .callback = set_bios_reboot, + .ident = "Dell PowerEdge 1300", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1300/"), + }, + }, + { /* Handle problems with rebooting on Dell 2400's */ + .callback = set_bios_reboot, + .ident = "Dell PowerEdge 2400", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"), + }, + }, + { /* Handle problems with rebooting on the Dell PowerEdge C6100. */ + .callback = set_pci_reboot, + .ident = "Dell PowerEdge C6100", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell"), + DMI_MATCH(DMI_PRODUCT_NAME, "C6100"), + }, + }, + { /* Handle problems with rebooting on the Precision M6600. */ + .callback = set_pci_reboot, + .ident = "Dell Precision M6600", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Precision M6600"), + }, + }, + { /* Handle problems with rebooting on Dell T5400's */ + .callback = set_bios_reboot, + .ident = "Dell Precision T5400", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T5400"), + }, + }, + { /* Handle problems with rebooting on Dell T7400's */ + .callback = set_bios_reboot, + .ident = "Dell Precision T7400", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T7400"), + }, + }, + { /* Handle problems with rebooting on Dell XPS710 */ + .callback = set_bios_reboot, + .ident = "Dell XPS710", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Dell XPS710"), + }, + }, + + /* Hewlett-Packard */ + { /* Handle problems with rebooting on HP laptops */ + .callback = set_bios_reboot, + .ident = "HP Compaq Laptop", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"), + }, + }, + + /* Sony */ + { /* Handle problems with rebooting on Sony VGN-Z540N */ + .callback = set_bios_reboot, + .ident = "Sony VGN-Z540N", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Sony Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"), + }, + }, + + { } +}; + +static int __init reboot_init(void) +{ + /* + * Only do the DMI check if reboot_type hasn't been overridden + * on the command line + */ + if (reboot_default) + dmi_check_system(reboot_dmi_table); + return 0; +} +core_initcall(reboot_init); + +static inline void kb_wait(void) +{ + int i; + + for (i = 0; i < 0x10000; i++) { + if ((inb(0x64) & 0x02) == 0) + break; + udelay(2); + } +} + +static void vmxoff_nmi(int cpu, struct pt_regs *regs) +{ + cpu_emergency_vmxoff(); +} + +/* Use NMIs as IPIs to tell all CPUs to disable virtualization */ +static void emergency_vmx_disable_all(void) +{ + /* Just make sure we won't change CPUs while doing this */ + local_irq_disable(); + + /* + * We need to disable VMX on all CPUs before rebooting, otherwise + * we risk hanging up the machine, because the CPU ignore INIT + * signals when VMX is enabled. + * + * We can't take any locks and we may be on an inconsistent + * state, so we use NMIs as IPIs to tell the other CPUs to disable + * VMX and halt. + * + * For safety, we will avoid running the nmi_shootdown_cpus() + * stuff unnecessarily, but we don't have a way to check + * if other CPUs have VMX enabled. So we will call it only if the + * CPU we are running on has VMX enabled. + * + * We will miss cases where VMX is not enabled on all CPUs. This + * shouldn't do much harm because KVM always enable VMX on all + * CPUs anyway. But we can miss it on the small window where KVM + * is still enabling VMX. + */ + if (cpu_has_vmx() && cpu_vmx_enabled()) { + /* Disable VMX on this CPU. */ + cpu_vmxoff(); + + /* Halt and disable VMX on the other CPUs */ + nmi_shootdown_cpus(vmxoff_nmi); + + } +} + + +void __attribute__((weak)) mach_reboot_fixups(void) +{ +} + +/* + * To the best of our knowledge Windows compatible x86 hardware expects + * the following on reboot: + * + * 1) If the FADT has the ACPI reboot register flag set, try it + * 2) If still alive, write to the keyboard controller + * 3) If still alive, write to the ACPI reboot register again + * 4) If still alive, write to the keyboard controller again + * 5) If still alive, call the EFI runtime service to reboot + * 6) If no EFI runtime service, call the BIOS to do a reboot + * + * We default to following the same pattern. We also have + * two other reboot methods: 'triple fault' and 'PCI', which + * can be triggered via the reboot= kernel boot option or + * via quirks. + * + * This means that this function can never return, it can misbehave + * by not rebooting properly and hanging. + */ +static void native_machine_emergency_restart(void) +{ + int i; + int attempt = 0; + int orig_reboot_type = reboot_type; + unsigned short mode; + + if (reboot_emergency) + emergency_vmx_disable_all(); + + tboot_shutdown(TB_SHUTDOWN_REBOOT); + + /* Tell the BIOS if we want cold or warm reboot */ + mode = reboot_mode == REBOOT_WARM ? 0x1234 : 0; + *((unsigned short *)__va(0x472)) = mode; + + for (;;) { + /* Could also try the reset bit in the Hammer NB */ + switch (reboot_type) { + case BOOT_ACPI: + acpi_reboot(); + reboot_type = BOOT_KBD; + break; + + case BOOT_KBD: + mach_reboot_fixups(); /* For board specific fixups */ + + for (i = 0; i < 10; i++) { + kb_wait(); + udelay(50); + outb(0xfe, 0x64); /* Pulse reset low */ + udelay(50); + } + if (attempt == 0 && orig_reboot_type == BOOT_ACPI) { + attempt = 1; + reboot_type = BOOT_ACPI; + } else { + reboot_type = BOOT_EFI; + } + break; + + case BOOT_EFI: + if (efi_enabled(EFI_RUNTIME_SERVICES)) + efi.reset_system(reboot_mode == REBOOT_WARM ? + EFI_RESET_WARM : + EFI_RESET_COLD, + EFI_SUCCESS, 0, NULL); + reboot_type = BOOT_BIOS; + break; + + case BOOT_BIOS: + machine_real_restart(MRR_BIOS); + + /* We're probably dead after this, but... */ + reboot_type = BOOT_CF9_SAFE; + break; + + case BOOT_CF9_FORCE: + port_cf9_safe = true; + /* Fall through */ + + case BOOT_CF9_SAFE: + if (port_cf9_safe) { + u8 reboot_code = reboot_mode == REBOOT_WARM ? 0x06 : 0x0E; + u8 cf9 = inb(0xcf9) & ~reboot_code; + outb(cf9|2, 0xcf9); /* Request hard reset */ + udelay(50); + /* Actually do the reset */ + outb(cf9|reboot_code, 0xcf9); + udelay(50); + } + reboot_type = BOOT_TRIPLE; + break; + + case BOOT_TRIPLE: + load_idt(&no_idt); + __asm__ __volatile__("int3"); + + /* We're probably dead after this, but... */ + reboot_type = BOOT_KBD; + break; + } + } +} + +void native_machine_shutdown(void) +{ + /* Stop the cpus and apics */ +#ifdef CONFIG_X86_IO_APIC + /* + * Disabling IO APIC before local APIC is a workaround for + * erratum AVR31 in "Intel Atom Processor C2000 Product Family + * Specification Update". In this situation, interrupts that target + * a Logical Processor whose Local APIC is either in the process of + * being hardware disabled or software disabled are neither delivered + * nor discarded. When this erratum occurs, the processor may hang. + * + * Even without the erratum, it still makes sense to quiet IO APIC + * before disabling Local APIC. + */ + disable_IO_APIC(); +#endif + +#ifdef CONFIG_SMP + /* + * Stop all of the others. Also disable the local irq to + * not receive the per-cpu timer interrupt which may trigger + * scheduler's load balance. + */ + local_irq_disable(); + stop_other_cpus(); +#endif + + lapic_shutdown(); + +#ifdef CONFIG_HPET_TIMER + hpet_disable(); +#endif + +#ifdef CONFIG_X86_64 + x86_platform.iommu_shutdown(); +#endif +} + +static void __machine_emergency_restart(int emergency) +{ + reboot_emergency = emergency; + machine_ops.emergency_restart(); +} + +static void native_machine_restart(char *__unused) +{ + pr_notice("machine restart\n"); + + if (!reboot_force) + machine_shutdown(); + __machine_emergency_restart(0); +} + +static void native_machine_halt(void) +{ + /* Stop other cpus and apics */ + machine_shutdown(); + + tboot_shutdown(TB_SHUTDOWN_HALT); + + stop_this_cpu(NULL); +} + +static void native_machine_power_off(void) +{ + if (pm_power_off) { + if (!reboot_force) + machine_shutdown(); + pm_power_off(); + } + /* A fallback in case there is no PM info available */ + tboot_shutdown(TB_SHUTDOWN_HALT); +} + +struct machine_ops machine_ops = { + .power_off = native_machine_power_off, + .shutdown = native_machine_shutdown, + .emergency_restart = native_machine_emergency_restart, + .restart = native_machine_restart, + .halt = native_machine_halt, +#ifdef CONFIG_KEXEC + .crash_shutdown = native_machine_crash_shutdown, +#endif +}; + +void machine_power_off(void) +{ + machine_ops.power_off(); +} + +void machine_shutdown(void) +{ + machine_ops.shutdown(); +} + +void machine_emergency_restart(void) +{ + __machine_emergency_restart(1); +} + +void machine_restart(char *cmd) +{ + machine_ops.restart(cmd); +} + +void machine_halt(void) +{ + machine_ops.halt(); +} + +#ifdef CONFIG_KEXEC +void machine_crash_shutdown(struct pt_regs *regs) +{ + machine_ops.crash_shutdown(regs); +} +#endif + + +#if defined(CONFIG_SMP) + +/* This keeps a track of which one is crashing cpu. */ +static int crashing_cpu; +static nmi_shootdown_cb shootdown_callback; + +static atomic_t waiting_for_crash_ipi; + +static int crash_nmi_callback(unsigned int val, struct pt_regs *regs) +{ + int cpu; + + cpu = raw_smp_processor_id(); + + /* + * Don't do anything if this handler is invoked on crashing cpu. + * Otherwise, system will completely hang. Crashing cpu can get + * an NMI if system was initially booted with nmi_watchdog parameter. + */ + if (cpu == crashing_cpu) + return NMI_HANDLED; + local_irq_disable(); + + shootdown_callback(cpu, regs); + + atomic_dec(&waiting_for_crash_ipi); + /* Assume hlt works */ + halt(); + for (;;) + cpu_relax(); + + return NMI_HANDLED; +} + +static void smp_send_nmi_allbutself(void) +{ + apic->send_IPI_allbutself(NMI_VECTOR); +} + +/* + * Halt all other CPUs, calling the specified function on each of them + * + * This function can be used to halt all other CPUs on crash + * or emergency reboot time. The function passed as parameter + * will be called inside a NMI handler on all CPUs. + */ +void nmi_shootdown_cpus(nmi_shootdown_cb callback) +{ + unsigned long msecs; + local_irq_disable(); + + /* Make a note of crashing cpu. Will be used in NMI callback. */ + crashing_cpu = safe_smp_processor_id(); + + shootdown_callback = callback; + + atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); + /* Would it be better to replace the trap vector here? */ + if (register_nmi_handler(NMI_LOCAL, crash_nmi_callback, + NMI_FLAG_FIRST, "crash")) + return; /* Return what? */ + /* + * Ensure the new callback function is set before sending + * out the NMI + */ + wmb(); + + smp_send_nmi_allbutself(); + + msecs = 1000; /* Wait at most a second for the other cpus to stop */ + while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) { + mdelay(1); + msecs--; + } + + /* Leave the nmi callback set */ +} +#else /* !CONFIG_SMP */ +void nmi_shootdown_cpus(nmi_shootdown_cb callback) +{ + /* No other CPUs to shoot down */ +} +#endif diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c new file mode 100644 index 00000000000..c8e41e90f59 --- /dev/null +++ b/arch/x86/kernel/reboot_fixups_32.c @@ -0,0 +1,102 @@ +/* + * This is a good place to put board specific reboot fixups. + * + * List of supported fixups: + * geode-gx1/cs5530a - Jaya Kumar <jayalk@intworks.biz> + * geode-gx/lx/cs5536 - Andres Salomon <dilinger@debian.org> + * + */ + +#include <asm/delay.h> +#include <linux/pci.h> +#include <linux/interrupt.h> +#include <asm/reboot_fixups.h> +#include <asm/msr.h> +#include <linux/cs5535.h> + +static void cs5530a_warm_reset(struct pci_dev *dev) +{ + /* writing 1 to the reset control register, 0x44 causes the + cs5530a to perform a system warm reset */ + pci_write_config_byte(dev, 0x44, 0x1); + udelay(50); /* shouldn't get here but be safe and spin-a-while */ + return; +} + +static void cs5536_warm_reset(struct pci_dev *dev) +{ + /* writing 1 to the LSB of this MSR causes a hard reset */ + wrmsrl(MSR_DIVIL_SOFT_RESET, 1ULL); + udelay(50); /* shouldn't get here but be safe and spin a while */ +} + +static void rdc321x_reset(struct pci_dev *dev) +{ + unsigned i; + /* Voluntary reset the watchdog timer */ + outl(0x80003840, 0xCF8); + /* Generate a CPU reset on next tick */ + i = inl(0xCFC); + /* Use the minimum timer resolution */ + i |= 0x1600; + outl(i, 0xCFC); + outb(1, 0x92); +} + +static void ce4100_reset(struct pci_dev *dev) +{ + int i; + + for (i = 0; i < 10; i++) { + outb(0x2, 0xcf9); + udelay(50); + } +} + +struct device_fixup { + unsigned int vendor; + unsigned int device; + void (*reboot_fixup)(struct pci_dev *); +}; + +/* + * PCI ids solely used for fixups_table go here + */ +#define PCI_DEVICE_ID_INTEL_CE4100 0x0708 + +static const struct device_fixup fixups_table[] = { +{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset }, +{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset }, +{ PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE, cs5530a_warm_reset }, +{ PCI_VENDOR_ID_RDC, PCI_DEVICE_ID_RDC_R6030, rdc321x_reset }, +{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CE4100, ce4100_reset }, +}; + +/* + * we see if any fixup is available for our current hardware. if there + * is a fixup, we call it and we expect to never return from it. if we + * do return, we keep looking and then eventually fall back to the + * standard mach_reboot on return. + */ +void mach_reboot_fixups(void) +{ + const struct device_fixup *cur; + struct pci_dev *dev; + int i; + + /* we can be called from sysrq-B code. In such a case it is + * prohibited to dig PCI */ + if (in_interrupt()) + return; + + for (i=0; i < ARRAY_SIZE(fixups_table); i++) { + cur = &(fixups_table[i]); + dev = pci_get_device(cur->vendor, cur->device, NULL); + if (!dev) + continue; + + cur->reboot_fixup(dev); + pci_dev_put(dev); + } +} + diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S new file mode 100644 index 00000000000..e13f8e7c22a --- /dev/null +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -0,0 +1,277 @@ +/* + * relocate_kernel.S - put the kernel image in place to boot + * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include <linux/linkage.h> +#include <asm/page_types.h> +#include <asm/kexec.h> +#include <asm/processor-flags.h> + +/* + * Must be relocatable PIC code callable as a C function + */ + +#define PTR(x) (x << 2) + +/* + * control_page + KEXEC_CONTROL_CODE_MAX_SIZE + * ~ control_page + PAGE_SIZE are used as data storage and stack for + * jumping back + */ +#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset)) + +/* Minimal CPU state */ +#define ESP DATA(0x0) +#define CR0 DATA(0x4) +#define CR3 DATA(0x8) +#define CR4 DATA(0xc) + +/* other data */ +#define CP_VA_CONTROL_PAGE DATA(0x10) +#define CP_PA_PGD DATA(0x14) +#define CP_PA_SWAP_PAGE DATA(0x18) +#define CP_PA_BACKUP_PAGES_MAP DATA(0x1c) + + .text + .globl relocate_kernel +relocate_kernel: + /* Save the CPU context, used for jumping back */ + + pushl %ebx + pushl %esi + pushl %edi + pushl %ebp + pushf + + movl 20+8(%esp), %ebp /* list of pages */ + movl PTR(VA_CONTROL_PAGE)(%ebp), %edi + movl %esp, ESP(%edi) + movl %cr0, %eax + movl %eax, CR0(%edi) + movl %cr3, %eax + movl %eax, CR3(%edi) + movl %cr4, %eax + movl %eax, CR4(%edi) + + /* read the arguments and say goodbye to the stack */ + movl 20+4(%esp), %ebx /* page_list */ + movl 20+8(%esp), %ebp /* list of pages */ + movl 20+12(%esp), %edx /* start address */ + movl 20+16(%esp), %ecx /* cpu_has_pae */ + movl 20+20(%esp), %esi /* preserve_context */ + + /* zero out flags, and disable interrupts */ + pushl $0 + popfl + + /* save some information for jumping back */ + movl PTR(VA_CONTROL_PAGE)(%ebp), %edi + movl %edi, CP_VA_CONTROL_PAGE(%edi) + movl PTR(PA_PGD)(%ebp), %eax + movl %eax, CP_PA_PGD(%edi) + movl PTR(PA_SWAP_PAGE)(%ebp), %eax + movl %eax, CP_PA_SWAP_PAGE(%edi) + movl %ebx, CP_PA_BACKUP_PAGES_MAP(%edi) + + /* + * get physical address of control page now + * this is impossible after page table switch + */ + movl PTR(PA_CONTROL_PAGE)(%ebp), %edi + + /* switch to new set of page tables */ + movl PTR(PA_PGD)(%ebp), %eax + movl %eax, %cr3 + + /* setup a new stack at the end of the physical control page */ + lea PAGE_SIZE(%edi), %esp + + /* jump to identity mapped page */ + movl %edi, %eax + addl $(identity_mapped - relocate_kernel), %eax + pushl %eax + ret + +identity_mapped: + /* set return address to 0 if not preserving context */ + pushl $0 + /* store the start address on the stack */ + pushl %edx + + /* + * Set cr0 to a known state: + * - Paging disabled + * - Alignment check disabled + * - Write protect disabled + * - No task switch + * - Don't do FP software emulation. + * - Proctected mode enabled + */ + movl %cr0, %eax + andl $~(X86_CR0_PG | X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM), %eax + orl $(X86_CR0_PE), %eax + movl %eax, %cr0 + + /* clear cr4 if applicable */ + testl %ecx, %ecx + jz 1f + /* + * Set cr4 to a known state: + * Setting everything to zero seems safe. + */ + xorl %eax, %eax + movl %eax, %cr4 + + jmp 1f +1: + + /* Flush the TLB (needed?) */ + xorl %eax, %eax + movl %eax, %cr3 + + movl CP_PA_SWAP_PAGE(%edi), %eax + pushl %eax + pushl %ebx + call swap_pages + addl $8, %esp + + /* + * To be certain of avoiding problems with self-modifying code + * I need to execute a serializing instruction here. + * So I flush the TLB, it's handy, and not processor dependent. + */ + xorl %eax, %eax + movl %eax, %cr3 + + /* + * set all of the registers to known values + * leave %esp alone + */ + + testl %esi, %esi + jnz 1f + xorl %edi, %edi + xorl %eax, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + xorl %esi, %esi + xorl %ebp, %ebp + ret +1: + popl %edx + movl CP_PA_SWAP_PAGE(%edi), %esp + addl $PAGE_SIZE, %esp +2: + call *%edx + + /* get the re-entry point of the peer system */ + movl 0(%esp), %ebp + call 1f +1: + popl %ebx + subl $(1b - relocate_kernel), %ebx + movl CP_VA_CONTROL_PAGE(%ebx), %edi + lea PAGE_SIZE(%ebx), %esp + movl CP_PA_SWAP_PAGE(%ebx), %eax + movl CP_PA_BACKUP_PAGES_MAP(%ebx), %edx + pushl %eax + pushl %edx + call swap_pages + addl $8, %esp + movl CP_PA_PGD(%ebx), %eax + movl %eax, %cr3 + movl %cr0, %eax + orl $X86_CR0_PG, %eax + movl %eax, %cr0 + lea PAGE_SIZE(%edi), %esp + movl %edi, %eax + addl $(virtual_mapped - relocate_kernel), %eax + pushl %eax + ret + +virtual_mapped: + movl CR4(%edi), %eax + movl %eax, %cr4 + movl CR3(%edi), %eax + movl %eax, %cr3 + movl CR0(%edi), %eax + movl %eax, %cr0 + movl ESP(%edi), %esp + movl %ebp, %eax + + popf + popl %ebp + popl %edi + popl %esi + popl %ebx + ret + + /* Do the copies */ +swap_pages: + movl 8(%esp), %edx + movl 4(%esp), %ecx + pushl %ebp + pushl %ebx + pushl %edi + pushl %esi + movl %ecx, %ebx + jmp 1f + +0: /* top, read another word from the indirection page */ + movl (%ebx), %ecx + addl $4, %ebx +1: + testl $0x1, %ecx /* is it a destination page */ + jz 2f + movl %ecx, %edi + andl $0xfffff000, %edi + jmp 0b +2: + testl $0x2, %ecx /* is it an indirection page */ + jz 2f + movl %ecx, %ebx + andl $0xfffff000, %ebx + jmp 0b +2: + testl $0x4, %ecx /* is it the done indicator */ + jz 2f + jmp 3f +2: + testl $0x8, %ecx /* is it the source indicator */ + jz 0b /* Ignore it otherwise */ + movl %ecx, %esi /* For every source page do a copy */ + andl $0xfffff000, %esi + + movl %edi, %eax + movl %esi, %ebp + + movl %edx, %edi + movl $1024, %ecx + rep ; movsl + + movl %ebp, %edi + movl %eax, %esi + movl $1024, %ecx + rep ; movsl + + movl %eax, %edi + movl %edx, %esi + movl $1024, %ecx + rep ; movsl + + lea PAGE_SIZE(%ebp), %esi + jmp 0b +3: + popl %esi + popl %edi + popl %ebx + popl %ebp + ret + + .globl kexec_control_code_size +.set kexec_control_code_size, . - relocate_kernel diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S new file mode 100644 index 00000000000..3fd2c693e47 --- /dev/null +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -0,0 +1,268 @@ +/* + * relocate_kernel.S - put the kernel image in place to boot + * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include <linux/linkage.h> +#include <asm/page_types.h> +#include <asm/kexec.h> +#include <asm/processor-flags.h> +#include <asm/pgtable_types.h> + +/* + * Must be relocatable PIC code callable as a C function + */ + +#define PTR(x) (x << 3) +#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) + +/* + * control_page + KEXEC_CONTROL_CODE_MAX_SIZE + * ~ control_page + PAGE_SIZE are used as data storage and stack for + * jumping back + */ +#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset)) + +/* Minimal CPU state */ +#define RSP DATA(0x0) +#define CR0 DATA(0x8) +#define CR3 DATA(0x10) +#define CR4 DATA(0x18) + +/* other data */ +#define CP_PA_TABLE_PAGE DATA(0x20) +#define CP_PA_SWAP_PAGE DATA(0x28) +#define CP_PA_BACKUP_PAGES_MAP DATA(0x30) + + .text + .align PAGE_SIZE + .code64 + .globl relocate_kernel +relocate_kernel: + /* + * %rdi indirection_page + * %rsi page_list + * %rdx start address + * %rcx preserve_context + */ + + /* Save the CPU context, used for jumping back */ + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + pushf + + movq PTR(VA_CONTROL_PAGE)(%rsi), %r11 + movq %rsp, RSP(%r11) + movq %cr0, %rax + movq %rax, CR0(%r11) + movq %cr3, %rax + movq %rax, CR3(%r11) + movq %cr4, %rax + movq %rax, CR4(%r11) + + /* zero out flags, and disable interrupts */ + pushq $0 + popfq + + /* + * get physical address of control page now + * this is impossible after page table switch + */ + movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 + + /* get physical address of page table now too */ + movq PTR(PA_TABLE_PAGE)(%rsi), %r9 + + /* get physical address of swap page now */ + movq PTR(PA_SWAP_PAGE)(%rsi), %r10 + + /* save some information for jumping back */ + movq %r9, CP_PA_TABLE_PAGE(%r11) + movq %r10, CP_PA_SWAP_PAGE(%r11) + movq %rdi, CP_PA_BACKUP_PAGES_MAP(%r11) + + /* Switch to the identity mapped page tables */ + movq %r9, %cr3 + + /* setup a new stack at the end of the physical control page */ + lea PAGE_SIZE(%r8), %rsp + + /* jump to identity mapped page */ + addq $(identity_mapped - relocate_kernel), %r8 + pushq %r8 + ret + +identity_mapped: + /* set return address to 0 if not preserving context */ + pushq $0 + /* store the start address on the stack */ + pushq %rdx + + /* + * Set cr0 to a known state: + * - Paging enabled + * - Alignment check disabled + * - Write protect disabled + * - No task switch + * - Don't do FP software emulation. + * - Proctected mode enabled + */ + movq %cr0, %rax + andq $~(X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM), %rax + orl $(X86_CR0_PG | X86_CR0_PE), %eax + movq %rax, %cr0 + + /* + * Set cr4 to a known state: + * - physical address extension enabled + */ + movq $X86_CR4_PAE, %rax + movq %rax, %cr4 + + jmp 1f +1: + + /* Flush the TLB (needed?) */ + movq %r9, %cr3 + + movq %rcx, %r11 + call swap_pages + + /* + * To be certain of avoiding problems with self-modifying code + * I need to execute a serializing instruction here. + * So I flush the TLB by reloading %cr3 here, it's handy, + * and not processor dependent. + */ + movq %cr3, %rax + movq %rax, %cr3 + + /* + * set all of the registers to known values + * leave %rsp alone + */ + + testq %r11, %r11 + jnz 1f + xorl %eax, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + xorl %esi, %esi + xorl %edi, %edi + xorl %ebp, %ebp + xorl %r8d, %r8d + xorl %r9d, %r9d + xorl %r10d, %r10d + xorl %r11d, %r11d + xorl %r12d, %r12d + xorl %r13d, %r13d + xorl %r14d, %r14d + xorl %r15d, %r15d + + ret + +1: + popq %rdx + leaq PAGE_SIZE(%r10), %rsp + call *%rdx + + /* get the re-entry point of the peer system */ + movq 0(%rsp), %rbp + call 1f +1: + popq %r8 + subq $(1b - relocate_kernel), %r8 + movq CP_PA_SWAP_PAGE(%r8), %r10 + movq CP_PA_BACKUP_PAGES_MAP(%r8), %rdi + movq CP_PA_TABLE_PAGE(%r8), %rax + movq %rax, %cr3 + lea PAGE_SIZE(%r8), %rsp + call swap_pages + movq $virtual_mapped, %rax + pushq %rax + ret + +virtual_mapped: + movq RSP(%r8), %rsp + movq CR4(%r8), %rax + movq %rax, %cr4 + movq CR3(%r8), %rax + movq CR0(%r8), %r8 + movq %rax, %cr3 + movq %r8, %cr0 + movq %rbp, %rax + + popf + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + ret + + /* Do the copies */ +swap_pages: + movq %rdi, %rcx /* Put the page_list in %rcx */ + xorl %edi, %edi + xorl %esi, %esi + jmp 1f + +0: /* top, read another word for the indirection page */ + + movq (%rbx), %rcx + addq $8, %rbx +1: + testq $0x1, %rcx /* is it a destination page? */ + jz 2f + movq %rcx, %rdi + andq $0xfffffffffffff000, %rdi + jmp 0b +2: + testq $0x2, %rcx /* is it an indirection page? */ + jz 2f + movq %rcx, %rbx + andq $0xfffffffffffff000, %rbx + jmp 0b +2: + testq $0x4, %rcx /* is it the done indicator? */ + jz 2f + jmp 3f +2: + testq $0x8, %rcx /* is it the source indicator? */ + jz 0b /* Ignore it otherwise */ + movq %rcx, %rsi /* For ever source page do a copy */ + andq $0xfffffffffffff000, %rsi + + movq %rdi, %rdx + movq %rsi, %rax + + movq %r10, %rdi + movq $512, %rcx + rep ; movsq + + movq %rax, %rdi + movq %rdx, %rsi + movq $512, %rcx + rep ; movsq + + movq %rdx, %rdi + movq %r10, %rsi + movq $512, %rcx + rep ; movsq + + lea PAGE_SIZE(%rax), %rsi + jmp 0b +3: + ret + + .globl kexec_control_code_size +.set kexec_control_code_size, . - relocate_kernel diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c new file mode 100644 index 00000000000..2a26819bb6a --- /dev/null +++ b/arch/x86/kernel/resource.c @@ -0,0 +1,48 @@ +#include <linux/ioport.h> +#include <asm/e820.h> + +static void resource_clip(struct resource *res, resource_size_t start, + resource_size_t end) +{ + resource_size_t low = 0, high = 0; + + if (res->end < start || res->start > end) + return; /* no conflict */ + + if (res->start < start) + low = start - res->start; + + if (res->end > end) + high = res->end - end; + + /* Keep the area above or below the conflict, whichever is larger */ + if (low > high) + res->end = start - 1; + else + res->start = end + 1; +} + +static void remove_e820_regions(struct resource *avail) +{ + int i; + struct e820entry *entry; + + for (i = 0; i < e820.nr_map; i++) { + entry = &e820.map[i]; + + resource_clip(avail, entry->addr, + entry->addr + entry->size - 1); + } +} + +void arch_remove_reservations(struct resource *avail) +{ + /* Trim out BIOS areas (low 1MB and high 2MB) and E820 regions */ + if (avail->flags & IORESOURCE_MEM) { + if (avail->start < BIOS_END) + avail->start = BIOS_END; + resource_clip(avail, BIOS_ROM_BASE, BIOS_ROM_END); + + remove_e820_regions(avail); + } +} diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c new file mode 100644 index 00000000000..ca9622a25e9 --- /dev/null +++ b/arch/x86/kernel/rtc.c @@ -0,0 +1,209 @@ +/* + * RTC related functions + */ +#include <linux/platform_device.h> +#include <linux/mc146818rtc.h> +#include <linux/acpi.h> +#include <linux/bcd.h> +#include <linux/export.h> +#include <linux/pnp.h> +#include <linux/of.h> + +#include <asm/vsyscall.h> +#include <asm/x86_init.h> +#include <asm/time.h> +#include <asm/intel-mid.h> +#include <asm/rtc.h> + +#ifdef CONFIG_X86_32 +/* + * This is a special lock that is owned by the CPU and holds the index + * register we are working with. It is required for NMI access to the + * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details. + */ +volatile unsigned long cmos_lock; +EXPORT_SYMBOL(cmos_lock); +#endif /* CONFIG_X86_32 */ + +/* For two digit years assume time is always after that */ +#define CMOS_YEARS_OFFS 2000 + +DEFINE_SPINLOCK(rtc_lock); +EXPORT_SYMBOL(rtc_lock); + +/* + * In order to set the CMOS clock precisely, set_rtc_mmss has to be + * called 500 ms after the second nowtime has started, because when + * nowtime is written into the registers of the CMOS clock, it will + * jump to the next second precisely 500 ms later. Check the Motorola + * MC146818A or Dallas DS12887 data sheet for details. + */ +int mach_set_rtc_mmss(const struct timespec *now) +{ + unsigned long nowtime = now->tv_sec; + struct rtc_time tm; + int retval = 0; + + rtc_time_to_tm(nowtime, &tm); + if (!rtc_valid_tm(&tm)) { + retval = set_rtc_time(&tm); + if (retval) + printk(KERN_ERR "%s: RTC write failed with error %d\n", + __FUNCTION__, retval); + } else { + printk(KERN_ERR + "%s: Invalid RTC value: write of %lx to RTC failed\n", + __FUNCTION__, nowtime); + retval = -EINVAL; + } + return retval; +} + +void mach_get_cmos_time(struct timespec *now) +{ + unsigned int status, year, mon, day, hour, min, sec, century = 0; + unsigned long flags; + + spin_lock_irqsave(&rtc_lock, flags); + + /* + * If UIP is clear, then we have >= 244 microseconds before + * RTC registers will be updated. Spec sheet says that this + * is the reliable way to read RTC - registers. If UIP is set + * then the register access might be invalid. + */ + while ((CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP)) + cpu_relax(); + + sec = CMOS_READ(RTC_SECONDS); + min = CMOS_READ(RTC_MINUTES); + hour = CMOS_READ(RTC_HOURS); + day = CMOS_READ(RTC_DAY_OF_MONTH); + mon = CMOS_READ(RTC_MONTH); + year = CMOS_READ(RTC_YEAR); + +#ifdef CONFIG_ACPI + if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID && + acpi_gbl_FADT.century) + century = CMOS_READ(acpi_gbl_FADT.century); +#endif + + status = CMOS_READ(RTC_CONTROL); + WARN_ON_ONCE(RTC_ALWAYS_BCD && (status & RTC_DM_BINARY)); + + spin_unlock_irqrestore(&rtc_lock, flags); + + if (RTC_ALWAYS_BCD || !(status & RTC_DM_BINARY)) { + sec = bcd2bin(sec); + min = bcd2bin(min); + hour = bcd2bin(hour); + day = bcd2bin(day); + mon = bcd2bin(mon); + year = bcd2bin(year); + } + + if (century) { + century = bcd2bin(century); + year += century * 100; + } else + year += CMOS_YEARS_OFFS; + + now->tv_sec = mktime(year, mon, day, hour, min, sec); + now->tv_nsec = 0; +} + +/* Routines for accessing the CMOS RAM/RTC. */ +unsigned char rtc_cmos_read(unsigned char addr) +{ + unsigned char val; + + lock_cmos_prefix(addr); + outb(addr, RTC_PORT(0)); + val = inb(RTC_PORT(1)); + lock_cmos_suffix(addr); + + return val; +} +EXPORT_SYMBOL(rtc_cmos_read); + +void rtc_cmos_write(unsigned char val, unsigned char addr) +{ + lock_cmos_prefix(addr); + outb(addr, RTC_PORT(0)); + outb(val, RTC_PORT(1)); + lock_cmos_suffix(addr); +} +EXPORT_SYMBOL(rtc_cmos_write); + +int update_persistent_clock(struct timespec now) +{ + return x86_platform.set_wallclock(&now); +} + +/* not static: needed by APM */ +void read_persistent_clock(struct timespec *ts) +{ + x86_platform.get_wallclock(ts); +} + + +static struct resource rtc_resources[] = { + [0] = { + .start = RTC_PORT(0), + .end = RTC_PORT(1), + .flags = IORESOURCE_IO, + }, + [1] = { + .start = RTC_IRQ, + .end = RTC_IRQ, + .flags = IORESOURCE_IRQ, + } +}; + +static struct platform_device rtc_device = { + .name = "rtc_cmos", + .id = -1, + .resource = rtc_resources, + .num_resources = ARRAY_SIZE(rtc_resources), +}; + +static __init int add_rtc_cmos(void) +{ +#ifdef CONFIG_PNP + static const char * const const ids[] __initconst = + { "PNP0b00", "PNP0b01", "PNP0b02", }; + struct pnp_dev *dev; + struct pnp_id *id; + int i; + + pnp_for_each_dev(dev) { + for (id = dev->id; id; id = id->next) { + for (i = 0; i < ARRAY_SIZE(ids); i++) { + if (compare_pnp_id(id, ids[i]) != 0) + return 0; + } + } + } +#endif + if (of_have_populated_dt()) + return 0; + + /* Intel MID platforms don't have ioport rtc */ + if (intel_mid_identify_cpu()) + return -ENODEV; + +#ifdef CONFIG_ACPI + if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC) { + /* This warning can likely go away again in a year or two. */ + pr_info("ACPI: not registering RTC platform device\n"); + return -ENODEV; + } +#endif + + platform_device_register(&rtc_device); + dev_info(&rtc_device.dev, + "registered platform RTC device (no PNP device found)\n"); + + return 0; +} +device_initcall(add_rtc_cmos); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c new file mode 100644 index 00000000000..78a0e629892 --- /dev/null +++ b/arch/x86/kernel/setup.c @@ -0,0 +1,1273 @@ +/* + * Copyright (C) 1995 Linus Torvalds + * + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 + * + * Memory region support + * David Parsons <orc@pell.chi.il.us>, July-August 1999 + * + * Added E820 sanitization routine (removes overlapping memory regions); + * Brian Moyle <bmoyle@mvista.com>, February 2001 + * + * Moved CPU detection code to cpu/${cpu}.c + * Patrick Mochel <mochel@osdl.org>, March 2002 + * + * Provisions for empty E820 memory regions (reported by certain BIOSes). + * Alex Achenbach <xela@slit.de>, December 2002. + * + */ + +/* + * This file handles the architecture-dependent parts of initialization + */ + +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/mmzone.h> +#include <linux/screen_info.h> +#include <linux/ioport.h> +#include <linux/acpi.h> +#include <linux/sfi.h> +#include <linux/apm_bios.h> +#include <linux/initrd.h> +#include <linux/bootmem.h> +#include <linux/memblock.h> +#include <linux/seq_file.h> +#include <linux/console.h> +#include <linux/root_dev.h> +#include <linux/highmem.h> +#include <linux/module.h> +#include <linux/efi.h> +#include <linux/init.h> +#include <linux/edd.h> +#include <linux/iscsi_ibft.h> +#include <linux/nodemask.h> +#include <linux/kexec.h> +#include <linux/dmi.h> +#include <linux/pfn.h> +#include <linux/pci.h> +#include <asm/pci-direct.h> +#include <linux/init_ohci1394_dma.h> +#include <linux/kvm_para.h> +#include <linux/dma-contiguous.h> + +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/stddef.h> +#include <linux/unistd.h> +#include <linux/ptrace.h> +#include <linux/user.h> +#include <linux/delay.h> + +#include <linux/kallsyms.h> +#include <linux/cpufreq.h> +#include <linux/dma-mapping.h> +#include <linux/ctype.h> +#include <linux/uaccess.h> + +#include <linux/percpu.h> +#include <linux/crash_dump.h> +#include <linux/tboot.h> +#include <linux/jiffies.h> + +#include <video/edid.h> + +#include <asm/mtrr.h> +#include <asm/apic.h> +#include <asm/realmode.h> +#include <asm/e820.h> +#include <asm/mpspec.h> +#include <asm/setup.h> +#include <asm/efi.h> +#include <asm/timer.h> +#include <asm/i8259.h> +#include <asm/sections.h> +#include <asm/io_apic.h> +#include <asm/ist.h> +#include <asm/setup_arch.h> +#include <asm/bios_ebda.h> +#include <asm/cacheflush.h> +#include <asm/processor.h> +#include <asm/bugs.h> + +#include <asm/vsyscall.h> +#include <asm/cpu.h> +#include <asm/desc.h> +#include <asm/dma.h> +#include <asm/iommu.h> +#include <asm/gart.h> +#include <asm/mmu_context.h> +#include <asm/proto.h> + +#include <asm/paravirt.h> +#include <asm/hypervisor.h> +#include <asm/olpc_ofw.h> + +#include <asm/percpu.h> +#include <asm/topology.h> +#include <asm/apicdef.h> +#include <asm/amd_nb.h> +#include <asm/mce.h> +#include <asm/alternative.h> +#include <asm/prom.h> + +/* + * max_low_pfn_mapped: highest direct mapped pfn under 4GB + * max_pfn_mapped: highest direct mapped pfn over 4GB + * + * The direct mapping only covers E820_RAM regions, so the ranges and gaps are + * represented by pfn_mapped + */ +unsigned long max_low_pfn_mapped; +unsigned long max_pfn_mapped; + +#ifdef CONFIG_DMI +RESERVE_BRK(dmi_alloc, 65536); +#endif + + +static __initdata unsigned long _brk_start = (unsigned long)__brk_base; +unsigned long _brk_end = (unsigned long)__brk_base; + +#ifdef CONFIG_X86_64 +int default_cpu_present_to_apicid(int mps_cpu) +{ + return __default_cpu_present_to_apicid(mps_cpu); +} + +int default_check_phys_apicid_present(int phys_apicid) +{ + return __default_check_phys_apicid_present(phys_apicid); +} +#endif + +struct boot_params boot_params; + +/* + * Machine setup.. + */ +static struct resource data_resource = { + .name = "Kernel data", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +static struct resource code_resource = { + .name = "Kernel code", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +static struct resource bss_resource = { + .name = "Kernel bss", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + + +#ifdef CONFIG_X86_32 +/* cpu data as detected by the assembly code in head.S */ +struct cpuinfo_x86 new_cpu_data = { + .wp_works_ok = -1, +}; +/* common cpu data for all cpus */ +struct cpuinfo_x86 boot_cpu_data __read_mostly = { + .wp_works_ok = -1, +}; +EXPORT_SYMBOL(boot_cpu_data); + +unsigned int def_to_bigsmp; + +/* for MCA, but anyone else can use it if they want */ +unsigned int machine_id; +unsigned int machine_submodel_id; +unsigned int BIOS_revision; + +struct apm_info apm_info; +EXPORT_SYMBOL(apm_info); + +#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \ + defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE) +struct ist_info ist_info; +EXPORT_SYMBOL(ist_info); +#else +struct ist_info ist_info; +#endif + +#else +struct cpuinfo_x86 boot_cpu_data __read_mostly = { + .x86_phys_bits = MAX_PHYSMEM_BITS, +}; +EXPORT_SYMBOL(boot_cpu_data); +#endif + + +#if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) +__visible unsigned long mmu_cr4_features; +#else +__visible unsigned long mmu_cr4_features = X86_CR4_PAE; +#endif + +/* Boot loader ID and version as integers, for the benefit of proc_dointvec */ +int bootloader_type, bootloader_version; + +/* + * Setup options + */ +struct screen_info screen_info; +EXPORT_SYMBOL(screen_info); +struct edid_info edid_info; +EXPORT_SYMBOL_GPL(edid_info); + +extern int root_mountflags; + +unsigned long saved_video_mode; + +#define RAMDISK_IMAGE_START_MASK 0x07FF +#define RAMDISK_PROMPT_FLAG 0x8000 +#define RAMDISK_LOAD_FLAG 0x4000 + +static char __initdata command_line[COMMAND_LINE_SIZE]; +#ifdef CONFIG_CMDLINE_BOOL +static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE; +#endif + +#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) +struct edd edd; +#ifdef CONFIG_EDD_MODULE +EXPORT_SYMBOL(edd); +#endif +/** + * copy_edd() - Copy the BIOS EDD information + * from boot_params into a safe place. + * + */ +static inline void __init copy_edd(void) +{ + memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer, + sizeof(edd.mbr_signature)); + memcpy(edd.edd_info, boot_params.eddbuf, sizeof(edd.edd_info)); + edd.mbr_signature_nr = boot_params.edd_mbr_sig_buf_entries; + edd.edd_info_nr = boot_params.eddbuf_entries; +} +#else +static inline void __init copy_edd(void) +{ +} +#endif + +void * __init extend_brk(size_t size, size_t align) +{ + size_t mask = align - 1; + void *ret; + + BUG_ON(_brk_start == 0); + BUG_ON(align & mask); + + _brk_end = (_brk_end + mask) & ~mask; + BUG_ON((char *)(_brk_end + size) > __brk_limit); + + ret = (void *)_brk_end; + _brk_end += size; + + memset(ret, 0, size); + + return ret; +} + +#ifdef CONFIG_X86_32 +static void __init cleanup_highmap(void) +{ +} +#endif + +static void __init reserve_brk(void) +{ + if (_brk_end > _brk_start) + memblock_reserve(__pa_symbol(_brk_start), + _brk_end - _brk_start); + + /* Mark brk area as locked down and no longer taking any + new allocations */ + _brk_start = 0; +} + +u64 relocated_ramdisk; + +#ifdef CONFIG_BLK_DEV_INITRD + +static u64 __init get_ramdisk_image(void) +{ + u64 ramdisk_image = boot_params.hdr.ramdisk_image; + + ramdisk_image |= (u64)boot_params.ext_ramdisk_image << 32; + + return ramdisk_image; +} +static u64 __init get_ramdisk_size(void) +{ + u64 ramdisk_size = boot_params.hdr.ramdisk_size; + + ramdisk_size |= (u64)boot_params.ext_ramdisk_size << 32; + + return ramdisk_size; +} + +#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) +static void __init relocate_initrd(void) +{ + /* Assume only end is not page aligned */ + u64 ramdisk_image = get_ramdisk_image(); + u64 ramdisk_size = get_ramdisk_size(); + u64 area_size = PAGE_ALIGN(ramdisk_size); + unsigned long slop, clen, mapaddr; + char *p, *q; + + /* We need to move the initrd down into directly mapped mem */ + relocated_ramdisk = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), + area_size, PAGE_SIZE); + + if (!relocated_ramdisk) + panic("Cannot find place for new RAMDISK of size %lld\n", + ramdisk_size); + + /* Note: this includes all the mem currently occupied by + the initrd, we rely on that fact to keep the data intact. */ + memblock_reserve(relocated_ramdisk, area_size); + initrd_start = relocated_ramdisk + PAGE_OFFSET; + initrd_end = initrd_start + ramdisk_size; + printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n", + relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1); + + q = (char *)initrd_start; + + /* Copy the initrd */ + while (ramdisk_size) { + slop = ramdisk_image & ~PAGE_MASK; + clen = ramdisk_size; + if (clen > MAX_MAP_CHUNK-slop) + clen = MAX_MAP_CHUNK-slop; + mapaddr = ramdisk_image & PAGE_MASK; + p = early_memremap(mapaddr, clen+slop); + memcpy(q, p+slop, clen); + early_iounmap(p, clen+slop); + q += clen; + ramdisk_image += clen; + ramdisk_size -= clen; + } + + ramdisk_image = get_ramdisk_image(); + ramdisk_size = get_ramdisk_size(); + printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to" + " [mem %#010llx-%#010llx]\n", + ramdisk_image, ramdisk_image + ramdisk_size - 1, + relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1); +} + +static void __init early_reserve_initrd(void) +{ + /* Assume only end is not page aligned */ + u64 ramdisk_image = get_ramdisk_image(); + u64 ramdisk_size = get_ramdisk_size(); + u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); + + if (!boot_params.hdr.type_of_loader || + !ramdisk_image || !ramdisk_size) + return; /* No initrd provided by bootloader */ + + memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image); +} +static void __init reserve_initrd(void) +{ + /* Assume only end is not page aligned */ + u64 ramdisk_image = get_ramdisk_image(); + u64 ramdisk_size = get_ramdisk_size(); + u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); + u64 mapped_size; + + if (!boot_params.hdr.type_of_loader || + !ramdisk_image || !ramdisk_size) + return; /* No initrd provided by bootloader */ + + initrd_start = 0; + + mapped_size = memblock_mem_size(max_pfn_mapped); + if (ramdisk_size >= (mapped_size>>1)) + panic("initrd too large to handle, " + "disabling initrd (%lld needed, %lld available)\n", + ramdisk_size, mapped_size>>1); + + printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image, + ramdisk_end - 1); + + if (pfn_range_is_mapped(PFN_DOWN(ramdisk_image), + PFN_DOWN(ramdisk_end))) { + /* All are mapped, easy case */ + initrd_start = ramdisk_image + PAGE_OFFSET; + initrd_end = initrd_start + ramdisk_size; + return; + } + + relocate_initrd(); + + memblock_free(ramdisk_image, ramdisk_end - ramdisk_image); +} +#else +static void __init early_reserve_initrd(void) +{ +} +static void __init reserve_initrd(void) +{ +} +#endif /* CONFIG_BLK_DEV_INITRD */ + +static void __init parse_setup_data(void) +{ + struct setup_data *data; + u64 pa_data, pa_next; + + pa_data = boot_params.hdr.setup_data; + while (pa_data) { + u32 data_len, map_len, data_type; + + map_len = max(PAGE_SIZE - (pa_data & ~PAGE_MASK), + (u64)sizeof(struct setup_data)); + data = early_memremap(pa_data, map_len); + data_len = data->len + sizeof(struct setup_data); + data_type = data->type; + pa_next = data->next; + early_iounmap(data, map_len); + + switch (data_type) { + case SETUP_E820_EXT: + parse_e820_ext(pa_data, data_len); + break; + case SETUP_DTB: + add_dtb(pa_data); + break; + case SETUP_EFI: + parse_efi_setup(pa_data, data_len); + break; + default: + break; + } + pa_data = pa_next; + } +} + +static void __init e820_reserve_setup_data(void) +{ + struct setup_data *data; + u64 pa_data; + int found = 0; + + pa_data = boot_params.hdr.setup_data; + while (pa_data) { + data = early_memremap(pa_data, sizeof(*data)); + e820_update_range(pa_data, sizeof(*data)+data->len, + E820_RAM, E820_RESERVED_KERN); + found = 1; + pa_data = data->next; + early_iounmap(data, sizeof(*data)); + } + if (!found) + return; + + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + memcpy(&e820_saved, &e820, sizeof(struct e820map)); + printk(KERN_INFO "extended physical RAM map:\n"); + e820_print_map("reserve setup_data"); +} + +static void __init memblock_x86_reserve_range_setup_data(void) +{ + struct setup_data *data; + u64 pa_data; + + pa_data = boot_params.hdr.setup_data; + while (pa_data) { + data = early_memremap(pa_data, sizeof(*data)); + memblock_reserve(pa_data, sizeof(*data) + data->len); + pa_data = data->next; + early_iounmap(data, sizeof(*data)); + } +} + +/* + * --------- Crashkernel reservation ------------------------------ + */ + +#ifdef CONFIG_KEXEC + +/* + * Keep the crash kernel below this limit. On 32 bits earlier kernels + * would limit the kernel to the low 512 MiB due to mapping restrictions. + * On 64bit, old kexec-tools need to under 896MiB. + */ +#ifdef CONFIG_X86_32 +# define CRASH_KERNEL_ADDR_LOW_MAX (512 << 20) +# define CRASH_KERNEL_ADDR_HIGH_MAX (512 << 20) +#else +# define CRASH_KERNEL_ADDR_LOW_MAX (896UL<<20) +# define CRASH_KERNEL_ADDR_HIGH_MAX MAXMEM +#endif + +static void __init reserve_crashkernel_low(void) +{ +#ifdef CONFIG_X86_64 + const unsigned long long alignment = 16<<20; /* 16M */ + unsigned long long low_base = 0, low_size = 0; + unsigned long total_low_mem; + unsigned long long base; + bool auto_set = false; + int ret; + + total_low_mem = memblock_mem_size(1UL<<(32-PAGE_SHIFT)); + /* crashkernel=Y,low */ + ret = parse_crashkernel_low(boot_command_line, total_low_mem, + &low_size, &base); + if (ret != 0) { + /* + * two parts from lib/swiotlb.c: + * swiotlb size: user specified with swiotlb= or default. + * swiotlb overflow buffer: now is hardcoded to 32k. + * We round it to 8M for other buffers that + * may need to stay low too. + */ + low_size = swiotlb_size_or_default() + (8UL<<20); + auto_set = true; + } else { + /* passed with crashkernel=0,low ? */ + if (!low_size) + return; + } + + low_base = memblock_find_in_range(low_size, (1ULL<<32), + low_size, alignment); + + if (!low_base) { + if (!auto_set) + pr_info("crashkernel low reservation failed - No suitable area found.\n"); + + return; + } + + memblock_reserve(low_base, low_size); + pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (System low RAM: %ldMB)\n", + (unsigned long)(low_size >> 20), + (unsigned long)(low_base >> 20), + (unsigned long)(total_low_mem >> 20)); + crashk_low_res.start = low_base; + crashk_low_res.end = low_base + low_size - 1; + insert_resource(&iomem_resource, &crashk_low_res); +#endif +} + +static void __init reserve_crashkernel(void) +{ + const unsigned long long alignment = 16<<20; /* 16M */ + unsigned long long total_mem; + unsigned long long crash_size, crash_base; + bool high = false; + int ret; + + total_mem = memblock_phys_mem_size(); + + /* crashkernel=XM */ + ret = parse_crashkernel(boot_command_line, total_mem, + &crash_size, &crash_base); + if (ret != 0 || crash_size <= 0) { + /* crashkernel=X,high */ + ret = parse_crashkernel_high(boot_command_line, total_mem, + &crash_size, &crash_base); + if (ret != 0 || crash_size <= 0) + return; + high = true; + } + + /* 0 means: find the address automatically */ + if (crash_base <= 0) { + /* + * kexec want bzImage is below CRASH_KERNEL_ADDR_MAX + */ + crash_base = memblock_find_in_range(alignment, + high ? CRASH_KERNEL_ADDR_HIGH_MAX : + CRASH_KERNEL_ADDR_LOW_MAX, + crash_size, alignment); + + if (!crash_base) { + pr_info("crashkernel reservation failed - No suitable area found.\n"); + return; + } + + } else { + unsigned long long start; + + start = memblock_find_in_range(crash_base, + crash_base + crash_size, crash_size, 1<<20); + if (start != crash_base) { + pr_info("crashkernel reservation failed - memory is in use.\n"); + return; + } + } + memblock_reserve(crash_base, crash_size); + + printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " + "for crashkernel (System RAM: %ldMB)\n", + (unsigned long)(crash_size >> 20), + (unsigned long)(crash_base >> 20), + (unsigned long)(total_mem >> 20)); + + crashk_res.start = crash_base; + crashk_res.end = crash_base + crash_size - 1; + insert_resource(&iomem_resource, &crashk_res); + + if (crash_base >= (1ULL<<32)) + reserve_crashkernel_low(); +} +#else +static void __init reserve_crashkernel(void) +{ +} +#endif + +static struct resource standard_io_resources[] = { + { .name = "dma1", .start = 0x00, .end = 0x1f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "pic1", .start = 0x20, .end = 0x21, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "timer0", .start = 0x40, .end = 0x43, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "timer1", .start = 0x50, .end = 0x53, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "keyboard", .start = 0x60, .end = 0x60, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "keyboard", .start = 0x64, .end = 0x64, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "dma page reg", .start = 0x80, .end = 0x8f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "pic2", .start = 0xa0, .end = 0xa1, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "dma2", .start = 0xc0, .end = 0xdf, + .flags = IORESOURCE_BUSY | IORESOURCE_IO }, + { .name = "fpu", .start = 0xf0, .end = 0xff, + .flags = IORESOURCE_BUSY | IORESOURCE_IO } +}; + +void __init reserve_standard_io_resources(void) +{ + int i; + + /* request I/O space for devices used on all i[345]86 PCs */ + for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) + request_resource(&ioport_resource, &standard_io_resources[i]); + +} + +static __init void reserve_ibft_region(void) +{ + unsigned long addr, size = 0; + + addr = find_ibft_region(&size); + + if (size) + memblock_reserve(addr, size); +} + +static bool __init snb_gfx_workaround_needed(void) +{ +#ifdef CONFIG_PCI + int i; + u16 vendor, devid; + static const __initconst u16 snb_ids[] = { + 0x0102, + 0x0112, + 0x0122, + 0x0106, + 0x0116, + 0x0126, + 0x010a, + }; + + /* Assume no if something weird is going on with PCI */ + if (!early_pci_allowed()) + return false; + + vendor = read_pci_config_16(0, 2, 0, PCI_VENDOR_ID); + if (vendor != 0x8086) + return false; + + devid = read_pci_config_16(0, 2, 0, PCI_DEVICE_ID); + for (i = 0; i < ARRAY_SIZE(snb_ids); i++) + if (devid == snb_ids[i]) + return true; +#endif + + return false; +} + +/* + * Sandy Bridge graphics has trouble with certain ranges, exclude + * them from allocation. + */ +static void __init trim_snb_memory(void) +{ + static const __initconst unsigned long bad_pages[] = { + 0x20050000, + 0x20110000, + 0x20130000, + 0x20138000, + 0x40004000, + }; + int i; + + if (!snb_gfx_workaround_needed()) + return; + + printk(KERN_DEBUG "reserving inaccessible SNB gfx pages\n"); + + /* + * Reserve all memory below the 1 MB mark that has not + * already been reserved. + */ + memblock_reserve(0, 1<<20); + + for (i = 0; i < ARRAY_SIZE(bad_pages); i++) { + if (memblock_reserve(bad_pages[i], PAGE_SIZE)) + printk(KERN_WARNING "failed to reserve 0x%08lx\n", + bad_pages[i]); + } +} + +/* + * Here we put platform-specific memory range workarounds, i.e. + * memory known to be corrupt or otherwise in need to be reserved on + * specific platforms. + * + * If this gets used more widely it could use a real dispatch mechanism. + */ +static void __init trim_platform_memory_ranges(void) +{ + trim_snb_memory(); +} + +static void __init trim_bios_range(void) +{ + /* + * A special case is the first 4Kb of memory; + * This is a BIOS owned area, not kernel ram, but generally + * not listed as such in the E820 table. + * + * This typically reserves additional memory (64KiB by default) + * since some BIOSes are known to corrupt low memory. See the + * Kconfig help text for X86_RESERVE_LOW. + */ + e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED); + + /* + * special case: Some BIOSen report the PC BIOS + * area (640->1Mb) as ram even though it is not. + * take them out. + */ + e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1); + + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); +} + +/* called before trim_bios_range() to spare extra sanitize */ +static void __init e820_add_kernel_range(void) +{ + u64 start = __pa_symbol(_text); + u64 size = __pa_symbol(_end) - start; + + /* + * Complain if .text .data and .bss are not marked as E820_RAM and + * attempt to fix it by adding the range. We may have a confused BIOS, + * or the user may have used memmap=exactmap or memmap=xxM$yyM to + * exclude kernel range. If we really are running on top non-RAM, + * we will crash later anyways. + */ + if (e820_all_mapped(start, start + size, E820_RAM)) + return; + + pr_warn(".text .data .bss are not marked as E820_RAM!\n"); + e820_remove_range(start, size, E820_RAM, 0); + e820_add_region(start, size, E820_RAM); +} + +static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10; + +static int __init parse_reservelow(char *p) +{ + unsigned long long size; + + if (!p) + return -EINVAL; + + size = memparse(p, &p); + + if (size < 4096) + size = 4096; + + if (size > 640*1024) + size = 640*1024; + + reserve_low = size; + + return 0; +} + +early_param("reservelow", parse_reservelow); + +static void __init trim_low_memory_range(void) +{ + memblock_reserve(0, ALIGN(reserve_low, PAGE_SIZE)); +} + +/* + * Dump out kernel offset information on panic. + */ +static int +dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) +{ + pr_emerg("Kernel Offset: 0x%lx from 0x%lx " + "(relocation range: 0x%lx-0x%lx)\n", + (unsigned long)&_text - __START_KERNEL, __START_KERNEL, + __START_KERNEL_map, MODULES_VADDR-1); + + return 0; +} + +/* + * Determine if we were loaded by an EFI loader. If so, then we have also been + * passed the efi memmap, systab, etc., so we should use these data structures + * for initialization. Note, the efi init code path is determined by the + * global efi_enabled. This allows the same kernel image to be used on existing + * systems (with a traditional BIOS) as well as on EFI systems. + */ +/* + * setup_arch - architecture-specific boot-time initializations + * + * Note: On x86_64, fixmaps are ready for use even before this is called. + */ + +void __init setup_arch(char **cmdline_p) +{ + memblock_reserve(__pa_symbol(_text), + (unsigned long)__bss_stop - (unsigned long)_text); + + early_reserve_initrd(); + + /* + * At this point everything still needed from the boot loader + * or BIOS or kernel text should be early reserved or marked not + * RAM in e820. All other memory is free game. + */ + +#ifdef CONFIG_X86_32 + memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); + + /* + * copy kernel address range established so far and switch + * to the proper swapper page table + */ + clone_pgd_range(swapper_pg_dir + KERNEL_PGD_BOUNDARY, + initial_page_table + KERNEL_PGD_BOUNDARY, + KERNEL_PGD_PTRS); + + load_cr3(swapper_pg_dir); + __flush_tlb_all(); +#else + printk(KERN_INFO "Command line: %s\n", boot_command_line); +#endif + + /* + * If we have OLPC OFW, we might end up relocating the fixmap due to + * reserve_top(), so do this before touching the ioremap area. + */ + olpc_ofw_detect(); + + early_trap_init(); + early_cpu_init(); + early_ioremap_init(); + + setup_olpc_ofw_pgd(); + + ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); + screen_info = boot_params.screen_info; + edid_info = boot_params.edid_info; +#ifdef CONFIG_X86_32 + apm_info.bios = boot_params.apm_bios_info; + ist_info = boot_params.ist_info; + if (boot_params.sys_desc_table.length != 0) { + machine_id = boot_params.sys_desc_table.table[0]; + machine_submodel_id = boot_params.sys_desc_table.table[1]; + BIOS_revision = boot_params.sys_desc_table.table[2]; + } +#endif + saved_video_mode = boot_params.hdr.vid_mode; + bootloader_type = boot_params.hdr.type_of_loader; + if ((bootloader_type >> 4) == 0xe) { + bootloader_type &= 0xf; + bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4; + } + bootloader_version = bootloader_type & 0xf; + bootloader_version |= boot_params.hdr.ext_loader_ver << 4; + +#ifdef CONFIG_BLK_DEV_RAM + rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK; + rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0); + rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0); +#endif +#ifdef CONFIG_EFI + if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, + "EL32", 4)) { + set_bit(EFI_BOOT, &efi.flags); + } else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, + "EL64", 4)) { + set_bit(EFI_BOOT, &efi.flags); + set_bit(EFI_64BIT, &efi.flags); + } + + if (efi_enabled(EFI_BOOT)) + efi_memblock_x86_reserve_range(); +#endif + + x86_init.oem.arch_setup(); + + iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; + setup_memory_map(); + parse_setup_data(); + + copy_edd(); + + if (!boot_params.hdr.root_flags) + root_mountflags &= ~MS_RDONLY; + init_mm.start_code = (unsigned long) _text; + init_mm.end_code = (unsigned long) _etext; + init_mm.end_data = (unsigned long) _edata; + init_mm.brk = _brk_end; + + code_resource.start = __pa_symbol(_text); + code_resource.end = __pa_symbol(_etext)-1; + data_resource.start = __pa_symbol(_etext); + data_resource.end = __pa_symbol(_edata)-1; + bss_resource.start = __pa_symbol(__bss_start); + bss_resource.end = __pa_symbol(__bss_stop)-1; + +#ifdef CONFIG_CMDLINE_BOOL +#ifdef CONFIG_CMDLINE_OVERRIDE + strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); +#else + if (builtin_cmdline[0]) { + /* append boot loader cmdline to builtin */ + strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE); + strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE); + strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); + } +#endif +#endif + + strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); + *cmdline_p = command_line; + + /* + * x86_configure_nx() is called before parse_early_param() to detect + * whether hardware doesn't support NX (so that the early EHCI debug + * console setup can safely call set_fixmap()). It may then be called + * again from within noexec_setup() during parsing early parameters + * to honor the respective command line option. + */ + x86_configure_nx(); + + parse_early_param(); + + x86_report_nx(); + + /* after early param, so could get panic from serial */ + memblock_x86_reserve_range_setup_data(); + + if (acpi_mps_check()) { +#ifdef CONFIG_X86_LOCAL_APIC + disable_apic = 1; +#endif + setup_clear_cpu_cap(X86_FEATURE_APIC); + } + +#ifdef CONFIG_PCI + if (pci_early_dump_regs) + early_dump_pci_devices(); +#endif + + /* update the e820_saved too */ + e820_reserve_setup_data(); + finish_e820_parsing(); + + if (efi_enabled(EFI_BOOT)) + efi_init(); + + dmi_scan_machine(); + dmi_memdev_walk(); + dmi_set_dump_stack_arch_desc(); + + /* + * VMware detection requires dmi to be available, so this + * needs to be done after dmi_scan_machine, for the BP. + */ + init_hypervisor_platform(); + + x86_init.resources.probe_roms(); + + /* after parse_early_param, so could debug it */ + insert_resource(&iomem_resource, &code_resource); + insert_resource(&iomem_resource, &data_resource); + insert_resource(&iomem_resource, &bss_resource); + + e820_add_kernel_range(); + trim_bios_range(); +#ifdef CONFIG_X86_32 + if (ppro_with_ram_bug()) { + e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM, + E820_RESERVED); + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + printk(KERN_INFO "fixed physical RAM map:\n"); + e820_print_map("bad_ppro"); + } +#else + early_gart_iommu_check(); +#endif + + /* + * partially used pages are not usable - thus + * we are rounding upwards: + */ + max_pfn = e820_end_of_ram_pfn(); + + /* update e820 for memory not covered by WB MTRRs */ + mtrr_bp_init(); + if (mtrr_trim_uncached_memory(max_pfn)) + max_pfn = e820_end_of_ram_pfn(); + +#ifdef CONFIG_X86_32 + /* max_low_pfn get updated here */ + find_low_pfn_range(); +#else + check_x2apic(); + + /* How many end-of-memory variables you have, grandma! */ + /* need this before calling reserve_initrd */ + if (max_pfn > (1UL<<(32 - PAGE_SHIFT))) + max_low_pfn = e820_end_of_low_ram_pfn(); + else + max_low_pfn = max_pfn; + + high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; +#endif + + /* + * Find and reserve possible boot-time SMP configuration: + */ + find_smp_config(); + + reserve_ibft_region(); + + early_alloc_pgt_buf(); + + /* + * Need to conclude brk, before memblock_x86_fill() + * it could use memblock_find_in_range, could overlap with + * brk area. + */ + reserve_brk(); + + cleanup_highmap(); + + memblock_set_current_limit(ISA_END_ADDRESS); + memblock_x86_fill(); + + /* + * The EFI specification says that boot service code won't be called + * after ExitBootServices(). This is, in fact, a lie. + */ + if (efi_enabled(EFI_MEMMAP)) + efi_reserve_boot_services(); + + /* preallocate 4k for mptable mpc */ + early_reserve_e820_mpc_new(); + +#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION + setup_bios_corruption_check(); +#endif + +#ifdef CONFIG_X86_32 + printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n", + (max_pfn_mapped<<PAGE_SHIFT) - 1); +#endif + + reserve_real_mode(); + + trim_platform_memory_ranges(); + trim_low_memory_range(); + + init_mem_mapping(); + + early_trap_pf_init(); + + setup_real_mode(); + + memblock_set_current_limit(get_max_mapped()); + dma_contiguous_reserve(max_pfn_mapped << PAGE_SHIFT); + + /* + * NOTE: On x86-32, only from this point on, fixmaps are ready for use. + */ + +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT + if (init_ohci1394_dma_early) + init_ohci1394_dma_on_all_controllers(); +#endif + /* Allocate bigger log buffer */ + setup_log_buf(1); + + reserve_initrd(); + +#if defined(CONFIG_ACPI) && defined(CONFIG_BLK_DEV_INITRD) + acpi_initrd_override((void *)initrd_start, initrd_end - initrd_start); +#endif + + vsmp_init(); + + io_delay_init(); + + /* + * Parse the ACPI tables for possible boot-time SMP configuration. + */ + acpi_boot_table_init(); + + early_acpi_boot_init(); + + initmem_init(); + + /* + * Reserve memory for crash kernel after SRAT is parsed so that it + * won't consume hotpluggable memory. + */ + reserve_crashkernel(); + + memblock_find_dma_reserve(); + +#ifdef CONFIG_KVM_GUEST + kvmclock_init(); +#endif + + x86_init.paging.pagetable_init(); + + if (boot_cpu_data.cpuid_level >= 0) { + /* A CPU has %cr4 if and only if it has CPUID */ + mmu_cr4_features = read_cr4(); + if (trampoline_cr4_features) + *trampoline_cr4_features = mmu_cr4_features; + } + +#ifdef CONFIG_X86_32 + /* sync back kernel address range */ + clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, + KERNEL_PGD_PTRS); +#endif + + tboot_probe(); + +#ifdef CONFIG_X86_64 + map_vsyscall(); +#endif + + generic_apic_probe(); + + early_quirks(); + + /* + * Read APIC and some other early information from ACPI tables. + */ + acpi_boot_init(); + sfi_init(); + x86_dtb_init(); + + /* + * get boot-time SMP configuration: + */ + if (smp_found_config) + get_smp_config(); + + prefill_possible_map(); + + init_cpu_to_node(); + + init_apic_mappings(); + if (x86_io_apic_ops.init) + x86_io_apic_ops.init(); + + kvm_guest_init(); + + e820_reserve_resources(); + e820_mark_nosave_regions(max_low_pfn); + + x86_init.resources.reserve_resources(); + + e820_setup_gap(); + +#ifdef CONFIG_VT +#if defined(CONFIG_VGA_CONSOLE) + if (!efi_enabled(EFI_BOOT) || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY)) + conswitchp = &vga_con; +#elif defined(CONFIG_DUMMY_CONSOLE) + conswitchp = &dummy_con; +#endif +#endif + x86_init.oem.banner(); + + x86_init.timers.wallclock_init(); + + mcheck_init(); + + arch_init_ideal_nops(); + + register_refined_jiffies(CLOCK_TICK_RATE); + +#ifdef CONFIG_EFI + if (efi_enabled(EFI_BOOT)) + efi_apply_memmap_quirks(); +#endif +} + +#ifdef CONFIG_X86_32 + +static struct resource video_ram_resource = { + .name = "Video RAM area", + .start = 0xa0000, + .end = 0xbffff, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +void __init i386_reserve_resources(void) +{ + request_resource(&iomem_resource, &video_ram_resource); + reserve_standard_io_resources(); +} + +#endif /* CONFIG_X86_32 */ + +static struct notifier_block kernel_offset_notifier = { + .notifier_call = dump_kernel_offset +}; + +static int __init register_kernel_offset_dumper(void) +{ + atomic_notifier_chain_register(&panic_notifier_list, + &kernel_offset_notifier); + return 0; +} +__initcall(register_kernel_offset_dumper); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c new file mode 100644 index 00000000000..5cdff035774 --- /dev/null +++ b/arch/x86/kernel/setup_percpu.c @@ -0,0 +1,287 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/percpu.h> +#include <linux/kexec.h> +#include <linux/crash_dump.h> +#include <linux/smp.h> +#include <linux/topology.h> +#include <linux/pfn.h> +#include <asm/sections.h> +#include <asm/processor.h> +#include <asm/setup.h> +#include <asm/mpspec.h> +#include <asm/apicdef.h> +#include <asm/highmem.h> +#include <asm/proto.h> +#include <asm/cpumask.h> +#include <asm/cpu.h> +#include <asm/stackprotector.h> + +DEFINE_PER_CPU_READ_MOSTLY(int, cpu_number); +EXPORT_PER_CPU_SYMBOL(cpu_number); + +#ifdef CONFIG_X86_64 +#define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load) +#else +#define BOOT_PERCPU_OFFSET 0 +#endif + +DEFINE_PER_CPU(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET; +EXPORT_PER_CPU_SYMBOL(this_cpu_off); + +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = { + [0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET, +}; +EXPORT_SYMBOL(__per_cpu_offset); + +/* + * On x86_64 symbols referenced from code should be reachable using + * 32bit relocations. Reserve space for static percpu variables in + * modules so that they are always served from the first chunk which + * is located at the percpu segment base. On x86_32, anything can + * address anywhere. No need to reserve space in the first chunk. + */ +#ifdef CONFIG_X86_64 +#define PERCPU_FIRST_CHUNK_RESERVE PERCPU_MODULE_RESERVE +#else +#define PERCPU_FIRST_CHUNK_RESERVE 0 +#endif + +#ifdef CONFIG_X86_32 +/** + * pcpu_need_numa - determine percpu allocation needs to consider NUMA + * + * If NUMA is not configured or there is only one NUMA node available, + * there is no reason to consider NUMA. This function determines + * whether percpu allocation should consider NUMA or not. + * + * RETURNS: + * true if NUMA should be considered; otherwise, false. + */ +static bool __init pcpu_need_numa(void) +{ +#ifdef CONFIG_NEED_MULTIPLE_NODES + pg_data_t *last = NULL; + unsigned int cpu; + + for_each_possible_cpu(cpu) { + int node = early_cpu_to_node(cpu); + + if (node_online(node) && NODE_DATA(node) && + last && last != NODE_DATA(node)) + return true; + + last = NODE_DATA(node); + } +#endif + return false; +} +#endif + +/** + * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu + * @cpu: cpu to allocate for + * @size: size allocation in bytes + * @align: alignment + * + * Allocate @size bytes aligned at @align for cpu @cpu. This wrapper + * does the right thing for NUMA regardless of the current + * configuration. + * + * RETURNS: + * Pointer to the allocated area on success, NULL on failure. + */ +static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, + unsigned long align) +{ + const unsigned long goal = __pa(MAX_DMA_ADDRESS); +#ifdef CONFIG_NEED_MULTIPLE_NODES + int node = early_cpu_to_node(cpu); + void *ptr; + + if (!node_online(node) || !NODE_DATA(node)) { + ptr = __alloc_bootmem_nopanic(size, align, goal); + pr_info("cpu %d has no node %d or node-local memory\n", + cpu, node); + pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n", + cpu, size, __pa(ptr)); + } else { + ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node), + size, align, goal); + pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n", + cpu, size, node, __pa(ptr)); + } + return ptr; +#else + return __alloc_bootmem_nopanic(size, align, goal); +#endif +} + +/* + * Helpers for first chunk memory allocation + */ +static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align) +{ + return pcpu_alloc_bootmem(cpu, size, align); +} + +static void __init pcpu_fc_free(void *ptr, size_t size) +{ + free_bootmem(__pa(ptr), size); +} + +static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) +{ +#ifdef CONFIG_NEED_MULTIPLE_NODES + if (early_cpu_to_node(from) == early_cpu_to_node(to)) + return LOCAL_DISTANCE; + else + return REMOTE_DISTANCE; +#else + return LOCAL_DISTANCE; +#endif +} + +static void __init pcpup_populate_pte(unsigned long addr) +{ + populate_extra_pte(addr); +} + +static inline void setup_percpu_segment(int cpu) +{ +#ifdef CONFIG_X86_32 + struct desc_struct gdt; + + pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF, + 0x2 | DESCTYPE_S, 0x8); + gdt.s = 1; + write_gdt_entry(get_cpu_gdt_table(cpu), + GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S); +#endif +} + +void __init setup_per_cpu_areas(void) +{ + unsigned int cpu; + unsigned long delta; + int rc; + + pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", + NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); + + /* + * Allocate percpu area. Embedding allocator is our favorite; + * however, on NUMA configurations, it can result in very + * sparse unit mapping and vmalloc area isn't spacious enough + * on 32bit. Use page in that case. + */ +#ifdef CONFIG_X86_32 + if (pcpu_chosen_fc == PCPU_FC_AUTO && pcpu_need_numa()) + pcpu_chosen_fc = PCPU_FC_PAGE; +#endif + rc = -EINVAL; + if (pcpu_chosen_fc != PCPU_FC_PAGE) { + const size_t dyn_size = PERCPU_MODULE_RESERVE + + PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE; + size_t atom_size; + + /* + * On 64bit, use PMD_SIZE for atom_size so that embedded + * percpu areas are aligned to PMD. This, in the future, + * can also allow using PMD mappings in vmalloc area. Use + * PAGE_SIZE on 32bit as vmalloc space is highly contended + * and large vmalloc area allocs can easily fail. + */ +#ifdef CONFIG_X86_64 + atom_size = PMD_SIZE; +#else + atom_size = PAGE_SIZE; +#endif + rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, + dyn_size, atom_size, + pcpu_cpu_distance, + pcpu_fc_alloc, pcpu_fc_free); + if (rc < 0) + pr_warning("%s allocator failed (%d), falling back to page size\n", + pcpu_fc_names[pcpu_chosen_fc], rc); + } + if (rc < 0) + rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, + pcpu_fc_alloc, pcpu_fc_free, + pcpup_populate_pte); + if (rc < 0) + panic("cannot initialize percpu area (err=%d)", rc); + + /* alrighty, percpu areas up and running */ + delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; + for_each_possible_cpu(cpu) { + per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu]; + per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); + per_cpu(cpu_number, cpu) = cpu; + setup_percpu_segment(cpu); + setup_stack_canary_segment(cpu); + /* + * Copy data used in early init routines from the + * initial arrays to the per cpu data areas. These + * arrays then become expendable and the *_early_ptr's + * are zeroed indicating that the static arrays are + * gone. + */ +#ifdef CONFIG_X86_LOCAL_APIC + per_cpu(x86_cpu_to_apicid, cpu) = + early_per_cpu_map(x86_cpu_to_apicid, cpu); + per_cpu(x86_bios_cpu_apicid, cpu) = + early_per_cpu_map(x86_bios_cpu_apicid, cpu); +#endif +#ifdef CONFIG_X86_32 + per_cpu(x86_cpu_to_logical_apicid, cpu) = + early_per_cpu_map(x86_cpu_to_logical_apicid, cpu); +#endif +#ifdef CONFIG_X86_64 + per_cpu(irq_stack_ptr, cpu) = + per_cpu(irq_stack_union.irq_stack, cpu) + + IRQ_STACK_SIZE - 64; +#endif +#ifdef CONFIG_NUMA + per_cpu(x86_cpu_to_node_map, cpu) = + early_per_cpu_map(x86_cpu_to_node_map, cpu); + /* + * Ensure that the boot cpu numa_node is correct when the boot + * cpu is on a node that doesn't have memory installed. + * Also cpu_up() will call cpu_to_node() for APs when + * MEMORY_HOTPLUG is defined, before per_cpu(numa_node) is set + * up later with c_init aka intel_init/amd_init. + * So set them all (boot cpu and all APs). + */ + set_cpu_numa_node(cpu, early_cpu_to_node(cpu)); +#endif + /* + * Up to this point, the boot CPU has been using .init.data + * area. Reload any changed state for the boot CPU. + */ + if (!cpu) + switch_to_new_gdt(cpu); + } + + /* indicate the early static arrays will soon be gone */ +#ifdef CONFIG_X86_LOCAL_APIC + early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; + early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; +#endif +#ifdef CONFIG_X86_32 + early_per_cpu_ptr(x86_cpu_to_logical_apicid) = NULL; +#endif +#ifdef CONFIG_NUMA + early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; +#endif + + /* Setup node to cpumask map */ + setup_node_to_cpumask_map(); + + /* Setup cpu initialized, callin, callout masks */ + setup_cpu_local_masks(); +} diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c new file mode 100644 index 00000000000..2851d63c120 --- /dev/null +++ b/arch/x86/kernel/signal.c @@ -0,0 +1,807 @@ +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs + * + * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson + * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes + * 2000-2002 x86-64 support by Andi Kleen + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/wait.h> +#include <linux/tracehook.h> +#include <linux/unistd.h> +#include <linux/stddef.h> +#include <linux/personality.h> +#include <linux/uaccess.h> +#include <linux/user-return-notifier.h> +#include <linux/uprobes.h> +#include <linux/context_tracking.h> + +#include <asm/processor.h> +#include <asm/ucontext.h> +#include <asm/i387.h> +#include <asm/fpu-internal.h> +#include <asm/vdso.h> +#include <asm/mce.h> +#include <asm/sighandling.h> + +#ifdef CONFIG_X86_64 +#include <asm/proto.h> +#include <asm/ia32_unistd.h> +#include <asm/sys_ia32.h> +#endif /* CONFIG_X86_64 */ + +#include <asm/syscall.h> +#include <asm/syscalls.h> + +#include <asm/sigframe.h> + +#define COPY(x) do { \ + get_user_ex(regs->x, &sc->x); \ +} while (0) + +#define GET_SEG(seg) ({ \ + unsigned short tmp; \ + get_user_ex(tmp, &sc->seg); \ + tmp; \ +}) + +#define COPY_SEG(seg) do { \ + regs->seg = GET_SEG(seg); \ +} while (0) + +#define COPY_SEG_CPL3(seg) do { \ + regs->seg = GET_SEG(seg) | 3; \ +} while (0) + +int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, + unsigned long *pax) +{ + void __user *buf; + unsigned int tmpflags; + unsigned int err = 0; + + /* Always make any pending restarted system calls return -EINTR */ + current_thread_info()->restart_block.fn = do_no_restart_syscall; + + get_user_try { + +#ifdef CONFIG_X86_32 + set_user_gs(regs, GET_SEG(gs)); + COPY_SEG(fs); + COPY_SEG(es); + COPY_SEG(ds); +#endif /* CONFIG_X86_32 */ + + COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); + COPY(dx); COPY(cx); COPY(ip); + +#ifdef CONFIG_X86_64 + COPY(r8); + COPY(r9); + COPY(r10); + COPY(r11); + COPY(r12); + COPY(r13); + COPY(r14); + COPY(r15); +#endif /* CONFIG_X86_64 */ + +#ifdef CONFIG_X86_32 + COPY_SEG_CPL3(cs); + COPY_SEG_CPL3(ss); +#else /* !CONFIG_X86_32 */ + /* Kernel saves and restores only the CS segment register on signals, + * which is the bare minimum needed to allow mixed 32/64-bit code. + * App's signal handler can save/restore other segments if needed. */ + COPY_SEG_CPL3(cs); +#endif /* CONFIG_X86_32 */ + + get_user_ex(tmpflags, &sc->flags); + regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); + regs->orig_ax = -1; /* disable syscall checks */ + + get_user_ex(buf, &sc->fpstate); + + get_user_ex(*pax, &sc->ax); + } get_user_catch(err); + + err |= restore_xstate_sig(buf, config_enabled(CONFIG_X86_32)); + + return err; +} + +int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, + struct pt_regs *regs, unsigned long mask) +{ + int err = 0; + + put_user_try { + +#ifdef CONFIG_X86_32 + put_user_ex(get_user_gs(regs), (unsigned int __user *)&sc->gs); + put_user_ex(regs->fs, (unsigned int __user *)&sc->fs); + put_user_ex(regs->es, (unsigned int __user *)&sc->es); + put_user_ex(regs->ds, (unsigned int __user *)&sc->ds); +#endif /* CONFIG_X86_32 */ + + put_user_ex(regs->di, &sc->di); + put_user_ex(regs->si, &sc->si); + put_user_ex(regs->bp, &sc->bp); + put_user_ex(regs->sp, &sc->sp); + put_user_ex(regs->bx, &sc->bx); + put_user_ex(regs->dx, &sc->dx); + put_user_ex(regs->cx, &sc->cx); + put_user_ex(regs->ax, &sc->ax); +#ifdef CONFIG_X86_64 + put_user_ex(regs->r8, &sc->r8); + put_user_ex(regs->r9, &sc->r9); + put_user_ex(regs->r10, &sc->r10); + put_user_ex(regs->r11, &sc->r11); + put_user_ex(regs->r12, &sc->r12); + put_user_ex(regs->r13, &sc->r13); + put_user_ex(regs->r14, &sc->r14); + put_user_ex(regs->r15, &sc->r15); +#endif /* CONFIG_X86_64 */ + + put_user_ex(current->thread.trap_nr, &sc->trapno); + put_user_ex(current->thread.error_code, &sc->err); + put_user_ex(regs->ip, &sc->ip); +#ifdef CONFIG_X86_32 + put_user_ex(regs->cs, (unsigned int __user *)&sc->cs); + put_user_ex(regs->flags, &sc->flags); + put_user_ex(regs->sp, &sc->sp_at_signal); + put_user_ex(regs->ss, (unsigned int __user *)&sc->ss); +#else /* !CONFIG_X86_32 */ + put_user_ex(regs->flags, &sc->flags); + put_user_ex(regs->cs, &sc->cs); + put_user_ex(0, &sc->gs); + put_user_ex(0, &sc->fs); +#endif /* CONFIG_X86_32 */ + + put_user_ex(fpstate, &sc->fpstate); + + /* non-iBCS2 extensions.. */ + put_user_ex(mask, &sc->oldmask); + put_user_ex(current->thread.cr2, &sc->cr2); + } put_user_catch(err); + + return err; +} + +/* + * Set up a signal frame. + */ + +/* + * Determine which stack to use.. + */ +static unsigned long align_sigframe(unsigned long sp) +{ +#ifdef CONFIG_X86_32 + /* + * Align the stack pointer according to the i386 ABI, + * i.e. so that on function entry ((sp + 4) & 15) == 0. + */ + sp = ((sp + 4) & -16ul) - 4; +#else /* !CONFIG_X86_32 */ + sp = round_down(sp, 16) - 8; +#endif + return sp; +} + +static inline void __user * +get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, + void __user **fpstate) +{ + /* Default to using normal stack */ + unsigned long math_size = 0; + unsigned long sp = regs->sp; + unsigned long buf_fx = 0; + int onsigstack = on_sig_stack(sp); + + /* redzone */ + if (config_enabled(CONFIG_X86_64)) + sp -= 128; + + if (!onsigstack) { + /* This is the X/Open sanctioned signal stack switching. */ + if (ka->sa.sa_flags & SA_ONSTACK) { + if (current->sas_ss_size) + sp = current->sas_ss_sp + current->sas_ss_size; + } else if (config_enabled(CONFIG_X86_32) && + (regs->ss & 0xffff) != __USER_DS && + !(ka->sa.sa_flags & SA_RESTORER) && + ka->sa.sa_restorer) { + /* This is the legacy signal stack switching. */ + sp = (unsigned long) ka->sa.sa_restorer; + } + } + + if (used_math()) { + sp = alloc_mathframe(sp, config_enabled(CONFIG_X86_32), + &buf_fx, &math_size); + *fpstate = (void __user *)sp; + } + + sp = align_sigframe(sp - frame_size); + + /* + * If we are on the alternate signal stack and would overflow it, don't. + * Return an always-bogus address instead so we will die with SIGSEGV. + */ + if (onsigstack && !likely(on_sig_stack(sp))) + return (void __user *)-1L; + + /* save i387 and extended state */ + if (used_math() && + save_xstate_sig(*fpstate, (void __user *)buf_fx, math_size) < 0) + return (void __user *)-1L; + + return (void __user *)sp; +} + +#ifdef CONFIG_X86_32 +static const struct { + u16 poplmovl; + u32 val; + u16 int80; +} __attribute__((packed)) retcode = { + 0xb858, /* popl %eax; movl $..., %eax */ + __NR_sigreturn, + 0x80cd, /* int $0x80 */ +}; + +static const struct { + u8 movl; + u32 val; + u16 int80; + u8 pad; +} __attribute__((packed)) rt_retcode = { + 0xb8, /* movl $..., %eax */ + __NR_rt_sigreturn, + 0x80cd, /* int $0x80 */ + 0 +}; + +static int +__setup_frame(int sig, struct ksignal *ksig, sigset_t *set, + struct pt_regs *regs) +{ + struct sigframe __user *frame; + void __user *restorer; + int err = 0; + void __user *fpstate = NULL; + + frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate); + + if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + return -EFAULT; + + if (__put_user(sig, &frame->sig)) + return -EFAULT; + + if (setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0])) + return -EFAULT; + + if (_NSIG_WORDS > 1) { + if (__copy_to_user(&frame->extramask, &set->sig[1], + sizeof(frame->extramask))) + return -EFAULT; + } + + if (current->mm->context.vdso) + restorer = current->mm->context.vdso + + selected_vdso32->sym___kernel_sigreturn; + else + restorer = &frame->retcode; + if (ksig->ka.sa.sa_flags & SA_RESTORER) + restorer = ksig->ka.sa.sa_restorer; + + /* Set up to return from userspace. */ + err |= __put_user(restorer, &frame->pretcode); + + /* + * This is popl %eax ; movl $__NR_sigreturn, %eax ; int $0x80 + * + * WE DO NOT USE IT ANY MORE! It's only left here for historical + * reasons and because gdb uses it as a signature to notice + * signal handler stack frames. + */ + err |= __put_user(*((u64 *)&retcode), (u64 *)frame->retcode); + + if (err) + return -EFAULT; + + /* Set up registers for signal handler */ + regs->sp = (unsigned long)frame; + regs->ip = (unsigned long)ksig->ka.sa.sa_handler; + regs->ax = (unsigned long)sig; + regs->dx = 0; + regs->cx = 0; + + regs->ds = __USER_DS; + regs->es = __USER_DS; + regs->ss = __USER_DS; + regs->cs = __USER_CS; + + return 0; +} + +static int __setup_rt_frame(int sig, struct ksignal *ksig, + sigset_t *set, struct pt_regs *regs) +{ + struct rt_sigframe __user *frame; + void __user *restorer; + int err = 0; + void __user *fpstate = NULL; + + frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate); + + if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + return -EFAULT; + + put_user_try { + put_user_ex(sig, &frame->sig); + put_user_ex(&frame->info, &frame->pinfo); + put_user_ex(&frame->uc, &frame->puc); + + /* Create the ucontext. */ + if (cpu_has_xsave) + put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags); + else + put_user_ex(0, &frame->uc.uc_flags); + put_user_ex(0, &frame->uc.uc_link); + save_altstack_ex(&frame->uc.uc_stack, regs->sp); + + /* Set up to return from userspace. */ + restorer = current->mm->context.vdso + + selected_vdso32->sym___kernel_rt_sigreturn; + if (ksig->ka.sa.sa_flags & SA_RESTORER) + restorer = ksig->ka.sa.sa_restorer; + put_user_ex(restorer, &frame->pretcode); + + /* + * This is movl $__NR_rt_sigreturn, %ax ; int $0x80 + * + * WE DO NOT USE IT ANY MORE! It's only left here for historical + * reasons and because gdb uses it as a signature to notice + * signal handler stack frames. + */ + put_user_ex(*((u64 *)&rt_retcode), (u64 *)frame->retcode); + } put_user_catch(err); + + err |= copy_siginfo_to_user(&frame->info, &ksig->info); + err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate, + regs, set->sig[0]); + err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); + + if (err) + return -EFAULT; + + /* Set up registers for signal handler */ + regs->sp = (unsigned long)frame; + regs->ip = (unsigned long)ksig->ka.sa.sa_handler; + regs->ax = (unsigned long)sig; + regs->dx = (unsigned long)&frame->info; + regs->cx = (unsigned long)&frame->uc; + + regs->ds = __USER_DS; + regs->es = __USER_DS; + regs->ss = __USER_DS; + regs->cs = __USER_CS; + + return 0; +} +#else /* !CONFIG_X86_32 */ +static int __setup_rt_frame(int sig, struct ksignal *ksig, + sigset_t *set, struct pt_regs *regs) +{ + struct rt_sigframe __user *frame; + void __user *fp = NULL; + int err = 0; + + frame = get_sigframe(&ksig->ka, regs, sizeof(struct rt_sigframe), &fp); + + if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + return -EFAULT; + + if (ksig->ka.sa.sa_flags & SA_SIGINFO) { + if (copy_siginfo_to_user(&frame->info, &ksig->info)) + return -EFAULT; + } + + put_user_try { + /* Create the ucontext. */ + if (cpu_has_xsave) + put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags); + else + put_user_ex(0, &frame->uc.uc_flags); + put_user_ex(0, &frame->uc.uc_link); + save_altstack_ex(&frame->uc.uc_stack, regs->sp); + + /* Set up to return from userspace. If provided, use a stub + already in userspace. */ + /* x86-64 should always use SA_RESTORER. */ + if (ksig->ka.sa.sa_flags & SA_RESTORER) { + put_user_ex(ksig->ka.sa.sa_restorer, &frame->pretcode); + } else { + /* could use a vstub here */ + err |= -EFAULT; + } + } put_user_catch(err); + + err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]); + err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); + + if (err) + return -EFAULT; + + /* Set up registers for signal handler */ + regs->di = sig; + /* In case the signal handler was declared without prototypes */ + regs->ax = 0; + + /* This also works for non SA_SIGINFO handlers because they expect the + next argument after the signal number on the stack. */ + regs->si = (unsigned long)&frame->info; + regs->dx = (unsigned long)&frame->uc; + regs->ip = (unsigned long) ksig->ka.sa.sa_handler; + + regs->sp = (unsigned long)frame; + + /* Set up the CS register to run signal handlers in 64-bit mode, + even if the handler happens to be interrupting 32-bit code. */ + regs->cs = __USER_CS; + + return 0; +} +#endif /* CONFIG_X86_32 */ + +static int x32_setup_rt_frame(struct ksignal *ksig, + compat_sigset_t *set, + struct pt_regs *regs) +{ +#ifdef CONFIG_X86_X32_ABI + struct rt_sigframe_x32 __user *frame; + void __user *restorer; + int err = 0; + void __user *fpstate = NULL; + + frame = get_sigframe(&ksig->ka, regs, sizeof(*frame), &fpstate); + + if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + return -EFAULT; + + if (ksig->ka.sa.sa_flags & SA_SIGINFO) { + if (copy_siginfo_to_user32(&frame->info, &ksig->info)) + return -EFAULT; + } + + put_user_try { + /* Create the ucontext. */ + if (cpu_has_xsave) + put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags); + else + put_user_ex(0, &frame->uc.uc_flags); + put_user_ex(0, &frame->uc.uc_link); + compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp); + put_user_ex(0, &frame->uc.uc__pad0); + + if (ksig->ka.sa.sa_flags & SA_RESTORER) { + restorer = ksig->ka.sa.sa_restorer; + } else { + /* could use a vstub here */ + restorer = NULL; + err |= -EFAULT; + } + put_user_ex(restorer, &frame->pretcode); + } put_user_catch(err); + + err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate, + regs, set->sig[0]); + err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); + + if (err) + return -EFAULT; + + /* Set up registers for signal handler */ + regs->sp = (unsigned long) frame; + regs->ip = (unsigned long) ksig->ka.sa.sa_handler; + + /* We use the x32 calling convention here... */ + regs->di = ksig->sig; + regs->si = (unsigned long) &frame->info; + regs->dx = (unsigned long) &frame->uc; + + loadsegment(ds, __USER_DS); + loadsegment(es, __USER_DS); + + regs->cs = __USER_CS; + regs->ss = __USER_DS; +#endif /* CONFIG_X86_X32_ABI */ + + return 0; +} + +/* + * Do a signal return; undo the signal stack. + */ +#ifdef CONFIG_X86_32 +asmlinkage unsigned long sys_sigreturn(void) +{ + struct pt_regs *regs = current_pt_regs(); + struct sigframe __user *frame; + unsigned long ax; + sigset_t set; + + frame = (struct sigframe __user *)(regs->sp - 8); + + if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + goto badframe; + if (__get_user(set.sig[0], &frame->sc.oldmask) || (_NSIG_WORDS > 1 + && __copy_from_user(&set.sig[1], &frame->extramask, + sizeof(frame->extramask)))) + goto badframe; + + set_current_blocked(&set); + + if (restore_sigcontext(regs, &frame->sc, &ax)) + goto badframe; + return ax; + +badframe: + signal_fault(regs, frame, "sigreturn"); + + return 0; +} +#endif /* CONFIG_X86_32 */ + +asmlinkage long sys_rt_sigreturn(void) +{ + struct pt_regs *regs = current_pt_regs(); + struct rt_sigframe __user *frame; + unsigned long ax; + sigset_t set; + + frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); + if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + goto badframe; + if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) + goto badframe; + + set_current_blocked(&set); + + if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) + goto badframe; + + if (restore_altstack(&frame->uc.uc_stack)) + goto badframe; + + return ax; + +badframe: + signal_fault(regs, frame, "rt_sigreturn"); + return 0; +} + +/* + * OK, we're invoking a handler: + */ +static int signr_convert(int sig) +{ +#ifdef CONFIG_X86_32 + struct thread_info *info = current_thread_info(); + + if (info->exec_domain && info->exec_domain->signal_invmap && sig < 32) + return info->exec_domain->signal_invmap[sig]; +#endif /* CONFIG_X86_32 */ + return sig; +} + +static int +setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) +{ + int usig = signr_convert(ksig->sig); + sigset_t *set = sigmask_to_save(); + compat_sigset_t *cset = (compat_sigset_t *) set; + + /* Set up the stack frame */ + if (is_ia32_frame()) { + if (ksig->ka.sa.sa_flags & SA_SIGINFO) + return ia32_setup_rt_frame(usig, ksig, cset, regs); + else + return ia32_setup_frame(usig, ksig, cset, regs); + } else if (is_x32_frame()) { + return x32_setup_rt_frame(ksig, cset, regs); + } else { + return __setup_rt_frame(ksig->sig, ksig, set, regs); + } +} + +static void +handle_signal(struct ksignal *ksig, struct pt_regs *regs) +{ + bool failed; + /* Are we from a system call? */ + if (syscall_get_nr(current, regs) >= 0) { + /* If so, check system call restarting.. */ + switch (syscall_get_error(current, regs)) { + case -ERESTART_RESTARTBLOCK: + case -ERESTARTNOHAND: + regs->ax = -EINTR; + break; + + case -ERESTARTSYS: + if (!(ksig->ka.sa.sa_flags & SA_RESTART)) { + regs->ax = -EINTR; + break; + } + /* fallthrough */ + case -ERESTARTNOINTR: + regs->ax = regs->orig_ax; + regs->ip -= 2; + break; + } + } + + /* + * If TF is set due to a debugger (TIF_FORCED_TF), clear the TF + * flag so that register information in the sigcontext is correct. + */ + if (unlikely(regs->flags & X86_EFLAGS_TF) && + likely(test_and_clear_thread_flag(TIF_FORCED_TF))) + regs->flags &= ~X86_EFLAGS_TF; + + failed = (setup_rt_frame(ksig, regs) < 0); + if (!failed) { + /* + * Clear the direction flag as per the ABI for function entry. + * + * Clear RF when entering the signal handler, because + * it might disable possible debug exception from the + * signal handler. + * + * Clear TF when entering the signal handler, but + * notify any tracer that was single-stepping it. + * The tracer may want to single-step inside the + * handler too. + */ + regs->flags &= ~(X86_EFLAGS_DF|X86_EFLAGS_RF|X86_EFLAGS_TF); + } + signal_setup_done(failed, ksig, test_thread_flag(TIF_SINGLESTEP)); +} + +#ifdef CONFIG_X86_32 +#define NR_restart_syscall __NR_restart_syscall +#else /* !CONFIG_X86_32 */ +#define NR_restart_syscall \ + test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall +#endif /* CONFIG_X86_32 */ + +/* + * Note that 'init' is a special process: it doesn't get signals it doesn't + * want to handle. Thus you cannot kill init even with a SIGKILL even by + * mistake. + */ +static void do_signal(struct pt_regs *regs) +{ + struct ksignal ksig; + + if (get_signal(&ksig)) { + /* Whee! Actually deliver the signal. */ + handle_signal(&ksig, regs); + return; + } + + /* Did we come from a system call? */ + if (syscall_get_nr(current, regs) >= 0) { + /* Restart the system call - no handlers present */ + switch (syscall_get_error(current, regs)) { + case -ERESTARTNOHAND: + case -ERESTARTSYS: + case -ERESTARTNOINTR: + regs->ax = regs->orig_ax; + regs->ip -= 2; + break; + + case -ERESTART_RESTARTBLOCK: + regs->ax = NR_restart_syscall; + regs->ip -= 2; + break; + } + } + + /* + * If there's no signal to deliver, we just put the saved sigmask + * back. + */ + restore_saved_sigmask(); +} + +/* + * notification of userspace execution resumption + * - triggered by the TIF_WORK_MASK flags + */ +__visible void +do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) +{ + user_exit(); + +#ifdef CONFIG_X86_MCE + /* notify userspace of pending MCEs */ + if (thread_info_flags & _TIF_MCE_NOTIFY) + mce_notify_process(); +#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ + + if (thread_info_flags & _TIF_UPROBE) + uprobe_notify_resume(regs); + + /* deal with pending signal delivery */ + if (thread_info_flags & _TIF_SIGPENDING) + do_signal(regs); + + if (thread_info_flags & _TIF_NOTIFY_RESUME) { + clear_thread_flag(TIF_NOTIFY_RESUME); + tracehook_notify_resume(regs); + } + if (thread_info_flags & _TIF_USER_RETURN_NOTIFY) + fire_user_return_notifiers(); + + user_enter(); +} + +void signal_fault(struct pt_regs *regs, void __user *frame, char *where) +{ + struct task_struct *me = current; + + if (show_unhandled_signals && printk_ratelimit()) { + printk("%s" + "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", + task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, + me->comm, me->pid, where, frame, + regs->ip, regs->sp, regs->orig_ax); + print_vma_addr(" in ", regs->ip); + pr_cont("\n"); + } + + force_sig(SIGSEGV, me); +} + +#ifdef CONFIG_X86_X32_ABI +asmlinkage long sys32_x32_rt_sigreturn(void) +{ + struct pt_regs *regs = current_pt_regs(); + struct rt_sigframe_x32 __user *frame; + sigset_t set; + unsigned long ax; + + frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); + + if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) + goto badframe; + if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) + goto badframe; + + set_current_blocked(&set); + + if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) + goto badframe; + + if (compat_restore_altstack(&frame->uc.uc_stack)) + goto badframe; + + return ax; + +badframe: + signal_fault(regs, frame, "x32 rt_sigreturn"); + return 0; +} +#endif diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c new file mode 100644 index 00000000000..be8e1bde07a --- /dev/null +++ b/arch/x86/kernel/smp.c @@ -0,0 +1,360 @@ +/* + * Intel SMP support routines. + * + * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk> + * (c) 1998-99, 2000, 2009 Ingo Molnar <mingo@redhat.com> + * (c) 2002,2003 Andi Kleen, SuSE Labs. + * + * i386 and x86_64 integration by Glauber Costa <gcosta@redhat.com> + * + * This code is released under the GNU General Public License version 2 or + * later. + */ + +#include <linux/init.h> + +#include <linux/mm.h> +#include <linux/delay.h> +#include <linux/spinlock.h> +#include <linux/export.h> +#include <linux/kernel_stat.h> +#include <linux/mc146818rtc.h> +#include <linux/cache.h> +#include <linux/interrupt.h> +#include <linux/cpu.h> +#include <linux/gfp.h> + +#include <asm/mtrr.h> +#include <asm/tlbflush.h> +#include <asm/mmu_context.h> +#include <asm/proto.h> +#include <asm/apic.h> +#include <asm/nmi.h> +#include <asm/trace/irq_vectors.h> +/* + * Some notes on x86 processor bugs affecting SMP operation: + * + * Pentium, Pentium Pro, II, III (and all CPUs) have bugs. + * The Linux implications for SMP are handled as follows: + * + * Pentium III / [Xeon] + * None of the E1AP-E3AP errata are visible to the user. + * + * E1AP. see PII A1AP + * E2AP. see PII A2AP + * E3AP. see PII A3AP + * + * Pentium II / [Xeon] + * None of the A1AP-A3AP errata are visible to the user. + * + * A1AP. see PPro 1AP + * A2AP. see PPro 2AP + * A3AP. see PPro 7AP + * + * Pentium Pro + * None of 1AP-9AP errata are visible to the normal user, + * except occasional delivery of 'spurious interrupt' as trap #15. + * This is very rare and a non-problem. + * + * 1AP. Linux maps APIC as non-cacheable + * 2AP. worked around in hardware + * 3AP. fixed in C0 and above steppings microcode update. + * Linux does not use excessive STARTUP_IPIs. + * 4AP. worked around in hardware + * 5AP. symmetric IO mode (normal Linux operation) not affected. + * 'noapic' mode has vector 0xf filled out properly. + * 6AP. 'noapic' mode might be affected - fixed in later steppings + * 7AP. We do not assume writes to the LVT deassering IRQs + * 8AP. We do not enable low power mode (deep sleep) during MP bootup + * 9AP. We do not use mixed mode + * + * Pentium + * There is a marginal case where REP MOVS on 100MHz SMP + * machines with B stepping processors can fail. XXX should provide + * an L1cache=Writethrough or L1cache=off option. + * + * B stepping CPUs may hang. There are hardware work arounds + * for this. We warn about it in case your board doesn't have the work + * arounds. Basically that's so I can tell anyone with a B stepping + * CPU and SMP problems "tough". + * + * Specific items [From Pentium Processor Specification Update] + * + * 1AP. Linux doesn't use remote read + * 2AP. Linux doesn't trust APIC errors + * 3AP. We work around this + * 4AP. Linux never generated 3 interrupts of the same priority + * to cause a lost local interrupt. + * 5AP. Remote read is never used + * 6AP. not affected - worked around in hardware + * 7AP. not affected - worked around in hardware + * 8AP. worked around in hardware - we get explicit CS errors if not + * 9AP. only 'noapic' mode affected. Might generate spurious + * interrupts, we log only the first one and count the + * rest silently. + * 10AP. not affected - worked around in hardware + * 11AP. Linux reads the APIC between writes to avoid this, as per + * the documentation. Make sure you preserve this as it affects + * the C stepping chips too. + * 12AP. not affected - worked around in hardware + * 13AP. not affected - worked around in hardware + * 14AP. we always deassert INIT during bootup + * 15AP. not affected - worked around in hardware + * 16AP. not affected - worked around in hardware + * 17AP. not affected - worked around in hardware + * 18AP. not affected - worked around in hardware + * 19AP. not affected - worked around in BIOS + * + * If this sounds worrying believe me these bugs are either ___RARE___, + * or are signal timing bugs worked around in hardware and there's + * about nothing of note with C stepping upwards. + */ + +static atomic_t stopping_cpu = ATOMIC_INIT(-1); +static bool smp_no_nmi_ipi = false; + +/* + * this function sends a 'reschedule' IPI to another CPU. + * it goes straight through and wastes no time serializing + * anything. Worst case is that we lose a reschedule ... + */ +static void native_smp_send_reschedule(int cpu) +{ + if (unlikely(cpu_is_offline(cpu))) { + WARN_ON(1); + return; + } + apic->send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR); +} + +void native_send_call_func_single_ipi(int cpu) +{ + apic->send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR); +} + +void native_send_call_func_ipi(const struct cpumask *mask) +{ + cpumask_var_t allbutself; + + if (!alloc_cpumask_var(&allbutself, GFP_ATOMIC)) { + apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR); + return; + } + + cpumask_copy(allbutself, cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), allbutself); + + if (cpumask_equal(mask, allbutself) && + cpumask_equal(cpu_online_mask, cpu_callout_mask)) + apic->send_IPI_allbutself(CALL_FUNCTION_VECTOR); + else + apic->send_IPI_mask(mask, CALL_FUNCTION_VECTOR); + + free_cpumask_var(allbutself); +} + +static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) +{ + /* We are registered on stopping cpu too, avoid spurious NMI */ + if (raw_smp_processor_id() == atomic_read(&stopping_cpu)) + return NMI_HANDLED; + + stop_this_cpu(NULL); + + return NMI_HANDLED; +} + +/* + * this function calls the 'stop' function on all other CPUs in the system. + */ + +asmlinkage __visible void smp_reboot_interrupt(void) +{ + ack_APIC_irq(); + irq_enter(); + stop_this_cpu(NULL); + irq_exit(); +} + +static void native_stop_other_cpus(int wait) +{ + unsigned long flags; + unsigned long timeout; + + if (reboot_force) + return; + + /* + * Use an own vector here because smp_call_function + * does lots of things not suitable in a panic situation. + */ + + /* + * We start by using the REBOOT_VECTOR irq. + * The irq is treated as a sync point to allow critical + * regions of code on other cpus to release their spin locks + * and re-enable irqs. Jumping straight to an NMI might + * accidentally cause deadlocks with further shutdown/panic + * code. By syncing, we give the cpus up to one second to + * finish their work before we force them off with the NMI. + */ + if (num_online_cpus() > 1) { + /* did someone beat us here? */ + if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id()) != -1) + return; + + /* sync above data before sending IRQ */ + wmb(); + + apic->send_IPI_allbutself(REBOOT_VECTOR); + + /* + * Don't wait longer than a second if the caller + * didn't ask us to wait. + */ + timeout = USEC_PER_SEC; + while (num_online_cpus() > 1 && (wait || timeout--)) + udelay(1); + } + + /* if the REBOOT_VECTOR didn't work, try with the NMI */ + if ((num_online_cpus() > 1) && (!smp_no_nmi_ipi)) { + if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback, + NMI_FLAG_FIRST, "smp_stop")) + /* Note: we ignore failures here */ + /* Hope the REBOOT_IRQ is good enough */ + goto finish; + + /* sync above data before sending IRQ */ + wmb(); + + pr_emerg("Shutting down cpus with NMI\n"); + + apic->send_IPI_allbutself(NMI_VECTOR); + + /* + * Don't wait longer than a 10 ms if the caller + * didn't ask us to wait. + */ + timeout = USEC_PER_MSEC * 10; + while (num_online_cpus() > 1 && (wait || timeout--)) + udelay(1); + } + +finish: + local_irq_save(flags); + disable_local_APIC(); + local_irq_restore(flags); +} + +/* + * Reschedule call back. + */ +static inline void __smp_reschedule_interrupt(void) +{ + inc_irq_stat(irq_resched_count); + scheduler_ipi(); +} + +__visible void smp_reschedule_interrupt(struct pt_regs *regs) +{ + ack_APIC_irq(); + __smp_reschedule_interrupt(); + /* + * KVM uses this interrupt to force a cpu out of guest mode + */ +} + +static inline void smp_entering_irq(void) +{ + ack_APIC_irq(); + irq_enter(); +} + +__visible void smp_trace_reschedule_interrupt(struct pt_regs *regs) +{ + /* + * Need to call irq_enter() before calling the trace point. + * __smp_reschedule_interrupt() calls irq_enter/exit() too (in + * scheduler_ipi(). This is OK, since those functions are allowed + * to nest. + */ + smp_entering_irq(); + trace_reschedule_entry(RESCHEDULE_VECTOR); + __smp_reschedule_interrupt(); + trace_reschedule_exit(RESCHEDULE_VECTOR); + exiting_irq(); + /* + * KVM uses this interrupt to force a cpu out of guest mode + */ +} + +static inline void __smp_call_function_interrupt(void) +{ + generic_smp_call_function_interrupt(); + inc_irq_stat(irq_call_count); +} + +__visible void smp_call_function_interrupt(struct pt_regs *regs) +{ + smp_entering_irq(); + __smp_call_function_interrupt(); + exiting_irq(); +} + +__visible void smp_trace_call_function_interrupt(struct pt_regs *regs) +{ + smp_entering_irq(); + trace_call_function_entry(CALL_FUNCTION_VECTOR); + __smp_call_function_interrupt(); + trace_call_function_exit(CALL_FUNCTION_VECTOR); + exiting_irq(); +} + +static inline void __smp_call_function_single_interrupt(void) +{ + generic_smp_call_function_single_interrupt(); + inc_irq_stat(irq_call_count); +} + +__visible void smp_call_function_single_interrupt(struct pt_regs *regs) +{ + smp_entering_irq(); + __smp_call_function_single_interrupt(); + exiting_irq(); +} + +__visible void smp_trace_call_function_single_interrupt(struct pt_regs *regs) +{ + smp_entering_irq(); + trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR); + __smp_call_function_single_interrupt(); + trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR); + exiting_irq(); +} + +static int __init nonmi_ipi_setup(char *str) +{ + smp_no_nmi_ipi = true; + return 1; +} + +__setup("nonmi_ipi", nonmi_ipi_setup); + +struct smp_ops smp_ops = { + .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, + .smp_prepare_cpus = native_smp_prepare_cpus, + .smp_cpus_done = native_smp_cpus_done, + + .stop_other_cpus = native_stop_other_cpus, + .smp_send_reschedule = native_smp_send_reschedule, + + .cpu_up = native_cpu_up, + .cpu_die = native_cpu_die, + .cpu_disable = native_cpu_disable, + .play_dead = native_play_dead, + + .send_call_func_ipi = native_send_call_func_ipi, + .send_call_func_single_ipi = native_send_call_func_single_ipi, +}; +EXPORT_SYMBOL_GPL(smp_ops); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c new file mode 100644 index 00000000000..5492798930e --- /dev/null +++ b/arch/x86/kernel/smpboot.c @@ -0,0 +1,1494 @@ + /* + * x86 SMP booting functions + * + * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk> + * (c) 1998, 1999, 2000, 2009 Ingo Molnar <mingo@redhat.com> + * Copyright 2001 Andi Kleen, SuSE Labs. + * + * Much of the core SMP work is based on previous work by Thomas Radke, to + * whom a great many thanks are extended. + * + * Thanks to Intel for making available several different Pentium, + * Pentium Pro and Pentium-II/Xeon MP machines. + * Original development of Linux SMP code supported by Caldera. + * + * This code is released under the GNU General Public License version 2 or + * later. + * + * Fixes + * Felix Koop : NR_CPUS used properly + * Jose Renau : Handle single CPU case. + * Alan Cox : By repeated request 8) - Total BogoMIPS report. + * Greg Wright : Fix for kernel stacks panic. + * Erich Boleyn : MP v1.4 and additional changes. + * Matthias Sattler : Changes for 2.1 kernel map. + * Michel Lespinasse : Changes for 2.1 kernel map. + * Michael Chastain : Change trampoline.S to gnu as. + * Alan Cox : Dumb bug: 'B' step PPro's are fine + * Ingo Molnar : Added APIC timers, based on code + * from Jose Renau + * Ingo Molnar : various cleanups and rewrites + * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug. + * Maciej W. Rozycki : Bits for genuine 82489DX APICs + * Andi Kleen : Changed for SMP boot into long mode. + * Martin J. Bligh : Added support for multi-quad systems + * Dave Jones : Report invalid combinations of Athlon CPUs. + * Rusty Russell : Hacked into shape for new "hotplug" boot process. + * Andi Kleen : Converted to new state machine. + * Ashok Raj : CPU hotplug support + * Glauber Costa : i386 and x86_64 integration + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/init.h> +#include <linux/smp.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/percpu.h> +#include <linux/bootmem.h> +#include <linux/err.h> +#include <linux/nmi.h> +#include <linux/tboot.h> +#include <linux/stackprotector.h> +#include <linux/gfp.h> +#include <linux/cpuidle.h> + +#include <asm/acpi.h> +#include <asm/desc.h> +#include <asm/nmi.h> +#include <asm/irq.h> +#include <asm/idle.h> +#include <asm/realmode.h> +#include <asm/cpu.h> +#include <asm/numa.h> +#include <asm/pgtable.h> +#include <asm/tlbflush.h> +#include <asm/mtrr.h> +#include <asm/mwait.h> +#include <asm/apic.h> +#include <asm/io_apic.h> +#include <asm/i387.h> +#include <asm/fpu-internal.h> +#include <asm/setup.h> +#include <asm/uv/uv.h> +#include <linux/mc146818rtc.h> +#include <asm/smpboot_hooks.h> +#include <asm/i8259.h> +#include <asm/realmode.h> +#include <asm/misc.h> + +/* State of each CPU */ +DEFINE_PER_CPU(int, cpu_state) = { 0 }; + +/* Number of siblings per CPU package */ +int smp_num_siblings = 1; +EXPORT_SYMBOL(smp_num_siblings); + +/* Last level cache ID of each logical CPU */ +DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID; + +/* representing HT siblings of each logical CPU */ +DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); +EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); + +/* representing HT and core siblings of each logical CPU */ +DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map); +EXPORT_PER_CPU_SYMBOL(cpu_core_map); + +DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); + +/* Per CPU bogomips and other parameters */ +DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); +EXPORT_PER_CPU_SYMBOL(cpu_info); + +atomic_t init_deasserted; + +/* + * Report back to the Boot Processor during boot time or to the caller processor + * during CPU online. + */ +static void smp_callin(void) +{ + int cpuid, phys_id; + unsigned long timeout; + + /* + * If waken up by an INIT in an 82489DX configuration + * we may get here before an INIT-deassert IPI reaches + * our local APIC. We have to wait for the IPI or we'll + * lock up on an APIC access. + * + * Since CPU0 is not wakened up by INIT, it doesn't wait for the IPI. + */ + cpuid = smp_processor_id(); + if (apic->wait_for_init_deassert && cpuid) + while (!atomic_read(&init_deasserted)) + cpu_relax(); + + /* + * (This works even if the APIC is not enabled.) + */ + phys_id = read_apic_id(); + if (cpumask_test_cpu(cpuid, cpu_callin_mask)) { + panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__, + phys_id, cpuid); + } + pr_debug("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id); + + /* + * STARTUP IPIs are fragile beasts as they might sometimes + * trigger some glue motherboard logic. Complete APIC bus + * silence for 1 second, this overestimates the time the + * boot CPU is spending to send the up to 2 STARTUP IPIs + * by a factor of two. This should be enough. + */ + + /* + * Waiting 2s total for startup (udelay is not yet working) + */ + timeout = jiffies + 2*HZ; + while (time_before(jiffies, timeout)) { + /* + * Has the boot CPU finished it's STARTUP sequence? + */ + if (cpumask_test_cpu(cpuid, cpu_callout_mask)) + break; + cpu_relax(); + } + + if (!time_before(jiffies, timeout)) { + panic("%s: CPU%d started up but did not get a callout!\n", + __func__, cpuid); + } + + /* + * the boot CPU has finished the init stage and is spinning + * on callin_map until we finish. We are free to set up this + * CPU, first the APIC. (this is probably redundant on most + * boards) + */ + + pr_debug("CALLIN, before setup_local_APIC()\n"); + if (apic->smp_callin_clear_local_apic) + apic->smp_callin_clear_local_apic(); + setup_local_APIC(); + end_local_APIC_setup(); + + /* + * Need to setup vector mappings before we enable interrupts. + */ + setup_vector_irq(smp_processor_id()); + + /* + * Save our processor parameters. Note: this information + * is needed for clock calibration. + */ + smp_store_cpu_info(cpuid); + + /* + * Get our bogomips. + * Update loops_per_jiffy in cpu_data. Previous call to + * smp_store_cpu_info() stored a value that is close but not as + * accurate as the value just calculated. + */ + calibrate_delay(); + cpu_data(cpuid).loops_per_jiffy = loops_per_jiffy; + pr_debug("Stack at about %p\n", &cpuid); + + /* + * This must be done before setting cpu_online_mask + * or calling notify_cpu_starting. + */ + set_cpu_sibling_map(raw_smp_processor_id()); + wmb(); + + notify_cpu_starting(cpuid); + + /* + * Allow the master to continue. + */ + cpumask_set_cpu(cpuid, cpu_callin_mask); +} + +static int cpu0_logical_apicid; +static int enable_start_cpu0; +/* + * Activate a secondary processor. + */ +static void notrace start_secondary(void *unused) +{ + /* + * Don't put *anything* before cpu_init(), SMP booting is too + * fragile that we want to limit the things done here to the + * most necessary things. + */ + cpu_init(); + x86_cpuinit.early_percpu_clock_init(); + preempt_disable(); + smp_callin(); + + enable_start_cpu0 = 0; + +#ifdef CONFIG_X86_32 + /* switch away from the initial page table */ + load_cr3(swapper_pg_dir); + __flush_tlb_all(); +#endif + + /* otherwise gcc will move up smp_processor_id before the cpu_init */ + barrier(); + /* + * Check TSC synchronization with the BP: + */ + check_tsc_sync_target(); + + /* + * Enable the espfix hack for this CPU + */ +#ifdef CONFIG_X86_ESPFIX64 + init_espfix_ap(); +#endif + + /* + * We need to hold vector_lock so there the set of online cpus + * does not change while we are assigning vectors to cpus. Holding + * this lock ensures we don't half assign or remove an irq from a cpu. + */ + lock_vector_lock(); + set_cpu_online(smp_processor_id(), true); + unlock_vector_lock(); + per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; + x86_platform.nmi_init(); + + /* enable local interrupts */ + local_irq_enable(); + + /* to prevent fake stack check failure in clock setup */ + boot_init_stack_canary(); + + x86_cpuinit.setup_percpu_clockev(); + + wmb(); + cpu_startup_entry(CPUHP_ONLINE); +} + +void __init smp_store_boot_cpu_info(void) +{ + int id = 0; /* CPU 0 */ + struct cpuinfo_x86 *c = &cpu_data(id); + + *c = boot_cpu_data; + c->cpu_index = id; +} + +/* + * The bootstrap kernel entry code has set these up. Save them for + * a given CPU + */ +void smp_store_cpu_info(int id) +{ + struct cpuinfo_x86 *c = &cpu_data(id); + + *c = boot_cpu_data; + c->cpu_index = id; + /* + * During boot time, CPU0 has this setup already. Save the info when + * bringing up AP or offlined CPU0. + */ + identify_secondary_cpu(c); +} + +static bool +topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name) +{ + int cpu1 = c->cpu_index, cpu2 = o->cpu_index; + + return !WARN_ONCE(cpu_to_node(cpu1) != cpu_to_node(cpu2), + "sched: CPU #%d's %s-sibling CPU #%d is not on the same node! " + "[node: %d != %d]. Ignoring dependency.\n", + cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2)); +} + +#define link_mask(_m, c1, c2) \ +do { \ + cpumask_set_cpu((c1), cpu_##_m##_mask(c2)); \ + cpumask_set_cpu((c2), cpu_##_m##_mask(c1)); \ +} while (0) + +static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +{ + if (cpu_has_topoext) { + int cpu1 = c->cpu_index, cpu2 = o->cpu_index; + + if (c->phys_proc_id == o->phys_proc_id && + per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2) && + c->compute_unit_id == o->compute_unit_id) + return topology_sane(c, o, "smt"); + + } else if (c->phys_proc_id == o->phys_proc_id && + c->cpu_core_id == o->cpu_core_id) { + return topology_sane(c, o, "smt"); + } + + return false; +} + +static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +{ + int cpu1 = c->cpu_index, cpu2 = o->cpu_index; + + if (per_cpu(cpu_llc_id, cpu1) != BAD_APICID && + per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2)) + return topology_sane(c, o, "llc"); + + return false; +} + +static bool match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +{ + if (c->phys_proc_id == o->phys_proc_id) { + if (cpu_has(c, X86_FEATURE_AMD_DCM)) + return true; + + return topology_sane(c, o, "mc"); + } + return false; +} + +void set_cpu_sibling_map(int cpu) +{ + bool has_smt = smp_num_siblings > 1; + bool has_mp = has_smt || boot_cpu_data.x86_max_cores > 1; + struct cpuinfo_x86 *c = &cpu_data(cpu); + struct cpuinfo_x86 *o; + int i; + + cpumask_set_cpu(cpu, cpu_sibling_setup_mask); + + if (!has_mp) { + cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); + cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu)); + cpumask_set_cpu(cpu, cpu_core_mask(cpu)); + c->booted_cores = 1; + return; + } + + for_each_cpu(i, cpu_sibling_setup_mask) { + o = &cpu_data(i); + + if ((i == cpu) || (has_smt && match_smt(c, o))) + link_mask(sibling, cpu, i); + + if ((i == cpu) || (has_mp && match_llc(c, o))) + link_mask(llc_shared, cpu, i); + + } + + /* + * This needs a separate iteration over the cpus because we rely on all + * cpu_sibling_mask links to be set-up. + */ + for_each_cpu(i, cpu_sibling_setup_mask) { + o = &cpu_data(i); + + if ((i == cpu) || (has_mp && match_mc(c, o))) { + link_mask(core, cpu, i); + + /* + * Does this new cpu bringup a new core? + */ + if (cpumask_weight(cpu_sibling_mask(cpu)) == 1) { + /* + * for each core in package, increment + * the booted_cores for this new cpu + */ + if (cpumask_first(cpu_sibling_mask(i)) == i) + c->booted_cores++; + /* + * increment the core count for all + * the other cpus in this package + */ + if (i != cpu) + cpu_data(i).booted_cores++; + } else if (i != cpu && !c->booted_cores) + c->booted_cores = cpu_data(i).booted_cores; + } + } +} + +/* maps the cpu to the sched domain representing multi-core */ +const struct cpumask *cpu_coregroup_mask(int cpu) +{ + return cpu_llc_shared_mask(cpu); +} + +static void impress_friends(void) +{ + int cpu; + unsigned long bogosum = 0; + /* + * Allow the user to impress friends. + */ + pr_debug("Before bogomips\n"); + for_each_possible_cpu(cpu) + if (cpumask_test_cpu(cpu, cpu_callout_mask)) + bogosum += cpu_data(cpu).loops_per_jiffy; + pr_info("Total of %d processors activated (%lu.%02lu BogoMIPS)\n", + num_online_cpus(), + bogosum/(500000/HZ), + (bogosum/(5000/HZ))%100); + + pr_debug("Before bogocount - setting activated=1\n"); +} + +void __inquire_remote_apic(int apicid) +{ + unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; + const char * const names[] = { "ID", "VERSION", "SPIV" }; + int timeout; + u32 status; + + pr_info("Inquiring remote APIC 0x%x...\n", apicid); + + for (i = 0; i < ARRAY_SIZE(regs); i++) { + pr_info("... APIC 0x%x %s: ", apicid, names[i]); + + /* + * Wait for idle. + */ + status = safe_apic_wait_icr_idle(); + if (status) + pr_cont("a previous APIC delivery may have failed\n"); + + apic_icr_write(APIC_DM_REMRD | regs[i], apicid); + + timeout = 0; + do { + udelay(100); + status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK; + } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000); + + switch (status) { + case APIC_ICR_RR_VALID: + status = apic_read(APIC_RRR); + pr_cont("%08x\n", status); + break; + default: + pr_cont("failed\n"); + } + } +} + +/* + * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal + * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this + * won't ... remember to clear down the APIC, etc later. + */ +int +wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip) +{ + unsigned long send_status, accept_status = 0; + int maxlvt; + + /* Target chip */ + /* Boot on the stack */ + /* Kick the second */ + apic_icr_write(APIC_DM_NMI | apic->dest_logical, apicid); + + pr_debug("Waiting for send to finish...\n"); + send_status = safe_apic_wait_icr_idle(); + + /* + * Give the other CPU some time to accept the IPI. + */ + udelay(200); + if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { + maxlvt = lapic_get_maxlvt(); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + accept_status = (apic_read(APIC_ESR) & 0xEF); + } + pr_debug("NMI sent\n"); + + if (send_status) + pr_err("APIC never delivered???\n"); + if (accept_status) + pr_err("APIC delivery error (%lx)\n", accept_status); + + return (send_status | accept_status); +} + +static int +wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) +{ + unsigned long send_status, accept_status = 0; + int maxlvt, num_starts, j; + + maxlvt = lapic_get_maxlvt(); + + /* + * Be paranoid about clearing APIC errors. + */ + if (APIC_INTEGRATED(apic_version[phys_apicid])) { + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + } + + pr_debug("Asserting INIT\n"); + + /* + * Turn INIT on target chip + */ + /* + * Send IPI + */ + apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT, + phys_apicid); + + pr_debug("Waiting for send to finish...\n"); + send_status = safe_apic_wait_icr_idle(); + + mdelay(10); + + pr_debug("Deasserting INIT\n"); + + /* Target chip */ + /* Send IPI */ + apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid); + + pr_debug("Waiting for send to finish...\n"); + send_status = safe_apic_wait_icr_idle(); + + mb(); + atomic_set(&init_deasserted, 1); + + /* + * Should we send STARTUP IPIs ? + * + * Determine this based on the APIC version. + * If we don't have an integrated APIC, don't send the STARTUP IPIs. + */ + if (APIC_INTEGRATED(apic_version[phys_apicid])) + num_starts = 2; + else + num_starts = 0; + + /* + * Paravirt / VMI wants a startup IPI hook here to set up the + * target processor state. + */ + startup_ipi_hook(phys_apicid, (unsigned long) start_secondary, + stack_start); + + /* + * Run STARTUP IPI loop. + */ + pr_debug("#startup loops: %d\n", num_starts); + + for (j = 1; j <= num_starts; j++) { + pr_debug("Sending STARTUP #%d\n", j); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + pr_debug("After apic_write\n"); + + /* + * STARTUP IPI + */ + + /* Target chip */ + /* Boot on the stack */ + /* Kick the second */ + apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12), + phys_apicid); + + /* + * Give the other CPU some time to accept the IPI. + */ + udelay(300); + + pr_debug("Startup point 1\n"); + + pr_debug("Waiting for send to finish...\n"); + send_status = safe_apic_wait_icr_idle(); + + /* + * Give the other CPU some time to accept the IPI. + */ + udelay(200); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + accept_status = (apic_read(APIC_ESR) & 0xEF); + if (send_status || accept_status) + break; + } + pr_debug("After Startup\n"); + + if (send_status) + pr_err("APIC never delivered???\n"); + if (accept_status) + pr_err("APIC delivery error (%lx)\n", accept_status); + + return (send_status | accept_status); +} + +void smp_announce(void) +{ + int num_nodes = num_online_nodes(); + + printk(KERN_INFO "x86: Booted up %d node%s, %d CPUs\n", + num_nodes, (num_nodes > 1 ? "s" : ""), num_online_cpus()); +} + +/* reduce the number of lines printed when booting a large cpu count system */ +static void announce_cpu(int cpu, int apicid) +{ + static int current_node = -1; + int node = early_cpu_to_node(cpu); + static int width, node_width; + + if (!width) + width = num_digits(num_possible_cpus()) + 1; /* + '#' sign */ + + if (!node_width) + node_width = num_digits(num_possible_nodes()) + 1; /* + '#' */ + + if (cpu == 1) + printk(KERN_INFO "x86: Booting SMP configuration:\n"); + + if (system_state == SYSTEM_BOOTING) { + if (node != current_node) { + if (current_node > (-1)) + pr_cont("\n"); + current_node = node; + + printk(KERN_INFO ".... node %*s#%d, CPUs: ", + node_width - num_digits(node), " ", node); + } + + /* Add padding for the BSP */ + if (cpu == 1) + pr_cont("%*s", width + 1, " "); + + pr_cont("%*s#%d", width - num_digits(cpu), " ", cpu); + + } else + pr_info("Booting Node %d Processor %d APIC 0x%x\n", + node, cpu, apicid); +} + +static int wakeup_cpu0_nmi(unsigned int cmd, struct pt_regs *regs) +{ + int cpu; + + cpu = smp_processor_id(); + if (cpu == 0 && !cpu_online(cpu) && enable_start_cpu0) + return NMI_HANDLED; + + return NMI_DONE; +} + +/* + * Wake up AP by INIT, INIT, STARTUP sequence. + * + * Instead of waiting for STARTUP after INITs, BSP will execute the BIOS + * boot-strap code which is not a desired behavior for waking up BSP. To + * void the boot-strap code, wake up CPU0 by NMI instead. + * + * This works to wake up soft offlined CPU0 only. If CPU0 is hard offlined + * (i.e. physically hot removed and then hot added), NMI won't wake it up. + * We'll change this code in the future to wake up hard offlined CPU0 if + * real platform and request are available. + */ +static int +wakeup_cpu_via_init_nmi(int cpu, unsigned long start_ip, int apicid, + int *cpu0_nmi_registered) +{ + int id; + int boot_error; + + preempt_disable(); + + /* + * Wake up AP by INIT, INIT, STARTUP sequence. + */ + if (cpu) { + boot_error = wakeup_secondary_cpu_via_init(apicid, start_ip); + goto out; + } + + /* + * Wake up BSP by nmi. + * + * Register a NMI handler to help wake up CPU0. + */ + boot_error = register_nmi_handler(NMI_LOCAL, + wakeup_cpu0_nmi, 0, "wake_cpu0"); + + if (!boot_error) { + enable_start_cpu0 = 1; + *cpu0_nmi_registered = 1; + if (apic->dest_logical == APIC_DEST_LOGICAL) + id = cpu0_logical_apicid; + else + id = apicid; + boot_error = wakeup_secondary_cpu_via_nmi(id, start_ip); + } + +out: + preempt_enable(); + + return boot_error; +} + +/* + * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad + * (ie clustered apic addressing mode), this is a LOGICAL apic ID. + * Returns zero if CPU booted OK, else error code from + * ->wakeup_secondary_cpu. + */ +static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) +{ + volatile u32 *trampoline_status = + (volatile u32 *) __va(real_mode_header->trampoline_status); + /* start_ip had better be page-aligned! */ + unsigned long start_ip = real_mode_header->trampoline_start; + + unsigned long boot_error = 0; + int timeout; + int cpu0_nmi_registered = 0; + + /* Just in case we booted with a single CPU. */ + alternatives_enable_smp(); + + idle->thread.sp = (unsigned long) (((struct pt_regs *) + (THREAD_SIZE + task_stack_page(idle))) - 1); + per_cpu(current_task, cpu) = idle; + +#ifdef CONFIG_X86_32 + /* Stack for startup_32 can be just as for start_secondary onwards */ + irq_ctx_init(cpu); +#else + clear_tsk_thread_flag(idle, TIF_FORK); + initial_gs = per_cpu_offset(cpu); +#endif + per_cpu(kernel_stack, cpu) = + (unsigned long)task_stack_page(idle) - + KERNEL_STACK_OFFSET + THREAD_SIZE; + early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); + initial_code = (unsigned long)start_secondary; + stack_start = idle->thread.sp; + + /* So we see what's up */ + announce_cpu(cpu, apicid); + + /* + * This grunge runs the startup process for + * the targeted processor. + */ + + atomic_set(&init_deasserted, 0); + + if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { + + pr_debug("Setting warm reset code and vector.\n"); + + smpboot_setup_warm_reset_vector(start_ip); + /* + * Be paranoid about clearing APIC errors. + */ + if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + } + } + + /* + * Wake up a CPU in difference cases: + * - Use the method in the APIC driver if it's defined + * Otherwise, + * - Use an INIT boot APIC message for APs or NMI for BSP. + */ + if (apic->wakeup_secondary_cpu) + boot_error = apic->wakeup_secondary_cpu(apicid, start_ip); + else + boot_error = wakeup_cpu_via_init_nmi(cpu, start_ip, apicid, + &cpu0_nmi_registered); + + if (!boot_error) { + /* + * allow APs to start initializing. + */ + pr_debug("Before Callout %d\n", cpu); + cpumask_set_cpu(cpu, cpu_callout_mask); + pr_debug("After Callout %d\n", cpu); + + /* + * Wait 5s total for a response + */ + for (timeout = 0; timeout < 50000; timeout++) { + if (cpumask_test_cpu(cpu, cpu_callin_mask)) + break; /* It has booted */ + udelay(100); + /* + * Allow other tasks to run while we wait for the + * AP to come online. This also gives a chance + * for the MTRR work(triggered by the AP coming online) + * to be completed in the stop machine context. + */ + schedule(); + } + + if (cpumask_test_cpu(cpu, cpu_callin_mask)) { + print_cpu_msr(&cpu_data(cpu)); + pr_debug("CPU%d: has booted.\n", cpu); + } else { + boot_error = 1; + if (*trampoline_status == 0xA5A5A5A5) + /* trampoline started but...? */ + pr_err("CPU%d: Stuck ??\n", cpu); + else + /* trampoline code not run */ + pr_err("CPU%d: Not responding\n", cpu); + if (apic->inquire_remote_apic) + apic->inquire_remote_apic(apicid); + } + } + + if (boot_error) { + /* Try to put things back the way they were before ... */ + numa_remove_cpu(cpu); /* was set by numa_add_cpu */ + + /* was set by do_boot_cpu() */ + cpumask_clear_cpu(cpu, cpu_callout_mask); + + /* was set by cpu_init() */ + cpumask_clear_cpu(cpu, cpu_initialized_mask); + } + + /* mark "stuck" area as not stuck */ + *trampoline_status = 0; + + if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { + /* + * Cleanup possible dangling ends... + */ + smpboot_restore_warm_reset_vector(); + } + /* + * Clean up the nmi handler. Do this after the callin and callout sync + * to avoid impact of possible long unregister time. + */ + if (cpu0_nmi_registered) + unregister_nmi_handler(NMI_LOCAL, "wake_cpu0"); + + return boot_error; +} + +int native_cpu_up(unsigned int cpu, struct task_struct *tidle) +{ + int apicid = apic->cpu_present_to_apicid(cpu); + unsigned long flags; + int err; + + WARN_ON(irqs_disabled()); + + pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu); + + if (apicid == BAD_APICID || + !physid_isset(apicid, phys_cpu_present_map) || + !apic->apic_id_valid(apicid)) { + pr_err("%s: bad cpu %d\n", __func__, cpu); + return -EINVAL; + } + + /* + * Already booted CPU? + */ + if (cpumask_test_cpu(cpu, cpu_callin_mask)) { + pr_debug("do_boot_cpu %d Already started\n", cpu); + return -ENOSYS; + } + + /* + * Save current MTRR state in case it was changed since early boot + * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync: + */ + mtrr_save_state(); + + per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; + + /* the FPU context is blank, nobody can own it */ + __cpu_disable_lazy_restore(cpu); + + err = do_boot_cpu(apicid, cpu, tidle); + if (err) { + pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu); + return -EIO; + } + + /* + * Check TSC synchronization with the AP (keep irqs disabled + * while doing so): + */ + local_irq_save(flags); + check_tsc_sync_source(cpu); + local_irq_restore(flags); + + while (!cpu_online(cpu)) { + cpu_relax(); + touch_nmi_watchdog(); + } + + return 0; +} + +/** + * arch_disable_smp_support() - disables SMP support for x86 at runtime + */ +void arch_disable_smp_support(void) +{ + disable_ioapic_support(); +} + +/* + * Fall back to non SMP mode after errors. + * + * RED-PEN audit/test this more. I bet there is more state messed up here. + */ +static __init void disable_smp(void) +{ + init_cpu_present(cpumask_of(0)); + init_cpu_possible(cpumask_of(0)); + smpboot_clear_io_apic_irqs(); + + if (smp_found_config) + physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); + else + physid_set_mask_of_physid(0, &phys_cpu_present_map); + cpumask_set_cpu(0, cpu_sibling_mask(0)); + cpumask_set_cpu(0, cpu_core_mask(0)); +} + +/* + * Various sanity checks. + */ +static int __init smp_sanity_check(unsigned max_cpus) +{ + preempt_disable(); + +#if !defined(CONFIG_X86_BIGSMP) && defined(CONFIG_X86_32) + if (def_to_bigsmp && nr_cpu_ids > 8) { + unsigned int cpu; + unsigned nr; + + pr_warn("More than 8 CPUs detected - skipping them\n" + "Use CONFIG_X86_BIGSMP\n"); + + nr = 0; + for_each_present_cpu(cpu) { + if (nr >= 8) + set_cpu_present(cpu, false); + nr++; + } + + nr = 0; + for_each_possible_cpu(cpu) { + if (nr >= 8) + set_cpu_possible(cpu, false); + nr++; + } + + nr_cpu_ids = 8; + } +#endif + + if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { + pr_warn("weird, boot CPU (#%d) not listed by the BIOS\n", + hard_smp_processor_id()); + + physid_set(hard_smp_processor_id(), phys_cpu_present_map); + } + + /* + * If we couldn't find an SMP configuration at boot time, + * get out of here now! + */ + if (!smp_found_config && !acpi_lapic) { + preempt_enable(); + pr_notice("SMP motherboard not detected\n"); + disable_smp(); + if (APIC_init_uniprocessor()) + pr_notice("Local APIC not detected. Using dummy APIC emulation.\n"); + return -1; + } + + /* + * Should not be necessary because the MP table should list the boot + * CPU too, but we do it for the sake of robustness anyway. + */ + if (!apic->check_phys_apicid_present(boot_cpu_physical_apicid)) { + pr_notice("weird, boot CPU (#%d) not listed by the BIOS\n", + boot_cpu_physical_apicid); + physid_set(hard_smp_processor_id(), phys_cpu_present_map); + } + preempt_enable(); + + /* + * If we couldn't find a local APIC, then get out of here now! + */ + if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) && + !cpu_has_apic) { + if (!disable_apic) { + pr_err("BIOS bug, local APIC #%d not detected!...\n", + boot_cpu_physical_apicid); + pr_err("... forcing use of dummy APIC emulation (tell your hw vendor)\n"); + } + smpboot_clear_io_apic(); + disable_ioapic_support(); + return -1; + } + + verify_local_APIC(); + + /* + * If SMP should be disabled, then really disable it! + */ + if (!max_cpus) { + pr_info("SMP mode deactivated\n"); + smpboot_clear_io_apic(); + + connect_bsp_APIC(); + setup_local_APIC(); + bsp_end_local_APIC_setup(); + return -1; + } + + return 0; +} + +static void __init smp_cpu_index_default(void) +{ + int i; + struct cpuinfo_x86 *c; + + for_each_possible_cpu(i) { + c = &cpu_data(i); + /* mark all to hotplug */ + c->cpu_index = nr_cpu_ids; + } +} + +/* + * Prepare for SMP bootup. The MP table or ACPI has been read + * earlier. Just do some sanity checking here and enable APIC mode. + */ +void __init native_smp_prepare_cpus(unsigned int max_cpus) +{ + unsigned int i; + + preempt_disable(); + smp_cpu_index_default(); + + /* + * Setup boot CPU information + */ + smp_store_boot_cpu_info(); /* Final full version of the data */ + cpumask_copy(cpu_callin_mask, cpumask_of(0)); + mb(); + + current_thread_info()->cpu = 0; /* needed? */ + for_each_possible_cpu(i) { + zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); + zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); + zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); + } + set_cpu_sibling_map(0); + + + if (smp_sanity_check(max_cpus) < 0) { + pr_info("SMP disabled\n"); + disable_smp(); + goto out; + } + + default_setup_apic_routing(); + + preempt_disable(); + if (read_apic_id() != boot_cpu_physical_apicid) { + panic("Boot APIC ID in local APIC unexpected (%d vs %d)", + read_apic_id(), boot_cpu_physical_apicid); + /* Or can we switch back to PIC here? */ + } + preempt_enable(); + + connect_bsp_APIC(); + + /* + * Switch from PIC to APIC mode. + */ + setup_local_APIC(); + + if (x2apic_mode) + cpu0_logical_apicid = apic_read(APIC_LDR); + else + cpu0_logical_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); + + /* + * Enable IO APIC before setting up error vector + */ + if (!skip_ioapic_setup && nr_ioapics) + enable_IO_APIC(); + + bsp_end_local_APIC_setup(); + + if (apic->setup_portio_remap) + apic->setup_portio_remap(); + + smpboot_setup_io_apic(); + /* + * Set up local APIC timer on boot CPU. + */ + + pr_info("CPU%d: ", 0); + print_cpu_info(&cpu_data(0)); + x86_init.timers.setup_percpu_clockev(); + + if (is_uv_system()) + uv_system_init(); + + set_mtrr_aps_delayed_init(); +out: + preempt_enable(); +} + +void arch_enable_nonboot_cpus_begin(void) +{ + set_mtrr_aps_delayed_init(); +} + +void arch_enable_nonboot_cpus_end(void) +{ + mtrr_aps_init(); +} + +/* + * Early setup to make printk work. + */ +void __init native_smp_prepare_boot_cpu(void) +{ + int me = smp_processor_id(); + switch_to_new_gdt(me); + /* already set me in cpu_online_mask in boot_cpu_init() */ + cpumask_set_cpu(me, cpu_callout_mask); + per_cpu(cpu_state, me) = CPU_ONLINE; +} + +void __init native_smp_cpus_done(unsigned int max_cpus) +{ + pr_debug("Boot done\n"); + + nmi_selftest(); + impress_friends(); +#ifdef CONFIG_X86_IO_APIC + setup_ioapic_dest(); +#endif + mtrr_aps_init(); +} + +static int __initdata setup_possible_cpus = -1; +static int __init _setup_possible_cpus(char *str) +{ + get_option(&str, &setup_possible_cpus); + return 0; +} +early_param("possible_cpus", _setup_possible_cpus); + + +/* + * cpu_possible_mask should be static, it cannot change as cpu's + * are onlined, or offlined. The reason is per-cpu data-structures + * are allocated by some modules at init time, and dont expect to + * do this dynamically on cpu arrival/departure. + * cpu_present_mask on the other hand can change dynamically. + * In case when cpu_hotplug is not compiled, then we resort to current + * behaviour, which is cpu_possible == cpu_present. + * - Ashok Raj + * + * Three ways to find out the number of additional hotplug CPUs: + * - If the BIOS specified disabled CPUs in ACPI/mptables use that. + * - The user can overwrite it with possible_cpus=NUM + * - Otherwise don't reserve additional CPUs. + * We do this because additional CPUs waste a lot of memory. + * -AK + */ +__init void prefill_possible_map(void) +{ + int i, possible; + + /* no processor from mptable or madt */ + if (!num_processors) + num_processors = 1; + + i = setup_max_cpus ?: 1; + if (setup_possible_cpus == -1) { + possible = num_processors; +#ifdef CONFIG_HOTPLUG_CPU + if (setup_max_cpus) + possible += disabled_cpus; +#else + if (possible > i) + possible = i; +#endif + } else + possible = setup_possible_cpus; + + total_cpus = max_t(int, possible, num_processors + disabled_cpus); + + /* nr_cpu_ids could be reduced via nr_cpus= */ + if (possible > nr_cpu_ids) { + pr_warn("%d Processors exceeds NR_CPUS limit of %d\n", + possible, nr_cpu_ids); + possible = nr_cpu_ids; + } + +#ifdef CONFIG_HOTPLUG_CPU + if (!setup_max_cpus) +#endif + if (possible > i) { + pr_warn("%d Processors exceeds max_cpus limit of %u\n", + possible, setup_max_cpus); + possible = i; + } + + pr_info("Allowing %d CPUs, %d hotplug CPUs\n", + possible, max_t(int, possible - num_processors, 0)); + + for (i = 0; i < possible; i++) + set_cpu_possible(i, true); + for (; i < NR_CPUS; i++) + set_cpu_possible(i, false); + + nr_cpu_ids = possible; +} + +#ifdef CONFIG_HOTPLUG_CPU + +static void remove_siblinginfo(int cpu) +{ + int sibling; + struct cpuinfo_x86 *c = &cpu_data(cpu); + + for_each_cpu(sibling, cpu_core_mask(cpu)) { + cpumask_clear_cpu(cpu, cpu_core_mask(sibling)); + /*/ + * last thread sibling in this cpu core going down + */ + if (cpumask_weight(cpu_sibling_mask(cpu)) == 1) + cpu_data(sibling).booted_cores--; + } + + for_each_cpu(sibling, cpu_sibling_mask(cpu)) + cpumask_clear_cpu(cpu, cpu_sibling_mask(sibling)); + cpumask_clear(cpu_sibling_mask(cpu)); + cpumask_clear(cpu_core_mask(cpu)); + c->phys_proc_id = 0; + c->cpu_core_id = 0; + cpumask_clear_cpu(cpu, cpu_sibling_setup_mask); +} + +static void __ref remove_cpu_from_maps(int cpu) +{ + set_cpu_online(cpu, false); + cpumask_clear_cpu(cpu, cpu_callout_mask); + cpumask_clear_cpu(cpu, cpu_callin_mask); + /* was set by cpu_init() */ + cpumask_clear_cpu(cpu, cpu_initialized_mask); + numa_remove_cpu(cpu); +} + +void cpu_disable_common(void) +{ + int cpu = smp_processor_id(); + + remove_siblinginfo(cpu); + + /* It's now safe to remove this processor from the online map */ + lock_vector_lock(); + remove_cpu_from_maps(cpu); + unlock_vector_lock(); + fixup_irqs(); +} + +int native_cpu_disable(void) +{ + int ret; + + ret = check_irq_vectors_for_cpu_disable(); + if (ret) + return ret; + + clear_local_APIC(); + + cpu_disable_common(); + return 0; +} + +void native_cpu_die(unsigned int cpu) +{ + /* We don't do anything here: idle task is faking death itself. */ + unsigned int i; + + for (i = 0; i < 10; i++) { + /* They ack this in play_dead by setting CPU_DEAD */ + if (per_cpu(cpu_state, cpu) == CPU_DEAD) { + if (system_state == SYSTEM_RUNNING) + pr_info("CPU %u is now offline\n", cpu); + return; + } + msleep(100); + } + pr_err("CPU %u didn't die...\n", cpu); +} + +void play_dead_common(void) +{ + idle_task_exit(); + reset_lazy_tlbstate(); + amd_e400_remove_cpu(raw_smp_processor_id()); + + mb(); + /* Ack it */ + __this_cpu_write(cpu_state, CPU_DEAD); + + /* + * With physical CPU hotplug, we should halt the cpu + */ + local_irq_disable(); +} + +static bool wakeup_cpu0(void) +{ + if (smp_processor_id() == 0 && enable_start_cpu0) + return true; + + return false; +} + +/* + * We need to flush the caches before going to sleep, lest we have + * dirty data in our caches when we come back up. + */ +static inline void mwait_play_dead(void) +{ + unsigned int eax, ebx, ecx, edx; + unsigned int highest_cstate = 0; + unsigned int highest_subcstate = 0; + void *mwait_ptr; + int i; + + if (!this_cpu_has(X86_FEATURE_MWAIT)) + return; + if (!this_cpu_has(X86_FEATURE_CLFLUSH)) + return; + if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF) + return; + + eax = CPUID_MWAIT_LEAF; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + + /* + * eax will be 0 if EDX enumeration is not valid. + * Initialized below to cstate, sub_cstate value when EDX is valid. + */ + if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) { + eax = 0; + } else { + edx >>= MWAIT_SUBSTATE_SIZE; + for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) { + if (edx & MWAIT_SUBSTATE_MASK) { + highest_cstate = i; + highest_subcstate = edx & MWAIT_SUBSTATE_MASK; + } + } + eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) | + (highest_subcstate - 1); + } + + /* + * This should be a memory location in a cache line which is + * unlikely to be touched by other processors. The actual + * content is immaterial as it is not actually modified in any way. + */ + mwait_ptr = ¤t_thread_info()->flags; + + wbinvd(); + + while (1) { + /* + * The CLFLUSH is a workaround for erratum AAI65 for + * the Xeon 7400 series. It's not clear it is actually + * needed, but it should be harmless in either case. + * The WBINVD is insufficient due to the spurious-wakeup + * case where we return around the loop. + */ + mb(); + clflush(mwait_ptr); + mb(); + __monitor(mwait_ptr, 0, 0); + mb(); + __mwait(eax, 0); + /* + * If NMI wants to wake up CPU0, start CPU0. + */ + if (wakeup_cpu0()) + start_cpu0(); + } +} + +static inline void hlt_play_dead(void) +{ + if (__this_cpu_read(cpu_info.x86) >= 4) + wbinvd(); + + while (1) { + native_halt(); + /* + * If NMI wants to wake up CPU0, start CPU0. + */ + if (wakeup_cpu0()) + start_cpu0(); + } +} + +void native_play_dead(void) +{ + play_dead_common(); + tboot_shutdown(TB_SHUTDOWN_WFS); + + mwait_play_dead(); /* Only returns on failure */ + if (cpuidle_play_dead()) + hlt_play_dead(); +} + +#else /* ... !CONFIG_HOTPLUG_CPU */ +int native_cpu_disable(void) +{ + return -ENOSYS; +} + +void native_cpu_die(unsigned int cpu) +{ + /* We said "no" in __cpu_disable */ + BUG(); +} + +void native_play_dead(void) +{ + BUG(); +} + +#endif diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c new file mode 100644 index 00000000000..fdd0c6430e5 --- /dev/null +++ b/arch/x86/kernel/stacktrace.c @@ -0,0 +1,146 @@ +/* + * Stack trace management functions + * + * Copyright (C) 2006-2009 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + */ +#include <linux/sched.h> +#include <linux/stacktrace.h> +#include <linux/module.h> +#include <linux/uaccess.h> +#include <asm/stacktrace.h> + +static int save_stack_stack(void *data, char *name) +{ + return 0; +} + +static void +__save_stack_address(void *data, unsigned long addr, bool reliable, bool nosched) +{ + struct stack_trace *trace = data; +#ifdef CONFIG_FRAME_POINTER + if (!reliable) + return; +#endif + if (nosched && in_sched_functions(addr)) + return; + if (trace->skip > 0) { + trace->skip--; + return; + } + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = addr; +} + +static void save_stack_address(void *data, unsigned long addr, int reliable) +{ + return __save_stack_address(data, addr, reliable, false); +} + +static void +save_stack_address_nosched(void *data, unsigned long addr, int reliable) +{ + return __save_stack_address(data, addr, reliable, true); +} + +static const struct stacktrace_ops save_stack_ops = { + .stack = save_stack_stack, + .address = save_stack_address, + .walk_stack = print_context_stack, +}; + +static const struct stacktrace_ops save_stack_ops_nosched = { + .stack = save_stack_stack, + .address = save_stack_address_nosched, + .walk_stack = print_context_stack, +}; + +/* + * Save stack-backtrace addresses into a stack_trace buffer. + */ +void save_stack_trace(struct stack_trace *trace) +{ + dump_trace(current, NULL, NULL, 0, &save_stack_ops, trace); + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = ULONG_MAX; +} +EXPORT_SYMBOL_GPL(save_stack_trace); + +void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) +{ + dump_trace(current, regs, NULL, 0, &save_stack_ops, trace); + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = ULONG_MAX; +} + +void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) +{ + dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace); + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = ULONG_MAX; +} +EXPORT_SYMBOL_GPL(save_stack_trace_tsk); + +/* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */ + +struct stack_frame_user { + const void __user *next_fp; + unsigned long ret_addr; +}; + +static int +copy_stack_frame(const void __user *fp, struct stack_frame_user *frame) +{ + int ret; + + if (!access_ok(VERIFY_READ, fp, sizeof(*frame))) + return 0; + + ret = 1; + pagefault_disable(); + if (__copy_from_user_inatomic(frame, fp, sizeof(*frame))) + ret = 0; + pagefault_enable(); + + return ret; +} + +static inline void __save_stack_trace_user(struct stack_trace *trace) +{ + const struct pt_regs *regs = task_pt_regs(current); + const void __user *fp = (const void __user *)regs->bp; + + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = regs->ip; + + while (trace->nr_entries < trace->max_entries) { + struct stack_frame_user frame; + + frame.next_fp = NULL; + frame.ret_addr = 0; + if (!copy_stack_frame(fp, &frame)) + break; + if ((unsigned long)fp < regs->sp) + break; + if (frame.ret_addr) { + trace->entries[trace->nr_entries++] = + frame.ret_addr; + } + if (fp == frame.next_fp) + break; + fp = frame.next_fp; + } +} + +void save_stack_trace_user(struct stack_trace *trace) +{ + /* + * Trace user stack if we are not a kernel thread + */ + if (current->mm) { + __save_stack_trace_user(trace); + } + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = ULONG_MAX; +} + diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c new file mode 100644 index 00000000000..9b4d51d0c0d --- /dev/null +++ b/arch/x86/kernel/step.c @@ -0,0 +1,230 @@ +/* + * x86 single-step support code, common to 32-bit and 64-bit. + */ +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/ptrace.h> +#include <asm/desc.h> + +unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs) +{ + unsigned long addr, seg; + + addr = regs->ip; + seg = regs->cs & 0xffff; + if (v8086_mode(regs)) { + addr = (addr & 0xffff) + (seg << 4); + return addr; + } + + /* + * We'll assume that the code segments in the GDT + * are all zero-based. That is largely true: the + * TLS segments are used for data, and the PNPBIOS + * and APM bios ones we just ignore here. + */ + if ((seg & SEGMENT_TI_MASK) == SEGMENT_LDT) { + struct desc_struct *desc; + unsigned long base; + + seg &= ~7UL; + + mutex_lock(&child->mm->context.lock); + if (unlikely((seg >> 3) >= child->mm->context.size)) + addr = -1L; /* bogus selector, access would fault */ + else { + desc = child->mm->context.ldt + seg; + base = get_desc_base(desc); + + /* 16-bit code segment? */ + if (!desc->d) + addr &= 0xffff; + addr += base; + } + mutex_unlock(&child->mm->context.lock); + } + + return addr; +} + +static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs) +{ + int i, copied; + unsigned char opcode[15]; + unsigned long addr = convert_ip_to_linear(child, regs); + + copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0); + for (i = 0; i < copied; i++) { + switch (opcode[i]) { + /* popf and iret */ + case 0x9d: case 0xcf: + return 1; + + /* CHECKME: 64 65 */ + + /* opcode and address size prefixes */ + case 0x66: case 0x67: + continue; + /* irrelevant prefixes (segment overrides and repeats) */ + case 0x26: case 0x2e: + case 0x36: case 0x3e: + case 0x64: case 0x65: + case 0xf0: case 0xf2: case 0xf3: + continue; + +#ifdef CONFIG_X86_64 + case 0x40 ... 0x4f: + if (!user_64bit_mode(regs)) + /* 32-bit mode: register increment */ + return 0; + /* 64-bit mode: REX prefix */ + continue; +#endif + + /* CHECKME: f2, f3 */ + + /* + * pushf: NOTE! We should probably not let + * the user see the TF bit being set. But + * it's more pain than it's worth to avoid + * it, and a debugger could emulate this + * all in user space if it _really_ cares. + */ + case 0x9c: + default: + return 0; + } + } + return 0; +} + +/* + * Enable single-stepping. Return nonzero if user mode is not using TF itself. + */ +static int enable_single_step(struct task_struct *child) +{ + struct pt_regs *regs = task_pt_regs(child); + unsigned long oflags; + + /* + * If we stepped into a sysenter/syscall insn, it trapped in + * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP. + * If user-mode had set TF itself, then it's still clear from + * do_debug() and we need to set it again to restore the user + * state so we don't wrongly set TIF_FORCED_TF below. + * If enable_single_step() was used last and that is what + * set TIF_SINGLESTEP, then both TF and TIF_FORCED_TF are + * already set and our bookkeeping is fine. + */ + if (unlikely(test_tsk_thread_flag(child, TIF_SINGLESTEP))) + regs->flags |= X86_EFLAGS_TF; + + /* + * Always set TIF_SINGLESTEP - this guarantees that + * we single-step system calls etc.. This will also + * cause us to set TF when returning to user mode. + */ + set_tsk_thread_flag(child, TIF_SINGLESTEP); + + oflags = regs->flags; + + /* Set TF on the kernel stack.. */ + regs->flags |= X86_EFLAGS_TF; + + /* + * ..but if TF is changed by the instruction we will trace, + * don't mark it as being "us" that set it, so that we + * won't clear it by hand later. + * + * Note that if we don't actually execute the popf because + * of a signal arriving right now or suchlike, we will lose + * track of the fact that it really was "us" that set it. + */ + if (is_setting_trap_flag(child, regs)) { + clear_tsk_thread_flag(child, TIF_FORCED_TF); + return 0; + } + + /* + * If TF was already set, check whether it was us who set it. + * If not, we should never attempt a block step. + */ + if (oflags & X86_EFLAGS_TF) + return test_tsk_thread_flag(child, TIF_FORCED_TF); + + set_tsk_thread_flag(child, TIF_FORCED_TF); + + return 1; +} + +void set_task_blockstep(struct task_struct *task, bool on) +{ + unsigned long debugctl; + + /* + * Ensure irq/preemption can't change debugctl in between. + * Note also that both TIF_BLOCKSTEP and debugctl should + * be changed atomically wrt preemption. + * + * NOTE: this means that set/clear TIF_BLOCKSTEP is only safe if + * task is current or it can't be running, otherwise we can race + * with __switch_to_xtra(). We rely on ptrace_freeze_traced() but + * PTRACE_KILL is not safe. + */ + local_irq_disable(); + debugctl = get_debugctlmsr(); + if (on) { + debugctl |= DEBUGCTLMSR_BTF; + set_tsk_thread_flag(task, TIF_BLOCKSTEP); + } else { + debugctl &= ~DEBUGCTLMSR_BTF; + clear_tsk_thread_flag(task, TIF_BLOCKSTEP); + } + if (task == current) + update_debugctlmsr(debugctl); + local_irq_enable(); +} + +/* + * Enable single or block step. + */ +static void enable_step(struct task_struct *child, bool block) +{ + /* + * Make sure block stepping (BTF) is not enabled unless it should be. + * Note that we don't try to worry about any is_setting_trap_flag() + * instructions after the first when using block stepping. + * So no one should try to use debugger block stepping in a program + * that uses user-mode single stepping itself. + */ + if (enable_single_step(child) && block) + set_task_blockstep(child, true); + else if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) + set_task_blockstep(child, false); +} + +void user_enable_single_step(struct task_struct *child) +{ + enable_step(child, 0); +} + +void user_enable_block_step(struct task_struct *child) +{ + enable_step(child, 1); +} + +void user_disable_single_step(struct task_struct *child) +{ + /* + * Make sure block stepping (BTF) is disabled. + */ + if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) + set_task_blockstep(child, false); + + /* Always clear TIF_SINGLESTEP... */ + clear_tsk_thread_flag(child, TIF_SINGLESTEP); + + /* But touch TF only if it was set by us.. */ + if (test_and_clear_tsk_thread_flag(child, TIF_FORCED_TF)) + task_pt_regs(child)->flags &= ~X86_EFLAGS_TF; +} diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c new file mode 100644 index 00000000000..30277e27431 --- /dev/null +++ b/arch/x86/kernel/sys_x86_64.c @@ -0,0 +1,192 @@ +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/syscalls.h> +#include <linux/mm.h> +#include <linux/fs.h> +#include <linux/smp.h> +#include <linux/sem.h> +#include <linux/msg.h> +#include <linux/shm.h> +#include <linux/stat.h> +#include <linux/mman.h> +#include <linux/file.h> +#include <linux/utsname.h> +#include <linux/personality.h> +#include <linux/random.h> +#include <linux/uaccess.h> +#include <linux/elf.h> + +#include <asm/ia32.h> +#include <asm/syscalls.h> + +/* + * Align a virtual address to avoid aliasing in the I$ on AMD F15h. + */ +static unsigned long get_align_mask(void) +{ + /* handle 32- and 64-bit case with a single conditional */ + if (va_align.flags < 0 || !(va_align.flags & (2 - mmap_is_ia32()))) + return 0; + + if (!(current->flags & PF_RANDOMIZE)) + return 0; + + return va_align.mask; +} + +unsigned long align_vdso_addr(unsigned long addr) +{ + unsigned long align_mask = get_align_mask(); + return (addr + align_mask) & ~align_mask; +} + +static int __init control_va_addr_alignment(char *str) +{ + /* guard against enabling this on other CPU families */ + if (va_align.flags < 0) + return 1; + + if (*str == 0) + return 1; + + if (*str == '=') + str++; + + if (!strcmp(str, "32")) + va_align.flags = ALIGN_VA_32; + else if (!strcmp(str, "64")) + va_align.flags = ALIGN_VA_64; + else if (!strcmp(str, "off")) + va_align.flags = 0; + else if (!strcmp(str, "on")) + va_align.flags = ALIGN_VA_32 | ALIGN_VA_64; + else + return 0; + + return 1; +} +__setup("align_va_addr", control_va_addr_alignment); + +SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, + unsigned long, prot, unsigned long, flags, + unsigned long, fd, unsigned long, off) +{ + long error; + error = -EINVAL; + if (off & ~PAGE_MASK) + goto out; + + error = sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); +out: + return error; +} + +static void find_start_end(unsigned long flags, unsigned long *begin, + unsigned long *end) +{ + if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT)) { + unsigned long new_begin; + /* This is usually used needed to map code in small + model, so it needs to be in the first 31bit. Limit + it to that. This means we need to move the + unmapped base down for this case. This can give + conflicts with the heap, but we assume that glibc + malloc knows how to fall back to mmap. Give it 1GB + of playground for now. -AK */ + *begin = 0x40000000; + *end = 0x80000000; + if (current->flags & PF_RANDOMIZE) { + new_begin = randomize_range(*begin, *begin + 0x02000000, 0); + if (new_begin) + *begin = new_begin; + } + } else { + *begin = current->mm->mmap_legacy_base; + *end = TASK_SIZE; + } +} + +unsigned long +arch_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + struct vm_unmapped_area_info info; + unsigned long begin, end; + + if (flags & MAP_FIXED) + return addr; + + find_start_end(flags, &begin, &end); + + if (len > end) + return -ENOMEM; + + if (addr) { + addr = PAGE_ALIGN(addr); + vma = find_vma(mm, addr); + if (end - len >= addr && + (!vma || addr + len <= vma->vm_start)) + return addr; + } + + info.flags = 0; + info.length = len; + info.low_limit = begin; + info.high_limit = end; + info.align_mask = filp ? get_align_mask() : 0; + info.align_offset = pgoff << PAGE_SHIFT; + return vm_unmapped_area(&info); +} + +unsigned long +arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, + const unsigned long len, const unsigned long pgoff, + const unsigned long flags) +{ + struct vm_area_struct *vma; + struct mm_struct *mm = current->mm; + unsigned long addr = addr0; + struct vm_unmapped_area_info info; + + /* requested length too big for entire address space */ + if (len > TASK_SIZE) + return -ENOMEM; + + if (flags & MAP_FIXED) + return addr; + + /* for MAP_32BIT mappings we force the legacy mmap base */ + if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT)) + goto bottomup; + + /* requesting a specific address */ + if (addr) { + addr = PAGE_ALIGN(addr); + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && + (!vma || addr + len <= vma->vm_start)) + return addr; + } + + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; + info.low_limit = PAGE_SIZE; + info.high_limit = mm->mmap_base; + info.align_mask = filp ? get_align_mask() : 0; + info.align_offset = pgoff << PAGE_SHIFT; + addr = vm_unmapped_area(&info); + if (!(addr & ~PAGE_MASK)) + return addr; + VM_BUG_ON(addr != -ENOMEM); + +bottomup: + /* + * A failed mmap() very likely causes application failure, + * so fall back to the bottom-up function here. This scenario + * can happen with large stack limits and large mmap() + * allocations. + */ + return arch_get_unmapped_area(filp, addr0, len, pgoff, flags); +} diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c new file mode 100644 index 00000000000..e9bcd57d8a9 --- /dev/null +++ b/arch/x86/kernel/syscall_32.c @@ -0,0 +1,25 @@ +/* System call table for i386. */ + +#include <linux/linkage.h> +#include <linux/sys.h> +#include <linux/cache.h> +#include <asm/asm-offsets.h> + +#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ; +#include <asm/syscalls_32.h> +#undef __SYSCALL_I386 + +#define __SYSCALL_I386(nr, sym, compat) [nr] = sym, + +typedef asmlinkage void (*sys_call_ptr_t)(void); + +extern asmlinkage void sys_ni_syscall(void); + +__visible const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { + /* + * Smells like a compiler bug -- it doesn't work + * when the & below is removed. + */ + [0 ... __NR_syscall_max] = &sys_ni_syscall, +#include <asm/syscalls_32.h> +}; diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c new file mode 100644 index 00000000000..4ac730b37f0 --- /dev/null +++ b/arch/x86/kernel/syscall_64.c @@ -0,0 +1,32 @@ +/* System call table for x86-64. */ + +#include <linux/linkage.h> +#include <linux/sys.h> +#include <linux/cache.h> +#include <asm/asm-offsets.h> +#include <asm/syscall.h> + +#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) + +#ifdef CONFIG_X86_X32_ABI +# define __SYSCALL_X32(nr, sym, compat) __SYSCALL_64(nr, sym, compat) +#else +# define __SYSCALL_X32(nr, sym, compat) /* nothing */ +#endif + +#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ; +#include <asm/syscalls_64.h> +#undef __SYSCALL_64 + +#define __SYSCALL_64(nr, sym, compat) [nr] = sym, + +extern void sys_ni_syscall(void); + +asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { + /* + * Smells like a compiler bug -- it doesn't work + * when the & below is removed. + */ + [0 ... __NR_syscall_max] = &sys_ni_syscall, +#include <asm/syscalls_64.h> +}; diff --git a/arch/x86/kernel/sysfb.c b/arch/x86/kernel/sysfb.c new file mode 100644 index 00000000000..193ec2ce46c --- /dev/null +++ b/arch/x86/kernel/sysfb.c @@ -0,0 +1,74 @@ +/* + * Generic System Framebuffers on x86 + * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +/* + * Simple-Framebuffer support for x86 systems + * Create a platform-device for any available boot framebuffer. The + * simple-framebuffer platform device is already available on DT systems, so + * this module parses the global "screen_info" object and creates a suitable + * platform device compatible with the "simple-framebuffer" DT object. If + * the framebuffer is incompatible, we instead create a legacy + * "vesa-framebuffer", "efi-framebuffer" or "platform-framebuffer" device and + * pass the screen_info as platform_data. This allows legacy drivers + * to pick these devices up without messing with simple-framebuffer drivers. + * The global "screen_info" is still valid at all times. + * + * If CONFIG_X86_SYSFB is not selected, we never register "simple-framebuffer" + * platform devices, but only use legacy framebuffer devices for + * backwards compatibility. + * + * TODO: We set the dev_id field of all platform-devices to 0. This allows + * other x86 OF/DT parsers to create such devices, too. However, they must + * start at offset 1 for this to work. + */ + +#include <linux/err.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/platform_data/simplefb.h> +#include <linux/platform_device.h> +#include <linux/screen_info.h> +#include <asm/sysfb.h> + +static __init int sysfb_init(void) +{ + struct screen_info *si = &screen_info; + struct simplefb_platform_data mode; + struct platform_device *pd; + const char *name; + bool compatible; + int ret; + + sysfb_apply_efi_quirks(); + + /* try to create a simple-framebuffer device */ + compatible = parse_mode(si, &mode); + if (compatible) { + ret = create_simplefb(si, &mode); + if (!ret) + return 0; + } + + /* if the FB is incompatible, create a legacy framebuffer device */ + if (si->orig_video_isVGA == VIDEO_TYPE_EFI) + name = "efi-framebuffer"; + else if (si->orig_video_isVGA == VIDEO_TYPE_VLFB) + name = "vesa-framebuffer"; + else + name = "platform-framebuffer"; + + pd = platform_device_register_resndata(NULL, name, 0, + NULL, 0, si, sizeof(*si)); + return IS_ERR(pd) ? PTR_ERR(pd) : 0; +} + +/* must execute after PCI subsystem for EFI quirks */ +device_initcall(sysfb_init); diff --git a/arch/x86/kernel/sysfb_efi.c b/arch/x86/kernel/sysfb_efi.c new file mode 100644 index 00000000000..b285d4e8c68 --- /dev/null +++ b/arch/x86/kernel/sysfb_efi.c @@ -0,0 +1,214 @@ +/* + * Generic System Framebuffers on x86 + * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com> + * + * EFI Quirks Copyright (c) 2006 Edgar Hucek <gimli@dark-green.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +/* + * EFI Quirks + * Several EFI systems do not correctly advertise their boot framebuffers. + * Hence, we use this static table of known broken machines and fix up the + * information so framebuffer drivers can load corectly. + */ + +#include <linux/dmi.h> +#include <linux/err.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/pci.h> +#include <linux/screen_info.h> +#include <video/vga.h> +#include <asm/sysfb.h> + +enum { + OVERRIDE_NONE = 0x0, + OVERRIDE_BASE = 0x1, + OVERRIDE_STRIDE = 0x2, + OVERRIDE_HEIGHT = 0x4, + OVERRIDE_WIDTH = 0x8, +}; + +struct efifb_dmi_info efifb_dmi_list[] = { + [M_I17] = { "i17", 0x80010000, 1472 * 4, 1440, 900, OVERRIDE_NONE }, + [M_I20] = { "i20", 0x80010000, 1728 * 4, 1680, 1050, OVERRIDE_NONE }, /* guess */ + [M_I20_SR] = { "imac7", 0x40010000, 1728 * 4, 1680, 1050, OVERRIDE_NONE }, + [M_I24] = { "i24", 0x80010000, 2048 * 4, 1920, 1200, OVERRIDE_NONE }, /* guess */ + [M_I24_8_1] = { "imac8", 0xc0060000, 2048 * 4, 1920, 1200, OVERRIDE_NONE }, + [M_I24_10_1] = { "imac10", 0xc0010000, 2048 * 4, 1920, 1080, OVERRIDE_NONE }, + [M_I27_11_1] = { "imac11", 0xc0010000, 2560 * 4, 2560, 1440, OVERRIDE_NONE }, + [M_MINI]= { "mini", 0x80000000, 2048 * 4, 1024, 768, OVERRIDE_NONE }, + [M_MINI_3_1] = { "mini31", 0x40010000, 1024 * 4, 1024, 768, OVERRIDE_NONE }, + [M_MINI_4_1] = { "mini41", 0xc0010000, 2048 * 4, 1920, 1200, OVERRIDE_NONE }, + [M_MB] = { "macbook", 0x80000000, 2048 * 4, 1280, 800, OVERRIDE_NONE }, + [M_MB_5_1] = { "macbook51", 0x80010000, 2048 * 4, 1280, 800, OVERRIDE_NONE }, + [M_MB_6_1] = { "macbook61", 0x80010000, 2048 * 4, 1280, 800, OVERRIDE_NONE }, + [M_MB_7_1] = { "macbook71", 0x80010000, 2048 * 4, 1280, 800, OVERRIDE_NONE }, + [M_MBA] = { "mba", 0x80000000, 2048 * 4, 1280, 800, OVERRIDE_NONE }, + /* 11" Macbook Air 3,1 passes the wrong stride */ + [M_MBA_3] = { "mba3", 0, 2048 * 4, 0, 0, OVERRIDE_STRIDE }, + [M_MBP] = { "mbp", 0x80010000, 1472 * 4, 1440, 900, OVERRIDE_NONE }, + [M_MBP_2] = { "mbp2", 0, 0, 0, 0, OVERRIDE_NONE }, /* placeholder */ + [M_MBP_2_2] = { "mbp22", 0x80010000, 1472 * 4, 1440, 900, OVERRIDE_NONE }, + [M_MBP_SR] = { "mbp3", 0x80030000, 2048 * 4, 1440, 900, OVERRIDE_NONE }, + [M_MBP_4] = { "mbp4", 0xc0060000, 2048 * 4, 1920, 1200, OVERRIDE_NONE }, + [M_MBP_5_1] = { "mbp51", 0xc0010000, 2048 * 4, 1440, 900, OVERRIDE_NONE }, + [M_MBP_5_2] = { "mbp52", 0xc0010000, 2048 * 4, 1920, 1200, OVERRIDE_NONE }, + [M_MBP_5_3] = { "mbp53", 0xd0010000, 2048 * 4, 1440, 900, OVERRIDE_NONE }, + [M_MBP_6_1] = { "mbp61", 0x90030000, 2048 * 4, 1920, 1200, OVERRIDE_NONE }, + [M_MBP_6_2] = { "mbp62", 0x90030000, 2048 * 4, 1680, 1050, OVERRIDE_NONE }, + [M_MBP_7_1] = { "mbp71", 0xc0010000, 2048 * 4, 1280, 800, OVERRIDE_NONE }, + [M_MBP_8_2] = { "mbp82", 0x90010000, 1472 * 4, 1440, 900, OVERRIDE_NONE }, + [M_UNKNOWN] = { NULL, 0, 0, 0, 0, OVERRIDE_NONE } +}; + +#define choose_value(dmivalue, fwvalue, field, flags) ({ \ + typeof(fwvalue) _ret_ = fwvalue; \ + if ((flags) & (field)) \ + _ret_ = dmivalue; \ + else if ((fwvalue) == 0) \ + _ret_ = dmivalue; \ + _ret_; \ + }) + +static int __init efifb_set_system(const struct dmi_system_id *id) +{ + struct efifb_dmi_info *info = id->driver_data; + + if (info->base == 0 && info->height == 0 && info->width == 0 && + info->stride == 0) + return 0; + + /* Trust the bootloader over the DMI tables */ + if (screen_info.lfb_base == 0) { +#if defined(CONFIG_PCI) + struct pci_dev *dev = NULL; + int found_bar = 0; +#endif + if (info->base) { + screen_info.lfb_base = choose_value(info->base, + screen_info.lfb_base, OVERRIDE_BASE, + info->flags); + +#if defined(CONFIG_PCI) + /* make sure that the address in the table is actually + * on a VGA device's PCI BAR */ + + for_each_pci_dev(dev) { + int i; + if ((dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) + continue; + for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { + resource_size_t start, end; + + start = pci_resource_start(dev, i); + if (start == 0) + break; + end = pci_resource_end(dev, i); + if (screen_info.lfb_base >= start && + screen_info.lfb_base < end) { + found_bar = 1; + } + } + } + if (!found_bar) + screen_info.lfb_base = 0; +#endif + } + } + if (screen_info.lfb_base) { + screen_info.lfb_linelength = choose_value(info->stride, + screen_info.lfb_linelength, OVERRIDE_STRIDE, + info->flags); + screen_info.lfb_width = choose_value(info->width, + screen_info.lfb_width, OVERRIDE_WIDTH, + info->flags); + screen_info.lfb_height = choose_value(info->height, + screen_info.lfb_height, OVERRIDE_HEIGHT, + info->flags); + if (screen_info.orig_video_isVGA == 0) + screen_info.orig_video_isVGA = VIDEO_TYPE_EFI; + } else { + screen_info.lfb_linelength = 0; + screen_info.lfb_width = 0; + screen_info.lfb_height = 0; + screen_info.orig_video_isVGA = 0; + return 0; + } + + printk(KERN_INFO "efifb: dmi detected %s - framebuffer at 0x%08x " + "(%dx%d, stride %d)\n", id->ident, + screen_info.lfb_base, screen_info.lfb_width, + screen_info.lfb_height, screen_info.lfb_linelength); + + return 1; +} + +#define EFIFB_DMI_SYSTEM_ID(vendor, name, enumid) \ + { \ + efifb_set_system, \ + name, \ + { \ + DMI_MATCH(DMI_BIOS_VENDOR, vendor), \ + DMI_MATCH(DMI_PRODUCT_NAME, name) \ + }, \ + &efifb_dmi_list[enumid] \ + } + +static const struct dmi_system_id efifb_dmi_system_table[] __initconst = { + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "iMac4,1", M_I17), + /* At least one of these two will be right; maybe both? */ + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "iMac5,1", M_I20), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac5,1", M_I20), + /* At least one of these two will be right; maybe both? */ + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "iMac6,1", M_I24), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac6,1", M_I24), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac7,1", M_I20_SR), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac8,1", M_I24_8_1), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac10,1", M_I24_10_1), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac11,1", M_I27_11_1), + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "Macmini1,1", M_MINI), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "Macmini3,1", M_MINI_3_1), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "Macmini4,1", M_MINI_4_1), + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBook1,1", M_MB), + /* At least one of these two will be right; maybe both? */ + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBook2,1", M_MB), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook2,1", M_MB), + /* At least one of these two will be right; maybe both? */ + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBook3,1", M_MB), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook3,1", M_MB), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook4,1", M_MB), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook5,1", M_MB_5_1), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook6,1", M_MB_6_1), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook7,1", M_MB_7_1), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookAir1,1", M_MBA), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookAir3,1", M_MBA_3), + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro1,1", M_MBP), + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro2,1", M_MBP_2), + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro2,2", M_MBP_2_2), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro2,1", M_MBP_2), + EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro3,1", M_MBP_SR), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro3,1", M_MBP_SR), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro4,1", M_MBP_4), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro5,1", M_MBP_5_1), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro5,2", M_MBP_5_2), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro5,3", M_MBP_5_3), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro6,1", M_MBP_6_1), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro6,2", M_MBP_6_2), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro7,1", M_MBP_7_1), + EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro8,2", M_MBP_8_2), + {}, +}; + +__init void sysfb_apply_efi_quirks(void) +{ + if (screen_info.orig_video_isVGA != VIDEO_TYPE_EFI || + !(screen_info.capabilities & VIDEO_CAPABILITY_SKIP_QUIRKS)) + dmi_check_system(efifb_dmi_system_table); +} diff --git a/arch/x86/kernel/sysfb_simplefb.c b/arch/x86/kernel/sysfb_simplefb.c new file mode 100644 index 00000000000..86179d40989 --- /dev/null +++ b/arch/x86/kernel/sysfb_simplefb.c @@ -0,0 +1,95 @@ +/* + * Generic System Framebuffers on x86 + * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +/* + * simple-framebuffer probing + * Try to convert "screen_info" into a "simple-framebuffer" compatible mode. + * If the mode is incompatible, we return "false" and let the caller create + * legacy nodes instead. + */ + +#include <linux/err.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/platform_data/simplefb.h> +#include <linux/platform_device.h> +#include <linux/screen_info.h> +#include <asm/sysfb.h> + +static const char simplefb_resname[] = "BOOTFB"; +static const struct simplefb_format formats[] = SIMPLEFB_FORMATS; + +/* try parsing x86 screen_info into a simple-framebuffer mode struct */ +__init bool parse_mode(const struct screen_info *si, + struct simplefb_platform_data *mode) +{ + const struct simplefb_format *f; + __u8 type; + unsigned int i; + + type = si->orig_video_isVGA; + if (type != VIDEO_TYPE_VLFB && type != VIDEO_TYPE_EFI) + return false; + + for (i = 0; i < ARRAY_SIZE(formats); ++i) { + f = &formats[i]; + if (si->lfb_depth == f->bits_per_pixel && + si->red_size == f->red.length && + si->red_pos == f->red.offset && + si->green_size == f->green.length && + si->green_pos == f->green.offset && + si->blue_size == f->blue.length && + si->blue_pos == f->blue.offset && + si->rsvd_size == f->transp.length && + si->rsvd_pos == f->transp.offset) { + mode->format = f->name; + mode->width = si->lfb_width; + mode->height = si->lfb_height; + mode->stride = si->lfb_linelength; + return true; + } + } + + return false; +} + +__init int create_simplefb(const struct screen_info *si, + const struct simplefb_platform_data *mode) +{ + struct platform_device *pd; + struct resource res; + unsigned long len; + + /* don't use lfb_size as it may contain the whole VMEM instead of only + * the part that is occupied by the framebuffer */ + len = mode->height * mode->stride; + len = PAGE_ALIGN(len); + if (len > (u64)si->lfb_size << 16) { + printk(KERN_WARNING "sysfb: VRAM smaller than advertised\n"); + return -EINVAL; + } + + /* setup IORESOURCE_MEM as framebuffer memory */ + memset(&res, 0, sizeof(res)); + res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; + res.name = simplefb_resname; + res.start = si->lfb_base; + res.end = si->lfb_base + len - 1; + if (res.end <= res.start) + return -EINVAL; + + pd = platform_device_register_resndata(NULL, "simple-framebuffer", 0, + &res, 1, mode, sizeof(*mode)); + if (IS_ERR(pd)) + return PTR_ERR(pd); + + return 0; +} diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c new file mode 100644 index 00000000000..91a4496db43 --- /dev/null +++ b/arch/x86/kernel/tboot.c @@ -0,0 +1,538 @@ +/* + * tboot.c: main implementation of helper functions used by kernel for + * runtime support of Intel(R) Trusted Execution Technology + * + * Copyright (c) 2006-2009, Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#include <linux/dma_remapping.h> +#include <linux/init_task.h> +#include <linux/spinlock.h> +#include <linux/export.h> +#include <linux/delay.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/dmar.h> +#include <linux/cpu.h> +#include <linux/pfn.h> +#include <linux/mm.h> +#include <linux/tboot.h> +#include <linux/debugfs.h> + +#include <asm/realmode.h> +#include <asm/processor.h> +#include <asm/bootparam.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/swiotlb.h> +#include <asm/fixmap.h> +#include <asm/proto.h> +#include <asm/setup.h> +#include <asm/e820.h> +#include <asm/io.h> + +#include "../realmode/rm/wakeup.h" + +/* Global pointer to shared data; NULL means no measured launch. */ +struct tboot *tboot __read_mostly; +EXPORT_SYMBOL(tboot); + +/* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */ +#define AP_WAIT_TIMEOUT 1 + +#undef pr_fmt +#define pr_fmt(fmt) "tboot: " fmt + +static u8 tboot_uuid[16] __initdata = TBOOT_UUID; + +void __init tboot_probe(void) +{ + /* Look for valid page-aligned address for shared page. */ + if (!boot_params.tboot_addr) + return; + /* + * also verify that it is mapped as we expect it before calling + * set_fixmap(), to reduce chance of garbage value causing crash + */ + if (!e820_any_mapped(boot_params.tboot_addr, + boot_params.tboot_addr, E820_RESERVED)) { + pr_warning("non-0 tboot_addr but it is not of type E820_RESERVED\n"); + return; + } + + /* only a natively booted kernel should be using TXT */ + if (paravirt_enabled()) { + pr_warning("non-0 tboot_addr but pv_ops is enabled\n"); + return; + } + + /* Map and check for tboot UUID. */ + set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr); + tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE); + if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) { + pr_warning("tboot at 0x%llx is invalid\n", + boot_params.tboot_addr); + tboot = NULL; + return; + } + if (tboot->version < 5) { + pr_warning("tboot version is invalid: %u\n", tboot->version); + tboot = NULL; + return; + } + + pr_info("found shared page at phys addr 0x%llx:\n", + boot_params.tboot_addr); + pr_debug("version: %d\n", tboot->version); + pr_debug("log_addr: 0x%08x\n", tboot->log_addr); + pr_debug("shutdown_entry: 0x%x\n", tboot->shutdown_entry); + pr_debug("tboot_base: 0x%08x\n", tboot->tboot_base); + pr_debug("tboot_size: 0x%x\n", tboot->tboot_size); +} + +static pgd_t *tboot_pg_dir; +static struct mm_struct tboot_mm = { + .mm_rb = RB_ROOT, + .pgd = swapper_pg_dir, + .mm_users = ATOMIC_INIT(2), + .mm_count = ATOMIC_INIT(1), + .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), + .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), + .mmlist = LIST_HEAD_INIT(init_mm.mmlist), +}; + +static inline void switch_to_tboot_pt(void) +{ + write_cr3(virt_to_phys(tboot_pg_dir)); +} + +static int map_tboot_page(unsigned long vaddr, unsigned long pfn, + pgprot_t prot) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_offset(&tboot_mm, vaddr); + pud = pud_alloc(&tboot_mm, pgd, vaddr); + if (!pud) + return -1; + pmd = pmd_alloc(&tboot_mm, pud, vaddr); + if (!pmd) + return -1; + pte = pte_alloc_map(&tboot_mm, NULL, pmd, vaddr); + if (!pte) + return -1; + set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot)); + pte_unmap(pte); + return 0; +} + +static int map_tboot_pages(unsigned long vaddr, unsigned long start_pfn, + unsigned long nr) +{ + /* Reuse the original kernel mapping */ + tboot_pg_dir = pgd_alloc(&tboot_mm); + if (!tboot_pg_dir) + return -1; + + for (; nr > 0; nr--, vaddr += PAGE_SIZE, start_pfn++) { + if (map_tboot_page(vaddr, start_pfn, PAGE_KERNEL_EXEC)) + return -1; + } + + return 0; +} + +static void tboot_create_trampoline(void) +{ + u32 map_base, map_size; + + /* Create identity map for tboot shutdown code. */ + map_base = PFN_DOWN(tboot->tboot_base); + map_size = PFN_UP(tboot->tboot_size); + if (map_tboot_pages(map_base << PAGE_SHIFT, map_base, map_size)) + panic("tboot: Error mapping tboot pages (mfns) @ 0x%x, 0x%x\n", + map_base, map_size); +} + +#ifdef CONFIG_ACPI_SLEEP + +static void add_mac_region(phys_addr_t start, unsigned long size) +{ + struct tboot_mac_region *mr; + phys_addr_t end = start + size; + + if (tboot->num_mac_regions >= MAX_TB_MAC_REGIONS) + panic("tboot: Too many MAC regions\n"); + + if (start && size) { + mr = &tboot->mac_regions[tboot->num_mac_regions++]; + mr->start = round_down(start, PAGE_SIZE); + mr->size = round_up(end, PAGE_SIZE) - mr->start; + } +} + +static int tboot_setup_sleep(void) +{ + int i; + + tboot->num_mac_regions = 0; + + for (i = 0; i < e820.nr_map; i++) { + if ((e820.map[i].type != E820_RAM) + && (e820.map[i].type != E820_RESERVED_KERN)) + continue; + + add_mac_region(e820.map[i].addr, e820.map[i].size); + } + + tboot->acpi_sinfo.kernel_s3_resume_vector = + real_mode_header->wakeup_start; + + return 0; +} + +#else /* no CONFIG_ACPI_SLEEP */ + +static int tboot_setup_sleep(void) +{ + /* S3 shutdown requested, but S3 not supported by the kernel... */ + BUG(); + return -1; +} + +#endif + +void tboot_shutdown(u32 shutdown_type) +{ + void (*shutdown)(void); + + if (!tboot_enabled()) + return; + + /* + * if we're being called before the 1:1 mapping is set up then just + * return and let the normal shutdown happen; this should only be + * due to very early panic() + */ + if (!tboot_pg_dir) + return; + + /* if this is S3 then set regions to MAC */ + if (shutdown_type == TB_SHUTDOWN_S3) + if (tboot_setup_sleep()) + return; + + tboot->shutdown_type = shutdown_type; + + switch_to_tboot_pt(); + + shutdown = (void(*)(void))(unsigned long)tboot->shutdown_entry; + shutdown(); + + /* should not reach here */ + while (1) + halt(); +} + +static void tboot_copy_fadt(const struct acpi_table_fadt *fadt) +{ +#define TB_COPY_GAS(tbg, g) \ + tbg.space_id = g.space_id; \ + tbg.bit_width = g.bit_width; \ + tbg.bit_offset = g.bit_offset; \ + tbg.access_width = g.access_width; \ + tbg.address = g.address; + + TB_COPY_GAS(tboot->acpi_sinfo.pm1a_cnt_blk, fadt->xpm1a_control_block); + TB_COPY_GAS(tboot->acpi_sinfo.pm1b_cnt_blk, fadt->xpm1b_control_block); + TB_COPY_GAS(tboot->acpi_sinfo.pm1a_evt_blk, fadt->xpm1a_event_block); + TB_COPY_GAS(tboot->acpi_sinfo.pm1b_evt_blk, fadt->xpm1b_event_block); + + /* + * We need phys addr of waking vector, but can't use virt_to_phys() on + * &acpi_gbl_FACS because it is ioremap'ed, so calc from FACS phys + * addr. + */ + tboot->acpi_sinfo.wakeup_vector = fadt->facs + + offsetof(struct acpi_table_facs, firmware_waking_vector); +} + +static int tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control) +{ + static u32 acpi_shutdown_map[ACPI_S_STATE_COUNT] = { + /* S0,1,2: */ -1, -1, -1, + /* S3: */ TB_SHUTDOWN_S3, + /* S4: */ TB_SHUTDOWN_S4, + /* S5: */ TB_SHUTDOWN_S5 }; + + if (!tboot_enabled()) + return 0; + + tboot_copy_fadt(&acpi_gbl_FADT); + tboot->acpi_sinfo.pm1a_cnt_val = pm1a_control; + tboot->acpi_sinfo.pm1b_cnt_val = pm1b_control; + /* we always use the 32b wakeup vector */ + tboot->acpi_sinfo.vector_width = 32; + + if (sleep_state >= ACPI_S_STATE_COUNT || + acpi_shutdown_map[sleep_state] == -1) { + pr_warning("unsupported sleep state 0x%x\n", sleep_state); + return -1; + } + + tboot_shutdown(acpi_shutdown_map[sleep_state]); + return 0; +} + +static int tboot_extended_sleep(u8 sleep_state, u32 val_a, u32 val_b) +{ + if (!tboot_enabled()) + return 0; + + pr_warning("tboot is not able to suspend on platforms with reduced hardware sleep (ACPIv5)"); + return -ENODEV; +} + +static atomic_t ap_wfs_count; + +static int tboot_wait_for_aps(int num_aps) +{ + unsigned long timeout; + + timeout = AP_WAIT_TIMEOUT*HZ; + while (atomic_read((atomic_t *)&tboot->num_in_wfs) != num_aps && + timeout) { + mdelay(1); + timeout--; + } + + if (timeout) + pr_warning("tboot wait for APs timeout\n"); + + return !(atomic_read((atomic_t *)&tboot->num_in_wfs) == num_aps); +} + +static int tboot_cpu_callback(struct notifier_block *nfb, unsigned long action, + void *hcpu) +{ + switch (action) { + case CPU_DYING: + atomic_inc(&ap_wfs_count); + if (num_online_cpus() == 1) + if (tboot_wait_for_aps(atomic_read(&ap_wfs_count))) + return NOTIFY_BAD; + break; + } + return NOTIFY_OK; +} + +static struct notifier_block tboot_cpu_notifier = +{ + .notifier_call = tboot_cpu_callback, +}; + +#ifdef CONFIG_DEBUG_FS + +#define TBOOT_LOG_UUID { 0x26, 0x25, 0x19, 0xc0, 0x30, 0x6b, 0xb4, 0x4d, \ + 0x4c, 0x84, 0xa3, 0xe9, 0x53, 0xb8, 0x81, 0x74 } + +#define TBOOT_SERIAL_LOG_ADDR 0x60000 +#define TBOOT_SERIAL_LOG_SIZE 0x08000 +#define LOG_MAX_SIZE_OFF 16 +#define LOG_BUF_OFF 24 + +static uint8_t tboot_log_uuid[16] = TBOOT_LOG_UUID; + +static ssize_t tboot_log_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) +{ + void __iomem *log_base; + u8 log_uuid[16]; + u32 max_size; + void *kbuf; + int ret = -EFAULT; + + log_base = ioremap_nocache(TBOOT_SERIAL_LOG_ADDR, TBOOT_SERIAL_LOG_SIZE); + if (!log_base) + return ret; + + memcpy_fromio(log_uuid, log_base, sizeof(log_uuid)); + if (memcmp(&tboot_log_uuid, log_uuid, sizeof(log_uuid))) + goto err_iounmap; + + max_size = readl(log_base + LOG_MAX_SIZE_OFF); + if (*ppos >= max_size) { + ret = 0; + goto err_iounmap; + } + + if (*ppos + count > max_size) + count = max_size - *ppos; + + kbuf = kmalloc(count, GFP_KERNEL); + if (!kbuf) { + ret = -ENOMEM; + goto err_iounmap; + } + + memcpy_fromio(kbuf, log_base + LOG_BUF_OFF + *ppos, count); + if (copy_to_user(user_buf, kbuf, count)) + goto err_kfree; + + *ppos += count; + + ret = count; + +err_kfree: + kfree(kbuf); + +err_iounmap: + iounmap(log_base); + + return ret; +} + +static const struct file_operations tboot_log_fops = { + .read = tboot_log_read, + .llseek = default_llseek, +}; + +#endif /* CONFIG_DEBUG_FS */ + +static __init int tboot_late_init(void) +{ + if (!tboot_enabled()) + return 0; + + tboot_create_trampoline(); + + atomic_set(&ap_wfs_count, 0); + register_hotcpu_notifier(&tboot_cpu_notifier); + +#ifdef CONFIG_DEBUG_FS + debugfs_create_file("tboot_log", S_IRUSR, + arch_debugfs_dir, NULL, &tboot_log_fops); +#endif + + acpi_os_set_prepare_sleep(&tboot_sleep); + acpi_os_set_prepare_extended_sleep(&tboot_extended_sleep); + return 0; +} + +late_initcall(tboot_late_init); + +/* + * TXT configuration registers (offsets from TXT_{PUB, PRIV}_CONFIG_REGS_BASE) + */ + +#define TXT_PUB_CONFIG_REGS_BASE 0xfed30000 +#define TXT_PRIV_CONFIG_REGS_BASE 0xfed20000 + +/* # pages for each config regs space - used by fixmap */ +#define NR_TXT_CONFIG_PAGES ((TXT_PUB_CONFIG_REGS_BASE - \ + TXT_PRIV_CONFIG_REGS_BASE) >> PAGE_SHIFT) + +/* offsets from pub/priv config space */ +#define TXTCR_HEAP_BASE 0x0300 +#define TXTCR_HEAP_SIZE 0x0308 + +#define SHA1_SIZE 20 + +struct sha1_hash { + u8 hash[SHA1_SIZE]; +}; + +struct sinit_mle_data { + u32 version; /* currently 6 */ + struct sha1_hash bios_acm_id; + u32 edx_senter_flags; + u64 mseg_valid; + struct sha1_hash sinit_hash; + struct sha1_hash mle_hash; + struct sha1_hash stm_hash; + struct sha1_hash lcp_policy_hash; + u32 lcp_policy_control; + u32 rlp_wakeup_addr; + u32 reserved; + u32 num_mdrs; + u32 mdrs_off; + u32 num_vtd_dmars; + u32 vtd_dmars_off; +} __packed; + +struct acpi_table_header *tboot_get_dmar_table(struct acpi_table_header *dmar_tbl) +{ + void *heap_base, *heap_ptr, *config; + + if (!tboot_enabled()) + return dmar_tbl; + + /* + * ACPI tables may not be DMA protected by tboot, so use DMAR copy + * SINIT saved in SinitMleData in TXT heap (which is DMA protected) + */ + + /* map config space in order to get heap addr */ + config = ioremap(TXT_PUB_CONFIG_REGS_BASE, NR_TXT_CONFIG_PAGES * + PAGE_SIZE); + if (!config) + return NULL; + + /* now map TXT heap */ + heap_base = ioremap(*(u64 *)(config + TXTCR_HEAP_BASE), + *(u64 *)(config + TXTCR_HEAP_SIZE)); + iounmap(config); + if (!heap_base) + return NULL; + + /* walk heap to SinitMleData */ + /* skip BiosData */ + heap_ptr = heap_base + *(u64 *)heap_base; + /* skip OsMleData */ + heap_ptr += *(u64 *)heap_ptr; + /* skip OsSinitData */ + heap_ptr += *(u64 *)heap_ptr; + /* now points to SinitMleDataSize; set to SinitMleData */ + heap_ptr += sizeof(u64); + /* get addr of DMAR table */ + dmar_tbl = (struct acpi_table_header *)(heap_ptr + + ((struct sinit_mle_data *)heap_ptr)->vtd_dmars_off - + sizeof(u64)); + + /* don't unmap heap because dmar.c needs access to this */ + + return dmar_tbl; +} + +int tboot_force_iommu(void) +{ + if (!tboot_enabled()) + return 0; + + if (no_iommu || swiotlb || dmar_disabled) + pr_warning("Forcing Intel-IOMMU to enabled\n"); + + dmar_disabled = 0; +#ifdef CONFIG_SWIOTLB + swiotlb = 0; +#endif + no_iommu = 0; + + return 1; +} diff --git a/arch/x86/kernel/tce_64.c b/arch/x86/kernel/tce_64.c new file mode 100644 index 00000000000..ab40954e113 --- /dev/null +++ b/arch/x86/kernel/tce_64.c @@ -0,0 +1,190 @@ +/* + * This file manages the translation entries for the IBM Calgary IOMMU. + * + * Derived from arch/powerpc/platforms/pseries/iommu.c + * + * Copyright (C) IBM Corporation, 2006 + * + * Author: Jon Mason <jdmason@us.ibm.com> + * Author: Muli Ben-Yehuda <muli@il.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/spinlock.h> +#include <linux/string.h> +#include <linux/pci.h> +#include <linux/dma-mapping.h> +#include <linux/bootmem.h> +#include <asm/tce.h> +#include <asm/calgary.h> +#include <asm/proto.h> +#include <asm/cacheflush.h> + +/* flush a tce at 'tceaddr' to main memory */ +static inline void flush_tce(void* tceaddr) +{ + /* a single tce can't cross a cache line */ + if (cpu_has_clflush) + clflush(tceaddr); + else + wbinvd(); +} + +void tce_build(struct iommu_table *tbl, unsigned long index, + unsigned int npages, unsigned long uaddr, int direction) +{ + u64* tp; + u64 t; + u64 rpn; + + t = (1 << TCE_READ_SHIFT); + if (direction != DMA_TO_DEVICE) + t |= (1 << TCE_WRITE_SHIFT); + + tp = ((u64*)tbl->it_base) + index; + + while (npages--) { + rpn = (virt_to_bus((void*)uaddr)) >> PAGE_SHIFT; + t &= ~TCE_RPN_MASK; + t |= (rpn << TCE_RPN_SHIFT); + + *tp = cpu_to_be64(t); + flush_tce(tp); + + uaddr += PAGE_SIZE; + tp++; + } +} + +void tce_free(struct iommu_table *tbl, long index, unsigned int npages) +{ + u64* tp; + + tp = ((u64*)tbl->it_base) + index; + + while (npages--) { + *tp = cpu_to_be64(0); + flush_tce(tp); + tp++; + } +} + +static inline unsigned int table_size_to_number_of_entries(unsigned char size) +{ + /* + * size is the order of the table, 0-7 + * smallest table is 8K entries, so shift result by 13 to + * multiply by 8K + */ + return (1 << size) << 13; +} + +static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl) +{ + unsigned int bitmapsz; + unsigned long bmppages; + int ret; + + tbl->it_busno = dev->bus->number; + + /* set the tce table size - measured in entries */ + tbl->it_size = table_size_to_number_of_entries(specified_table_size); + + /* + * number of bytes needed for the bitmap size in number of + * entries; we need one bit per entry + */ + bitmapsz = tbl->it_size / BITS_PER_BYTE; + bmppages = __get_free_pages(GFP_KERNEL, get_order(bitmapsz)); + if (!bmppages) { + printk(KERN_ERR "Calgary: cannot allocate bitmap\n"); + ret = -ENOMEM; + goto done; + } + + tbl->it_map = (unsigned long*)bmppages; + + memset(tbl->it_map, 0, bitmapsz); + + tbl->it_hint = 0; + + spin_lock_init(&tbl->it_lock); + + return 0; + +done: + return ret; +} + +int __init build_tce_table(struct pci_dev *dev, void __iomem *bbar) +{ + struct iommu_table *tbl; + int ret; + + if (pci_iommu(dev->bus)) { + printk(KERN_ERR "Calgary: dev %p has sysdata->iommu %p\n", + dev, pci_iommu(dev->bus)); + BUG(); + } + + tbl = kzalloc(sizeof(struct iommu_table), GFP_KERNEL); + if (!tbl) { + printk(KERN_ERR "Calgary: error allocating iommu_table\n"); + ret = -ENOMEM; + goto done; + } + + ret = tce_table_setparms(dev, tbl); + if (ret) + goto free_tbl; + + tbl->bbar = bbar; + + set_pci_iommu(dev->bus, tbl); + + return 0; + +free_tbl: + kfree(tbl); +done: + return ret; +} + +void * __init alloc_tce_table(void) +{ + unsigned int size; + + size = table_size_to_number_of_entries(specified_table_size); + size *= TCE_ENTRY_SIZE; + + return __alloc_bootmem_low(size, size, 0); +} + +void __init free_tce_table(void *tbl) +{ + unsigned int size; + + if (!tbl) + return; + + size = table_size_to_number_of_entries(specified_table_size); + size *= TCE_ENTRY_SIZE; + + free_bootmem(__pa(tbl), size); +} diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c new file mode 100644 index 00000000000..3f92ce07e52 --- /dev/null +++ b/arch/x86/kernel/test_nx.c @@ -0,0 +1,175 @@ +/* + * test_nx.c: functional test for NX functionality + * + * (C) Copyright 2008 Intel Corporation + * Author: Arjan van de Ven <arjan@linux.intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ +#include <linux/module.h> +#include <linux/sort.h> +#include <linux/slab.h> + +#include <asm/uaccess.h> +#include <asm/asm.h> + +extern int rodata_test_data; + +/* + * This file checks 4 things: + * 1) Check if the stack is not executable + * 2) Check if kmalloc memory is not executable + * 3) Check if the .rodata section is not executable + * 4) Check if the .data section of a module is not executable + * + * To do this, the test code tries to execute memory in stack/kmalloc/etc, + * and then checks if the expected trap happens. + * + * Sadly, this implies having a dynamic exception handling table entry. + * ... which can be done (and will make Rusty cry)... but it can only + * be done in a stand-alone module with only 1 entry total. + * (otherwise we'd have to sort and that's just too messy) + */ + + + +/* + * We want to set up an exception handling point on our stack, + * which means a variable value. This function is rather dirty + * and walks the exception table of the module, looking for a magic + * marker and replaces it with a specific function. + */ +static void fudze_exception_table(void *marker, void *new) +{ + struct module *mod = THIS_MODULE; + struct exception_table_entry *extable; + + /* + * Note: This module has only 1 exception table entry, + * so searching and sorting is not needed. If that changes, + * this would be the place to search and re-sort the exception + * table. + */ + if (mod->num_exentries > 1) { + printk(KERN_ERR "test_nx: too many exception table entries!\n"); + printk(KERN_ERR "test_nx: test results are not reliable.\n"); + return; + } + extable = (struct exception_table_entry *)mod->extable; + extable[0].insn = (unsigned long)new; +} + + +/* + * exception tables get their symbols translated so we need + * to use a fake function to put in there, which we can then + * replace at runtime. + */ +void foo_label(void); + +/* + * returns 0 for not-executable, negative for executable + * + * Note: we cannot allow this function to be inlined, because + * that would give us more than 1 exception table entry. + * This in turn would break the assumptions above. + */ +static noinline int test_address(void *address) +{ + unsigned long result; + + /* Set up an exception table entry for our address */ + fudze_exception_table(&foo_label, address); + result = 1; + asm volatile( + "foo_label:\n" + "0: call *%[fake_code]\n" + "1:\n" + ".section .fixup,\"ax\"\n" + "2: mov %[zero], %[rslt]\n" + " ret\n" + ".previous\n" + _ASM_EXTABLE(0b,2b) + : [rslt] "=r" (result) + : [fake_code] "r" (address), [zero] "r" (0UL), "0" (result) + ); + /* change the exception table back for the next round */ + fudze_exception_table(address, &foo_label); + + if (result) + return -ENODEV; + return 0; +} + +static unsigned char test_data = 0xC3; /* 0xC3 is the opcode for "ret" */ + +static int test_NX(void) +{ + int ret = 0; + /* 0xC3 is the opcode for "ret" */ + char stackcode[] = {0xC3, 0x90, 0 }; + char *heap; + + test_data = 0xC3; + + printk(KERN_INFO "Testing NX protection\n"); + + /* Test 1: check if the stack is not executable */ + if (test_address(&stackcode)) { + printk(KERN_ERR "test_nx: stack was executable\n"); + ret = -ENODEV; + } + + + /* Test 2: Check if the heap is executable */ + heap = kmalloc(64, GFP_KERNEL); + if (!heap) + return -ENOMEM; + heap[0] = 0xC3; /* opcode for "ret" */ + + if (test_address(heap)) { + printk(KERN_ERR "test_nx: heap was executable\n"); + ret = -ENODEV; + } + kfree(heap); + + /* + * The following 2 tests currently fail, this needs to get fixed + * Until then, don't run them to avoid too many people getting scared + * by the error message + */ + +#ifdef CONFIG_DEBUG_RODATA + /* Test 3: Check if the .rodata section is executable */ + if (rodata_test_data != 0xC3) { + printk(KERN_ERR "test_nx: .rodata marker has invalid value\n"); + ret = -ENODEV; + } else if (test_address(&rodata_test_data)) { + printk(KERN_ERR "test_nx: .rodata section is executable\n"); + ret = -ENODEV; + } +#endif + +#if 0 + /* Test 4: Check if the .data section of a module is executable */ + if (test_address(&test_data)) { + printk(KERN_ERR "test_nx: .data section is executable\n"); + ret = -ENODEV; + } + +#endif + return ret; +} + +static void test_exit(void) +{ +} + +module_init(test_NX); +module_exit(test_exit); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Testcase for the NX infrastructure"); +MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>"); diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c new file mode 100644 index 00000000000..b79133abda4 --- /dev/null +++ b/arch/x86/kernel/test_rodata.c @@ -0,0 +1,80 @@ +/* + * test_rodata.c: functional test for mark_rodata_ro function + * + * (C) Copyright 2008 Intel Corporation + * Author: Arjan van de Ven <arjan@linux.intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ +#include <linux/module.h> +#include <asm/cacheflush.h> +#include <asm/sections.h> +#include <asm/asm.h> + +int rodata_test(void) +{ + unsigned long result; + unsigned long start, end; + + /* test 1: read the value */ + /* If this test fails, some previous testrun has clobbered the state */ + if (!rodata_test_data) { + printk(KERN_ERR "rodata_test: test 1 fails (start data)\n"); + return -ENODEV; + } + + /* test 2: write to the variable; this should fault */ + /* + * If this test fails, we managed to overwrite the data + * + * This is written in assembly to be able to catch the + * exception that is supposed to happen in the correct + * case + */ + + result = 1; + asm volatile( + "0: mov %[zero],(%[rodata_test])\n" + " mov %[zero], %[rslt]\n" + "1:\n" + ".section .fixup,\"ax\"\n" + "2: jmp 1b\n" + ".previous\n" + _ASM_EXTABLE(0b,2b) + : [rslt] "=r" (result) + : [rodata_test] "r" (&rodata_test_data), [zero] "r" (0UL) + ); + + + if (!result) { + printk(KERN_ERR "rodata_test: test data was not read only\n"); + return -ENODEV; + } + + /* test 3: check the value hasn't changed */ + /* If this test fails, we managed to overwrite the data */ + if (!rodata_test_data) { + printk(KERN_ERR "rodata_test: Test 3 failes (end data)\n"); + return -ENODEV; + } + /* test 4: check if the rodata section is 4Kb aligned */ + start = (unsigned long)__start_rodata; + end = (unsigned long)__end_rodata; + if (start & (PAGE_SIZE - 1)) { + printk(KERN_ERR "rodata_test: .rodata is not 4k aligned\n"); + return -ENODEV; + } + if (end & (PAGE_SIZE - 1)) { + printk(KERN_ERR "rodata_test: .rodata end is not 4k aligned\n"); + return -ENODEV; + } + + return 0; +} + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Testcase for the DEBUG_RODATA infrastructure"); +MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>"); diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c new file mode 100644 index 00000000000..bf7ef5ce29d --- /dev/null +++ b/arch/x86/kernel/time.c @@ -0,0 +1,95 @@ +/* + * Copyright (c) 1991,1992,1995 Linus Torvalds + * Copyright (c) 1994 Alan Modra + * Copyright (c) 1995 Markus Kuhn + * Copyright (c) 1996 Ingo Molnar + * Copyright (c) 1998 Andrea Arcangeli + * Copyright (c) 2002,2006 Vojtech Pavlik + * Copyright (c) 2003 Andi Kleen + * + */ + +#include <linux/clockchips.h> +#include <linux/interrupt.h> +#include <linux/i8253.h> +#include <linux/time.h> +#include <linux/export.h> + +#include <asm/vsyscall.h> +#include <asm/x86_init.h> +#include <asm/i8259.h> +#include <asm/timer.h> +#include <asm/hpet.h> +#include <asm/time.h> + +#ifdef CONFIG_X86_64 +__visible DEFINE_VVAR(volatile unsigned long, jiffies) = INITIAL_JIFFIES; +#endif + +unsigned long profile_pc(struct pt_regs *regs) +{ + unsigned long pc = instruction_pointer(regs); + + if (!user_mode_vm(regs) && in_lock_functions(pc)) { +#ifdef CONFIG_FRAME_POINTER + return *(unsigned long *)(regs->bp + sizeof(long)); +#else + unsigned long *sp = + (unsigned long *)kernel_stack_pointer(regs); + /* + * Return address is either directly at stack pointer + * or above a saved flags. Eflags has bits 22-31 zero, + * kernel addresses don't. + */ + if (sp[0] >> 22) + return sp[0]; + if (sp[1] >> 22) + return sp[1]; +#endif + } + return pc; +} +EXPORT_SYMBOL(profile_pc); + +/* + * Default timer interrupt handler for PIT/HPET + */ +static irqreturn_t timer_interrupt(int irq, void *dev_id) +{ + global_clock_event->event_handler(global_clock_event); + return IRQ_HANDLED; +} + +static struct irqaction irq0 = { + .handler = timer_interrupt, + .flags = IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, + .name = "timer" +}; + +void __init setup_default_timer_irq(void) +{ + setup_irq(0, &irq0); +} + +/* Default timer init function */ +void __init hpet_time_init(void) +{ + if (!hpet_enable()) + setup_pit_timer(); + setup_default_timer_irq(); +} + +static __init void x86_late_time_init(void) +{ + x86_init.timers.timer_init(); + tsc_init(); +} + +/* + * Initialize TSC and delay the periodic timer init to + * late x86_late_time_init() so ioremap works. + */ +void __init time_init(void) +{ + late_time_init = x86_late_time_init; +} diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c new file mode 100644 index 00000000000..f7fec09e3e3 --- /dev/null +++ b/arch/x86/kernel/tls.c @@ -0,0 +1,213 @@ +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/user.h> +#include <linux/regset.h> +#include <linux/syscalls.h> + +#include <asm/uaccess.h> +#include <asm/desc.h> +#include <asm/ldt.h> +#include <asm/processor.h> +#include <asm/proto.h> + +#include "tls.h" + +/* + * sys_alloc_thread_area: get a yet unused TLS descriptor index. + */ +static int get_free_idx(void) +{ + struct thread_struct *t = ¤t->thread; + int idx; + + for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++) + if (desc_empty(&t->tls_array[idx])) + return idx + GDT_ENTRY_TLS_MIN; + return -ESRCH; +} + +static void set_tls_desc(struct task_struct *p, int idx, + const struct user_desc *info, int n) +{ + struct thread_struct *t = &p->thread; + struct desc_struct *desc = &t->tls_array[idx - GDT_ENTRY_TLS_MIN]; + int cpu; + + /* + * We must not get preempted while modifying the TLS. + */ + cpu = get_cpu(); + + while (n-- > 0) { + if (LDT_empty(info)) + desc->a = desc->b = 0; + else + fill_ldt(desc, info); + ++info; + ++desc; + } + + if (t == ¤t->thread) + load_TLS(t, cpu); + + put_cpu(); +} + +/* + * Set a given TLS descriptor: + */ +int do_set_thread_area(struct task_struct *p, int idx, + struct user_desc __user *u_info, + int can_allocate) +{ + struct user_desc info; + + if (copy_from_user(&info, u_info, sizeof(info))) + return -EFAULT; + + if (idx == -1) + idx = info.entry_number; + + /* + * index -1 means the kernel should try to find and + * allocate an empty descriptor: + */ + if (idx == -1 && can_allocate) { + idx = get_free_idx(); + if (idx < 0) + return idx; + if (put_user(idx, &u_info->entry_number)) + return -EFAULT; + } + + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return -EINVAL; + + set_tls_desc(p, idx, &info, 1); + + return 0; +} + +SYSCALL_DEFINE1(set_thread_area, struct user_desc __user *, u_info) +{ + return do_set_thread_area(current, -1, u_info, 1); +} + + +/* + * Get the current Thread-Local Storage area: + */ + +static void fill_user_desc(struct user_desc *info, int idx, + const struct desc_struct *desc) + +{ + memset(info, 0, sizeof(*info)); + info->entry_number = idx; + info->base_addr = get_desc_base(desc); + info->limit = get_desc_limit(desc); + info->seg_32bit = desc->d; + info->contents = desc->type >> 2; + info->read_exec_only = !(desc->type & 2); + info->limit_in_pages = desc->g; + info->seg_not_present = !desc->p; + info->useable = desc->avl; +#ifdef CONFIG_X86_64 + info->lm = desc->l; +#endif +} + +int do_get_thread_area(struct task_struct *p, int idx, + struct user_desc __user *u_info) +{ + struct user_desc info; + + if (idx == -1 && get_user(idx, &u_info->entry_number)) + return -EFAULT; + + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return -EINVAL; + + fill_user_desc(&info, idx, + &p->thread.tls_array[idx - GDT_ENTRY_TLS_MIN]); + + if (copy_to_user(u_info, &info, sizeof(info))) + return -EFAULT; + return 0; +} + +SYSCALL_DEFINE1(get_thread_area, struct user_desc __user *, u_info) +{ + return do_get_thread_area(current, -1, u_info); +} + +int regset_tls_active(struct task_struct *target, + const struct user_regset *regset) +{ + struct thread_struct *t = &target->thread; + int n = GDT_ENTRY_TLS_ENTRIES; + while (n > 0 && desc_empty(&t->tls_array[n - 1])) + --n; + return n; +} + +int regset_tls_get(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + const struct desc_struct *tls; + + if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) || + (pos % sizeof(struct user_desc)) != 0 || + (count % sizeof(struct user_desc)) != 0) + return -EINVAL; + + pos /= sizeof(struct user_desc); + count /= sizeof(struct user_desc); + + tls = &target->thread.tls_array[pos]; + + if (kbuf) { + struct user_desc *info = kbuf; + while (count-- > 0) + fill_user_desc(info++, GDT_ENTRY_TLS_MIN + pos++, + tls++); + } else { + struct user_desc __user *u_info = ubuf; + while (count-- > 0) { + struct user_desc info; + fill_user_desc(&info, GDT_ENTRY_TLS_MIN + pos++, tls++); + if (__copy_to_user(u_info++, &info, sizeof(info))) + return -EFAULT; + } + } + + return 0; +} + +int regset_tls_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct user_desc infobuf[GDT_ENTRY_TLS_ENTRIES]; + const struct user_desc *info; + + if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) || + (pos % sizeof(struct user_desc)) != 0 || + (count % sizeof(struct user_desc)) != 0) + return -EINVAL; + + if (kbuf) + info = kbuf; + else if (__copy_from_user(infobuf, ubuf, count)) + return -EFAULT; + else + info = infobuf; + + set_tls_desc(target, + GDT_ENTRY_TLS_MIN + (pos / sizeof(struct user_desc)), + info, count / sizeof(struct user_desc)); + + return 0; +} diff --git a/arch/x86/kernel/tls.h b/arch/x86/kernel/tls.h new file mode 100644 index 00000000000..2f083a2fe21 --- /dev/null +++ b/arch/x86/kernel/tls.h @@ -0,0 +1,21 @@ +/* + * Internal declarations for x86 TLS implementation functions. + * + * Copyright (C) 2007 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + * + * Red Hat Author: Roland McGrath. + */ + +#ifndef _ARCH_X86_KERNEL_TLS_H + +#include <linux/regset.h> + +extern user_regset_active_fn regset_tls_active; +extern user_regset_get_fn regset_tls_get; +extern user_regset_set_fn regset_tls_set; + +#endif /* _ARCH_X86_KERNEL_TLS_H */ diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c new file mode 100644 index 00000000000..649b010da00 --- /dev/null +++ b/arch/x86/kernel/topology.c @@ -0,0 +1,173 @@ +/* + * Populate sysfs with topology information + * + * Written by: Matthew Dobson, IBM Corporation + * Original Code: Paul Dorwin, IBM Corporation, Patrick Mochel, OSDL + * + * Copyright (C) 2002, IBM Corp. + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Send feedback to <colpatch@us.ibm.com> + */ +#include <linux/nodemask.h> +#include <linux/export.h> +#include <linux/mmzone.h> +#include <linux/init.h> +#include <linux/smp.h> +#include <linux/irq.h> +#include <asm/cpu.h> + +static DEFINE_PER_CPU(struct x86_cpu, cpu_devices); + +#ifdef CONFIG_HOTPLUG_CPU + +#ifdef CONFIG_BOOTPARAM_HOTPLUG_CPU0 +static int cpu0_hotpluggable = 1; +#else +static int cpu0_hotpluggable; +static int __init enable_cpu0_hotplug(char *str) +{ + cpu0_hotpluggable = 1; + return 1; +} + +__setup("cpu0_hotplug", enable_cpu0_hotplug); +#endif + +#ifdef CONFIG_DEBUG_HOTPLUG_CPU0 +/* + * This function offlines a CPU as early as possible and allows userspace to + * boot up without the CPU. The CPU can be onlined back by user after boot. + * + * This is only called for debugging CPU offline/online feature. + */ +int __ref _debug_hotplug_cpu(int cpu, int action) +{ + struct device *dev = get_cpu_device(cpu); + int ret; + + if (!cpu_is_hotpluggable(cpu)) + return -EINVAL; + + lock_device_hotplug(); + + switch (action) { + case 0: + ret = cpu_down(cpu); + if (!ret) { + pr_info("CPU %u is now offline\n", cpu); + dev->offline = true; + kobject_uevent(&dev->kobj, KOBJ_OFFLINE); + } else + pr_debug("Can't offline CPU%d.\n", cpu); + break; + case 1: + ret = cpu_up(cpu); + if (!ret) { + dev->offline = false; + kobject_uevent(&dev->kobj, KOBJ_ONLINE); + } else { + pr_debug("Can't online CPU%d.\n", cpu); + } + break; + default: + ret = -EINVAL; + } + + unlock_device_hotplug(); + + return ret; +} + +static int __init debug_hotplug_cpu(void) +{ + _debug_hotplug_cpu(0, 0); + return 0; +} + +late_initcall_sync(debug_hotplug_cpu); +#endif /* CONFIG_DEBUG_HOTPLUG_CPU0 */ + +int __ref arch_register_cpu(int num) +{ + struct cpuinfo_x86 *c = &cpu_data(num); + + /* + * Currently CPU0 is only hotpluggable on Intel platforms. Other + * vendors can add hotplug support later. + */ + if (c->x86_vendor != X86_VENDOR_INTEL) + cpu0_hotpluggable = 0; + + /* + * Two known BSP/CPU0 dependencies: Resume from suspend/hibernate + * depends on BSP. PIC interrupts depend on BSP. + * + * If the BSP depencies are under control, one can tell kernel to + * enable BSP hotplug. This basically adds a control file and + * one can attempt to offline BSP. + */ + if (num == 0 && cpu0_hotpluggable) { + unsigned int irq; + /* + * We won't take down the boot processor on i386 if some + * interrupts only are able to be serviced by the BSP in PIC. + */ + for_each_active_irq(irq) { + if (!IO_APIC_IRQ(irq) && irq_has_action(irq)) { + cpu0_hotpluggable = 0; + break; + } + } + } + if (num || cpu0_hotpluggable) + per_cpu(cpu_devices, num).cpu.hotpluggable = 1; + + return register_cpu(&per_cpu(cpu_devices, num).cpu, num); +} +EXPORT_SYMBOL(arch_register_cpu); + +void arch_unregister_cpu(int num) +{ + unregister_cpu(&per_cpu(cpu_devices, num).cpu); +} +EXPORT_SYMBOL(arch_unregister_cpu); +#else /* CONFIG_HOTPLUG_CPU */ + +static int __init arch_register_cpu(int num) +{ + return register_cpu(&per_cpu(cpu_devices, num).cpu, num); +} +#endif /* CONFIG_HOTPLUG_CPU */ + +static int __init topology_init(void) +{ + int i; + +#ifdef CONFIG_NUMA + for_each_online_node(i) + register_one_node(i); +#endif + + for_each_present_cpu(i) + arch_register_cpu(i); + + return 0; +} +subsys_initcall(topology_init); diff --git a/arch/x86/kernel/trace_clock.c b/arch/x86/kernel/trace_clock.c new file mode 100644 index 00000000000..25b993729f9 --- /dev/null +++ b/arch/x86/kernel/trace_clock.c @@ -0,0 +1,21 @@ +/* + * X86 trace clocks + */ +#include <asm/trace_clock.h> +#include <asm/barrier.h> +#include <asm/msr.h> + +/* + * trace_clock_x86_tsc(): A clock that is just the cycle counter. + * + * Unlike the other clocks, this is not in nanoseconds. + */ +u64 notrace trace_clock_x86_tsc(void) +{ + u64 ret; + + rdtsc_barrier(); + rdtscll(ret); + + return ret; +} diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c new file mode 100644 index 00000000000..1c113db9ed5 --- /dev/null +++ b/arch/x86/kernel/tracepoint.c @@ -0,0 +1,59 @@ +/* + * Code for supporting irq vector tracepoints. + * + * Copyright (C) 2013 Seiji Aguchi <seiji.aguchi@hds.com> + * + */ +#include <asm/hw_irq.h> +#include <asm/desc.h> +#include <linux/atomic.h> + +atomic_t trace_idt_ctr = ATOMIC_INIT(0); +struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1, + (unsigned long) trace_idt_table }; + +/* No need to be aligned, but done to keep all IDTs defined the same way. */ +gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss; + +static int trace_irq_vector_refcount; +static DEFINE_MUTEX(irq_vector_mutex); + +static void set_trace_idt_ctr(int val) +{ + atomic_set(&trace_idt_ctr, val); + /* Ensure the trace_idt_ctr is set before sending IPI */ + wmb(); +} + +static void switch_idt(void *arg) +{ + unsigned long flags; + + local_irq_save(flags); + load_current_idt(); + local_irq_restore(flags); +} + +void trace_irq_vector_regfunc(void) +{ + mutex_lock(&irq_vector_mutex); + if (!trace_irq_vector_refcount) { + set_trace_idt_ctr(1); + smp_call_function(switch_idt, NULL, 0); + switch_idt(NULL); + } + trace_irq_vector_refcount++; + mutex_unlock(&irq_vector_mutex); +} + +void trace_irq_vector_unregfunc(void) +{ + mutex_lock(&irq_vector_mutex); + trace_irq_vector_refcount--; + if (!trace_irq_vector_refcount) { + set_trace_idt_ctr(0); + smp_call_function(switch_idt, NULL, 0); + switch_idt(NULL); + } + mutex_unlock(&irq_vector_mutex); +} diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c new file mode 100644 index 00000000000..0d0e922fafc --- /dev/null +++ b/arch/x86/kernel/traps.c @@ -0,0 +1,825 @@ +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + * + * Pentium III FXSR, SSE support + * Gareth Hughes <gareth@valinux.com>, May 2000 + */ + +/* + * Handle hardware traps and faults. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/context_tracking.h> +#include <linux/interrupt.h> +#include <linux/kallsyms.h> +#include <linux/spinlock.h> +#include <linux/kprobes.h> +#include <linux/uaccess.h> +#include <linux/kdebug.h> +#include <linux/kgdb.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/ptrace.h> +#include <linux/uprobes.h> +#include <linux/string.h> +#include <linux/delay.h> +#include <linux/errno.h> +#include <linux/kexec.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/init.h> +#include <linux/bug.h> +#include <linux/nmi.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/io.h> + +#ifdef CONFIG_EISA +#include <linux/ioport.h> +#include <linux/eisa.h> +#endif + +#if defined(CONFIG_EDAC) +#include <linux/edac.h> +#endif + +#include <asm/kmemcheck.h> +#include <asm/stacktrace.h> +#include <asm/processor.h> +#include <asm/debugreg.h> +#include <linux/atomic.h> +#include <asm/ftrace.h> +#include <asm/traps.h> +#include <asm/desc.h> +#include <asm/i387.h> +#include <asm/fpu-internal.h> +#include <asm/mce.h> +#include <asm/fixmap.h> +#include <asm/mach_traps.h> +#include <asm/alternative.h> + +#ifdef CONFIG_X86_64 +#include <asm/x86_init.h> +#include <asm/pgalloc.h> +#include <asm/proto.h> + +/* No need to be aligned, but done to keep all IDTs defined the same way. */ +gate_desc debug_idt_table[NR_VECTORS] __page_aligned_bss; +#else +#include <asm/processor-flags.h> +#include <asm/setup.h> + +asmlinkage int system_call(void); +#endif + +/* Must be page-aligned because the real IDT is used in a fixmap. */ +gate_desc idt_table[NR_VECTORS] __page_aligned_bss; + +DECLARE_BITMAP(used_vectors, NR_VECTORS); +EXPORT_SYMBOL_GPL(used_vectors); + +static inline void conditional_sti(struct pt_regs *regs) +{ + if (regs->flags & X86_EFLAGS_IF) + local_irq_enable(); +} + +static inline void preempt_conditional_sti(struct pt_regs *regs) +{ + preempt_count_inc(); + if (regs->flags & X86_EFLAGS_IF) + local_irq_enable(); +} + +static inline void conditional_cli(struct pt_regs *regs) +{ + if (regs->flags & X86_EFLAGS_IF) + local_irq_disable(); +} + +static inline void preempt_conditional_cli(struct pt_regs *regs) +{ + if (regs->flags & X86_EFLAGS_IF) + local_irq_disable(); + preempt_count_dec(); +} + +static nokprobe_inline int +do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, + struct pt_regs *regs, long error_code) +{ +#ifdef CONFIG_X86_32 + if (regs->flags & X86_VM_MASK) { + /* + * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86. + * On nmi (interrupt 2), do_trap should not be called. + */ + if (trapnr < X86_TRAP_UD) { + if (!handle_vm86_trap((struct kernel_vm86_regs *) regs, + error_code, trapnr)) + return 0; + } + return -1; + } +#endif + if (!user_mode(regs)) { + if (!fixup_exception(regs)) { + tsk->thread.error_code = error_code; + tsk->thread.trap_nr = trapnr; + die(str, regs, error_code); + } + return 0; + } + + return -1; +} + +static siginfo_t *fill_trap_info(struct pt_regs *regs, int signr, int trapnr, + siginfo_t *info) +{ + unsigned long siaddr; + int sicode; + + switch (trapnr) { + default: + return SEND_SIG_PRIV; + + case X86_TRAP_DE: + sicode = FPE_INTDIV; + siaddr = uprobe_get_trap_addr(regs); + break; + case X86_TRAP_UD: + sicode = ILL_ILLOPN; + siaddr = uprobe_get_trap_addr(regs); + break; + case X86_TRAP_AC: + sicode = BUS_ADRALN; + siaddr = 0; + break; + } + + info->si_signo = signr; + info->si_errno = 0; + info->si_code = sicode; + info->si_addr = (void __user *)siaddr; + return info; +} + +static void +do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, + long error_code, siginfo_t *info) +{ + struct task_struct *tsk = current; + + + if (!do_trap_no_signal(tsk, trapnr, str, regs, error_code)) + return; + /* + * We want error_code and trap_nr set for userspace faults and + * kernelspace faults which result in die(), but not + * kernelspace faults which are fixed up. die() gives the + * process no chance to handle the signal and notice the + * kernel fault information, so that won't result in polluting + * the information about previously queued, but not yet + * delivered, faults. See also do_general_protection below. + */ + tsk->thread.error_code = error_code; + tsk->thread.trap_nr = trapnr; + +#ifdef CONFIG_X86_64 + if (show_unhandled_signals && unhandled_signal(tsk, signr) && + printk_ratelimit()) { + pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx", + tsk->comm, tsk->pid, str, + regs->ip, regs->sp, error_code); + print_vma_addr(" in ", regs->ip); + pr_cont("\n"); + } +#endif + + force_sig_info(signr, info ?: SEND_SIG_PRIV, tsk); +} +NOKPROBE_SYMBOL(do_trap); + +static void do_error_trap(struct pt_regs *regs, long error_code, char *str, + unsigned long trapnr, int signr) +{ + enum ctx_state prev_state = exception_enter(); + siginfo_t info; + + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) != + NOTIFY_STOP) { + conditional_sti(regs); + do_trap(trapnr, signr, str, regs, error_code, + fill_trap_info(regs, signr, trapnr, &info)); + } + + exception_exit(prev_state); +} + +#define DO_ERROR(trapnr, signr, str, name) \ +dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ +{ \ + do_error_trap(regs, error_code, str, trapnr, signr); \ +} + +DO_ERROR(X86_TRAP_DE, SIGFPE, "divide error", divide_error) +DO_ERROR(X86_TRAP_OF, SIGSEGV, "overflow", overflow) +DO_ERROR(X86_TRAP_BR, SIGSEGV, "bounds", bounds) +DO_ERROR(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op) +DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun",coprocessor_segment_overrun) +DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS) +DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present) +#ifdef CONFIG_X86_32 +DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment) +#endif +DO_ERROR(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check) + +#ifdef CONFIG_X86_64 +/* Runs on IST stack */ +dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code) +{ + enum ctx_state prev_state; + + prev_state = exception_enter(); + if (notify_die(DIE_TRAP, "stack segment", regs, error_code, + X86_TRAP_SS, SIGBUS) != NOTIFY_STOP) { + preempt_conditional_sti(regs); + do_trap(X86_TRAP_SS, SIGBUS, "stack segment", regs, error_code, NULL); + preempt_conditional_cli(regs); + } + exception_exit(prev_state); +} + +dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) +{ + static const char str[] = "double fault"; + struct task_struct *tsk = current; + + exception_enter(); + /* Return not checked because double check cannot be ignored */ + notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); + + tsk->thread.error_code = error_code; + tsk->thread.trap_nr = X86_TRAP_DF; + +#ifdef CONFIG_DOUBLEFAULT + df_debug(regs, error_code); +#endif + /* + * This is always a kernel trap and never fixable (and thus must + * never return). + */ + for (;;) + die(str, regs, error_code); +} +#endif + +dotraplinkage void +do_general_protection(struct pt_regs *regs, long error_code) +{ + struct task_struct *tsk; + enum ctx_state prev_state; + + prev_state = exception_enter(); + conditional_sti(regs); + +#ifdef CONFIG_X86_32 + if (regs->flags & X86_VM_MASK) { + local_irq_enable(); + handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); + goto exit; + } +#endif + + tsk = current; + if (!user_mode(regs)) { + if (fixup_exception(regs)) + goto exit; + + tsk->thread.error_code = error_code; + tsk->thread.trap_nr = X86_TRAP_GP; + if (notify_die(DIE_GPF, "general protection fault", regs, error_code, + X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP) + die("general protection fault", regs, error_code); + goto exit; + } + + tsk->thread.error_code = error_code; + tsk->thread.trap_nr = X86_TRAP_GP; + + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && + printk_ratelimit()) { + pr_info("%s[%d] general protection ip:%lx sp:%lx error:%lx", + tsk->comm, task_pid_nr(tsk), + regs->ip, regs->sp, error_code); + print_vma_addr(" in ", regs->ip); + pr_cont("\n"); + } + + force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk); +exit: + exception_exit(prev_state); +} +NOKPROBE_SYMBOL(do_general_protection); + +/* May run on IST stack. */ +dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) +{ + enum ctx_state prev_state; + +#ifdef CONFIG_DYNAMIC_FTRACE + /* + * ftrace must be first, everything else may cause a recursive crash. + * See note by declaration of modifying_ftrace_code in ftrace.c + */ + if (unlikely(atomic_read(&modifying_ftrace_code)) && + ftrace_int3_handler(regs)) + return; +#endif + if (poke_int3_handler(regs)) + return; + + prev_state = exception_enter(); +#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP + if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, + SIGTRAP) == NOTIFY_STOP) + goto exit; +#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */ + +#ifdef CONFIG_KPROBES + if (kprobe_int3_handler(regs)) + goto exit; +#endif + + if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, + SIGTRAP) == NOTIFY_STOP) + goto exit; + + /* + * Let others (NMI) know that the debug stack is in use + * as we may switch to the interrupt stack. + */ + debug_stack_usage_inc(); + preempt_conditional_sti(regs); + do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL); + preempt_conditional_cli(regs); + debug_stack_usage_dec(); +exit: + exception_exit(prev_state); +} +NOKPROBE_SYMBOL(do_int3); + +#ifdef CONFIG_X86_64 +/* + * Help handler running on IST stack to switch back to user stack + * for scheduling or signal handling. The actual stack switch is done in + * entry.S + */ +asmlinkage __visible struct pt_regs *sync_regs(struct pt_regs *eregs) +{ + struct pt_regs *regs = eregs; + /* Did already sync */ + if (eregs == (struct pt_regs *)eregs->sp) + ; + /* Exception from user space */ + else if (user_mode(eregs)) + regs = task_pt_regs(current); + /* + * Exception from kernel and interrupts are enabled. Move to + * kernel process stack. + */ + else if (eregs->flags & X86_EFLAGS_IF) + regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs)); + if (eregs != regs) + *regs = *eregs; + return regs; +} +NOKPROBE_SYMBOL(sync_regs); +#endif + +/* + * Our handling of the processor debug registers is non-trivial. + * We do not clear them on entry and exit from the kernel. Therefore + * it is possible to get a watchpoint trap here from inside the kernel. + * However, the code in ./ptrace.c has ensured that the user can + * only set watchpoints on userspace addresses. Therefore the in-kernel + * watchpoint trap can only occur in code which is reading/writing + * from user space. Such code must not hold kernel locks (since it + * can equally take a page fault), therefore it is safe to call + * force_sig_info even though that claims and releases locks. + * + * Code in ./signal.c ensures that the debug control register + * is restored before we deliver any signal, and therefore that + * user code runs with the correct debug control register even though + * we clear it here. + * + * Being careful here means that we don't have to be as careful in a + * lot of more complicated places (task switching can be a bit lazy + * about restoring all the debug state, and ptrace doesn't have to + * find every occurrence of the TF bit that could be saved away even + * by user code) + * + * May run on IST stack. + */ +dotraplinkage void do_debug(struct pt_regs *regs, long error_code) +{ + struct task_struct *tsk = current; + enum ctx_state prev_state; + int user_icebp = 0; + unsigned long dr6; + int si_code; + + prev_state = exception_enter(); + + get_debugreg(dr6, 6); + + /* Filter out all the reserved bits which are preset to 1 */ + dr6 &= ~DR6_RESERVED; + + /* + * If dr6 has no reason to give us about the origin of this trap, + * then it's very likely the result of an icebp/int01 trap. + * User wants a sigtrap for that. + */ + if (!dr6 && user_mode(regs)) + user_icebp = 1; + + /* Catch kmemcheck conditions first of all! */ + if ((dr6 & DR_STEP) && kmemcheck_trap(regs)) + goto exit; + + /* DR6 may or may not be cleared by the CPU */ + set_debugreg(0, 6); + + /* + * The processor cleared BTF, so don't mark that we need it set. + */ + clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP); + + /* Store the virtualized DR6 value */ + tsk->thread.debugreg6 = dr6; + +#ifdef CONFIG_KPROBES + if (kprobe_debug_handler(regs)) + goto exit; +#endif + + if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, error_code, + SIGTRAP) == NOTIFY_STOP) + goto exit; + + /* + * Let others (NMI) know that the debug stack is in use + * as we may switch to the interrupt stack. + */ + debug_stack_usage_inc(); + + /* It's safe to allow irq's after DR6 has been saved */ + preempt_conditional_sti(regs); + + if (regs->flags & X86_VM_MASK) { + handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, + X86_TRAP_DB); + preempt_conditional_cli(regs); + debug_stack_usage_dec(); + goto exit; + } + + /* + * Single-stepping through system calls: ignore any exceptions in + * kernel space, but re-enable TF when returning to user mode. + * + * We already checked v86 mode above, so we can check for kernel mode + * by just checking the CPL of CS. + */ + if ((dr6 & DR_STEP) && !user_mode(regs)) { + tsk->thread.debugreg6 &= ~DR_STEP; + set_tsk_thread_flag(tsk, TIF_SINGLESTEP); + regs->flags &= ~X86_EFLAGS_TF; + } + si_code = get_si_code(tsk->thread.debugreg6); + if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) + send_sigtrap(tsk, regs, error_code, si_code); + preempt_conditional_cli(regs); + debug_stack_usage_dec(); + +exit: + exception_exit(prev_state); +} +NOKPROBE_SYMBOL(do_debug); + +/* + * Note that we play around with the 'TS' bit in an attempt to get + * the correct behaviour even in the presence of the asynchronous + * IRQ13 behaviour + */ +static void math_error(struct pt_regs *regs, int error_code, int trapnr) +{ + struct task_struct *task = current; + siginfo_t info; + unsigned short err; + char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" : + "simd exception"; + + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP) + return; + conditional_sti(regs); + + if (!user_mode_vm(regs)) + { + if (!fixup_exception(regs)) { + task->thread.error_code = error_code; + task->thread.trap_nr = trapnr; + die(str, regs, error_code); + } + return; + } + + /* + * Save the info for the exception handler and clear the error. + */ + save_init_fpu(task); + task->thread.trap_nr = trapnr; + task->thread.error_code = error_code; + info.si_signo = SIGFPE; + info.si_errno = 0; + info.si_addr = (void __user *)uprobe_get_trap_addr(regs); + if (trapnr == X86_TRAP_MF) { + unsigned short cwd, swd; + /* + * (~cwd & swd) will mask out exceptions that are not set to unmasked + * status. 0x3f is the exception bits in these regs, 0x200 is the + * C1 reg you need in case of a stack fault, 0x040 is the stack + * fault bit. We should only be taking one exception at a time, + * so if this combination doesn't produce any single exception, + * then we have a bad program that isn't synchronizing its FPU usage + * and it will suffer the consequences since we won't be able to + * fully reproduce the context of the exception + */ + cwd = get_fpu_cwd(task); + swd = get_fpu_swd(task); + + err = swd & ~cwd; + } else { + /* + * The SIMD FPU exceptions are handled a little differently, as there + * is only a single status/control register. Thus, to determine which + * unmasked exception was caught we must mask the exception mask bits + * at 0x1f80, and then use these to mask the exception bits at 0x3f. + */ + unsigned short mxcsr = get_fpu_mxcsr(task); + err = ~(mxcsr >> 7) & mxcsr; + } + + if (err & 0x001) { /* Invalid op */ + /* + * swd & 0x240 == 0x040: Stack Underflow + * swd & 0x240 == 0x240: Stack Overflow + * User must clear the SF bit (0x40) if set + */ + info.si_code = FPE_FLTINV; + } else if (err & 0x004) { /* Divide by Zero */ + info.si_code = FPE_FLTDIV; + } else if (err & 0x008) { /* Overflow */ + info.si_code = FPE_FLTOVF; + } else if (err & 0x012) { /* Denormal, Underflow */ + info.si_code = FPE_FLTUND; + } else if (err & 0x020) { /* Precision */ + info.si_code = FPE_FLTRES; + } else { + /* + * If we're using IRQ 13, or supposedly even some trap + * X86_TRAP_MF implementations, it's possible + * we get a spurious trap, which is not an error. + */ + return; + } + force_sig_info(SIGFPE, &info, task); +} + +dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) +{ + enum ctx_state prev_state; + + prev_state = exception_enter(); + math_error(regs, error_code, X86_TRAP_MF); + exception_exit(prev_state); +} + +dotraplinkage void +do_simd_coprocessor_error(struct pt_regs *regs, long error_code) +{ + enum ctx_state prev_state; + + prev_state = exception_enter(); + math_error(regs, error_code, X86_TRAP_XF); + exception_exit(prev_state); +} + +dotraplinkage void +do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) +{ + conditional_sti(regs); +#if 0 + /* No need to warn about this any longer. */ + pr_info("Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); +#endif +} + +asmlinkage __visible void __attribute__((weak)) smp_thermal_interrupt(void) +{ +} + +asmlinkage __visible void __attribute__((weak)) smp_threshold_interrupt(void) +{ +} + +/* + * 'math_state_restore()' saves the current math information in the + * old math state array, and gets the new ones from the current task + * + * Careful.. There are problems with IBM-designed IRQ13 behaviour. + * Don't touch unless you *really* know how it works. + * + * Must be called with kernel preemption disabled (eg with local + * local interrupts as in the case of do_device_not_available). + */ +void math_state_restore(void) +{ + struct task_struct *tsk = current; + + if (!tsk_used_math(tsk)) { + local_irq_enable(); + /* + * does a slab alloc which can sleep + */ + if (init_fpu(tsk)) { + /* + * ran out of memory! + */ + do_group_exit(SIGKILL); + return; + } + local_irq_disable(); + } + + __thread_fpu_begin(tsk); + + /* + * Paranoid restore. send a SIGSEGV if we fail to restore the state. + */ + if (unlikely(restore_fpu_checking(tsk))) { + drop_init_fpu(tsk); + force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk); + return; + } + + tsk->thread.fpu_counter++; +} +EXPORT_SYMBOL_GPL(math_state_restore); + +dotraplinkage void +do_device_not_available(struct pt_regs *regs, long error_code) +{ + enum ctx_state prev_state; + + prev_state = exception_enter(); + BUG_ON(use_eager_fpu()); + +#ifdef CONFIG_MATH_EMULATION + if (read_cr0() & X86_CR0_EM) { + struct math_emu_info info = { }; + + conditional_sti(regs); + + info.regs = regs; + math_emulate(&info); + exception_exit(prev_state); + return; + } +#endif + math_state_restore(); /* interrupts still off */ +#ifdef CONFIG_X86_32 + conditional_sti(regs); +#endif + exception_exit(prev_state); +} +NOKPROBE_SYMBOL(do_device_not_available); + +#ifdef CONFIG_X86_32 +dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) +{ + siginfo_t info; + enum ctx_state prev_state; + + prev_state = exception_enter(); + local_irq_enable(); + + info.si_signo = SIGILL; + info.si_errno = 0; + info.si_code = ILL_BADSTK; + info.si_addr = NULL; + if (notify_die(DIE_TRAP, "iret exception", regs, error_code, + X86_TRAP_IRET, SIGILL) != NOTIFY_STOP) { + do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code, + &info); + } + exception_exit(prev_state); +} +#endif + +/* Set of traps needed for early debugging. */ +void __init early_trap_init(void) +{ + set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); + /* int3 can be called from all */ + set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); +#ifdef CONFIG_X86_32 + set_intr_gate(X86_TRAP_PF, page_fault); +#endif + load_idt(&idt_descr); +} + +void __init early_trap_pf_init(void) +{ +#ifdef CONFIG_X86_64 + set_intr_gate(X86_TRAP_PF, page_fault); +#endif +} + +void __init trap_init(void) +{ + int i; + +#ifdef CONFIG_EISA + void __iomem *p = early_ioremap(0x0FFFD9, 4); + + if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24)) + EISA_bus = 1; + early_iounmap(p, 4); +#endif + + set_intr_gate(X86_TRAP_DE, divide_error); + set_intr_gate_ist(X86_TRAP_NMI, &nmi, NMI_STACK); + /* int4 can be called from all */ + set_system_intr_gate(X86_TRAP_OF, &overflow); + set_intr_gate(X86_TRAP_BR, bounds); + set_intr_gate(X86_TRAP_UD, invalid_op); + set_intr_gate(X86_TRAP_NM, device_not_available); +#ifdef CONFIG_X86_32 + set_task_gate(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS); +#else + set_intr_gate_ist(X86_TRAP_DF, &double_fault, DOUBLEFAULT_STACK); +#endif + set_intr_gate(X86_TRAP_OLD_MF, coprocessor_segment_overrun); + set_intr_gate(X86_TRAP_TS, invalid_TSS); + set_intr_gate(X86_TRAP_NP, segment_not_present); + set_intr_gate_ist(X86_TRAP_SS, &stack_segment, STACKFAULT_STACK); + set_intr_gate(X86_TRAP_GP, general_protection); + set_intr_gate(X86_TRAP_SPURIOUS, spurious_interrupt_bug); + set_intr_gate(X86_TRAP_MF, coprocessor_error); + set_intr_gate(X86_TRAP_AC, alignment_check); +#ifdef CONFIG_X86_MCE + set_intr_gate_ist(X86_TRAP_MC, &machine_check, MCE_STACK); +#endif + set_intr_gate(X86_TRAP_XF, simd_coprocessor_error); + + /* Reserve all the builtin and the syscall vector: */ + for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) + set_bit(i, used_vectors); + +#ifdef CONFIG_IA32_EMULATION + set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); + set_bit(IA32_SYSCALL_VECTOR, used_vectors); +#endif + +#ifdef CONFIG_X86_32 + set_system_trap_gate(SYSCALL_VECTOR, &system_call); + set_bit(SYSCALL_VECTOR, used_vectors); +#endif + + /* + * Set the IDT descriptor to a fixed read-only location, so that the + * "sidt" instruction will not leak the location of the kernel, and + * to defend the IDT against arbitrary memory write vulnerabilities. + * It will be reloaded in cpu_init() */ + __set_fixmap(FIX_RO_IDT, __pa_symbol(idt_table), PAGE_KERNEL_RO); + idt_descr.address = fix_to_virt(FIX_RO_IDT); + + /* + * Should be a barrier for any external CPU state: + */ + cpu_init(); + + x86_init.irqs.trap_init(); + +#ifdef CONFIG_X86_64 + memcpy(&debug_idt_table, &idt_table, IDT_ENTRIES * 16); + set_nmi_gate(X86_TRAP_DB, &debug); + set_nmi_gate(X86_TRAP_BP, &int3); +#endif +} diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c new file mode 100644 index 00000000000..ea030319b32 --- /dev/null +++ b/arch/x86/kernel/tsc.c @@ -0,0 +1,1241 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/timer.h> +#include <linux/acpi_pmtmr.h> +#include <linux/cpufreq.h> +#include <linux/delay.h> +#include <linux/clocksource.h> +#include <linux/percpu.h> +#include <linux/timex.h> +#include <linux/static_key.h> + +#include <asm/hpet.h> +#include <asm/timer.h> +#include <asm/vgtod.h> +#include <asm/time.h> +#include <asm/delay.h> +#include <asm/hypervisor.h> +#include <asm/nmi.h> +#include <asm/x86_init.h> + +unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */ +EXPORT_SYMBOL(cpu_khz); + +unsigned int __read_mostly tsc_khz; +EXPORT_SYMBOL(tsc_khz); + +/* + * TSC can be unstable due to cpufreq or due to unsynced TSCs + */ +static int __read_mostly tsc_unstable; + +/* native_sched_clock() is called before tsc_init(), so + we must start with the TSC soft disabled to prevent + erroneous rdtsc usage on !cpu_has_tsc processors */ +static int __read_mostly tsc_disabled = -1; + +static struct static_key __use_tsc = STATIC_KEY_INIT; + +int tsc_clocksource_reliable; + +/* + * Use a ring-buffer like data structure, where a writer advances the head by + * writing a new data entry and a reader advances the tail when it observes a + * new entry. + * + * Writers are made to wait on readers until there's space to write a new + * entry. + * + * This means that we can always use an {offset, mul} pair to compute a ns + * value that is 'roughly' in the right direction, even if we're writing a new + * {offset, mul} pair during the clock read. + * + * The down-side is that we can no longer guarantee strict monotonicity anymore + * (assuming the TSC was that to begin with), because while we compute the + * intersection point of the two clock slopes and make sure the time is + * continuous at the point of switching; we can no longer guarantee a reader is + * strictly before or after the switch point. + * + * It does mean a reader no longer needs to disable IRQs in order to avoid + * CPU-Freq updates messing with his times, and similarly an NMI reader will + * no longer run the risk of hitting half-written state. + */ + +struct cyc2ns { + struct cyc2ns_data data[2]; /* 0 + 2*24 = 48 */ + struct cyc2ns_data *head; /* 48 + 8 = 56 */ + struct cyc2ns_data *tail; /* 56 + 8 = 64 */ +}; /* exactly fits one cacheline */ + +static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns); + +struct cyc2ns_data *cyc2ns_read_begin(void) +{ + struct cyc2ns_data *head; + + preempt_disable(); + + head = this_cpu_read(cyc2ns.head); + /* + * Ensure we observe the entry when we observe the pointer to it. + * matches the wmb from cyc2ns_write_end(). + */ + smp_read_barrier_depends(); + head->__count++; + barrier(); + + return head; +} + +void cyc2ns_read_end(struct cyc2ns_data *head) +{ + barrier(); + /* + * If we're the outer most nested read; update the tail pointer + * when we're done. This notifies possible pending writers + * that we've observed the head pointer and that the other + * entry is now free. + */ + if (!--head->__count) { + /* + * x86-TSO does not reorder writes with older reads; + * therefore once this write becomes visible to another + * cpu, we must be finished reading the cyc2ns_data. + * + * matches with cyc2ns_write_begin(). + */ + this_cpu_write(cyc2ns.tail, head); + } + preempt_enable(); +} + +/* + * Begin writing a new @data entry for @cpu. + * + * Assumes some sort of write side lock; currently 'provided' by the assumption + * that cpufreq will call its notifiers sequentially. + */ +static struct cyc2ns_data *cyc2ns_write_begin(int cpu) +{ + struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); + struct cyc2ns_data *data = c2n->data; + + if (data == c2n->head) + data++; + + /* XXX send an IPI to @cpu in order to guarantee a read? */ + + /* + * When we observe the tail write from cyc2ns_read_end(), + * the cpu must be done with that entry and its safe + * to start writing to it. + */ + while (c2n->tail == data) + cpu_relax(); + + return data; +} + +static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data) +{ + struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); + + /* + * Ensure the @data writes are visible before we publish the + * entry. Matches the data-depencency in cyc2ns_read_begin(). + */ + smp_wmb(); + + ACCESS_ONCE(c2n->head) = data; +} + +/* + * Accelerators for sched_clock() + * convert from cycles(64bits) => nanoseconds (64bits) + * basic equation: + * ns = cycles / (freq / ns_per_sec) + * ns = cycles * (ns_per_sec / freq) + * ns = cycles * (10^9 / (cpu_khz * 10^3)) + * ns = cycles * (10^6 / cpu_khz) + * + * Then we use scaling math (suggested by george@mvista.com) to get: + * ns = cycles * (10^6 * SC / cpu_khz) / SC + * ns = cycles * cyc2ns_scale / SC + * + * And since SC is a constant power of two, we can convert the div + * into a shift. + * + * We can use khz divisor instead of mhz to keep a better precision, since + * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. + * (mathieu.desnoyers@polymtl.ca) + * + * -johnstul@us.ibm.com "math is hard, lets go shopping!" + */ + +#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ + +static void cyc2ns_data_init(struct cyc2ns_data *data) +{ + data->cyc2ns_mul = 0; + data->cyc2ns_shift = CYC2NS_SCALE_FACTOR; + data->cyc2ns_offset = 0; + data->__count = 0; +} + +static void cyc2ns_init(int cpu) +{ + struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); + + cyc2ns_data_init(&c2n->data[0]); + cyc2ns_data_init(&c2n->data[1]); + + c2n->head = c2n->data; + c2n->tail = c2n->data; +} + +static inline unsigned long long cycles_2_ns(unsigned long long cyc) +{ + struct cyc2ns_data *data, *tail; + unsigned long long ns; + + /* + * See cyc2ns_read_*() for details; replicated in order to avoid + * an extra few instructions that came with the abstraction. + * Notable, it allows us to only do the __count and tail update + * dance when its actually needed. + */ + + preempt_disable_notrace(); + data = this_cpu_read(cyc2ns.head); + tail = this_cpu_read(cyc2ns.tail); + + if (likely(data == tail)) { + ns = data->cyc2ns_offset; + ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); + } else { + data->__count++; + + barrier(); + + ns = data->cyc2ns_offset; + ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); + + barrier(); + + if (!--data->__count) + this_cpu_write(cyc2ns.tail, data); + } + preempt_enable_notrace(); + + return ns; +} + +/* XXX surely we already have this someplace in the kernel?! */ +#define DIV_ROUND(n, d) (((n) + ((d) / 2)) / (d)) + +static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) +{ + unsigned long long tsc_now, ns_now; + struct cyc2ns_data *data; + unsigned long flags; + + local_irq_save(flags); + sched_clock_idle_sleep_event(); + + if (!cpu_khz) + goto done; + + data = cyc2ns_write_begin(cpu); + + rdtscll(tsc_now); + ns_now = cycles_2_ns(tsc_now); + + /* + * Compute a new multiplier as per the above comment and ensure our + * time function is continuous; see the comment near struct + * cyc2ns_data. + */ + data->cyc2ns_mul = DIV_ROUND(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR, cpu_khz); + data->cyc2ns_shift = CYC2NS_SCALE_FACTOR; + data->cyc2ns_offset = ns_now - + mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); + + cyc2ns_write_end(cpu, data); + +done: + sched_clock_idle_wakeup_event(0); + local_irq_restore(flags); +} +/* + * Scheduler clock - returns current time in nanosec units. + */ +u64 native_sched_clock(void) +{ + u64 tsc_now; + + /* + * Fall back to jiffies if there's no TSC available: + * ( But note that we still use it if the TSC is marked + * unstable. We do this because unlike Time Of Day, + * the scheduler clock tolerates small errors and it's + * very important for it to be as fast as the platform + * can achieve it. ) + */ + if (!static_key_false(&__use_tsc)) { + /* No locking but a rare wrong value is not a big deal: */ + return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); + } + + /* read the Time Stamp Counter: */ + rdtscll(tsc_now); + + /* return the value in ns */ + return cycles_2_ns(tsc_now); +} + +/* We need to define a real function for sched_clock, to override the + weak default version */ +#ifdef CONFIG_PARAVIRT +unsigned long long sched_clock(void) +{ + return paravirt_sched_clock(); +} +#else +unsigned long long +sched_clock(void) __attribute__((alias("native_sched_clock"))); +#endif + +unsigned long long native_read_tsc(void) +{ + return __native_read_tsc(); +} +EXPORT_SYMBOL(native_read_tsc); + +int check_tsc_unstable(void) +{ + return tsc_unstable; +} +EXPORT_SYMBOL_GPL(check_tsc_unstable); + +int check_tsc_disabled(void) +{ + return tsc_disabled; +} +EXPORT_SYMBOL_GPL(check_tsc_disabled); + +#ifdef CONFIG_X86_TSC +int __init notsc_setup(char *str) +{ + pr_warn("Kernel compiled with CONFIG_X86_TSC, cannot disable TSC completely\n"); + tsc_disabled = 1; + return 1; +} +#else +/* + * disable flag for tsc. Takes effect by clearing the TSC cpu flag + * in cpu/common.c + */ +int __init notsc_setup(char *str) +{ + setup_clear_cpu_cap(X86_FEATURE_TSC); + return 1; +} +#endif + +__setup("notsc", notsc_setup); + +static int no_sched_irq_time; + +static int __init tsc_setup(char *str) +{ + if (!strcmp(str, "reliable")) + tsc_clocksource_reliable = 1; + if (!strncmp(str, "noirqtime", 9)) + no_sched_irq_time = 1; + return 1; +} + +__setup("tsc=", tsc_setup); + +#define MAX_RETRIES 5 +#define SMI_TRESHOLD 50000 + +/* + * Read TSC and the reference counters. Take care of SMI disturbance + */ +static u64 tsc_read_refs(u64 *p, int hpet) +{ + u64 t1, t2; + int i; + + for (i = 0; i < MAX_RETRIES; i++) { + t1 = get_cycles(); + if (hpet) + *p = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF; + else + *p = acpi_pm_read_early(); + t2 = get_cycles(); + if ((t2 - t1) < SMI_TRESHOLD) + return t2; + } + return ULLONG_MAX; +} + +/* + * Calculate the TSC frequency from HPET reference + */ +static unsigned long calc_hpet_ref(u64 deltatsc, u64 hpet1, u64 hpet2) +{ + u64 tmp; + + if (hpet2 < hpet1) + hpet2 += 0x100000000ULL; + hpet2 -= hpet1; + tmp = ((u64)hpet2 * hpet_readl(HPET_PERIOD)); + do_div(tmp, 1000000); + do_div(deltatsc, tmp); + + return (unsigned long) deltatsc; +} + +/* + * Calculate the TSC frequency from PMTimer reference + */ +static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2) +{ + u64 tmp; + + if (!pm1 && !pm2) + return ULONG_MAX; + + if (pm2 < pm1) + pm2 += (u64)ACPI_PM_OVRRUN; + pm2 -= pm1; + tmp = pm2 * 1000000000LL; + do_div(tmp, PMTMR_TICKS_PER_SEC); + do_div(deltatsc, tmp); + + return (unsigned long) deltatsc; +} + +#define CAL_MS 10 +#define CAL_LATCH (PIT_TICK_RATE / (1000 / CAL_MS)) +#define CAL_PIT_LOOPS 1000 + +#define CAL2_MS 50 +#define CAL2_LATCH (PIT_TICK_RATE / (1000 / CAL2_MS)) +#define CAL2_PIT_LOOPS 5000 + + +/* + * Try to calibrate the TSC against the Programmable + * Interrupt Timer and return the frequency of the TSC + * in kHz. + * + * Return ULONG_MAX on failure to calibrate. + */ +static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin) +{ + u64 tsc, t1, t2, delta; + unsigned long tscmin, tscmax; + int pitcnt; + + /* Set the Gate high, disable speaker */ + outb((inb(0x61) & ~0x02) | 0x01, 0x61); + + /* + * Setup CTC channel 2* for mode 0, (interrupt on terminal + * count mode), binary count. Set the latch register to 50ms + * (LSB then MSB) to begin countdown. + */ + outb(0xb0, 0x43); + outb(latch & 0xff, 0x42); + outb(latch >> 8, 0x42); + + tsc = t1 = t2 = get_cycles(); + + pitcnt = 0; + tscmax = 0; + tscmin = ULONG_MAX; + while ((inb(0x61) & 0x20) == 0) { + t2 = get_cycles(); + delta = t2 - tsc; + tsc = t2; + if ((unsigned long) delta < tscmin) + tscmin = (unsigned int) delta; + if ((unsigned long) delta > tscmax) + tscmax = (unsigned int) delta; + pitcnt++; + } + + /* + * Sanity checks: + * + * If we were not able to read the PIT more than loopmin + * times, then we have been hit by a massive SMI + * + * If the maximum is 10 times larger than the minimum, + * then we got hit by an SMI as well. + */ + if (pitcnt < loopmin || tscmax > 10 * tscmin) + return ULONG_MAX; + + /* Calculate the PIT value */ + delta = t2 - t1; + do_div(delta, ms); + return delta; +} + +/* + * This reads the current MSB of the PIT counter, and + * checks if we are running on sufficiently fast and + * non-virtualized hardware. + * + * Our expectations are: + * + * - the PIT is running at roughly 1.19MHz + * + * - each IO is going to take about 1us on real hardware, + * but we allow it to be much faster (by a factor of 10) or + * _slightly_ slower (ie we allow up to a 2us read+counter + * update - anything else implies a unacceptably slow CPU + * or PIT for the fast calibration to work. + * + * - with 256 PIT ticks to read the value, we have 214us to + * see the same MSB (and overhead like doing a single TSC + * read per MSB value etc). + * + * - We're doing 2 reads per loop (LSB, MSB), and we expect + * them each to take about a microsecond on real hardware. + * So we expect a count value of around 100. But we'll be + * generous, and accept anything over 50. + * + * - if the PIT is stuck, and we see *many* more reads, we + * return early (and the next caller of pit_expect_msb() + * then consider it a failure when they don't see the + * next expected value). + * + * These expectations mean that we know that we have seen the + * transition from one expected value to another with a fairly + * high accuracy, and we didn't miss any events. We can thus + * use the TSC value at the transitions to calculate a pretty + * good value for the TSC frequencty. + */ +static inline int pit_verify_msb(unsigned char val) +{ + /* Ignore LSB */ + inb(0x42); + return inb(0x42) == val; +} + +static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap) +{ + int count; + u64 tsc = 0, prev_tsc = 0; + + for (count = 0; count < 50000; count++) { + if (!pit_verify_msb(val)) + break; + prev_tsc = tsc; + tsc = get_cycles(); + } + *deltap = get_cycles() - prev_tsc; + *tscp = tsc; + + /* + * We require _some_ success, but the quality control + * will be based on the error terms on the TSC values. + */ + return count > 5; +} + +/* + * How many MSB values do we want to see? We aim for + * a maximum error rate of 500ppm (in practice the + * real error is much smaller), but refuse to spend + * more than 50ms on it. + */ +#define MAX_QUICK_PIT_MS 50 +#define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256) + +static unsigned long quick_pit_calibrate(void) +{ + int i; + u64 tsc, delta; + unsigned long d1, d2; + + /* Set the Gate high, disable speaker */ + outb((inb(0x61) & ~0x02) | 0x01, 0x61); + + /* + * Counter 2, mode 0 (one-shot), binary count + * + * NOTE! Mode 2 decrements by two (and then the + * output is flipped each time, giving the same + * final output frequency as a decrement-by-one), + * so mode 0 is much better when looking at the + * individual counts. + */ + outb(0xb0, 0x43); + + /* Start at 0xffff */ + outb(0xff, 0x42); + outb(0xff, 0x42); + + /* + * The PIT starts counting at the next edge, so we + * need to delay for a microsecond. The easiest way + * to do that is to just read back the 16-bit counter + * once from the PIT. + */ + pit_verify_msb(0); + + if (pit_expect_msb(0xff, &tsc, &d1)) { + for (i = 1; i <= MAX_QUICK_PIT_ITERATIONS; i++) { + if (!pit_expect_msb(0xff-i, &delta, &d2)) + break; + + /* + * Iterate until the error is less than 500 ppm + */ + delta -= tsc; + if (d1+d2 >= delta >> 11) + continue; + + /* + * Check the PIT one more time to verify that + * all TSC reads were stable wrt the PIT. + * + * This also guarantees serialization of the + * last cycle read ('d2') in pit_expect_msb. + */ + if (!pit_verify_msb(0xfe - i)) + break; + goto success; + } + } + pr_err("Fast TSC calibration failed\n"); + return 0; + +success: + /* + * Ok, if we get here, then we've seen the + * MSB of the PIT decrement 'i' times, and the + * error has shrunk to less than 500 ppm. + * + * As a result, we can depend on there not being + * any odd delays anywhere, and the TSC reads are + * reliable (within the error). + * + * kHz = ticks / time-in-seconds / 1000; + * kHz = (t2 - t1) / (I * 256 / PIT_TICK_RATE) / 1000 + * kHz = ((t2 - t1) * PIT_TICK_RATE) / (I * 256 * 1000) + */ + delta *= PIT_TICK_RATE; + do_div(delta, i*256*1000); + pr_info("Fast TSC calibration using PIT\n"); + return delta; +} + +/** + * native_calibrate_tsc - calibrate the tsc on boot + */ +unsigned long native_calibrate_tsc(void) +{ + u64 tsc1, tsc2, delta, ref1, ref2; + unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; + unsigned long flags, latch, ms, fast_calibrate; + int hpet = is_hpet_enabled(), i, loopmin; + + /* Calibrate TSC using MSR for Intel Atom SoCs */ + local_irq_save(flags); + fast_calibrate = try_msr_calibrate_tsc(); + local_irq_restore(flags); + if (fast_calibrate) + return fast_calibrate; + + local_irq_save(flags); + fast_calibrate = quick_pit_calibrate(); + local_irq_restore(flags); + if (fast_calibrate) + return fast_calibrate; + + /* + * Run 5 calibration loops to get the lowest frequency value + * (the best estimate). We use two different calibration modes + * here: + * + * 1) PIT loop. We set the PIT Channel 2 to oneshot mode and + * load a timeout of 50ms. We read the time right after we + * started the timer and wait until the PIT count down reaches + * zero. In each wait loop iteration we read the TSC and check + * the delta to the previous read. We keep track of the min + * and max values of that delta. The delta is mostly defined + * by the IO time of the PIT access, so we can detect when a + * SMI/SMM disturbance happened between the two reads. If the + * maximum time is significantly larger than the minimum time, + * then we discard the result and have another try. + * + * 2) Reference counter. If available we use the HPET or the + * PMTIMER as a reference to check the sanity of that value. + * We use separate TSC readouts and check inside of the + * reference read for a SMI/SMM disturbance. We dicard + * disturbed values here as well. We do that around the PIT + * calibration delay loop as we have to wait for a certain + * amount of time anyway. + */ + + /* Preset PIT loop values */ + latch = CAL_LATCH; + ms = CAL_MS; + loopmin = CAL_PIT_LOOPS; + + for (i = 0; i < 3; i++) { + unsigned long tsc_pit_khz; + + /* + * Read the start value and the reference count of + * hpet/pmtimer when available. Then do the PIT + * calibration, which will take at least 50ms, and + * read the end value. + */ + local_irq_save(flags); + tsc1 = tsc_read_refs(&ref1, hpet); + tsc_pit_khz = pit_calibrate_tsc(latch, ms, loopmin); + tsc2 = tsc_read_refs(&ref2, hpet); + local_irq_restore(flags); + + /* Pick the lowest PIT TSC calibration so far */ + tsc_pit_min = min(tsc_pit_min, tsc_pit_khz); + + /* hpet or pmtimer available ? */ + if (ref1 == ref2) + continue; + + /* Check, whether the sampling was disturbed by an SMI */ + if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX) + continue; + + tsc2 = (tsc2 - tsc1) * 1000000LL; + if (hpet) + tsc2 = calc_hpet_ref(tsc2, ref1, ref2); + else + tsc2 = calc_pmtimer_ref(tsc2, ref1, ref2); + + tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2); + + /* Check the reference deviation */ + delta = ((u64) tsc_pit_min) * 100; + do_div(delta, tsc_ref_min); + + /* + * If both calibration results are inside a 10% window + * then we can be sure, that the calibration + * succeeded. We break out of the loop right away. We + * use the reference value, as it is more precise. + */ + if (delta >= 90 && delta <= 110) { + pr_info("PIT calibration matches %s. %d loops\n", + hpet ? "HPET" : "PMTIMER", i + 1); + return tsc_ref_min; + } + + /* + * Check whether PIT failed more than once. This + * happens in virtualized environments. We need to + * give the virtual PC a slightly longer timeframe for + * the HPET/PMTIMER to make the result precise. + */ + if (i == 1 && tsc_pit_min == ULONG_MAX) { + latch = CAL2_LATCH; + ms = CAL2_MS; + loopmin = CAL2_PIT_LOOPS; + } + } + + /* + * Now check the results. + */ + if (tsc_pit_min == ULONG_MAX) { + /* PIT gave no useful value */ + pr_warn("Unable to calibrate against PIT\n"); + + /* We don't have an alternative source, disable TSC */ + if (!hpet && !ref1 && !ref2) { + pr_notice("No reference (HPET/PMTIMER) available\n"); + return 0; + } + + /* The alternative source failed as well, disable TSC */ + if (tsc_ref_min == ULONG_MAX) { + pr_warn("HPET/PMTIMER calibration failed\n"); + return 0; + } + + /* Use the alternative source */ + pr_info("using %s reference calibration\n", + hpet ? "HPET" : "PMTIMER"); + + return tsc_ref_min; + } + + /* We don't have an alternative source, use the PIT calibration value */ + if (!hpet && !ref1 && !ref2) { + pr_info("Using PIT calibration value\n"); + return tsc_pit_min; + } + + /* The alternative source failed, use the PIT calibration value */ + if (tsc_ref_min == ULONG_MAX) { + pr_warn("HPET/PMTIMER calibration failed. Using PIT calibration.\n"); + return tsc_pit_min; + } + + /* + * The calibration values differ too much. In doubt, we use + * the PIT value as we know that there are PMTIMERs around + * running at double speed. At least we let the user know: + */ + pr_warn("PIT calibration deviates from %s: %lu %lu\n", + hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min); + pr_info("Using PIT calibration value\n"); + return tsc_pit_min; +} + +int recalibrate_cpu_khz(void) +{ +#ifndef CONFIG_SMP + unsigned long cpu_khz_old = cpu_khz; + + if (cpu_has_tsc) { + tsc_khz = x86_platform.calibrate_tsc(); + cpu_khz = tsc_khz; + cpu_data(0).loops_per_jiffy = + cpufreq_scale(cpu_data(0).loops_per_jiffy, + cpu_khz_old, cpu_khz); + return 0; + } else + return -ENODEV; +#else + return -ENODEV; +#endif +} + +EXPORT_SYMBOL(recalibrate_cpu_khz); + + +static unsigned long long cyc2ns_suspend; + +void tsc_save_sched_clock_state(void) +{ + if (!sched_clock_stable()) + return; + + cyc2ns_suspend = sched_clock(); +} + +/* + * Even on processors with invariant TSC, TSC gets reset in some the + * ACPI system sleep states. And in some systems BIOS seem to reinit TSC to + * arbitrary value (still sync'd across cpu's) during resume from such sleep + * states. To cope up with this, recompute the cyc2ns_offset for each cpu so + * that sched_clock() continues from the point where it was left off during + * suspend. + */ +void tsc_restore_sched_clock_state(void) +{ + unsigned long long offset; + unsigned long flags; + int cpu; + + if (!sched_clock_stable()) + return; + + local_irq_save(flags); + + /* + * We're comming out of suspend, there's no concurrency yet; don't + * bother being nice about the RCU stuff, just write to both + * data fields. + */ + + this_cpu_write(cyc2ns.data[0].cyc2ns_offset, 0); + this_cpu_write(cyc2ns.data[1].cyc2ns_offset, 0); + + offset = cyc2ns_suspend - sched_clock(); + + for_each_possible_cpu(cpu) { + per_cpu(cyc2ns.data[0].cyc2ns_offset, cpu) = offset; + per_cpu(cyc2ns.data[1].cyc2ns_offset, cpu) = offset; + } + + local_irq_restore(flags); +} + +#ifdef CONFIG_CPU_FREQ + +/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency + * changes. + * + * RED-PEN: On SMP we assume all CPUs run with the same frequency. It's + * not that important because current Opteron setups do not support + * scaling on SMP anyroads. + * + * Should fix up last_tsc too. Currently gettimeofday in the + * first tick after the change will be slightly wrong. + */ + +static unsigned int ref_freq; +static unsigned long loops_per_jiffy_ref; +static unsigned long tsc_khz_ref; + +static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cpufreq_freqs *freq = data; + unsigned long *lpj; + + if (cpu_has(&cpu_data(freq->cpu), X86_FEATURE_CONSTANT_TSC)) + return 0; + + lpj = &boot_cpu_data.loops_per_jiffy; +#ifdef CONFIG_SMP + if (!(freq->flags & CPUFREQ_CONST_LOOPS)) + lpj = &cpu_data(freq->cpu).loops_per_jiffy; +#endif + + if (!ref_freq) { + ref_freq = freq->old; + loops_per_jiffy_ref = *lpj; + tsc_khz_ref = tsc_khz; + } + if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || + (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) { + *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); + + tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); + if (!(freq->flags & CPUFREQ_CONST_LOOPS)) + mark_tsc_unstable("cpufreq changes"); + + set_cyc2ns_scale(tsc_khz, freq->cpu); + } + + return 0; +} + +static struct notifier_block time_cpufreq_notifier_block = { + .notifier_call = time_cpufreq_notifier +}; + +static int __init cpufreq_tsc(void) +{ + if (!cpu_has_tsc) + return 0; + if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + return 0; + cpufreq_register_notifier(&time_cpufreq_notifier_block, + CPUFREQ_TRANSITION_NOTIFIER); + return 0; +} + +core_initcall(cpufreq_tsc); + +#endif /* CONFIG_CPU_FREQ */ + +/* clocksource code */ + +static struct clocksource clocksource_tsc; + +/* + * We compare the TSC to the cycle_last value in the clocksource + * structure to avoid a nasty time-warp. This can be observed in a + * very small window right after one CPU updated cycle_last under + * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which + * is smaller than the cycle_last reference value due to a TSC which + * is slighty behind. This delta is nowhere else observable, but in + * that case it results in a forward time jump in the range of hours + * due to the unsigned delta calculation of the time keeping core + * code, which is necessary to support wrapping clocksources like pm + * timer. + */ +static cycle_t read_tsc(struct clocksource *cs) +{ + cycle_t ret = (cycle_t)get_cycles(); + + return ret >= clocksource_tsc.cycle_last ? + ret : clocksource_tsc.cycle_last; +} + +static void resume_tsc(struct clocksource *cs) +{ + if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3)) + clocksource_tsc.cycle_last = 0; +} + +static struct clocksource clocksource_tsc = { + .name = "tsc", + .rating = 300, + .read = read_tsc, + .resume = resume_tsc, + .mask = CLOCKSOURCE_MASK(64), + .flags = CLOCK_SOURCE_IS_CONTINUOUS | + CLOCK_SOURCE_MUST_VERIFY, + .archdata = { .vclock_mode = VCLOCK_TSC }, +}; + +void mark_tsc_unstable(char *reason) +{ + if (!tsc_unstable) { + tsc_unstable = 1; + clear_sched_clock_stable(); + disable_sched_clock_irqtime(); + pr_info("Marking TSC unstable due to %s\n", reason); + /* Change only the rating, when not registered */ + if (clocksource_tsc.mult) + clocksource_mark_unstable(&clocksource_tsc); + else { + clocksource_tsc.flags |= CLOCK_SOURCE_UNSTABLE; + clocksource_tsc.rating = 0; + } + } +} + +EXPORT_SYMBOL_GPL(mark_tsc_unstable); + +static void __init check_system_tsc_reliable(void) +{ +#ifdef CONFIG_MGEODE_LX + /* RTSC counts during suspend */ +#define RTSC_SUSP 0x100 + unsigned long res_low, res_high; + + rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high); + /* Geode_LX - the OLPC CPU has a very reliable TSC */ + if (res_low & RTSC_SUSP) + tsc_clocksource_reliable = 1; +#endif + if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) + tsc_clocksource_reliable = 1; +} + +/* + * Make an educated guess if the TSC is trustworthy and synchronized + * over all CPUs. + */ +int unsynchronized_tsc(void) +{ + if (!cpu_has_tsc || tsc_unstable) + return 1; + +#ifdef CONFIG_SMP + if (apic_is_clustered_box()) + return 1; +#endif + + if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + return 0; + + if (tsc_clocksource_reliable) + return 0; + /* + * Intel systems are normally all synchronized. + * Exceptions must mark TSC as unstable: + */ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { + /* assume multi socket systems are not synchronized: */ + if (num_possible_cpus() > 1) + return 1; + } + + return 0; +} + + +static void tsc_refine_calibration_work(struct work_struct *work); +static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work); +/** + * tsc_refine_calibration_work - Further refine tsc freq calibration + * @work - ignored. + * + * This functions uses delayed work over a period of a + * second to further refine the TSC freq value. Since this is + * timer based, instead of loop based, we don't block the boot + * process while this longer calibration is done. + * + * If there are any calibration anomalies (too many SMIs, etc), + * or the refined calibration is off by 1% of the fast early + * calibration, we throw out the new calibration and use the + * early calibration. + */ +static void tsc_refine_calibration_work(struct work_struct *work) +{ + static u64 tsc_start = -1, ref_start; + static int hpet; + u64 tsc_stop, ref_stop, delta; + unsigned long freq; + + /* Don't bother refining TSC on unstable systems */ + if (check_tsc_unstable()) + goto out; + + /* + * Since the work is started early in boot, we may be + * delayed the first time we expire. So set the workqueue + * again once we know timers are working. + */ + if (tsc_start == -1) { + /* + * Only set hpet once, to avoid mixing hardware + * if the hpet becomes enabled later. + */ + hpet = is_hpet_enabled(); + schedule_delayed_work(&tsc_irqwork, HZ); + tsc_start = tsc_read_refs(&ref_start, hpet); + return; + } + + tsc_stop = tsc_read_refs(&ref_stop, hpet); + + /* hpet or pmtimer available ? */ + if (ref_start == ref_stop) + goto out; + + /* Check, whether the sampling was disturbed by an SMI */ + if (tsc_start == ULLONG_MAX || tsc_stop == ULLONG_MAX) + goto out; + + delta = tsc_stop - tsc_start; + delta *= 1000000LL; + if (hpet) + freq = calc_hpet_ref(delta, ref_start, ref_stop); + else + freq = calc_pmtimer_ref(delta, ref_start, ref_stop); + + /* Make sure we're within 1% */ + if (abs(tsc_khz - freq) > tsc_khz/100) + goto out; + + tsc_khz = freq; + pr_info("Refined TSC clocksource calibration: %lu.%03lu MHz\n", + (unsigned long)tsc_khz / 1000, + (unsigned long)tsc_khz % 1000); + +out: + clocksource_register_khz(&clocksource_tsc, tsc_khz); +} + + +static int __init init_tsc_clocksource(void) +{ + if (!cpu_has_tsc || tsc_disabled > 0 || !tsc_khz) + return 0; + + if (tsc_clocksource_reliable) + clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; + /* lower the rating if we already know its unstable: */ + if (check_tsc_unstable()) { + clocksource_tsc.rating = 0; + clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; + } + + if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3)) + clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP; + + /* + * Trust the results of the earlier calibration on systems + * exporting a reliable TSC. + */ + if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { + clocksource_register_khz(&clocksource_tsc, tsc_khz); + return 0; + } + + schedule_delayed_work(&tsc_irqwork, 0); + return 0; +} +/* + * We use device_initcall here, to ensure we run after the hpet + * is fully initialized, which may occur at fs_initcall time. + */ +device_initcall(init_tsc_clocksource); + +void __init tsc_init(void) +{ + u64 lpj; + int cpu; + + x86_init.timers.tsc_pre_init(); + + if (!cpu_has_tsc) + return; + + tsc_khz = x86_platform.calibrate_tsc(); + cpu_khz = tsc_khz; + + if (!tsc_khz) { + mark_tsc_unstable("could not calculate TSC khz"); + return; + } + + pr_info("Detected %lu.%03lu MHz processor\n", + (unsigned long)cpu_khz / 1000, + (unsigned long)cpu_khz % 1000); + + /* + * Secondary CPUs do not run through tsc_init(), so set up + * all the scale factors for all CPUs, assuming the same + * speed as the bootup CPU. (cpufreq notifiers will fix this + * up if their speed diverges) + */ + for_each_possible_cpu(cpu) { + cyc2ns_init(cpu); + set_cyc2ns_scale(cpu_khz, cpu); + } + + if (tsc_disabled > 0) + return; + + /* now allow native_sched_clock() to use rdtsc */ + + tsc_disabled = 0; + static_key_slow_inc(&__use_tsc); + + if (!no_sched_irq_time) + enable_sched_clock_irqtime(); + + lpj = ((u64)tsc_khz * 1000); + do_div(lpj, HZ); + lpj_fine = lpj; + + use_tsc_delay(); + + if (unsynchronized_tsc()) + mark_tsc_unstable("TSCs unsynchronized"); + + check_system_tsc_reliable(); +} + +#ifdef CONFIG_SMP +/* + * If we have a constant TSC and are using the TSC for the delay loop, + * we can skip clock calibration if another cpu in the same socket has already + * been calibrated. This assumes that CONSTANT_TSC applies to all + * cpus in the socket - this should be a safe assumption. + */ +unsigned long calibrate_delay_is_known(void) +{ + int i, cpu = smp_processor_id(); + + if (!tsc_disabled && !cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC)) + return 0; + + for_each_online_cpu(i) + if (cpu_data(i).phys_proc_id == cpu_data(cpu).phys_proc_id) + return cpu_data(i).loops_per_jiffy; + return 0; +} +#endif diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c new file mode 100644 index 00000000000..92ae6acac8a --- /dev/null +++ b/arch/x86/kernel/tsc_msr.c @@ -0,0 +1,127 @@ +/* + * tsc_msr.c - MSR based TSC calibration on Intel Atom SoC platforms. + * + * TSC in Intel Atom SoC runs at a constant rate which can be figured + * by this formula: + * <maximum core-clock to bus-clock ratio> * <maximum resolved frequency> + * See Intel 64 and IA-32 System Programming Guid section 16.12 and 30.11.5 + * for details. + * Especially some Intel Atom SoCs don't have PIT(i8254) or HPET, so MSR + * based calibration is the only option. + * + * + * Copyright (C) 2013 Intel Corporation + * Author: Bin Gao <bin.gao@intel.com> + * + * This file is released under the GPLv2. + */ + +#include <linux/kernel.h> +#include <asm/processor.h> +#include <asm/setup.h> +#include <asm/apic.h> +#include <asm/param.h> + +/* CPU reference clock frequency: in KHz */ +#define FREQ_83 83200 +#define FREQ_100 99840 +#define FREQ_133 133200 +#define FREQ_166 166400 + +#define MAX_NUM_FREQS 8 + +/* + * According to Intel 64 and IA-32 System Programming Guide, + * if MSR_PERF_STAT[31] is set, the maximum resolved bus ratio can be + * read in MSR_PLATFORM_ID[12:8], otherwise in MSR_PERF_STAT[44:40]. + * Unfortunately some Intel Atom SoCs aren't quite compliant to this, + * so we need manually differentiate SoC families. This is what the + * field msr_plat does. + */ +struct freq_desc { + u8 x86_family; /* CPU family */ + u8 x86_model; /* model */ + u8 msr_plat; /* 1: use MSR_PLATFORM_INFO, 0: MSR_IA32_PERF_STATUS */ + u32 freqs[MAX_NUM_FREQS]; +}; + +static struct freq_desc freq_desc_tables[] = { + /* PNW */ + { 6, 0x27, 0, { 0, 0, 0, 0, 0, FREQ_100, 0, FREQ_83 } }, + /* CLV+ */ + { 6, 0x35, 0, { 0, FREQ_133, 0, 0, 0, FREQ_100, 0, FREQ_83 } }, + /* TNG */ + { 6, 0x4a, 1, { 0, FREQ_100, FREQ_133, 0, 0, 0, 0, 0 } }, + /* VLV2 */ + { 6, 0x37, 1, { FREQ_83, FREQ_100, FREQ_133, FREQ_166, 0, 0, 0, 0 } }, + /* ANN */ + { 6, 0x5a, 1, { FREQ_83, FREQ_100, FREQ_133, FREQ_100, 0, 0, 0, 0 } }, +}; + +static int match_cpu(u8 family, u8 model) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(freq_desc_tables); i++) { + if ((family == freq_desc_tables[i].x86_family) && + (model == freq_desc_tables[i].x86_model)) + return i; + } + + return -1; +} + +/* Map CPU reference clock freq ID(0-7) to CPU reference clock freq(KHz) */ +#define id_to_freq(cpu_index, freq_id) \ + (freq_desc_tables[cpu_index].freqs[freq_id]) + +/* + * Do MSR calibration only for known/supported CPUs. + * + * Returns the calibration value or 0 if MSR calibration failed. + */ +unsigned long try_msr_calibrate_tsc(void) +{ + u32 lo, hi, ratio, freq_id, freq; + unsigned long res; + int cpu_index; + + cpu_index = match_cpu(boot_cpu_data.x86, boot_cpu_data.x86_model); + if (cpu_index < 0) + return 0; + + if (freq_desc_tables[cpu_index].msr_plat) { + rdmsr(MSR_PLATFORM_INFO, lo, hi); + ratio = (lo >> 8) & 0x1f; + } else { + rdmsr(MSR_IA32_PERF_STATUS, lo, hi); + ratio = (hi >> 8) & 0x1f; + } + pr_info("Maximum core-clock to bus-clock ratio: 0x%x\n", ratio); + + if (!ratio) + goto fail; + + /* Get FSB FREQ ID */ + rdmsr(MSR_FSB_FREQ, lo, hi); + freq_id = lo & 0x7; + freq = id_to_freq(cpu_index, freq_id); + pr_info("Resolved frequency ID: %u, frequency: %u KHz\n", + freq_id, freq); + if (!freq) + goto fail; + + /* TSC frequency = maximum resolved freq * maximum resolved bus ratio */ + res = freq * ratio; + pr_info("TSC runs at %lu KHz\n", res); + +#ifdef CONFIG_X86_LOCAL_APIC + lapic_timer_frequency = (freq * 1000) / HZ; + pr_info("lapic_timer_frequency = %d\n", lapic_timer_frequency); +#endif + return res; + +fail: + pr_warn("Fast TSC calibration using MSR failed\n"); + return 0; +} diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c new file mode 100644 index 00000000000..26488487bc6 --- /dev/null +++ b/arch/x86/kernel/tsc_sync.c @@ -0,0 +1,216 @@ +/* + * check TSC synchronization. + * + * Copyright (C) 2006, Red Hat, Inc., Ingo Molnar + * + * We check whether all boot CPUs have their TSC's synchronized, + * print a warning if not and turn off the TSC clock-source. + * + * The warp-check is point-to-point between two CPUs, the CPU + * initiating the bootup is the 'source CPU', the freshly booting + * CPU is the 'target CPU'. + * + * Only two CPUs may participate - they can enter in any order. + * ( The serial nature of the boot logic and the CPU hotplug lock + * protects against more than 2 CPUs entering this code. ) + */ +#include <linux/spinlock.h> +#include <linux/kernel.h> +#include <linux/smp.h> +#include <linux/nmi.h> +#include <asm/tsc.h> + +/* + * Entry/exit counters that make sure that both CPUs + * run the measurement code at once: + */ +static atomic_t start_count; +static atomic_t stop_count; + +/* + * We use a raw spinlock in this exceptional case, because + * we want to have the fastest, inlined, non-debug version + * of a critical section, to be able to prove TSC time-warps: + */ +static arch_spinlock_t sync_lock = __ARCH_SPIN_LOCK_UNLOCKED; + +static cycles_t last_tsc; +static cycles_t max_warp; +static int nr_warps; + +/* + * TSC-warp measurement loop running on both CPUs: + */ +static void check_tsc_warp(unsigned int timeout) +{ + cycles_t start, now, prev, end; + int i; + + rdtsc_barrier(); + start = get_cycles(); + rdtsc_barrier(); + /* + * The measurement runs for 'timeout' msecs: + */ + end = start + (cycles_t) tsc_khz * timeout; + now = start; + + for (i = 0; ; i++) { + /* + * We take the global lock, measure TSC, save the + * previous TSC that was measured (possibly on + * another CPU) and update the previous TSC timestamp. + */ + arch_spin_lock(&sync_lock); + prev = last_tsc; + rdtsc_barrier(); + now = get_cycles(); + rdtsc_barrier(); + last_tsc = now; + arch_spin_unlock(&sync_lock); + + /* + * Be nice every now and then (and also check whether + * measurement is done [we also insert a 10 million + * loops safety exit, so we dont lock up in case the + * TSC readout is totally broken]): + */ + if (unlikely(!(i & 7))) { + if (now > end || i > 10000000) + break; + cpu_relax(); + touch_nmi_watchdog(); + } + /* + * Outside the critical section we can now see whether + * we saw a time-warp of the TSC going backwards: + */ + if (unlikely(prev > now)) { + arch_spin_lock(&sync_lock); + max_warp = max(max_warp, prev - now); + nr_warps++; + arch_spin_unlock(&sync_lock); + } + } + WARN(!(now-start), + "Warning: zero tsc calibration delta: %Ld [max: %Ld]\n", + now-start, end-start); +} + +/* + * If the target CPU coming online doesn't have any of its core-siblings + * online, a timeout of 20msec will be used for the TSC-warp measurement + * loop. Otherwise a smaller timeout of 2msec will be used, as we have some + * information about this socket already (and this information grows as we + * have more and more logical-siblings in that socket). + * + * Ideally we should be able to skip the TSC sync check on the other + * core-siblings, if the first logical CPU in a socket passed the sync test. + * But as the TSC is per-logical CPU and can potentially be modified wrongly + * by the bios, TSC sync test for smaller duration should be able + * to catch such errors. Also this will catch the condition where all the + * cores in the socket doesn't get reset at the same time. + */ +static inline unsigned int loop_timeout(int cpu) +{ + return (cpumask_weight(cpu_core_mask(cpu)) > 1) ? 2 : 20; +} + +/* + * Source CPU calls into this - it waits for the freshly booted + * target CPU to arrive and then starts the measurement: + */ +void check_tsc_sync_source(int cpu) +{ + int cpus = 2; + + /* + * No need to check if we already know that the TSC is not + * synchronized: + */ + if (unsynchronized_tsc()) + return; + + if (tsc_clocksource_reliable) { + if (cpu == (nr_cpu_ids-1) || system_state != SYSTEM_BOOTING) + pr_info( + "Skipped synchronization checks as TSC is reliable.\n"); + return; + } + + /* + * Reset it - in case this is a second bootup: + */ + atomic_set(&stop_count, 0); + + /* + * Wait for the target to arrive: + */ + while (atomic_read(&start_count) != cpus-1) + cpu_relax(); + /* + * Trigger the target to continue into the measurement too: + */ + atomic_inc(&start_count); + + check_tsc_warp(loop_timeout(cpu)); + + while (atomic_read(&stop_count) != cpus-1) + cpu_relax(); + + if (nr_warps) { + pr_warning("TSC synchronization [CPU#%d -> CPU#%d]:\n", + smp_processor_id(), cpu); + pr_warning("Measured %Ld cycles TSC warp between CPUs, " + "turning off TSC clock.\n", max_warp); + mark_tsc_unstable("check_tsc_sync_source failed"); + } else { + pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n", + smp_processor_id(), cpu); + } + + /* + * Reset it - just in case we boot another CPU later: + */ + atomic_set(&start_count, 0); + nr_warps = 0; + max_warp = 0; + last_tsc = 0; + + /* + * Let the target continue with the bootup: + */ + atomic_inc(&stop_count); +} + +/* + * Freshly booted CPUs call into this: + */ +void check_tsc_sync_target(void) +{ + int cpus = 2; + + if (unsynchronized_tsc() || tsc_clocksource_reliable) + return; + + /* + * Register this CPU's participation and wait for the + * source CPU to start the measurement: + */ + atomic_inc(&start_count); + while (atomic_read(&start_count) != cpus) + cpu_relax(); + + check_tsc_warp(loop_timeout(smp_processor_id())); + + /* + * Ok, we are done: + */ + atomic_inc(&stop_count); + + /* + * Wait for the source CPU to print stuff: + */ + while (atomic_read(&stop_count) != cpus) + cpu_relax(); +} diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c new file mode 100644 index 00000000000..5d1cbfe4ae5 --- /dev/null +++ b/arch/x86/kernel/uprobes.c @@ -0,0 +1,928 @@ +/* + * User-space Probes (UProbes) for x86 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2008-2011 + * Authors: + * Srikar Dronamraju + * Jim Keniston + */ +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/ptrace.h> +#include <linux/uprobes.h> +#include <linux/uaccess.h> + +#include <linux/kdebug.h> +#include <asm/processor.h> +#include <asm/insn.h> + +/* Post-execution fixups. */ + +/* Adjust IP back to vicinity of actual insn */ +#define UPROBE_FIX_IP 0x01 + +/* Adjust the return address of a call insn */ +#define UPROBE_FIX_CALL 0x02 + +/* Instruction will modify TF, don't change it */ +#define UPROBE_FIX_SETF 0x04 + +#define UPROBE_FIX_RIP_SI 0x08 +#define UPROBE_FIX_RIP_DI 0x10 +#define UPROBE_FIX_RIP_BX 0x20 +#define UPROBE_FIX_RIP_MASK \ + (UPROBE_FIX_RIP_SI | UPROBE_FIX_RIP_DI | UPROBE_FIX_RIP_BX) + +#define UPROBE_TRAP_NR UINT_MAX + +/* Adaptations for mhiramat x86 decoder v14. */ +#define OPCODE1(insn) ((insn)->opcode.bytes[0]) +#define OPCODE2(insn) ((insn)->opcode.bytes[1]) +#define OPCODE3(insn) ((insn)->opcode.bytes[2]) +#define MODRM_REG(insn) X86_MODRM_REG((insn)->modrm.value) + +#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ + (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ + (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \ + (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \ + (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \ + << (row % 32)) + +/* + * Good-instruction tables for 32-bit apps. This is non-const and volatile + * to keep gcc from statically optimizing it out, as variable_test_bit makes + * some versions of gcc to think only *(unsigned long*) is used. + */ +#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) +static volatile u32 good_insns_32[256 / 32] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ---------------------------------------------- */ + W(0x00, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) | /* 00 */ + W(0x10, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 10 */ + W(0x20, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) | /* 20 */ + W(0x30, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) , /* 30 */ + W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ + W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ + W(0x60, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */ + W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */ + W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ + W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ + W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */ + W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ + W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */ + W(0xd0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ + W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */ + W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */ + /* ---------------------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ +}; +#else +#define good_insns_32 NULL +#endif + +/* Good-instruction tables for 64-bit apps */ +#if defined(CONFIG_X86_64) +static volatile u32 good_insns_64[256 / 32] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ---------------------------------------------- */ + W(0x00, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 00 */ + W(0x10, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 10 */ + W(0x20, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 20 */ + W(0x30, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 30 */ + W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */ + W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ + W(0x60, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */ + W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */ + W(0x80, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ + W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ + W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */ + W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ + W(0xc0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */ + W(0xd0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ + W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */ + W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */ + /* ---------------------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ +}; +#else +#define good_insns_64 NULL +#endif + +/* Using this for both 64-bit and 32-bit apps */ +static volatile u32 good_2byte_insns[256 / 32] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ---------------------------------------------- */ + W(0x00, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1) | /* 00 */ + W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* 10 */ + W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */ + W(0x30, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */ + W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ + W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ + W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */ + W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */ + W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ + W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ + W(0xa0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */ + W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ + W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */ + W(0xd0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ + W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */ + W(0xf0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* f0 */ + /* ---------------------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ +}; +#undef W + +/* + * opcodes we'll probably never support: + * + * 6c-6d, e4-e5, ec-ed - in + * 6e-6f, e6-e7, ee-ef - out + * cc, cd - int3, int + * cf - iret + * d6 - illegal instruction + * f1 - int1/icebp + * f4 - hlt + * fa, fb - cli, sti + * 0f - lar, lsl, syscall, clts, sysret, sysenter, sysexit, invd, wbinvd, ud2 + * + * invalid opcodes in 64-bit mode: + * + * 06, 0e, 16, 1e, 27, 2f, 37, 3f, 60-62, 82, c4-c5, d4-d5 + * 63 - we support this opcode in x86_64 but not in i386. + * + * opcodes we may need to refine support for: + * + * 0f - 2-byte instructions: For many of these instructions, the validity + * depends on the prefix and/or the reg field. On such instructions, we + * just consider the opcode combination valid if it corresponds to any + * valid instruction. + * + * 8f - Group 1 - only reg = 0 is OK + * c6-c7 - Group 11 - only reg = 0 is OK + * d9-df - fpu insns with some illegal encodings + * f2, f3 - repnz, repz prefixes. These are also the first byte for + * certain floating-point instructions, such as addsd. + * + * fe - Group 4 - only reg = 0 or 1 is OK + * ff - Group 5 - only reg = 0-6 is OK + * + * others -- Do we need to support these? + * + * 0f - (floating-point?) prefetch instructions + * 07, 17, 1f - pop es, pop ss, pop ds + * 26, 2e, 36, 3e - es:, cs:, ss:, ds: segment prefixes -- + * but 64 and 65 (fs: and gs:) seem to be used, so we support them + * 67 - addr16 prefix + * ce - into + * f0 - lock prefix + */ + +/* + * TODO: + * - Where necessary, examine the modrm byte and allow only valid instructions + * in the different Groups and fpu instructions. + */ + +static bool is_prefix_bad(struct insn *insn) +{ + int i; + + for (i = 0; i < insn->prefixes.nbytes; i++) { + switch (insn->prefixes.bytes[i]) { + case 0x26: /* INAT_PFX_ES */ + case 0x2E: /* INAT_PFX_CS */ + case 0x36: /* INAT_PFX_DS */ + case 0x3E: /* INAT_PFX_SS */ + case 0xF0: /* INAT_PFX_LOCK */ + return true; + } + } + return false; +} + +static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool x86_64) +{ + u32 volatile *good_insns; + + insn_init(insn, auprobe->insn, x86_64); + /* has the side-effect of processing the entire instruction */ + insn_get_length(insn); + if (WARN_ON_ONCE(!insn_complete(insn))) + return -ENOEXEC; + + if (is_prefix_bad(insn)) + return -ENOTSUPP; + + if (x86_64) + good_insns = good_insns_64; + else + good_insns = good_insns_32; + + if (test_bit(OPCODE1(insn), (unsigned long *)good_insns)) + return 0; + + if (insn->opcode.nbytes == 2) { + if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns)) + return 0; + } + + return -ENOTSUPP; +} + +#ifdef CONFIG_X86_64 +static inline bool is_64bit_mm(struct mm_struct *mm) +{ + return !config_enabled(CONFIG_IA32_EMULATION) || + !(mm->context.ia32_compat == TIF_IA32); +} +/* + * If arch_uprobe->insn doesn't use rip-relative addressing, return + * immediately. Otherwise, rewrite the instruction so that it accesses + * its memory operand indirectly through a scratch register. Set + * defparam->fixups accordingly. (The contents of the scratch register + * will be saved before we single-step the modified instruction, + * and restored afterward). + * + * We do this because a rip-relative instruction can access only a + * relatively small area (+/- 2 GB from the instruction), and the XOL + * area typically lies beyond that area. At least for instructions + * that store to memory, we can't execute the original instruction + * and "fix things up" later, because the misdirected store could be + * disastrous. + * + * Some useful facts about rip-relative instructions: + * + * - There's always a modrm byte with bit layout "00 reg 101". + * - There's never a SIB byte. + * - The displacement is always 4 bytes. + * - REX.B=1 bit in REX prefix, which normally extends r/m field, + * has no effect on rip-relative mode. It doesn't make modrm byte + * with r/m=101 refer to register 1101 = R13. + */ +static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) +{ + u8 *cursor; + u8 reg; + u8 reg2; + + if (!insn_rip_relative(insn)) + return; + + /* + * insn_rip_relative() would have decoded rex_prefix, vex_prefix, modrm. + * Clear REX.b bit (extension of MODRM.rm field): + * we want to encode low numbered reg, not r8+. + */ + if (insn->rex_prefix.nbytes) { + cursor = auprobe->insn + insn_offset_rex_prefix(insn); + /* REX byte has 0100wrxb layout, clearing REX.b bit */ + *cursor &= 0xfe; + } + /* + * Similar treatment for VEX3 prefix. + * TODO: add XOP/EVEX treatment when insn decoder supports them + */ + if (insn->vex_prefix.nbytes == 3) { + /* + * vex2: c5 rvvvvLpp (has no b bit) + * vex3/xop: c4/8f rxbmmmmm wvvvvLpp + * evex: 62 rxbR00mm wvvvv1pp zllBVaaa + * (evex will need setting of both b and x since + * in non-sib encoding evex.x is 4th bit of MODRM.rm) + * Setting VEX3.b (setting because it has inverted meaning): + */ + cursor = auprobe->insn + insn_offset_vex_prefix(insn) + 1; + *cursor |= 0x20; + } + + /* + * Convert from rip-relative addressing to register-relative addressing + * via a scratch register. + * + * This is tricky since there are insns with modrm byte + * which also use registers not encoded in modrm byte: + * [i]div/[i]mul: implicitly use dx:ax + * shift ops: implicitly use cx + * cmpxchg: implicitly uses ax + * cmpxchg8/16b: implicitly uses dx:ax and bx:cx + * Encoding: 0f c7/1 modrm + * The code below thinks that reg=1 (cx), chooses si as scratch. + * mulx: implicitly uses dx: mulx r/m,r1,r2 does r1:r2 = dx * r/m. + * First appeared in Haswell (BMI2 insn). It is vex-encoded. + * Example where none of bx,cx,dx can be used as scratch reg: + * c4 e2 63 f6 0d disp32 mulx disp32(%rip),%ebx,%ecx + * [v]pcmpistri: implicitly uses cx, xmm0 + * [v]pcmpistrm: implicitly uses xmm0 + * [v]pcmpestri: implicitly uses ax, dx, cx, xmm0 + * [v]pcmpestrm: implicitly uses ax, dx, xmm0 + * Evil SSE4.2 string comparison ops from hell. + * maskmovq/[v]maskmovdqu: implicitly uses (ds:rdi) as destination. + * Encoding: 0f f7 modrm, 66 0f f7 modrm, vex-encoded: c5 f9 f7 modrm. + * Store op1, byte-masked by op2 msb's in each byte, to (ds:rdi). + * AMD says it has no 3-operand form (vex.vvvv must be 1111) + * and that it can have only register operands, not mem + * (its modrm byte must have mode=11). + * If these restrictions will ever be lifted, + * we'll need code to prevent selection of di as scratch reg! + * + * Summary: I don't know any insns with modrm byte which + * use SI register implicitly. DI register is used only + * by one insn (maskmovq) and BX register is used + * only by one too (cmpxchg8b). + * BP is stack-segment based (may be a problem?). + * AX, DX, CX are off-limits (many implicit users). + * SP is unusable (it's stack pointer - think about "pop mem"; + * also, rsp+disp32 needs sib encoding -> insn length change). + */ + + reg = MODRM_REG(insn); /* Fetch modrm.reg */ + reg2 = 0xff; /* Fetch vex.vvvv */ + if (insn->vex_prefix.nbytes == 2) + reg2 = insn->vex_prefix.bytes[1]; + else if (insn->vex_prefix.nbytes == 3) + reg2 = insn->vex_prefix.bytes[2]; + /* + * TODO: add XOP, EXEV vvvv reading. + * + * vex.vvvv field is in bits 6-3, bits are inverted. + * But in 32-bit mode, high-order bit may be ignored. + * Therefore, let's consider only 3 low-order bits. + */ + reg2 = ((reg2 >> 3) & 0x7) ^ 0x7; + /* + * Register numbering is ax,cx,dx,bx, sp,bp,si,di, r8..r15. + * + * Choose scratch reg. Order is important: must not select bx + * if we can use si (cmpxchg8b case!) + */ + if (reg != 6 && reg2 != 6) { + reg2 = 6; + auprobe->defparam.fixups |= UPROBE_FIX_RIP_SI; + } else if (reg != 7 && reg2 != 7) { + reg2 = 7; + auprobe->defparam.fixups |= UPROBE_FIX_RIP_DI; + /* TODO (paranoia): force maskmovq to not use di */ + } else { + reg2 = 3; + auprobe->defparam.fixups |= UPROBE_FIX_RIP_BX; + } + /* + * Point cursor at the modrm byte. The next 4 bytes are the + * displacement. Beyond the displacement, for some instructions, + * is the immediate operand. + */ + cursor = auprobe->insn + insn_offset_modrm(insn); + /* + * Change modrm from "00 reg 101" to "10 reg reg2". Example: + * 89 05 disp32 mov %eax,disp32(%rip) becomes + * 89 86 disp32 mov %eax,disp32(%rsi) + */ + *cursor = 0x80 | (reg << 3) | reg2; +} + +static inline unsigned long * +scratch_reg(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + if (auprobe->defparam.fixups & UPROBE_FIX_RIP_SI) + return ®s->si; + if (auprobe->defparam.fixups & UPROBE_FIX_RIP_DI) + return ®s->di; + return ®s->bx; +} + +/* + * If we're emulating a rip-relative instruction, save the contents + * of the scratch register and store the target address in that register. + */ +static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + if (auprobe->defparam.fixups & UPROBE_FIX_RIP_MASK) { + struct uprobe_task *utask = current->utask; + unsigned long *sr = scratch_reg(auprobe, regs); + + utask->autask.saved_scratch_register = *sr; + *sr = utask->vaddr + auprobe->defparam.ilen; + } +} + +static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + if (auprobe->defparam.fixups & UPROBE_FIX_RIP_MASK) { + struct uprobe_task *utask = current->utask; + unsigned long *sr = scratch_reg(auprobe, regs); + + *sr = utask->autask.saved_scratch_register; + } +} +#else /* 32-bit: */ +static inline bool is_64bit_mm(struct mm_struct *mm) +{ + return false; +} +/* + * No RIP-relative addressing on 32-bit + */ +static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) +{ +} +static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ +} +static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ +} +#endif /* CONFIG_X86_64 */ + +struct uprobe_xol_ops { + bool (*emulate)(struct arch_uprobe *, struct pt_regs *); + int (*pre_xol)(struct arch_uprobe *, struct pt_regs *); + int (*post_xol)(struct arch_uprobe *, struct pt_regs *); + void (*abort)(struct arch_uprobe *, struct pt_regs *); +}; + +static inline int sizeof_long(void) +{ + return is_ia32_task() ? 4 : 8; +} + +static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + riprel_pre_xol(auprobe, regs); + return 0; +} + +static int push_ret_address(struct pt_regs *regs, unsigned long ip) +{ + unsigned long new_sp = regs->sp - sizeof_long(); + + if (copy_to_user((void __user *)new_sp, &ip, sizeof_long())) + return -EFAULT; + + regs->sp = new_sp; + return 0; +} + +/* + * We have to fix things up as follows: + * + * Typically, the new ip is relative to the copied instruction. We need + * to make it relative to the original instruction (FIX_IP). Exceptions + * are return instructions and absolute or indirect jump or call instructions. + * + * If the single-stepped instruction was a call, the return address that + * is atop the stack is the address following the copied instruction. We + * need to make it the address following the original instruction (FIX_CALL). + * + * If the original instruction was a rip-relative instruction such as + * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent + * instruction using a scratch register -- e.g., "movl %edx,0xnnnn(%rsi)". + * We need to restore the contents of the scratch register + * (FIX_RIP_reg). + */ +static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + struct uprobe_task *utask = current->utask; + + riprel_post_xol(auprobe, regs); + if (auprobe->defparam.fixups & UPROBE_FIX_IP) { + long correction = utask->vaddr - utask->xol_vaddr; + regs->ip += correction; + } else if (auprobe->defparam.fixups & UPROBE_FIX_CALL) { + regs->sp += sizeof_long(); /* Pop incorrect return address */ + if (push_ret_address(regs, utask->vaddr + auprobe->defparam.ilen)) + return -ERESTART; + } + /* popf; tell the caller to not touch TF */ + if (auprobe->defparam.fixups & UPROBE_FIX_SETF) + utask->autask.saved_tf = true; + + return 0; +} + +static void default_abort_op(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + riprel_post_xol(auprobe, regs); +} + +static struct uprobe_xol_ops default_xol_ops = { + .pre_xol = default_pre_xol_op, + .post_xol = default_post_xol_op, + .abort = default_abort_op, +}; + +static bool branch_is_call(struct arch_uprobe *auprobe) +{ + return auprobe->branch.opc1 == 0xe8; +} + +#define CASE_COND \ + COND(70, 71, XF(OF)) \ + COND(72, 73, XF(CF)) \ + COND(74, 75, XF(ZF)) \ + COND(78, 79, XF(SF)) \ + COND(7a, 7b, XF(PF)) \ + COND(76, 77, XF(CF) || XF(ZF)) \ + COND(7c, 7d, XF(SF) != XF(OF)) \ + COND(7e, 7f, XF(ZF) || XF(SF) != XF(OF)) + +#define COND(op_y, op_n, expr) \ + case 0x ## op_y: DO((expr) != 0) \ + case 0x ## op_n: DO((expr) == 0) + +#define XF(xf) (!!(flags & X86_EFLAGS_ ## xf)) + +static bool is_cond_jmp_opcode(u8 opcode) +{ + switch (opcode) { + #define DO(expr) \ + return true; + CASE_COND + #undef DO + + default: + return false; + } +} + +static bool check_jmp_cond(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + unsigned long flags = regs->flags; + + switch (auprobe->branch.opc1) { + #define DO(expr) \ + return expr; + CASE_COND + #undef DO + + default: /* not a conditional jmp */ + return true; + } +} + +#undef XF +#undef COND +#undef CASE_COND + +static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + unsigned long new_ip = regs->ip += auprobe->branch.ilen; + unsigned long offs = (long)auprobe->branch.offs; + + if (branch_is_call(auprobe)) { + /* + * If it fails we execute this (mangled, see the comment in + * branch_clear_offset) insn out-of-line. In the likely case + * this should trigger the trap, and the probed application + * should die or restart the same insn after it handles the + * signal, arch_uprobe_post_xol() won't be even called. + * + * But there is corner case, see the comment in ->post_xol(). + */ + if (push_ret_address(regs, new_ip)) + return false; + } else if (!check_jmp_cond(auprobe, regs)) { + offs = 0; + } + + regs->ip = new_ip + offs; + return true; +} + +static int branch_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + BUG_ON(!branch_is_call(auprobe)); + /* + * We can only get here if branch_emulate_op() failed to push the ret + * address _and_ another thread expanded our stack before the (mangled) + * "call" insn was executed out-of-line. Just restore ->sp and restart. + * We could also restore ->ip and try to call branch_emulate_op() again. + */ + regs->sp += sizeof_long(); + return -ERESTART; +} + +static void branch_clear_offset(struct arch_uprobe *auprobe, struct insn *insn) +{ + /* + * Turn this insn into "call 1f; 1:", this is what we will execute + * out-of-line if ->emulate() fails. We only need this to generate + * a trap, so that the probed task receives the correct signal with + * the properly filled siginfo. + * + * But see the comment in ->post_xol(), in the unlikely case it can + * succeed. So we need to ensure that the new ->ip can not fall into + * the non-canonical area and trigger #GP. + * + * We could turn it into (say) "pushf", but then we would need to + * divorce ->insn[] and ->ixol[]. We need to preserve the 1st byte + * of ->insn[] for set_orig_insn(). + */ + memset(auprobe->insn + insn_offset_immediate(insn), + 0, insn->immediate.nbytes); +} + +static struct uprobe_xol_ops branch_xol_ops = { + .emulate = branch_emulate_op, + .post_xol = branch_post_xol_op, +}; + +/* Returns -ENOSYS if branch_xol_ops doesn't handle this insn */ +static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) +{ + u8 opc1 = OPCODE1(insn); + int i; + + switch (opc1) { + case 0xeb: /* jmp 8 */ + case 0xe9: /* jmp 32 */ + case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */ + break; + + case 0xe8: /* call relative */ + branch_clear_offset(auprobe, insn); + break; + + case 0x0f: + if (insn->opcode.nbytes != 2) + return -ENOSYS; + /* + * If it is a "near" conditional jmp, OPCODE2() - 0x10 matches + * OPCODE1() of the "short" jmp which checks the same condition. + */ + opc1 = OPCODE2(insn) - 0x10; + default: + if (!is_cond_jmp_opcode(opc1)) + return -ENOSYS; + } + + /* + * 16-bit overrides such as CALLW (66 e8 nn nn) are not supported. + * Intel and AMD behavior differ in 64-bit mode: Intel ignores 66 prefix. + * No one uses these insns, reject any branch insns with such prefix. + */ + for (i = 0; i < insn->prefixes.nbytes; i++) { + if (insn->prefixes.bytes[i] == 0x66) + return -ENOTSUPP; + } + + auprobe->branch.opc1 = opc1; + auprobe->branch.ilen = insn->length; + auprobe->branch.offs = insn->immediate.value; + + auprobe->ops = &branch_xol_ops; + return 0; +} + +/** + * arch_uprobe_analyze_insn - instruction analysis including validity and fixups. + * @mm: the probed address space. + * @arch_uprobe: the probepoint information. + * @addr: virtual address at which to install the probepoint + * Return 0 on success or a -ve number on error. + */ +int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr) +{ + struct insn insn; + u8 fix_ip_or_call = UPROBE_FIX_IP; + int ret; + + ret = uprobe_init_insn(auprobe, &insn, is_64bit_mm(mm)); + if (ret) + return ret; + + ret = branch_setup_xol_ops(auprobe, &insn); + if (ret != -ENOSYS) + return ret; + + /* + * Figure out which fixups default_post_xol_op() will need to perform, + * and annotate defparam->fixups accordingly. + */ + switch (OPCODE1(&insn)) { + case 0x9d: /* popf */ + auprobe->defparam.fixups |= UPROBE_FIX_SETF; + break; + case 0xc3: /* ret or lret -- ip is correct */ + case 0xcb: + case 0xc2: + case 0xca: + case 0xea: /* jmp absolute -- ip is correct */ + fix_ip_or_call = 0; + break; + case 0x9a: /* call absolute - Fix return addr, not ip */ + fix_ip_or_call = UPROBE_FIX_CALL; + break; + case 0xff: + switch (MODRM_REG(&insn)) { + case 2: case 3: /* call or lcall, indirect */ + fix_ip_or_call = UPROBE_FIX_CALL; + break; + case 4: case 5: /* jmp or ljmp, indirect */ + fix_ip_or_call = 0; + break; + } + /* fall through */ + default: + riprel_analyze(auprobe, &insn); + } + + auprobe->defparam.ilen = insn.length; + auprobe->defparam.fixups |= fix_ip_or_call; + + auprobe->ops = &default_xol_ops; + return 0; +} + +/* + * arch_uprobe_pre_xol - prepare to execute out of line. + * @auprobe: the probepoint information. + * @regs: reflects the saved user state of current task. + */ +int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + struct uprobe_task *utask = current->utask; + + if (auprobe->ops->pre_xol) { + int err = auprobe->ops->pre_xol(auprobe, regs); + if (err) + return err; + } + + regs->ip = utask->xol_vaddr; + utask->autask.saved_trap_nr = current->thread.trap_nr; + current->thread.trap_nr = UPROBE_TRAP_NR; + + utask->autask.saved_tf = !!(regs->flags & X86_EFLAGS_TF); + regs->flags |= X86_EFLAGS_TF; + if (test_tsk_thread_flag(current, TIF_BLOCKSTEP)) + set_task_blockstep(current, false); + + return 0; +} + +/* + * If xol insn itself traps and generates a signal(Say, + * SIGILL/SIGSEGV/etc), then detect the case where a singlestepped + * instruction jumps back to its own address. It is assumed that anything + * like do_page_fault/do_trap/etc sets thread.trap_nr != -1. + * + * arch_uprobe_pre_xol/arch_uprobe_post_xol save/restore thread.trap_nr, + * arch_uprobe_xol_was_trapped() simply checks that ->trap_nr is not equal to + * UPROBE_TRAP_NR == -1 set by arch_uprobe_pre_xol(). + */ +bool arch_uprobe_xol_was_trapped(struct task_struct *t) +{ + if (t->thread.trap_nr != UPROBE_TRAP_NR) + return true; + + return false; +} + +/* + * Called after single-stepping. To avoid the SMP problems that can + * occur when we temporarily put back the original opcode to + * single-step, we single-stepped a copy of the instruction. + * + * This function prepares to resume execution after the single-step. + */ +int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + struct uprobe_task *utask = current->utask; + bool send_sigtrap = utask->autask.saved_tf; + int err = 0; + + WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR); + current->thread.trap_nr = utask->autask.saved_trap_nr; + + if (auprobe->ops->post_xol) { + err = auprobe->ops->post_xol(auprobe, regs); + if (err) { + /* + * Restore ->ip for restart or post mortem analysis. + * ->post_xol() must not return -ERESTART unless this + * is really possible. + */ + regs->ip = utask->vaddr; + if (err == -ERESTART) + err = 0; + send_sigtrap = false; + } + } + /* + * arch_uprobe_pre_xol() doesn't save the state of TIF_BLOCKSTEP + * so we can get an extra SIGTRAP if we do not clear TF. We need + * to examine the opcode to make it right. + */ + if (send_sigtrap) + send_sig(SIGTRAP, current, 0); + + if (!utask->autask.saved_tf) + regs->flags &= ~X86_EFLAGS_TF; + + return err; +} + +/* callback routine for handling exceptions. */ +int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data) +{ + struct die_args *args = data; + struct pt_regs *regs = args->regs; + int ret = NOTIFY_DONE; + + /* We are only interested in userspace traps */ + if (regs && !user_mode_vm(regs)) + return NOTIFY_DONE; + + switch (val) { + case DIE_INT3: + if (uprobe_pre_sstep_notifier(regs)) + ret = NOTIFY_STOP; + + break; + + case DIE_DEBUG: + if (uprobe_post_sstep_notifier(regs)) + ret = NOTIFY_STOP; + + default: + break; + } + + return ret; +} + +/* + * This function gets called when XOL instruction either gets trapped or + * the thread has a fatal signal. Reset the instruction pointer to its + * probed address for the potential restart or for post mortem analysis. + */ +void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + struct uprobe_task *utask = current->utask; + + if (auprobe->ops->abort) + auprobe->ops->abort(auprobe, regs); + + current->thread.trap_nr = utask->autask.saved_trap_nr; + regs->ip = utask->vaddr; + /* clear TF if it was set by us in arch_uprobe_pre_xol() */ + if (!utask->autask.saved_tf) + regs->flags &= ~X86_EFLAGS_TF; +} + +static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + if (auprobe->ops->emulate) + return auprobe->ops->emulate(auprobe, regs); + return false; +} + +bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + bool ret = __skip_sstep(auprobe, regs); + if (ret && (regs->flags & X86_EFLAGS_TF)) + send_sig(SIGTRAP, current, 0); + return ret; +} + +unsigned long +arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs) +{ + int rasize = sizeof_long(), nleft; + unsigned long orig_ret_vaddr = 0; /* clear high bits for 32-bit apps */ + + if (copy_from_user(&orig_ret_vaddr, (void __user *)regs->sp, rasize)) + return -1; + + /* check whether address has been already hijacked */ + if (orig_ret_vaddr == trampoline_vaddr) + return orig_ret_vaddr; + + nleft = copy_to_user((void __user *)regs->sp, &trampoline_vaddr, rasize); + if (likely(!nleft)) + return orig_ret_vaddr; + + if (nleft != rasize) { + pr_err("uprobe: return address clobbered: pid=%d, %%sp=%#lx, " + "%%ip=%#lx\n", current->pid, regs->sp, regs->ip); + + force_sig_info(SIGSEGV, SEND_SIG_FORCED, current); + } + + return -1; +} diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S new file mode 100644 index 00000000000..b9242bacbe5 --- /dev/null +++ b/arch/x86/kernel/verify_cpu.S @@ -0,0 +1,139 @@ +/* + * + * verify_cpu.S - Code for cpu long mode and SSE verification. This + * code has been borrowed from boot/setup.S and was introduced by + * Andi Kleen. + * + * Copyright (c) 2007 Andi Kleen (ak@suse.de) + * Copyright (c) 2007 Eric Biederman (ebiederm@xmission.com) + * Copyright (c) 2007 Vivek Goyal (vgoyal@in.ibm.com) + * Copyright (c) 2010 Kees Cook (kees.cook@canonical.com) + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + * + * This is a common code for verification whether CPU supports + * long mode and SSE or not. It is not called directly instead this + * file is included at various places and compiled in that context. + * This file is expected to run in 32bit code. Currently: + * + * arch/x86/boot/compressed/head_64.S: Boot cpu verification + * arch/x86/kernel/trampoline_64.S: secondary processor verification + * arch/x86/kernel/head_32.S: processor startup + * + * verify_cpu, returns the status of longmode and SSE in register %eax. + * 0: Success 1: Failure + * + * On Intel, the XD_DISABLE flag will be cleared as a side-effect. + * + * The caller needs to check for the error code and take the action + * appropriately. Either display a message or halt. + */ + +#include <asm/cpufeature.h> +#include <asm/msr-index.h> + +verify_cpu: + pushfl # Save caller passed flags + pushl $0 # Kill any dangerous flags + popfl + + pushfl # standard way to check for cpuid + popl %eax + movl %eax,%ebx + xorl $0x200000,%eax + pushl %eax + popfl + pushfl + popl %eax + cmpl %eax,%ebx + jz verify_cpu_no_longmode # cpu has no cpuid + + movl $0x0,%eax # See if cpuid 1 is implemented + cpuid + cmpl $0x1,%eax + jb verify_cpu_no_longmode # no cpuid 1 + + xor %di,%di + cmpl $0x68747541,%ebx # AuthenticAMD + jnz verify_cpu_noamd + cmpl $0x69746e65,%edx + jnz verify_cpu_noamd + cmpl $0x444d4163,%ecx + jnz verify_cpu_noamd + mov $1,%di # cpu is from AMD + jmp verify_cpu_check + +verify_cpu_noamd: + cmpl $0x756e6547,%ebx # GenuineIntel? + jnz verify_cpu_check + cmpl $0x49656e69,%edx + jnz verify_cpu_check + cmpl $0x6c65746e,%ecx + jnz verify_cpu_check + + # only call IA32_MISC_ENABLE when: + # family > 6 || (family == 6 && model >= 0xd) + movl $0x1, %eax # check CPU family and model + cpuid + movl %eax, %ecx + + andl $0x0ff00f00, %eax # mask family and extended family + shrl $8, %eax + cmpl $6, %eax + ja verify_cpu_clear_xd # family > 6, ok + jb verify_cpu_check # family < 6, skip + + andl $0x000f00f0, %ecx # mask model and extended model + shrl $4, %ecx + cmpl $0xd, %ecx + jb verify_cpu_check # family == 6, model < 0xd, skip + +verify_cpu_clear_xd: + movl $MSR_IA32_MISC_ENABLE, %ecx + rdmsr + btrl $2, %edx # clear MSR_IA32_MISC_ENABLE_XD_DISABLE + jnc verify_cpu_check # only write MSR if bit was changed + wrmsr + +verify_cpu_check: + movl $0x1,%eax # Does the cpu have what it takes + cpuid + andl $REQUIRED_MASK0,%edx + xorl $REQUIRED_MASK0,%edx + jnz verify_cpu_no_longmode + + movl $0x80000000,%eax # See if extended cpuid is implemented + cpuid + cmpl $0x80000001,%eax + jb verify_cpu_no_longmode # no extended cpuid + + movl $0x80000001,%eax # Does the cpu have what it takes + cpuid + andl $REQUIRED_MASK1,%edx + xorl $REQUIRED_MASK1,%edx + jnz verify_cpu_no_longmode + +verify_cpu_sse_test: + movl $1,%eax + cpuid + andl $SSE_MASK,%edx + cmpl $SSE_MASK,%edx + je verify_cpu_sse_ok + test %di,%di + jz verify_cpu_no_longmode # only try to force SSE on AMD + movl $MSR_K7_HWCR,%ecx + rdmsr + btr $15,%eax # enable SSE + wrmsr + xor %di,%di # don't loop + jmp verify_cpu_sse_test # try again + +verify_cpu_no_longmode: + popfl # Restore caller passed flags + movl $1,%eax + ret +verify_cpu_sse_ok: + popfl # Restore caller passed flags + xorl %eax, %eax + ret diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c new file mode 100644 index 00000000000..e8edcf52e06 --- /dev/null +++ b/arch/x86/kernel/vm86_32.c @@ -0,0 +1,841 @@ +/* + * Copyright (C) 1994 Linus Torvalds + * + * 29 dec 2001 - Fixed oopses caused by unchecked access to the vm86 + * stack - Manfred Spraul <manfred@colorfullife.com> + * + * 22 mar 2002 - Manfred detected the stackfaults, but didn't handle + * them correctly. Now the emulation will be in a + * consistent state after stackfaults - Kasper Dupont + * <kasperd@daimi.au.dk> + * + * 22 mar 2002 - Added missing clear_IF in set_vflags_* Kasper Dupont + * <kasperd@daimi.au.dk> + * + * ?? ??? 2002 - Fixed premature returns from handle_vm86_fault + * caused by Kasper Dupont's changes - Stas Sergeev + * + * 4 apr 2002 - Fixed CHECK_IF_IN_TRAP broken by Stas' changes. + * Kasper Dupont <kasperd@daimi.au.dk> + * + * 9 apr 2002 - Changed syntax of macros in handle_vm86_fault. + * Kasper Dupont <kasperd@daimi.au.dk> + * + * 9 apr 2002 - Changed stack access macros to jump to a label + * instead of returning to userspace. This simplifies + * do_int, and is needed by handle_vm6_fault. Kasper + * Dupont <kasperd@daimi.au.dk> + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/capability.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/syscalls.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/signal.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/highmem.h> +#include <linux/ptrace.h> +#include <linux/audit.h> +#include <linux/stddef.h> + +#include <asm/uaccess.h> +#include <asm/io.h> +#include <asm/tlbflush.h> +#include <asm/irq.h> + +/* + * Known problems: + * + * Interrupt handling is not guaranteed: + * - a real x86 will disable all interrupts for one instruction + * after a "mov ss,xx" to make stack handling atomic even without + * the 'lss' instruction. We can't guarantee this in v86 mode, + * as the next instruction might result in a page fault or similar. + * - a real x86 will have interrupts disabled for one instruction + * past the 'sti' that enables them. We don't bother with all the + * details yet. + * + * Let's hope these problems do not actually matter for anything. + */ + + +#define KVM86 ((struct kernel_vm86_struct *)regs) +#define VMPI KVM86->vm86plus + + +/* + * 8- and 16-bit register defines.. + */ +#define AL(regs) (((unsigned char *)&((regs)->pt.ax))[0]) +#define AH(regs) (((unsigned char *)&((regs)->pt.ax))[1]) +#define IP(regs) (*(unsigned short *)&((regs)->pt.ip)) +#define SP(regs) (*(unsigned short *)&((regs)->pt.sp)) + +/* + * virtual flags (16 and 32-bit versions) + */ +#define VFLAGS (*(unsigned short *)&(current->thread.v86flags)) +#define VEFLAGS (current->thread.v86flags) + +#define set_flags(X, new, mask) \ +((X) = ((X) & ~(mask)) | ((new) & (mask))) + +#define SAFE_MASK (0xDD5) +#define RETURN_MASK (0xDFF) + +/* convert kernel_vm86_regs to vm86_regs */ +static int copy_vm86_regs_to_user(struct vm86_regs __user *user, + const struct kernel_vm86_regs *regs) +{ + int ret = 0; + + /* + * kernel_vm86_regs is missing gs, so copy everything up to + * (but not including) orig_eax, and then rest including orig_eax. + */ + ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_ax)); + ret += copy_to_user(&user->orig_eax, ®s->pt.orig_ax, + sizeof(struct kernel_vm86_regs) - + offsetof(struct kernel_vm86_regs, pt.orig_ax)); + + return ret; +} + +/* convert vm86_regs to kernel_vm86_regs */ +static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs, + const struct vm86_regs __user *user, + unsigned extra) +{ + int ret = 0; + + /* copy ax-fs inclusive */ + ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_ax)); + /* copy orig_ax-__gsh+extra */ + ret += copy_from_user(®s->pt.orig_ax, &user->orig_eax, + sizeof(struct kernel_vm86_regs) - + offsetof(struct kernel_vm86_regs, pt.orig_ax) + + extra); + return ret; +} + +struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs) +{ + struct tss_struct *tss; + struct pt_regs *ret; + unsigned long tmp; + + /* + * This gets called from entry.S with interrupts disabled, but + * from process context. Enable interrupts here, before trying + * to access user space. + */ + local_irq_enable(); + + if (!current->thread.vm86_info) { + pr_alert("no vm86_info: BAD\n"); + do_exit(SIGSEGV); + } + set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | current->thread.v86mask); + tmp = copy_vm86_regs_to_user(¤t->thread.vm86_info->regs, regs); + tmp += put_user(current->thread.screen_bitmap, ¤t->thread.vm86_info->screen_bitmap); + if (tmp) { + pr_alert("could not access userspace vm86_info\n"); + do_exit(SIGSEGV); + } + + tss = &per_cpu(init_tss, get_cpu()); + current->thread.sp0 = current->thread.saved_sp0; + current->thread.sysenter_cs = __KERNEL_CS; + load_sp0(tss, ¤t->thread); + current->thread.saved_sp0 = 0; + put_cpu(); + + ret = KVM86->regs32; + + ret->fs = current->thread.saved_fs; + set_user_gs(ret, current->thread.saved_gs); + + return ret; +} + +static void mark_screen_rdonly(struct mm_struct *mm) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + spinlock_t *ptl; + int i; + + down_write(&mm->mmap_sem); + pgd = pgd_offset(mm, 0xA0000); + if (pgd_none_or_clear_bad(pgd)) + goto out; + pud = pud_offset(pgd, 0xA0000); + if (pud_none_or_clear_bad(pud)) + goto out; + pmd = pmd_offset(pud, 0xA0000); + split_huge_page_pmd_mm(mm, 0xA0000, pmd); + if (pmd_none_or_clear_bad(pmd)) + goto out; + pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl); + for (i = 0; i < 32; i++) { + if (pte_present(*pte)) + set_pte(pte, pte_wrprotect(*pte)); + pte++; + } + pte_unmap_unlock(pte, ptl); +out: + up_write(&mm->mmap_sem); + flush_tlb(); +} + + + +static int do_vm86_irq_handling(int subfunction, int irqnumber); +static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk); + +SYSCALL_DEFINE1(vm86old, struct vm86_struct __user *, v86) +{ + struct kernel_vm86_struct info; /* declare this _on top_, + * this avoids wasting of stack space. + * This remains on the stack until we + * return to 32 bit user space. + */ + struct task_struct *tsk = current; + int tmp; + + if (tsk->thread.saved_sp0) + return -EPERM; + tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, + offsetof(struct kernel_vm86_struct, vm86plus) - + sizeof(info.regs)); + if (tmp) + return -EFAULT; + memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus); + info.regs32 = current_pt_regs(); + tsk->thread.vm86_info = v86; + do_sys_vm86(&info, tsk); + return 0; /* we never return here */ +} + + +SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg) +{ + struct kernel_vm86_struct info; /* declare this _on top_, + * this avoids wasting of stack space. + * This remains on the stack until we + * return to 32 bit user space. + */ + struct task_struct *tsk; + int tmp; + struct vm86plus_struct __user *v86; + + tsk = current; + switch (cmd) { + case VM86_REQUEST_IRQ: + case VM86_FREE_IRQ: + case VM86_GET_IRQ_BITS: + case VM86_GET_AND_RESET_IRQ: + return do_vm86_irq_handling(cmd, (int)arg); + case VM86_PLUS_INSTALL_CHECK: + /* + * NOTE: on old vm86 stuff this will return the error + * from access_ok(), because the subfunction is + * interpreted as (invalid) address to vm86_struct. + * So the installation check works. + */ + return 0; + } + + /* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */ + if (tsk->thread.saved_sp0) + return -EPERM; + v86 = (struct vm86plus_struct __user *)arg; + tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, + offsetof(struct kernel_vm86_struct, regs32) - + sizeof(info.regs)); + if (tmp) + return -EFAULT; + info.regs32 = current_pt_regs(); + info.vm86plus.is_vm86pus = 1; + tsk->thread.vm86_info = (struct vm86_struct __user *)v86; + do_sys_vm86(&info, tsk); + return 0; /* we never return here */ +} + + +static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk) +{ + struct tss_struct *tss; +/* + * make sure the vm86() system call doesn't try to do anything silly + */ + info->regs.pt.ds = 0; + info->regs.pt.es = 0; + info->regs.pt.fs = 0; +#ifndef CONFIG_X86_32_LAZY_GS + info->regs.pt.gs = 0; +#endif + +/* + * The flags register is also special: we cannot trust that the user + * has set it up safely, so this makes sure interrupt etc flags are + * inherited from protected mode. + */ + VEFLAGS = info->regs.pt.flags; + info->regs.pt.flags &= SAFE_MASK; + info->regs.pt.flags |= info->regs32->flags & ~SAFE_MASK; + info->regs.pt.flags |= X86_VM_MASK; + + switch (info->cpu_type) { + case CPU_286: + tsk->thread.v86mask = 0; + break; + case CPU_386: + tsk->thread.v86mask = X86_EFLAGS_NT | X86_EFLAGS_IOPL; + break; + case CPU_486: + tsk->thread.v86mask = X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL; + break; + default: + tsk->thread.v86mask = X86_EFLAGS_ID | X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL; + break; + } + +/* + * Save old state, set default return value (%ax) to 0 (VM86_SIGNAL) + */ + info->regs32->ax = VM86_SIGNAL; + tsk->thread.saved_sp0 = tsk->thread.sp0; + tsk->thread.saved_fs = info->regs32->fs; + tsk->thread.saved_gs = get_user_gs(info->regs32); + + tss = &per_cpu(init_tss, get_cpu()); + tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0; + if (cpu_has_sep) + tsk->thread.sysenter_cs = 0; + load_sp0(tss, &tsk->thread); + put_cpu(); + + tsk->thread.screen_bitmap = info->screen_bitmap; + if (info->flags & VM86_SCREEN_BITMAP) + mark_screen_rdonly(tsk->mm); + + /*call __audit_syscall_exit since we do not exit via the normal paths */ +#ifdef CONFIG_AUDITSYSCALL + if (unlikely(current->audit_context)) + __audit_syscall_exit(1, 0); +#endif + + __asm__ __volatile__( + "movl %0,%%esp\n\t" + "movl %1,%%ebp\n\t" +#ifdef CONFIG_X86_32_LAZY_GS + "mov %2, %%gs\n\t" +#endif + "jmp resume_userspace" + : /* no outputs */ + :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0)); + /* we never return here */ +} + +static inline void return_to_32bit(struct kernel_vm86_regs *regs16, int retval) +{ + struct pt_regs *regs32; + + regs32 = save_v86_state(regs16); + regs32->ax = retval; + __asm__ __volatile__("movl %0,%%esp\n\t" + "movl %1,%%ebp\n\t" + "jmp resume_userspace" + : : "r" (regs32), "r" (current_thread_info())); +} + +static inline void set_IF(struct kernel_vm86_regs *regs) +{ + VEFLAGS |= X86_EFLAGS_VIF; + if (VEFLAGS & X86_EFLAGS_VIP) + return_to_32bit(regs, VM86_STI); +} + +static inline void clear_IF(struct kernel_vm86_regs *regs) +{ + VEFLAGS &= ~X86_EFLAGS_VIF; +} + +static inline void clear_TF(struct kernel_vm86_regs *regs) +{ + regs->pt.flags &= ~X86_EFLAGS_TF; +} + +static inline void clear_AC(struct kernel_vm86_regs *regs) +{ + regs->pt.flags &= ~X86_EFLAGS_AC; +} + +/* + * It is correct to call set_IF(regs) from the set_vflags_* + * functions. However someone forgot to call clear_IF(regs) + * in the opposite case. + * After the command sequence CLI PUSHF STI POPF you should + * end up with interrupts disabled, but you ended up with + * interrupts enabled. + * ( I was testing my own changes, but the only bug I + * could find was in a function I had not changed. ) + * [KD] + */ + +static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs *regs) +{ + set_flags(VEFLAGS, flags, current->thread.v86mask); + set_flags(regs->pt.flags, flags, SAFE_MASK); + if (flags & X86_EFLAGS_IF) + set_IF(regs); + else + clear_IF(regs); +} + +static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs *regs) +{ + set_flags(VFLAGS, flags, current->thread.v86mask); + set_flags(regs->pt.flags, flags, SAFE_MASK); + if (flags & X86_EFLAGS_IF) + set_IF(regs); + else + clear_IF(regs); +} + +static inline unsigned long get_vflags(struct kernel_vm86_regs *regs) +{ + unsigned long flags = regs->pt.flags & RETURN_MASK; + + if (VEFLAGS & X86_EFLAGS_VIF) + flags |= X86_EFLAGS_IF; + flags |= X86_EFLAGS_IOPL; + return flags | (VEFLAGS & current->thread.v86mask); +} + +static inline int is_revectored(int nr, struct revectored_struct *bitmap) +{ + __asm__ __volatile__("btl %2,%1\n\tsbbl %0,%0" + :"=r" (nr) + :"m" (*bitmap), "r" (nr)); + return nr; +} + +#define val_byte(val, n) (((__u8 *)&val)[n]) + +#define pushb(base, ptr, val, err_label) \ + do { \ + __u8 __val = val; \ + ptr--; \ + if (put_user(__val, base + ptr) < 0) \ + goto err_label; \ + } while (0) + +#define pushw(base, ptr, val, err_label) \ + do { \ + __u16 __val = val; \ + ptr--; \ + if (put_user(val_byte(__val, 1), base + ptr) < 0) \ + goto err_label; \ + ptr--; \ + if (put_user(val_byte(__val, 0), base + ptr) < 0) \ + goto err_label; \ + } while (0) + +#define pushl(base, ptr, val, err_label) \ + do { \ + __u32 __val = val; \ + ptr--; \ + if (put_user(val_byte(__val, 3), base + ptr) < 0) \ + goto err_label; \ + ptr--; \ + if (put_user(val_byte(__val, 2), base + ptr) < 0) \ + goto err_label; \ + ptr--; \ + if (put_user(val_byte(__val, 1), base + ptr) < 0) \ + goto err_label; \ + ptr--; \ + if (put_user(val_byte(__val, 0), base + ptr) < 0) \ + goto err_label; \ + } while (0) + +#define popb(base, ptr, err_label) \ + ({ \ + __u8 __res; \ + if (get_user(__res, base + ptr) < 0) \ + goto err_label; \ + ptr++; \ + __res; \ + }) + +#define popw(base, ptr, err_label) \ + ({ \ + __u16 __res; \ + if (get_user(val_byte(__res, 0), base + ptr) < 0) \ + goto err_label; \ + ptr++; \ + if (get_user(val_byte(__res, 1), base + ptr) < 0) \ + goto err_label; \ + ptr++; \ + __res; \ + }) + +#define popl(base, ptr, err_label) \ + ({ \ + __u32 __res; \ + if (get_user(val_byte(__res, 0), base + ptr) < 0) \ + goto err_label; \ + ptr++; \ + if (get_user(val_byte(__res, 1), base + ptr) < 0) \ + goto err_label; \ + ptr++; \ + if (get_user(val_byte(__res, 2), base + ptr) < 0) \ + goto err_label; \ + ptr++; \ + if (get_user(val_byte(__res, 3), base + ptr) < 0) \ + goto err_label; \ + ptr++; \ + __res; \ + }) + +/* There are so many possible reasons for this function to return + * VM86_INTx, so adding another doesn't bother me. We can expect + * userspace programs to be able to handle it. (Getting a problem + * in userspace is always better than an Oops anyway.) [KD] + */ +static void do_int(struct kernel_vm86_regs *regs, int i, + unsigned char __user *ssp, unsigned short sp) +{ + unsigned long __user *intr_ptr; + unsigned long segoffs; + + if (regs->pt.cs == BIOSSEG) + goto cannot_handle; + if (is_revectored(i, &KVM86->int_revectored)) + goto cannot_handle; + if (i == 0x21 && is_revectored(AH(regs), &KVM86->int21_revectored)) + goto cannot_handle; + intr_ptr = (unsigned long __user *) (i << 2); + if (get_user(segoffs, intr_ptr)) + goto cannot_handle; + if ((segoffs >> 16) == BIOSSEG) + goto cannot_handle; + pushw(ssp, sp, get_vflags(regs), cannot_handle); + pushw(ssp, sp, regs->pt.cs, cannot_handle); + pushw(ssp, sp, IP(regs), cannot_handle); + regs->pt.cs = segoffs >> 16; + SP(regs) -= 6; + IP(regs) = segoffs & 0xffff; + clear_TF(regs); + clear_IF(regs); + clear_AC(regs); + return; + +cannot_handle: + return_to_32bit(regs, VM86_INTx + (i << 8)); +} + +int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno) +{ + if (VMPI.is_vm86pus) { + if ((trapno == 3) || (trapno == 1)) { + KVM86->regs32->ax = VM86_TRAP + (trapno << 8); + /* setting this flag forces the code in entry_32.S to + the path where we call save_v86_state() and change + the stack pointer to KVM86->regs32 */ + set_thread_flag(TIF_NOTIFY_RESUME); + return 0; + } + do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs)); + return 0; + } + if (trapno != 1) + return 1; /* we let this handle by the calling routine */ + current->thread.trap_nr = trapno; + current->thread.error_code = error_code; + force_sig(SIGTRAP, current); + return 0; +} + +void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code) +{ + unsigned char opcode; + unsigned char __user *csp; + unsigned char __user *ssp; + unsigned short ip, sp, orig_flags; + int data32, pref_done; + +#define CHECK_IF_IN_TRAP \ + if (VMPI.vm86dbg_active && VMPI.vm86dbg_TFpendig) \ + newflags |= X86_EFLAGS_TF +#define VM86_FAULT_RETURN do { \ + if (VMPI.force_return_for_pic && (VEFLAGS & (X86_EFLAGS_IF | X86_EFLAGS_VIF))) \ + return_to_32bit(regs, VM86_PICRETURN); \ + if (orig_flags & X86_EFLAGS_TF) \ + handle_vm86_trap(regs, 0, 1); \ + return; } while (0) + + orig_flags = *(unsigned short *)®s->pt.flags; + + csp = (unsigned char __user *) (regs->pt.cs << 4); + ssp = (unsigned char __user *) (regs->pt.ss << 4); + sp = SP(regs); + ip = IP(regs); + + data32 = 0; + pref_done = 0; + do { + switch (opcode = popb(csp, ip, simulate_sigsegv)) { + case 0x66: /* 32-bit data */ data32 = 1; break; + case 0x67: /* 32-bit address */ break; + case 0x2e: /* CS */ break; + case 0x3e: /* DS */ break; + case 0x26: /* ES */ break; + case 0x36: /* SS */ break; + case 0x65: /* GS */ break; + case 0x64: /* FS */ break; + case 0xf2: /* repnz */ break; + case 0xf3: /* rep */ break; + default: pref_done = 1; + } + } while (!pref_done); + + switch (opcode) { + + /* pushf */ + case 0x9c: + if (data32) { + pushl(ssp, sp, get_vflags(regs), simulate_sigsegv); + SP(regs) -= 4; + } else { + pushw(ssp, sp, get_vflags(regs), simulate_sigsegv); + SP(regs) -= 2; + } + IP(regs) = ip; + VM86_FAULT_RETURN; + + /* popf */ + case 0x9d: + { + unsigned long newflags; + if (data32) { + newflags = popl(ssp, sp, simulate_sigsegv); + SP(regs) += 4; + } else { + newflags = popw(ssp, sp, simulate_sigsegv); + SP(regs) += 2; + } + IP(regs) = ip; + CHECK_IF_IN_TRAP; + if (data32) + set_vflags_long(newflags, regs); + else + set_vflags_short(newflags, regs); + + VM86_FAULT_RETURN; + } + + /* int xx */ + case 0xcd: { + int intno = popb(csp, ip, simulate_sigsegv); + IP(regs) = ip; + if (VMPI.vm86dbg_active) { + if ((1 << (intno & 7)) & VMPI.vm86dbg_intxxtab[intno >> 3]) + return_to_32bit(regs, VM86_INTx + (intno << 8)); + } + do_int(regs, intno, ssp, sp); + return; + } + + /* iret */ + case 0xcf: + { + unsigned long newip; + unsigned long newcs; + unsigned long newflags; + if (data32) { + newip = popl(ssp, sp, simulate_sigsegv); + newcs = popl(ssp, sp, simulate_sigsegv); + newflags = popl(ssp, sp, simulate_sigsegv); + SP(regs) += 12; + } else { + newip = popw(ssp, sp, simulate_sigsegv); + newcs = popw(ssp, sp, simulate_sigsegv); + newflags = popw(ssp, sp, simulate_sigsegv); + SP(regs) += 6; + } + IP(regs) = newip; + regs->pt.cs = newcs; + CHECK_IF_IN_TRAP; + if (data32) { + set_vflags_long(newflags, regs); + } else { + set_vflags_short(newflags, regs); + } + VM86_FAULT_RETURN; + } + + /* cli */ + case 0xfa: + IP(regs) = ip; + clear_IF(regs); + VM86_FAULT_RETURN; + + /* sti */ + /* + * Damn. This is incorrect: the 'sti' instruction should actually + * enable interrupts after the /next/ instruction. Not good. + * + * Probably needs some horsing around with the TF flag. Aiee.. + */ + case 0xfb: + IP(regs) = ip; + set_IF(regs); + VM86_FAULT_RETURN; + + default: + return_to_32bit(regs, VM86_UNKNOWN); + } + + return; + +simulate_sigsegv: + /* FIXME: After a long discussion with Stas we finally + * agreed, that this is wrong. Here we should + * really send a SIGSEGV to the user program. + * But how do we create the correct context? We + * are inside a general protection fault handler + * and has just returned from a page fault handler. + * The correct context for the signal handler + * should be a mixture of the two, but how do we + * get the information? [KD] + */ + return_to_32bit(regs, VM86_UNKNOWN); +} + +/* ---------------- vm86 special IRQ passing stuff ----------------- */ + +#define VM86_IRQNAME "vm86irq" + +static struct vm86_irqs { + struct task_struct *tsk; + int sig; +} vm86_irqs[16]; + +static DEFINE_SPINLOCK(irqbits_lock); +static int irqbits; + +#define ALLOWED_SIGS (1 /* 0 = don't send a signal */ \ + | (1 << SIGUSR1) | (1 << SIGUSR2) | (1 << SIGIO) | (1 << SIGURG) \ + | (1 << SIGUNUSED)) + +static irqreturn_t irq_handler(int intno, void *dev_id) +{ + int irq_bit; + unsigned long flags; + + spin_lock_irqsave(&irqbits_lock, flags); + irq_bit = 1 << intno; + if ((irqbits & irq_bit) || !vm86_irqs[intno].tsk) + goto out; + irqbits |= irq_bit; + if (vm86_irqs[intno].sig) + send_sig(vm86_irqs[intno].sig, vm86_irqs[intno].tsk, 1); + /* + * IRQ will be re-enabled when user asks for the irq (whether + * polling or as a result of the signal) + */ + disable_irq_nosync(intno); + spin_unlock_irqrestore(&irqbits_lock, flags); + return IRQ_HANDLED; + +out: + spin_unlock_irqrestore(&irqbits_lock, flags); + return IRQ_NONE; +} + +static inline void free_vm86_irq(int irqnumber) +{ + unsigned long flags; + + free_irq(irqnumber, NULL); + vm86_irqs[irqnumber].tsk = NULL; + + spin_lock_irqsave(&irqbits_lock, flags); + irqbits &= ~(1 << irqnumber); + spin_unlock_irqrestore(&irqbits_lock, flags); +} + +void release_vm86_irqs(struct task_struct *task) +{ + int i; + for (i = FIRST_VM86_IRQ ; i <= LAST_VM86_IRQ; i++) + if (vm86_irqs[i].tsk == task) + free_vm86_irq(i); +} + +static inline int get_and_reset_irq(int irqnumber) +{ + int bit; + unsigned long flags; + int ret = 0; + + if (invalid_vm86_irq(irqnumber)) return 0; + if (vm86_irqs[irqnumber].tsk != current) return 0; + spin_lock_irqsave(&irqbits_lock, flags); + bit = irqbits & (1 << irqnumber); + irqbits &= ~bit; + if (bit) { + enable_irq(irqnumber); + ret = 1; + } + + spin_unlock_irqrestore(&irqbits_lock, flags); + return ret; +} + + +static int do_vm86_irq_handling(int subfunction, int irqnumber) +{ + int ret; + switch (subfunction) { + case VM86_GET_AND_RESET_IRQ: { + return get_and_reset_irq(irqnumber); + } + case VM86_GET_IRQ_BITS: { + return irqbits; + } + case VM86_REQUEST_IRQ: { + int sig = irqnumber >> 8; + int irq = irqnumber & 255; + if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (!((1 << sig) & ALLOWED_SIGS)) return -EPERM; + if (invalid_vm86_irq(irq)) return -EPERM; + if (vm86_irqs[irq].tsk) return -EPERM; + ret = request_irq(irq, &irq_handler, 0, VM86_IRQNAME, NULL); + if (ret) return ret; + vm86_irqs[irq].sig = sig; + vm86_irqs[irq].tsk = current; + return irq; + } + case VM86_FREE_IRQ: { + if (invalid_vm86_irq(irqnumber)) return -EPERM; + if (!vm86_irqs[irqnumber].tsk) return 0; + if (vm86_irqs[irqnumber].tsk != current) return -EPERM; + free_vm86_irq(irqnumber); + return 0; + } + } + return -EINVAL; +} + diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S new file mode 100644 index 00000000000..49edf2dd361 --- /dev/null +++ b/arch/x86/kernel/vmlinux.lds.S @@ -0,0 +1,371 @@ +/* + * ld script for the x86 kernel + * + * Historic 32-bit version written by Martin Mares <mj@atrey.karlin.mff.cuni.cz> + * + * Modernisation, unification and other changes and fixes: + * Copyright (C) 2007-2009 Sam Ravnborg <sam@ravnborg.org> + * + * + * Don't define absolute symbols until and unless you know that symbol + * value is should remain constant even if kernel image is relocated + * at run time. Absolute symbols are not relocated. If symbol value should + * change if kernel is relocated, make the symbol section relative and + * put it inside the section definition. + */ + +#ifdef CONFIG_X86_32 +#define LOAD_OFFSET __PAGE_OFFSET +#else +#define LOAD_OFFSET __START_KERNEL_map +#endif + +#include <asm-generic/vmlinux.lds.h> +#include <asm/asm-offsets.h> +#include <asm/thread_info.h> +#include <asm/page_types.h> +#include <asm/cache.h> +#include <asm/boot.h> + +#undef i386 /* in case the preprocessor is a 32bit one */ + +OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT) + +#ifdef CONFIG_X86_32 +OUTPUT_ARCH(i386) +ENTRY(phys_startup_32) +jiffies = jiffies_64; +#else +OUTPUT_ARCH(i386:x86-64) +ENTRY(phys_startup_64) +jiffies_64 = jiffies; +#endif + +#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) +/* + * On 64-bit, align RODATA to 2MB so that even with CONFIG_DEBUG_RODATA + * we retain large page mappings for boundaries spanning kernel text, rodata + * and data sections. + * + * However, kernel identity mappings will have different RWX permissions + * to the pages mapping to text and to the pages padding (which are freed) the + * text section. Hence kernel identity mappings will be broken to smaller + * pages. For 64-bit, kernel text and kernel identity mappings are different, + * so we can enable protection checks that come with CONFIG_DEBUG_RODATA, + * as well as retain 2MB large page mappings for kernel text. + */ +#define X64_ALIGN_DEBUG_RODATA_BEGIN . = ALIGN(HPAGE_SIZE); + +#define X64_ALIGN_DEBUG_RODATA_END \ + . = ALIGN(HPAGE_SIZE); \ + __end_rodata_hpage_align = .; + +#else + +#define X64_ALIGN_DEBUG_RODATA_BEGIN +#define X64_ALIGN_DEBUG_RODATA_END + +#endif + +PHDRS { + text PT_LOAD FLAGS(5); /* R_E */ + data PT_LOAD FLAGS(6); /* RW_ */ +#ifdef CONFIG_X86_64 +#ifdef CONFIG_SMP + percpu PT_LOAD FLAGS(6); /* RW_ */ +#endif + init PT_LOAD FLAGS(7); /* RWE */ +#endif + note PT_NOTE FLAGS(0); /* ___ */ +} + +SECTIONS +{ +#ifdef CONFIG_X86_32 + . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; + phys_startup_32 = startup_32 - LOAD_OFFSET; +#else + . = __START_KERNEL; + phys_startup_64 = startup_64 - LOAD_OFFSET; +#endif + + /* Text and read-only data */ + .text : AT(ADDR(.text) - LOAD_OFFSET) { + _text = .; + /* bootstrapping code */ + HEAD_TEXT + . = ALIGN(8); + _stext = .; + TEXT_TEXT + SCHED_TEXT + LOCK_TEXT + KPROBES_TEXT + ENTRY_TEXT + IRQENTRY_TEXT + *(.fixup) + *(.gnu.warning) + /* End of text section */ + _etext = .; + } :text = 0x9090 + + NOTES :text :note + + EXCEPTION_TABLE(16) :text = 0x9090 + +#if defined(CONFIG_DEBUG_RODATA) + /* .text should occupy whole number of pages */ + . = ALIGN(PAGE_SIZE); +#endif + X64_ALIGN_DEBUG_RODATA_BEGIN + RO_DATA(PAGE_SIZE) + X64_ALIGN_DEBUG_RODATA_END + + /* Data */ + .data : AT(ADDR(.data) - LOAD_OFFSET) { + /* Start of data section */ + _sdata = .; + + /* init_task */ + INIT_TASK_DATA(THREAD_SIZE) + +#ifdef CONFIG_X86_32 + /* 32 bit has nosave before _edata */ + NOSAVE_DATA +#endif + + PAGE_ALIGNED_DATA(PAGE_SIZE) + + CACHELINE_ALIGNED_DATA(L1_CACHE_BYTES) + + DATA_DATA + CONSTRUCTORS + + /* rarely changed data like cpu maps */ + READ_MOSTLY_DATA(INTERNODE_CACHE_BYTES) + + /* End of data section */ + _edata = .; + } :data + + + . = ALIGN(PAGE_SIZE); + __vvar_page = .; + + .vvar : AT(ADDR(.vvar) - LOAD_OFFSET) { + /* work around gold bug 13023 */ + __vvar_beginning_hack = .; + + /* Place all vvars at the offsets in asm/vvar.h. */ +#define EMIT_VVAR(name, offset) \ + . = __vvar_beginning_hack + offset; \ + *(.vvar_ ## name) +#define __VVAR_KERNEL_LDS +#include <asm/vvar.h> +#undef __VVAR_KERNEL_LDS +#undef EMIT_VVAR + + /* + * Pad the rest of the page with zeros. Otherwise the loader + * can leave garbage here. + */ + . = __vvar_beginning_hack + PAGE_SIZE; + } :data + + . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE); + + /* Init code and data - will be freed after init */ + . = ALIGN(PAGE_SIZE); + .init.begin : AT(ADDR(.init.begin) - LOAD_OFFSET) { + __init_begin = .; /* paired with __init_end */ + } + +#if defined(CONFIG_X86_64) && defined(CONFIG_SMP) + /* + * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the + * output PHDR, so the next output section - .init.text - should + * start another segment - init. + */ + PERCPU_VADDR(INTERNODE_CACHE_BYTES, 0, :percpu) +#endif + + INIT_TEXT_SECTION(PAGE_SIZE) +#ifdef CONFIG_X86_64 + :init +#endif + + INIT_DATA_SECTION(16) + + .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { + __x86_cpu_dev_start = .; + *(.x86_cpu_dev.init) + __x86_cpu_dev_end = .; + } + +#ifdef CONFIG_X86_INTEL_MID + .x86_intel_mid_dev.init : AT(ADDR(.x86_intel_mid_dev.init) - \ + LOAD_OFFSET) { + __x86_intel_mid_dev_start = .; + *(.x86_intel_mid_dev.init) + __x86_intel_mid_dev_end = .; + } +#endif + + /* + * start address and size of operations which during runtime + * can be patched with virtualization friendly instructions or + * baremetal native ones. Think page table operations. + * Details in paravirt_types.h + */ + . = ALIGN(8); + .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { + __parainstructions = .; + *(.parainstructions) + __parainstructions_end = .; + } + + /* + * struct alt_inst entries. From the header (alternative.h): + * "Alternative instructions for different CPU types or capabilities" + * Think locking instructions on spinlocks. + */ + . = ALIGN(8); + .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { + __alt_instructions = .; + *(.altinstructions) + __alt_instructions_end = .; + } + + /* + * And here are the replacement instructions. The linker sticks + * them as binary blobs. The .altinstructions has enough data to + * get the address and the length of them to patch the kernel safely. + */ + .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { + *(.altinstr_replacement) + } + + /* + * struct iommu_table_entry entries are injected in this section. + * It is an array of IOMMUs which during run time gets sorted depending + * on its dependency order. After rootfs_initcall is complete + * this section can be safely removed. + */ + .iommu_table : AT(ADDR(.iommu_table) - LOAD_OFFSET) { + __iommu_table = .; + *(.iommu_table) + __iommu_table_end = .; + } + + . = ALIGN(8); + .apicdrivers : AT(ADDR(.apicdrivers) - LOAD_OFFSET) { + __apicdrivers = .; + *(.apicdrivers); + __apicdrivers_end = .; + } + + . = ALIGN(8); + /* + * .exit.text is discard at runtime, not link time, to deal with + * references from .altinstructions and .eh_frame + */ + .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { + EXIT_TEXT + } + + .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { + EXIT_DATA + } + +#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) + PERCPU_SECTION(INTERNODE_CACHE_BYTES) +#endif + + . = ALIGN(PAGE_SIZE); + + /* freed after init ends here */ + .init.end : AT(ADDR(.init.end) - LOAD_OFFSET) { + __init_end = .; + } + + /* + * smp_locks might be freed after init + * start/end must be page aligned + */ + . = ALIGN(PAGE_SIZE); + .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { + __smp_locks = .; + *(.smp_locks) + . = ALIGN(PAGE_SIZE); + __smp_locks_end = .; + } + +#ifdef CONFIG_X86_64 + .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { + NOSAVE_DATA + } +#endif + + /* BSS */ + . = ALIGN(PAGE_SIZE); + .bss : AT(ADDR(.bss) - LOAD_OFFSET) { + __bss_start = .; + *(.bss..page_aligned) + *(.bss) + . = ALIGN(PAGE_SIZE); + __bss_stop = .; + } + + . = ALIGN(PAGE_SIZE); + .brk : AT(ADDR(.brk) - LOAD_OFFSET) { + __brk_base = .; + . += 64 * 1024; /* 64k alignment slop space */ + *(.brk_reservation) /* areas brk users have reserved */ + __brk_limit = .; + } + + _end = .; + + STABS_DEBUG + DWARF_DEBUG + + /* Sections to be discarded */ + DISCARDS + /DISCARD/ : { *(.eh_frame) } +} + + +#ifdef CONFIG_X86_32 +/* + * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility: + */ +. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), + "kernel image bigger than KERNEL_IMAGE_SIZE"); +#else +/* + * Per-cpu symbols which need to be offset from __per_cpu_load + * for the boot processor. + */ +#define INIT_PER_CPU(x) init_per_cpu__##x = x + __per_cpu_load +INIT_PER_CPU(gdt_page); +INIT_PER_CPU(irq_stack_union); + +/* + * Build-time check on the image size: + */ +. = ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), + "kernel image bigger than KERNEL_IMAGE_SIZE"); + +#ifdef CONFIG_SMP +. = ASSERT((irq_stack_union == 0), + "irq_stack_union is not at start of per-cpu area"); +#endif + +#endif /* CONFIG_X86_32 */ + +#ifdef CONFIG_KEXEC +#include <asm/kexec.h> + +. = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, + "kexec control code size is too big"); +#endif + diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c new file mode 100644 index 00000000000..b99b9ad8540 --- /dev/null +++ b/arch/x86/kernel/vsmp_64.c @@ -0,0 +1,245 @@ +/* + * vSMPowered(tm) systems specific initialization + * Copyright (C) 2005 ScaleMP Inc. + * + * Use of this code is subject to the terms and conditions of the + * GNU general public license version 2. See "COPYING" or + * http://www.gnu.org/licenses/gpl.html + * + * Ravikiran Thirumalai <kiran@scalemp.com>, + * Shai Fultheim <shai@scalemp.com> + * Paravirt ops integration: Glauber de Oliveira Costa <gcosta@redhat.com>, + * Ravikiran Thirumalai <kiran@scalemp.com> + */ + +#include <linux/init.h> +#include <linux/pci_ids.h> +#include <linux/pci_regs.h> +#include <linux/smp.h> +#include <linux/irq.h> + +#include <asm/apic.h> +#include <asm/pci-direct.h> +#include <asm/io.h> +#include <asm/paravirt.h> +#include <asm/setup.h> + +#define TOPOLOGY_REGISTER_OFFSET 0x10 + +/* Flag below is initialized once during vSMP PCI initialization. */ +static int irq_routing_comply = 1; + +#if defined CONFIG_PCI && defined CONFIG_PARAVIRT +/* + * Interrupt control on vSMPowered systems: + * ~AC is a shadow of IF. If IF is 'on' AC should be 'off' + * and vice versa. + */ + +asmlinkage __visible unsigned long vsmp_save_fl(void) +{ + unsigned long flags = native_save_fl(); + + if (!(flags & X86_EFLAGS_IF) || (flags & X86_EFLAGS_AC)) + flags &= ~X86_EFLAGS_IF; + return flags; +} +PV_CALLEE_SAVE_REGS_THUNK(vsmp_save_fl); + +__visible void vsmp_restore_fl(unsigned long flags) +{ + if (flags & X86_EFLAGS_IF) + flags &= ~X86_EFLAGS_AC; + else + flags |= X86_EFLAGS_AC; + native_restore_fl(flags); +} +PV_CALLEE_SAVE_REGS_THUNK(vsmp_restore_fl); + +asmlinkage __visible void vsmp_irq_disable(void) +{ + unsigned long flags = native_save_fl(); + + native_restore_fl((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC); +} +PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_disable); + +asmlinkage __visible void vsmp_irq_enable(void) +{ + unsigned long flags = native_save_fl(); + + native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC)); +} +PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_enable); + +static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf, + unsigned long addr, unsigned len) +{ + switch (type) { + case PARAVIRT_PATCH(pv_irq_ops.irq_enable): + case PARAVIRT_PATCH(pv_irq_ops.irq_disable): + case PARAVIRT_PATCH(pv_irq_ops.save_fl): + case PARAVIRT_PATCH(pv_irq_ops.restore_fl): + return paravirt_patch_default(type, clobbers, ibuf, addr, len); + default: + return native_patch(type, clobbers, ibuf, addr, len); + } + +} + +static void __init set_vsmp_pv_ops(void) +{ + void __iomem *address; + unsigned int cap, ctl, cfg; + + /* set vSMP magic bits to indicate vSMP capable kernel */ + cfg = read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0); + address = early_ioremap(cfg, 8); + cap = readl(address); + ctl = readl(address + 4); + printk(KERN_INFO "vSMP CTL: capabilities:0x%08x control:0x%08x\n", + cap, ctl); + + /* If possible, let the vSMP foundation route the interrupt optimally */ +#ifdef CONFIG_SMP + if (cap & ctl & BIT(8)) { + ctl &= ~BIT(8); + + /* Interrupt routing set to ignore */ + irq_routing_comply = 0; + +#ifdef CONFIG_PROC_FS + /* Don't let users change irq affinity via procfs */ + no_irq_affinity = 1; +#endif + } +#endif + + if (cap & ctl & (1 << 4)) { + /* Setup irq ops and turn on vSMP IRQ fastpath handling */ + pv_irq_ops.irq_disable = PV_CALLEE_SAVE(vsmp_irq_disable); + pv_irq_ops.irq_enable = PV_CALLEE_SAVE(vsmp_irq_enable); + pv_irq_ops.save_fl = PV_CALLEE_SAVE(vsmp_save_fl); + pv_irq_ops.restore_fl = PV_CALLEE_SAVE(vsmp_restore_fl); + pv_init_ops.patch = vsmp_patch; + ctl &= ~(1 << 4); + } + writel(ctl, address + 4); + ctl = readl(address + 4); + pr_info("vSMP CTL: control set to:0x%08x\n", ctl); + + early_iounmap(address, 8); +} +#else +static void __init set_vsmp_pv_ops(void) +{ +} +#endif + +#ifdef CONFIG_PCI +static int is_vsmp = -1; + +static void __init detect_vsmp_box(void) +{ + is_vsmp = 0; + + if (!early_pci_allowed()) + return; + + /* Check if we are running on a ScaleMP vSMPowered box */ + if (read_pci_config(0, 0x1f, 0, PCI_VENDOR_ID) == + (PCI_VENDOR_ID_SCALEMP | (PCI_DEVICE_ID_SCALEMP_VSMP_CTL << 16))) + is_vsmp = 1; +} + +int is_vsmp_box(void) +{ + if (is_vsmp != -1) + return is_vsmp; + else { + WARN_ON_ONCE(1); + return 0; + } +} + +#else +static void __init detect_vsmp_box(void) +{ +} +int is_vsmp_box(void) +{ + return 0; +} +#endif + +static void __init vsmp_cap_cpus(void) +{ +#if !defined(CONFIG_X86_VSMP) && defined(CONFIG_SMP) + void __iomem *address; + unsigned int cfg, topology, node_shift, maxcpus; + + /* + * CONFIG_X86_VSMP is not configured, so limit the number CPUs to the + * ones present in the first board, unless explicitly overridden by + * setup_max_cpus + */ + if (setup_max_cpus != NR_CPUS) + return; + + /* Read the vSMP Foundation topology register */ + cfg = read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0); + address = early_ioremap(cfg + TOPOLOGY_REGISTER_OFFSET, 4); + if (WARN_ON(!address)) + return; + + topology = readl(address); + node_shift = (topology >> 16) & 0x7; + if (!node_shift) + /* The value 0 should be decoded as 8 */ + node_shift = 8; + maxcpus = (topology & ((1 << node_shift) - 1)) + 1; + + pr_info("vSMP CTL: Capping CPUs to %d (CONFIG_X86_VSMP is unset)\n", + maxcpus); + setup_max_cpus = maxcpus; + early_iounmap(address, 4); +#endif +} + +static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) +{ + return hard_smp_processor_id() >> index_msb; +} + +/* + * In vSMP, all cpus should be capable of handling interrupts, regardless of + * the APIC used. + */ +static void fill_vector_allocation_domain(int cpu, struct cpumask *retmask, + const struct cpumask *mask) +{ + cpumask_setall(retmask); +} + +static void vsmp_apic_post_init(void) +{ + /* need to update phys_pkg_id */ + apic->phys_pkg_id = apicid_phys_pkg_id; + + if (!irq_routing_comply) + apic->vector_allocation_domain = fill_vector_allocation_domain; +} + +void __init vsmp_init(void) +{ + detect_vsmp_box(); + if (!is_vsmp_box()) + return; + + x86_platform.apic_post_init = vsmp_apic_post_init; + + vsmp_cap_cpus(); + + set_vsmp_pv_ops(); + return; +} diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c new file mode 100644 index 00000000000..ea5b5709aa7 --- /dev/null +++ b/arch/x86/kernel/vsyscall_64.c @@ -0,0 +1,354 @@ +/* + * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE + * Copyright 2003 Andi Kleen, SuSE Labs. + * + * [ NOTE: this mechanism is now deprecated in favor of the vDSO. ] + * + * Thanks to hpa@transmeta.com for some useful hint. + * Special thanks to Ingo Molnar for his early experience with + * a different vsyscall implementation for Linux/IA32 and for the name. + * + * vsyscall 1 is located at -10Mbyte, vsyscall 2 is located + * at virtual address -10Mbyte+1024bytes etc... There are at max 4 + * vsyscalls. One vsyscall can reserve more than 1 slot to avoid + * jumping out of line if necessary. We cannot add more with this + * mechanism because older kernels won't return -ENOSYS. + * + * Note: the concept clashes with user mode linux. UML users should + * use the vDSO. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/time.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/timer.h> +#include <linux/seqlock.h> +#include <linux/jiffies.h> +#include <linux/sysctl.h> +#include <linux/topology.h> +#include <linux/timekeeper_internal.h> +#include <linux/getcpu.h> +#include <linux/cpu.h> +#include <linux/smp.h> +#include <linux/notifier.h> +#include <linux/syscalls.h> +#include <linux/ratelimit.h> + +#include <asm/vsyscall.h> +#include <asm/pgtable.h> +#include <asm/compat.h> +#include <asm/page.h> +#include <asm/unistd.h> +#include <asm/fixmap.h> +#include <asm/errno.h> +#include <asm/io.h> +#include <asm/segment.h> +#include <asm/desc.h> +#include <asm/topology.h> +#include <asm/traps.h> + +#define CREATE_TRACE_POINTS +#include "vsyscall_trace.h" + +DEFINE_VVAR(int, vgetcpu_mode); + +static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; + +static int __init vsyscall_setup(char *str) +{ + if (str) { + if (!strcmp("emulate", str)) + vsyscall_mode = EMULATE; + else if (!strcmp("native", str)) + vsyscall_mode = NATIVE; + else if (!strcmp("none", str)) + vsyscall_mode = NONE; + else + return -EINVAL; + + return 0; + } + + return -EINVAL; +} +early_param("vsyscall", vsyscall_setup); + +static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, + const char *message) +{ + if (!show_unhandled_signals) + return; + + pr_notice_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", + level, current->comm, task_pid_nr(current), + message, regs->ip, regs->cs, + regs->sp, regs->ax, regs->si, regs->di); +} + +static int addr_to_vsyscall_nr(unsigned long addr) +{ + int nr; + + if ((addr & ~0xC00UL) != VSYSCALL_ADDR) + return -EINVAL; + + nr = (addr & 0xC00UL) >> 10; + if (nr >= 3) + return -EINVAL; + + return nr; +} + +static bool write_ok_or_segv(unsigned long ptr, size_t size) +{ + /* + * XXX: if access_ok, get_user, and put_user handled + * sig_on_uaccess_error, this could go away. + */ + + if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) { + siginfo_t info; + struct thread_struct *thread = ¤t->thread; + + thread->error_code = 6; /* user fault, no page, write */ + thread->cr2 = ptr; + thread->trap_nr = X86_TRAP_PF; + + memset(&info, 0, sizeof(info)); + info.si_signo = SIGSEGV; + info.si_errno = 0; + info.si_code = SEGV_MAPERR; + info.si_addr = (void __user *)ptr; + + force_sig_info(SIGSEGV, &info, current); + return false; + } else { + return true; + } +} + +bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) +{ + struct task_struct *tsk; + unsigned long caller; + int vsyscall_nr, syscall_nr, tmp; + int prev_sig_on_uaccess_error; + long ret; + + /* + * No point in checking CS -- the only way to get here is a user mode + * trap to a high address, which means that we're in 64-bit user code. + */ + + WARN_ON_ONCE(address != regs->ip); + + if (vsyscall_mode == NONE) { + warn_bad_vsyscall(KERN_INFO, regs, + "vsyscall attempted with vsyscall=none"); + return false; + } + + vsyscall_nr = addr_to_vsyscall_nr(address); + + trace_emulate_vsyscall(vsyscall_nr); + + if (vsyscall_nr < 0) { + warn_bad_vsyscall(KERN_WARNING, regs, + "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround"); + goto sigsegv; + } + + if (get_user(caller, (unsigned long __user *)regs->sp) != 0) { + warn_bad_vsyscall(KERN_WARNING, regs, + "vsyscall with bad stack (exploit attempt?)"); + goto sigsegv; + } + + tsk = current; + + /* + * Check for access_ok violations and find the syscall nr. + * + * NULL is a valid user pointer (in the access_ok sense) on 32-bit and + * 64-bit, so we don't need to special-case it here. For all the + * vsyscalls, NULL means "don't write anything" not "write it at + * address 0". + */ + switch (vsyscall_nr) { + case 0: + if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) || + !write_ok_or_segv(regs->si, sizeof(struct timezone))) { + ret = -EFAULT; + goto check_fault; + } + + syscall_nr = __NR_gettimeofday; + break; + + case 1: + if (!write_ok_or_segv(regs->di, sizeof(time_t))) { + ret = -EFAULT; + goto check_fault; + } + + syscall_nr = __NR_time; + break; + + case 2: + if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || + !write_ok_or_segv(regs->si, sizeof(unsigned))) { + ret = -EFAULT; + goto check_fault; + } + + syscall_nr = __NR_getcpu; + break; + } + + /* + * Handle seccomp. regs->ip must be the original value. + * See seccomp_send_sigsys and Documentation/prctl/seccomp_filter.txt. + * + * We could optimize the seccomp disabled case, but performance + * here doesn't matter. + */ + regs->orig_ax = syscall_nr; + regs->ax = -ENOSYS; + tmp = secure_computing(syscall_nr); + if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { + warn_bad_vsyscall(KERN_DEBUG, regs, + "seccomp tried to change syscall nr or ip"); + do_exit(SIGSYS); + } + if (tmp) + goto do_ret; /* skip requested */ + + /* + * With a real vsyscall, page faults cause SIGSEGV. We want to + * preserve that behavior to make writing exploits harder. + */ + prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error; + current_thread_info()->sig_on_uaccess_error = 1; + + ret = -EFAULT; + switch (vsyscall_nr) { + case 0: + ret = sys_gettimeofday( + (struct timeval __user *)regs->di, + (struct timezone __user *)regs->si); + break; + + case 1: + ret = sys_time((time_t __user *)regs->di); + break; + + case 2: + ret = sys_getcpu((unsigned __user *)regs->di, + (unsigned __user *)regs->si, + NULL); + break; + } + + current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error; + +check_fault: + if (ret == -EFAULT) { + /* Bad news -- userspace fed a bad pointer to a vsyscall. */ + warn_bad_vsyscall(KERN_INFO, regs, + "vsyscall fault (exploit attempt?)"); + + /* + * If we failed to generate a signal for any reason, + * generate one here. (This should be impossible.) + */ + if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) && + !sigismember(&tsk->pending.signal, SIGSEGV))) + goto sigsegv; + + return true; /* Don't emulate the ret. */ + } + + regs->ax = ret; + +do_ret: + /* Emulate a ret instruction. */ + regs->ip = caller; + regs->sp += 8; + return true; + +sigsegv: + force_sig(SIGSEGV, current); + return true; +} + +/* + * Assume __initcall executes before all user space. Hopefully kmod + * doesn't violate that. We'll find out if it does. + */ +static void vsyscall_set_cpu(int cpu) +{ + unsigned long d; + unsigned long node = 0; +#ifdef CONFIG_NUMA + node = cpu_to_node(cpu); +#endif + if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP)) + write_rdtscp_aux((node << 12) | cpu); + + /* + * Store cpu number in limit so that it can be loaded quickly + * in user space in vgetcpu. (12 bits for the CPU and 8 bits for the node) + */ + d = 0x0f40000000000ULL; + d |= cpu; + d |= (node & 0xf) << 12; + d |= (node >> 4) << 48; + + write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); +} + +static void cpu_vsyscall_init(void *arg) +{ + /* preemption should be already off */ + vsyscall_set_cpu(raw_smp_processor_id()); +} + +static int +cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg) +{ + long cpu = (long)arg; + + if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) + smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1); + + return NOTIFY_DONE; +} + +void __init map_vsyscall(void) +{ + extern char __vsyscall_page; + unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); + + __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, + vsyscall_mode == NATIVE + ? PAGE_KERNEL_VSYSCALL + : PAGE_KERNEL_VVAR); + BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != + (unsigned long)VSYSCALL_ADDR); +} + +static int __init vsyscall_init(void) +{ + cpu_notifier_register_begin(); + + on_each_cpu(cpu_vsyscall_init, NULL, 1); + /* notifier priority > KVM */ + __hotcpu_notifier(cpu_vsyscall_notifier, 30); + + cpu_notifier_register_done(); + + return 0; +} +__initcall(vsyscall_init); diff --git a/arch/x86/kernel/vsyscall_emu_64.S b/arch/x86/kernel/vsyscall_emu_64.S new file mode 100644 index 00000000000..c9596a9af15 --- /dev/null +++ b/arch/x86/kernel/vsyscall_emu_64.S @@ -0,0 +1,37 @@ +/* + * vsyscall_emu_64.S: Vsyscall emulation page + * + * Copyright (c) 2011 Andy Lutomirski + * + * Subject to the GNU General Public License, version 2 + */ + +#include <linux/linkage.h> + +#include <asm/irq_vectors.h> +#include <asm/page_types.h> +#include <asm/unistd_64.h> + +__PAGE_ALIGNED_DATA + .globl __vsyscall_page + .balign PAGE_SIZE, 0xcc + .type __vsyscall_page, @object +__vsyscall_page: + + mov $__NR_gettimeofday, %rax + syscall + ret + + .balign 1024, 0xcc + mov $__NR_time, %rax + syscall + ret + + .balign 1024, 0xcc + mov $__NR_getcpu, %rax + syscall + ret + + .balign 4096, 0xcc + + .size __vsyscall_page, 4096 diff --git a/arch/x86/kernel/vsyscall_gtod.c b/arch/x86/kernel/vsyscall_gtod.c new file mode 100644 index 00000000000..9531fbb123b --- /dev/null +++ b/arch/x86/kernel/vsyscall_gtod.c @@ -0,0 +1,69 @@ +/* + * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE + * Copyright 2003 Andi Kleen, SuSE Labs. + * + * Modified for x86 32 bit architecture by + * Stefani Seibold <stefani@seibold.net> + * sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany + * + * Thanks to hpa@transmeta.com for some useful hint. + * Special thanks to Ingo Molnar for his early experience with + * a different vsyscall implementation for Linux/IA32 and for the name. + * + */ + +#include <linux/timekeeper_internal.h> +#include <asm/vgtod.h> +#include <asm/vvar.h> + +DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data); + +void update_vsyscall_tz(void) +{ + vsyscall_gtod_data.tz_minuteswest = sys_tz.tz_minuteswest; + vsyscall_gtod_data.tz_dsttime = sys_tz.tz_dsttime; +} + +void update_vsyscall(struct timekeeper *tk) +{ + struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data; + + gtod_write_begin(vdata); + + /* copy vsyscall data */ + vdata->vclock_mode = tk->clock->archdata.vclock_mode; + vdata->cycle_last = tk->clock->cycle_last; + vdata->mask = tk->clock->mask; + vdata->mult = tk->mult; + vdata->shift = tk->shift; + + vdata->wall_time_sec = tk->xtime_sec; + vdata->wall_time_snsec = tk->xtime_nsec; + + vdata->monotonic_time_sec = tk->xtime_sec + + tk->wall_to_monotonic.tv_sec; + vdata->monotonic_time_snsec = tk->xtime_nsec + + ((u64)tk->wall_to_monotonic.tv_nsec + << tk->shift); + while (vdata->monotonic_time_snsec >= + (((u64)NSEC_PER_SEC) << tk->shift)) { + vdata->monotonic_time_snsec -= + ((u64)NSEC_PER_SEC) << tk->shift; + vdata->monotonic_time_sec++; + } + + vdata->wall_time_coarse_sec = tk->xtime_sec; + vdata->wall_time_coarse_nsec = (long)(tk->xtime_nsec >> tk->shift); + + vdata->monotonic_time_coarse_sec = + vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec; + vdata->monotonic_time_coarse_nsec = + vdata->wall_time_coarse_nsec + tk->wall_to_monotonic.tv_nsec; + + while (vdata->monotonic_time_coarse_nsec >= NSEC_PER_SEC) { + vdata->monotonic_time_coarse_nsec -= NSEC_PER_SEC; + vdata->monotonic_time_coarse_sec++; + } + + gtod_write_end(vdata); +} diff --git a/arch/x86/kernel/vsyscall_trace.h b/arch/x86/kernel/vsyscall_trace.h new file mode 100644 index 00000000000..a8b2edec54f --- /dev/null +++ b/arch/x86/kernel/vsyscall_trace.h @@ -0,0 +1,29 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM vsyscall + +#if !defined(__VSYSCALL_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define __VSYSCALL_TRACE_H + +#include <linux/tracepoint.h> + +TRACE_EVENT(emulate_vsyscall, + + TP_PROTO(int nr), + + TP_ARGS(nr), + + TP_STRUCT__entry(__field(int, nr)), + + TP_fast_assign( + __entry->nr = nr; + ), + + TP_printk("nr = %d", __entry->nr) +); + +#endif + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH ../../arch/x86/kernel +#define TRACE_INCLUDE_FILE vsyscall_trace +#include <trace/define_trace.h> diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c new file mode 100644 index 00000000000..040681928e9 --- /dev/null +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -0,0 +1,75 @@ +/* Exports for assembly files. + All C exports should go in the respective C files. */ + +#include <linux/module.h> +#include <linux/smp.h> + +#include <net/checksum.h> + +#include <asm/processor.h> +#include <asm/pgtable.h> +#include <asm/uaccess.h> +#include <asm/desc.h> +#include <asm/ftrace.h> + +#ifdef CONFIG_FUNCTION_TRACER +/* mcount and __fentry__ are defined in assembly */ +#ifdef CC_USING_FENTRY +EXPORT_SYMBOL(__fentry__); +#else +EXPORT_SYMBOL(mcount); +#endif +#endif + +EXPORT_SYMBOL(__get_user_1); +EXPORT_SYMBOL(__get_user_2); +EXPORT_SYMBOL(__get_user_4); +EXPORT_SYMBOL(__get_user_8); +EXPORT_SYMBOL(__put_user_1); +EXPORT_SYMBOL(__put_user_2); +EXPORT_SYMBOL(__put_user_4); +EXPORT_SYMBOL(__put_user_8); + +EXPORT_SYMBOL(copy_user_generic_string); +EXPORT_SYMBOL(copy_user_generic_unrolled); +EXPORT_SYMBOL(copy_user_enhanced_fast_string); +EXPORT_SYMBOL(__copy_user_nocache); +EXPORT_SYMBOL(_copy_from_user); +EXPORT_SYMBOL(_copy_to_user); + +EXPORT_SYMBOL(copy_page); +EXPORT_SYMBOL(clear_page); + +EXPORT_SYMBOL(csum_partial); + +/* + * Export string functions. We normally rely on gcc builtin for most of these, + * but gcc sometimes decides not to inline them. + */ +#undef memcpy +#undef memset +#undef memmove + +extern void *memset(void *, int, __kernel_size_t); +extern void *memcpy(void *, const void *, __kernel_size_t); +extern void *__memcpy(void *, const void *, __kernel_size_t); + +EXPORT_SYMBOL(memset); +EXPORT_SYMBOL(memcpy); +EXPORT_SYMBOL(__memcpy); +EXPORT_SYMBOL(memmove); + +#ifndef CONFIG_DEBUG_VIRTUAL +EXPORT_SYMBOL(phys_base); +#endif +EXPORT_SYMBOL(empty_zero_page); +#ifndef CONFIG_PARAVIRT +EXPORT_SYMBOL(native_load_gs_index); +#endif + +#ifdef CONFIG_PREEMPT +EXPORT_SYMBOL(___preempt_schedule); +#ifdef CONFIG_CONTEXT_TRACKING +EXPORT_SYMBOL(___preempt_schedule_context); +#endif +#endif diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c new file mode 100644 index 00000000000..e48b674639c --- /dev/null +++ b/arch/x86/kernel/x86_init.c @@ -0,0 +1,163 @@ +/* + * Copyright (C) 2009 Thomas Gleixner <tglx@linutronix.de> + * + * For licencing details see kernel-base/COPYING + */ +#include <linux/init.h> +#include <linux/ioport.h> +#include <linux/module.h> +#include <linux/pci.h> + +#include <asm/bios_ebda.h> +#include <asm/paravirt.h> +#include <asm/pci_x86.h> +#include <asm/pci.h> +#include <asm/mpspec.h> +#include <asm/setup.h> +#include <asm/apic.h> +#include <asm/e820.h> +#include <asm/time.h> +#include <asm/irq.h> +#include <asm/io_apic.h> +#include <asm/hpet.h> +#include <asm/pat.h> +#include <asm/tsc.h> +#include <asm/iommu.h> +#include <asm/mach_traps.h> + +void x86_init_noop(void) { } +void __init x86_init_uint_noop(unsigned int unused) { } +int __init iommu_init_noop(void) { return 0; } +void iommu_shutdown_noop(void) { } + +/* + * The platform setup functions are preset with the default functions + * for standard PC hardware. + */ +struct x86_init_ops x86_init __initdata = { + + .resources = { + .probe_roms = probe_roms, + .reserve_resources = reserve_standard_io_resources, + .memory_setup = default_machine_specific_memory_setup, + }, + + .mpparse = { + .mpc_record = x86_init_uint_noop, + .setup_ioapic_ids = x86_init_noop, + .mpc_apic_id = default_mpc_apic_id, + .smp_read_mpc_oem = default_smp_read_mpc_oem, + .mpc_oem_bus_info = default_mpc_oem_bus_info, + .find_smp_config = default_find_smp_config, + .get_smp_config = default_get_smp_config, + }, + + .irqs = { + .pre_vector_init = init_ISA_irqs, + .intr_init = native_init_IRQ, + .trap_init = x86_init_noop, + }, + + .oem = { + .arch_setup = x86_init_noop, + .banner = default_banner, + }, + + .paging = { + .pagetable_init = native_pagetable_init, + }, + + .timers = { + .setup_percpu_clockev = setup_boot_APIC_clock, + .tsc_pre_init = x86_init_noop, + .timer_init = hpet_time_init, + .wallclock_init = x86_init_noop, + }, + + .iommu = { + .iommu_init = iommu_init_noop, + }, + + .pci = { + .init = x86_default_pci_init, + .init_irq = x86_default_pci_init_irq, + .fixup_irqs = x86_default_pci_fixup_irqs, + }, +}; + +struct x86_cpuinit_ops x86_cpuinit = { + .early_percpu_clock_init = x86_init_noop, + .setup_percpu_clockev = setup_secondary_APIC_clock, +}; + +static void default_nmi_init(void) { }; +static int default_i8042_detect(void) { return 1; }; + +struct x86_platform_ops x86_platform = { + .calibrate_tsc = native_calibrate_tsc, + .get_wallclock = mach_get_cmos_time, + .set_wallclock = mach_set_rtc_mmss, + .iommu_shutdown = iommu_shutdown_noop, + .is_untracked_pat_range = is_ISA_range, + .nmi_init = default_nmi_init, + .get_nmi_reason = default_get_nmi_reason, + .i8042_detect = default_i8042_detect, + .save_sched_clock_state = tsc_save_sched_clock_state, + .restore_sched_clock_state = tsc_restore_sched_clock_state, +}; + +EXPORT_SYMBOL_GPL(x86_platform); + +#if defined(CONFIG_PCI_MSI) +struct x86_msi_ops x86_msi = { + .setup_msi_irqs = native_setup_msi_irqs, + .compose_msi_msg = native_compose_msi_msg, + .teardown_msi_irq = native_teardown_msi_irq, + .teardown_msi_irqs = default_teardown_msi_irqs, + .restore_msi_irqs = default_restore_msi_irqs, + .setup_hpet_msi = default_setup_hpet_msi, + .msi_mask_irq = default_msi_mask_irq, + .msix_mask_irq = default_msix_mask_irq, +}; + +/* MSI arch specific hooks */ +int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) +{ + return x86_msi.setup_msi_irqs(dev, nvec, type); +} + +void arch_teardown_msi_irqs(struct pci_dev *dev) +{ + x86_msi.teardown_msi_irqs(dev); +} + +void arch_teardown_msi_irq(unsigned int irq) +{ + x86_msi.teardown_msi_irq(irq); +} + +void arch_restore_msi_irqs(struct pci_dev *dev) +{ + x86_msi.restore_msi_irqs(dev); +} +u32 arch_msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag) +{ + return x86_msi.msi_mask_irq(desc, mask, flag); +} +u32 arch_msix_mask_irq(struct msi_desc *desc, u32 flag) +{ + return x86_msi.msix_mask_irq(desc, flag); +} +#endif + +struct x86_io_apic_ops x86_io_apic_ops = { + .init = native_io_apic_init_mappings, + .read = native_io_apic_read, + .write = native_io_apic_write, + .modify = native_io_apic_modify, + .disable = native_disable_io_apic, + .print_entries = native_io_apic_print_entries, + .set_affinity = native_ioapic_set_affinity, + .setup_entry = native_setup_ioapic_entry, + .eoi_ioapic_pin = native_eoi_ioapic_pin, +}; diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c new file mode 100644 index 00000000000..a4b451c6add --- /dev/null +++ b/arch/x86/kernel/xsave.c @@ -0,0 +1,637 @@ +/* + * xsave/xrstor support. + * + * Author: Suresh Siddha <suresh.b.siddha@intel.com> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/bootmem.h> +#include <linux/compat.h> +#include <asm/i387.h> +#include <asm/fpu-internal.h> +#include <asm/sigframe.h> +#include <asm/xcr.h> + +/* + * Supported feature mask by the CPU and the kernel. + */ +u64 pcntxt_mask; + +/* + * Represents init state for the supported extended state. + */ +struct xsave_struct *init_xstate_buf; + +static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32; +static unsigned int *xstate_offsets, *xstate_sizes, xstate_features; + +/* + * If a processor implementation discern that a processor state component is + * in its initialized state it may modify the corresponding bit in the + * xsave_hdr.xstate_bv as '0', with out modifying the corresponding memory + * layout in the case of xsaveopt. While presenting the xstate information to + * the user, we always ensure that the memory layout of a feature will be in + * the init state if the corresponding header bit is zero. This is to ensure + * that the user doesn't see some stale state in the memory layout during + * signal handling, debugging etc. + */ +void __sanitize_i387_state(struct task_struct *tsk) +{ + struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave; + int feature_bit = 0x2; + u64 xstate_bv; + + if (!fx) + return; + + xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv; + + /* + * None of the feature bits are in init state. So nothing else + * to do for us, as the memory layout is up to date. + */ + if ((xstate_bv & pcntxt_mask) == pcntxt_mask) + return; + + /* + * FP is in init state + */ + if (!(xstate_bv & XSTATE_FP)) { + fx->cwd = 0x37f; + fx->swd = 0; + fx->twd = 0; + fx->fop = 0; + fx->rip = 0; + fx->rdp = 0; + memset(&fx->st_space[0], 0, 128); + } + + /* + * SSE is in init state + */ + if (!(xstate_bv & XSTATE_SSE)) + memset(&fx->xmm_space[0], 0, 256); + + xstate_bv = (pcntxt_mask & ~xstate_bv) >> 2; + + /* + * Update all the other memory layouts for which the corresponding + * header bit is in the init state. + */ + while (xstate_bv) { + if (xstate_bv & 0x1) { + int offset = xstate_offsets[feature_bit]; + int size = xstate_sizes[feature_bit]; + + memcpy(((void *) fx) + offset, + ((void *) init_xstate_buf) + offset, + size); + } + + xstate_bv >>= 1; + feature_bit++; + } +} + +/* + * Check for the presence of extended state information in the + * user fpstate pointer in the sigcontext. + */ +static inline int check_for_xstate(struct i387_fxsave_struct __user *buf, + void __user *fpstate, + struct _fpx_sw_bytes *fx_sw) +{ + int min_xstate_size = sizeof(struct i387_fxsave_struct) + + sizeof(struct xsave_hdr_struct); + unsigned int magic2; + + if (__copy_from_user(fx_sw, &buf->sw_reserved[0], sizeof(*fx_sw))) + return -1; + + /* Check for the first magic field and other error scenarios. */ + if (fx_sw->magic1 != FP_XSTATE_MAGIC1 || + fx_sw->xstate_size < min_xstate_size || + fx_sw->xstate_size > xstate_size || + fx_sw->xstate_size > fx_sw->extended_size) + return -1; + + /* + * Check for the presence of second magic word at the end of memory + * layout. This detects the case where the user just copied the legacy + * fpstate layout with out copying the extended state information + * in the memory layout. + */ + if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size)) + || magic2 != FP_XSTATE_MAGIC2) + return -1; + + return 0; +} + +/* + * Signal frame handlers. + */ +static inline int save_fsave_header(struct task_struct *tsk, void __user *buf) +{ + if (use_fxsr()) { + struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave; + struct user_i387_ia32_struct env; + struct _fpstate_ia32 __user *fp = buf; + + convert_from_fxsr(&env, tsk); + + if (__copy_to_user(buf, &env, sizeof(env)) || + __put_user(xsave->i387.swd, &fp->status) || + __put_user(X86_FXSR_MAGIC, &fp->magic)) + return -1; + } else { + struct i387_fsave_struct __user *fp = buf; + u32 swd; + if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status)) + return -1; + } + + return 0; +} + +static inline int save_xstate_epilog(void __user *buf, int ia32_frame) +{ + struct xsave_struct __user *x = buf; + struct _fpx_sw_bytes *sw_bytes; + u32 xstate_bv; + int err; + + /* Setup the bytes not touched by the [f]xsave and reserved for SW. */ + sw_bytes = ia32_frame ? &fx_sw_reserved_ia32 : &fx_sw_reserved; + err = __copy_to_user(&x->i387.sw_reserved, sw_bytes, sizeof(*sw_bytes)); + + if (!use_xsave()) + return err; + + err |= __put_user(FP_XSTATE_MAGIC2, (__u32 *)(buf + xstate_size)); + + /* + * Read the xstate_bv which we copied (directly from the cpu or + * from the state in task struct) to the user buffers. + */ + err |= __get_user(xstate_bv, (__u32 *)&x->xsave_hdr.xstate_bv); + + /* + * For legacy compatible, we always set FP/SSE bits in the bit + * vector while saving the state to the user context. This will + * enable us capturing any changes(during sigreturn) to + * the FP/SSE bits by the legacy applications which don't touch + * xstate_bv in the xsave header. + * + * xsave aware apps can change the xstate_bv in the xsave + * header as well as change any contents in the memory layout. + * xrestore as part of sigreturn will capture all the changes. + */ + xstate_bv |= XSTATE_FPSSE; + + err |= __put_user(xstate_bv, (__u32 *)&x->xsave_hdr.xstate_bv); + + return err; +} + +static inline int save_user_xstate(struct xsave_struct __user *buf) +{ + int err; + + if (use_xsave()) + err = xsave_user(buf); + else if (use_fxsr()) + err = fxsave_user((struct i387_fxsave_struct __user *) buf); + else + err = fsave_user((struct i387_fsave_struct __user *) buf); + + if (unlikely(err) && __clear_user(buf, xstate_size)) + err = -EFAULT; + return err; +} + +/* + * Save the fpu, extended register state to the user signal frame. + * + * 'buf_fx' is the 64-byte aligned pointer at which the [f|fx|x]save + * state is copied. + * 'buf' points to the 'buf_fx' or to the fsave header followed by 'buf_fx'. + * + * buf == buf_fx for 64-bit frames and 32-bit fsave frame. + * buf != buf_fx for 32-bit frames with fxstate. + * + * If the fpu, extended register state is live, save the state directly + * to the user frame pointed by the aligned pointer 'buf_fx'. Otherwise, + * copy the thread's fpu state to the user frame starting at 'buf_fx'. + * + * If this is a 32-bit frame with fxstate, put a fsave header before + * the aligned state at 'buf_fx'. + * + * For [f]xsave state, update the SW reserved fields in the [f]xsave frame + * indicating the absence/presence of the extended state to the user. + */ +int save_xstate_sig(void __user *buf, void __user *buf_fx, int size) +{ + struct xsave_struct *xsave = ¤t->thread.fpu.state->xsave; + struct task_struct *tsk = current; + int ia32_fxstate = (buf != buf_fx); + + ia32_fxstate &= (config_enabled(CONFIG_X86_32) || + config_enabled(CONFIG_IA32_EMULATION)); + + if (!access_ok(VERIFY_WRITE, buf, size)) + return -EACCES; + + if (!static_cpu_has(X86_FEATURE_FPU)) + return fpregs_soft_get(current, NULL, 0, + sizeof(struct user_i387_ia32_struct), NULL, + (struct _fpstate_ia32 __user *) buf) ? -1 : 1; + + if (user_has_fpu()) { + /* Save the live register state to the user directly. */ + if (save_user_xstate(buf_fx)) + return -1; + /* Update the thread's fxstate to save the fsave header. */ + if (ia32_fxstate) + fpu_fxsave(&tsk->thread.fpu); + } else { + sanitize_i387_state(tsk); + if (__copy_to_user(buf_fx, xsave, xstate_size)) + return -1; + } + + /* Save the fsave header for the 32-bit frames. */ + if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf)) + return -1; + + if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate)) + return -1; + + drop_init_fpu(tsk); /* trigger finit */ + + return 0; +} + +static inline void +sanitize_restored_xstate(struct task_struct *tsk, + struct user_i387_ia32_struct *ia32_env, + u64 xstate_bv, int fx_only) +{ + struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave; + struct xsave_hdr_struct *xsave_hdr = &xsave->xsave_hdr; + + if (use_xsave()) { + /* These bits must be zero. */ + xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0; + + /* + * Init the state that is not present in the memory + * layout and not enabled by the OS. + */ + if (fx_only) + xsave_hdr->xstate_bv = XSTATE_FPSSE; + else + xsave_hdr->xstate_bv &= (pcntxt_mask & xstate_bv); + } + + if (use_fxsr()) { + /* + * mscsr reserved bits must be masked to zero for security + * reasons. + */ + xsave->i387.mxcsr &= mxcsr_feature_mask; + + convert_to_fxsr(tsk, ia32_env); + } +} + +/* + * Restore the extended state if present. Otherwise, restore the FP/SSE state. + */ +static inline int restore_user_xstate(void __user *buf, u64 xbv, int fx_only) +{ + if (use_xsave()) { + if ((unsigned long)buf % 64 || fx_only) { + u64 init_bv = pcntxt_mask & ~XSTATE_FPSSE; + xrstor_state(init_xstate_buf, init_bv); + return fxrstor_user(buf); + } else { + u64 init_bv = pcntxt_mask & ~xbv; + if (unlikely(init_bv)) + xrstor_state(init_xstate_buf, init_bv); + return xrestore_user(buf, xbv); + } + } else if (use_fxsr()) { + return fxrstor_user(buf); + } else + return frstor_user(buf); +} + +int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) +{ + int ia32_fxstate = (buf != buf_fx); + struct task_struct *tsk = current; + int state_size = xstate_size; + u64 xstate_bv = 0; + int fx_only = 0; + + ia32_fxstate &= (config_enabled(CONFIG_X86_32) || + config_enabled(CONFIG_IA32_EMULATION)); + + if (!buf) { + drop_init_fpu(tsk); + return 0; + } + + if (!access_ok(VERIFY_READ, buf, size)) + return -EACCES; + + if (!used_math() && init_fpu(tsk)) + return -1; + + if (!static_cpu_has(X86_FEATURE_FPU)) + return fpregs_soft_set(current, NULL, + 0, sizeof(struct user_i387_ia32_struct), + NULL, buf) != 0; + + if (use_xsave()) { + struct _fpx_sw_bytes fx_sw_user; + if (unlikely(check_for_xstate(buf_fx, buf_fx, &fx_sw_user))) { + /* + * Couldn't find the extended state information in the + * memory layout. Restore just the FP/SSE and init all + * the other extended state. + */ + state_size = sizeof(struct i387_fxsave_struct); + fx_only = 1; + } else { + state_size = fx_sw_user.xstate_size; + xstate_bv = fx_sw_user.xstate_bv; + } + } + + if (ia32_fxstate) { + /* + * For 32-bit frames with fxstate, copy the user state to the + * thread's fpu state, reconstruct fxstate from the fsave + * header. Sanitize the copied state etc. + */ + struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave; + struct user_i387_ia32_struct env; + int err = 0; + + /* + * Drop the current fpu which clears used_math(). This ensures + * that any context-switch during the copy of the new state, + * avoids the intermediate state from getting restored/saved. + * Thus avoiding the new restored state from getting corrupted. + * We will be ready to restore/save the state only after + * set_used_math() is again set. + */ + drop_fpu(tsk); + + if (__copy_from_user(xsave, buf_fx, state_size) || + __copy_from_user(&env, buf, sizeof(env))) { + err = -1; + } else { + sanitize_restored_xstate(tsk, &env, xstate_bv, fx_only); + set_used_math(); + } + + if (use_eager_fpu()) + math_state_restore(); + + return err; + } else { + /* + * For 64-bit frames and 32-bit fsave frames, restore the user + * state to the registers directly (with exceptions handled). + */ + user_fpu_begin(); + if (restore_user_xstate(buf_fx, xstate_bv, fx_only)) { + drop_init_fpu(tsk); + return -1; + } + } + + return 0; +} + +/* + * Prepare the SW reserved portion of the fxsave memory layout, indicating + * the presence of the extended state information in the memory layout + * pointed by the fpstate pointer in the sigcontext. + * This will be saved when ever the FP and extended state context is + * saved on the user stack during the signal handler delivery to the user. + */ +static void prepare_fx_sw_frame(void) +{ + int fsave_header_size = sizeof(struct i387_fsave_struct); + int size = xstate_size + FP_XSTATE_MAGIC2_SIZE; + + if (config_enabled(CONFIG_X86_32)) + size += fsave_header_size; + + fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1; + fx_sw_reserved.extended_size = size; + fx_sw_reserved.xstate_bv = pcntxt_mask; + fx_sw_reserved.xstate_size = xstate_size; + + if (config_enabled(CONFIG_IA32_EMULATION)) { + fx_sw_reserved_ia32 = fx_sw_reserved; + fx_sw_reserved_ia32.extended_size += fsave_header_size; + } +} + +/* + * Enable the extended processor state save/restore feature + */ +static inline void xstate_enable(void) +{ + set_in_cr4(X86_CR4_OSXSAVE); + xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); +} + +/* + * Record the offsets and sizes of different state managed by the xsave + * memory layout. + */ +static void __init setup_xstate_features(void) +{ + int eax, ebx, ecx, edx, leaf = 0x2; + + xstate_features = fls64(pcntxt_mask); + xstate_offsets = alloc_bootmem(xstate_features * sizeof(int)); + xstate_sizes = alloc_bootmem(xstate_features * sizeof(int)); + + do { + cpuid_count(XSTATE_CPUID, leaf, &eax, &ebx, &ecx, &edx); + + if (eax == 0) + break; + + xstate_offsets[leaf] = ebx; + xstate_sizes[leaf] = eax; + + leaf++; + } while (1); +} + +/* + * setup the xstate image representing the init state + */ +static void __init setup_init_fpu_buf(void) +{ + /* + * Setup init_xstate_buf to represent the init state of + * all the features managed by the xsave + */ + init_xstate_buf = alloc_bootmem_align(xstate_size, + __alignof__(struct xsave_struct)); + fx_finit(&init_xstate_buf->i387); + + if (!cpu_has_xsave) + return; + + setup_xstate_features(); + + /* + * Init all the features state with header_bv being 0x0 + */ + xrstor_state(init_xstate_buf, -1); + /* + * Dump the init state again. This is to identify the init state + * of any feature which is not represented by all zero's. + */ + xsave_state(init_xstate_buf, -1); +} + +static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO; +static int __init eager_fpu_setup(char *s) +{ + if (!strcmp(s, "on")) + eagerfpu = ENABLE; + else if (!strcmp(s, "off")) + eagerfpu = DISABLE; + else if (!strcmp(s, "auto")) + eagerfpu = AUTO; + return 1; +} +__setup("eagerfpu=", eager_fpu_setup); + +/* + * Enable and initialize the xsave feature. + */ +static void __init xstate_enable_boot_cpu(void) +{ + unsigned int eax, ebx, ecx, edx; + + if (boot_cpu_data.cpuid_level < XSTATE_CPUID) { + WARN(1, KERN_ERR "XSTATE_CPUID missing\n"); + return; + } + + cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); + pcntxt_mask = eax + ((u64)edx << 32); + + if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) { + pr_err("FP/SSE not shown under xsave features 0x%llx\n", + pcntxt_mask); + BUG(); + } + + /* + * Support only the state known to OS. + */ + pcntxt_mask = pcntxt_mask & XCNTXT_MASK; + + xstate_enable(); + + /* + * Recompute the context size for enabled features + */ + cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); + xstate_size = ebx; + + update_regset_xstate_info(xstate_size, pcntxt_mask); + prepare_fx_sw_frame(); + setup_init_fpu_buf(); + + /* Auto enable eagerfpu for xsaveopt */ + if (cpu_has_xsaveopt && eagerfpu != DISABLE) + eagerfpu = ENABLE; + + if (pcntxt_mask & XSTATE_EAGER) { + if (eagerfpu == DISABLE) { + pr_err("eagerfpu not present, disabling some xstate features: 0x%llx\n", + pcntxt_mask & XSTATE_EAGER); + pcntxt_mask &= ~XSTATE_EAGER; + } else { + eagerfpu = ENABLE; + } + } + + pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x\n", + pcntxt_mask, xstate_size); +} + +/* + * For the very first instance, this calls xstate_enable_boot_cpu(); + * for all subsequent instances, this calls xstate_enable(). + * + * This is somewhat obfuscated due to the lack of powerful enough + * overrides for the section checks. + */ +void xsave_init(void) +{ + static __refdata void (*next_func)(void) = xstate_enable_boot_cpu; + void (*this_func)(void); + + if (!cpu_has_xsave) + return; + + this_func = next_func; + next_func = xstate_enable; + this_func(); +} + +static inline void __init eager_fpu_init_bp(void) +{ + current->thread.fpu.state = + alloc_bootmem_align(xstate_size, __alignof__(struct xsave_struct)); + if (!init_xstate_buf) + setup_init_fpu_buf(); +} + +void eager_fpu_init(void) +{ + static __refdata void (*boot_func)(void) = eager_fpu_init_bp; + + clear_used_math(); + current_thread_info()->status = 0; + + if (eagerfpu == ENABLE) + setup_force_cpu_cap(X86_FEATURE_EAGER_FPU); + + if (!cpu_has_eager_fpu) { + stts(); + return; + } + + if (boot_func) { + boot_func(); + boot_func = NULL; + } + + /* + * This is same as math_state_restore(). But use_xsave() is + * not yet patched to use math_state_restore(). + */ + init_fpu(current); + __thread_fpu_begin(current); + if (cpu_has_xsave) + xrstor_state(init_xstate_buf, -1); + else + fxrstor_checking(&init_xstate_buf->i387); +} |
