aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile19
-rw-r--r--arch/x86/kernel/acpi/boot.c121
-rw-r--r--arch/x86/kernel/acpi/cstate.c27
-rw-r--r--arch/x86/kernel/acpi/sleep.c11
-rw-r--r--arch/x86/kernel/acpi/sleep.h2
-rw-r--r--arch/x86/kernel/acpi/wakeup_32.S2
-rw-r--r--arch/x86/kernel/acpi/wakeup_64.S2
-rw-r--r--arch/x86/kernel/alternative.c14
-rw-r--r--arch/x86/kernel/amd_gart_64.c2
-rw-r--r--arch/x86/kernel/amd_nb.c4
-rw-r--r--arch/x86/kernel/apb_timer.c10
-rw-r--r--arch/x86/kernel/aperture_64.c79
-rw-r--r--arch/x86/kernel/apic/Makefile3
-rw-r--r--arch/x86/kernel/apic/apic.c90
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c9
-rw-r--r--arch/x86/kernel/apic/apic_noop.c4
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c2
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c3
-rw-r--r--arch/x86/kernel/apic/es7000_32.c746
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c23
-rw-r--r--arch/x86/kernel/apic/io_apic.c182
-rw-r--r--arch/x86/kernel/apic/ipi.c1
-rw-r--r--arch/x86/kernel/apic/numaq_32.c525
-rw-r--r--arch/x86/kernel/apic/probe_32.c3
-rw-r--r--arch/x86/kernel/apic/summit_32.c552
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c3
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c3
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c101
-rw-r--r--arch/x86/kernel/apm_32.c12
-rw-r--r--arch/x86/kernel/asm-offsets.c1
-rw-r--r--arch/x86/kernel/check.c2
-rw-r--r--arch/x86/kernel/cpu/Makefile3
-rw-r--r--arch/x86/kernel/cpu/amd.c70
-rw-r--r--arch/x86/kernel/cpu/centaur.c281
-rw-r--r--arch/x86/kernel/cpu/common.c79
-rw-r--r--arch/x86/kernel/cpu/cpu.h20
-rw-r--r--arch/x86/kernel/cpu/cyrix.c1
-rw-r--r--arch/x86/kernel/cpu/intel.c133
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c27
-rw-r--r--arch/x86/kernel/cpu/match.c42
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-apei.c15
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c91
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c34
-rw-r--r--arch/x86/kernel/cpu/mcheck/p5.c1
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c22
-rw-r--r--arch/x86/kernel/cpu/mcheck/threshold.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/winchip.c1
-rw-r--r--arch/x86/kernel/cpu/microcode/Makefile7
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c (renamed from arch/x86/kernel/microcode_amd.c)17
-rw-r--r--arch/x86/kernel/cpu/microcode/amd_early.c (renamed from arch/x86/kernel/microcode_amd_early.c)266
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c (renamed from arch/x86/kernel/microcode_core.c)6
-rw-r--r--arch/x86/kernel/cpu/microcode/core_early.c (renamed from arch/x86/kernel/microcode_core_early.c)37
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c (renamed from arch/x86/kernel/microcode_intel.c)2
-rw-r--r--arch/x86/kernel/cpu/microcode/intel_early.c (renamed from arch/x86/kernel/microcode_intel_early.c)10
-rw-r--r--arch/x86/kernel/cpu/microcode/intel_lib.c (renamed from arch/x86/kernel/microcode_intel_lib.c)0
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c102
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c4
-rw-r--r--arch/x86/kernel/cpu/perf_event.c110
-rw-r--r--arch/x86/kernel/cpu/perf_event.h42
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_ibs.c62
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_uncore.c7
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c168
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c231
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_lbr.c36
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_rapl.c714
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.c702
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.h5
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c34
-rw-r--r--arch/x86/kernel/cpu/perf_event_p6.c48
-rw-r--r--arch/x86/kernel/cpu/proc.c15
-rw-r--r--arch/x86/kernel/cpu/rdrand.c15
-rw-r--r--arch/x86/kernel/cpu/scattered.c2
-rw-r--r--arch/x86/kernel/cpu/transmeta.c1
-rw-r--r--arch/x86/kernel/cpu/umc.c5
-rw-r--r--arch/x86/kernel/cpuid.c15
-rw-r--r--arch/x86/kernel/crash.c5
-rw-r--r--arch/x86/kernel/devicetree.c53
-rw-r--r--arch/x86/kernel/doublefault.c1
-rw-r--r--arch/x86/kernel/dumpstack.c20
-rw-r--r--arch/x86/kernel/dumpstack_32.c46
-rw-r--r--arch/x86/kernel/dumpstack_64.c118
-rw-r--r--arch/x86/kernel/e820.c2
-rw-r--r--arch/x86/kernel/early-quirks.c280
-rw-r--r--arch/x86/kernel/early_printk.c9
-rw-r--r--arch/x86/kernel/entry_32.S82
-rw-r--r--arch/x86/kernel/entry_64.S488
-rw-r--r--arch/x86/kernel/espfix_64.c208
-rw-r--r--arch/x86/kernel/ftrace.c204
-rw-r--r--arch/x86/kernel/head32.c6
-rw-r--r--arch/x86/kernel/head64.c6
-rw-r--r--arch/x86/kernel/head_32.S7
-rw-r--r--arch/x86/kernel/head_64.S6
-rw-r--r--arch/x86/kernel/hpet.c20
-rw-r--r--arch/x86/kernel/hw_breakpoint.c6
-rw-r--r--arch/x86/kernel/i386_ksyms_32.c7
-rw-r--r--arch/x86/kernel/i387.c17
-rw-r--r--arch/x86/kernel/i8259.c23
-rw-r--r--arch/x86/kernel/iosf_mbi.c237
-rw-r--r--arch/x86/kernel/irq.c123
-rw-r--r--arch/x86/kernel/irq_32.c103
-rw-r--r--arch/x86/kernel/irq_64.c21
-rw-r--r--arch/x86/kernel/irqinit.c4
-rw-r--r--arch/x86/kernel/jump_label.c25
-rw-r--r--arch/x86/kernel/kgdb.c1
-rw-r--r--arch/x86/kernel/kprobes/core.c147
-rw-r--r--arch/x86/kernel/kprobes/ftrace.c17
-rw-r--r--arch/x86/kernel/kprobes/opt.c32
-rw-r--r--arch/x86/kernel/ksysfs.c340
-rw-r--r--arch/x86/kernel/kvm.c62
-rw-r--r--arch/x86/kernel/kvmclock.c3
-rw-r--r--arch/x86/kernel/ldt.c5
-rw-r--r--arch/x86/kernel/machine_kexec_32.c1
-rw-r--r--arch/x86/kernel/machine_kexec_64.c2
-rw-r--r--arch/x86/kernel/mcount_64.S217
-rw-r--r--arch/x86/kernel/module.c46
-rw-r--r--arch/x86/kernel/msr.c18
-rw-r--r--arch/x86/kernel/nmi.c55
-rw-r--r--arch/x86/kernel/paravirt.c6
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c2
-rw-r--r--arch/x86/kernel/pci-calgary_64.c31
-rw-r--r--arch/x86/kernel/pci-dma.c13
-rw-r--r--arch/x86/kernel/pci-nommu.c1
-rw-r--r--arch/x86/kernel/pci-swiotlb.c9
-rw-r--r--arch/x86/kernel/preempt.S25
-rw-r--r--arch/x86/kernel/process.c11
-rw-r--r--arch/x86/kernel/process_32.c17
-rw-r--r--arch/x86/kernel/process_64.c21
-rw-r--r--arch/x86/kernel/ptrace.c8
-rw-r--r--arch/x86/kernel/pvclock.c13
-rw-r--r--arch/x86/kernel/quirks.c39
-rw-r--r--arch/x86/kernel/reboot.c377
-rw-r--r--arch/x86/kernel/rtc.c12
-rw-r--r--arch/x86/kernel/setup.c81
-rw-r--r--arch/x86/kernel/signal.c6
-rw-r--r--arch/x86/kernel/smp.c2
-rw-r--r--arch/x86/kernel/smpboot.c98
-rw-r--r--arch/x86/kernel/sysfb_simplefb.c4
-rw-r--r--arch/x86/kernel/time.c4
-rw-r--r--arch/x86/kernel/topology.c11
-rw-r--r--arch/x86/kernel/traps.c182
-rw-r--r--arch/x86/kernel/tsc.c334
-rw-r--r--arch/x86/kernel/tsc_msr.c127
-rw-r--r--arch/x86/kernel/tsc_sync.c1
-rw-r--r--arch/x86/kernel/uprobes.c842
-rw-r--r--arch/x86/kernel/vmlinux.lds.S17
-rw-r--r--arch/x86/kernel/vsmp_64.c19
-rw-r--r--arch/x86/kernel/vsyscall_64.c64
-rw-r--r--arch/x86/kernel/vsyscall_gtod.c69
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c7
-rw-r--r--arch/x86/kernel/x86_init.c14
-rw-r--r--arch/x86/kernel/xsave.c10
151 files changed, 6637 insertions, 4958 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index a5408b965c9..047f9ff2e36 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -26,16 +26,21 @@ obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-y += probe_roms.o
obj-$(CONFIG_X86_32) += i386_ksyms_32.o
obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
-obj-y += syscall_$(BITS).o
+obj-$(CONFIG_X86_64) += mcount_64.o
+obj-y += syscall_$(BITS).o vsyscall_gtod.o
obj-$(CONFIG_X86_64) += vsyscall_64.o
obj-$(CONFIG_X86_64) += vsyscall_emu_64.o
+obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o
+obj-$(CONFIG_SYSFS) += ksysfs.o
obj-y += bootflag.o e820.o
obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
-obj-y += tsc.o io_delay.o rtc.o
+obj-y += tsc.o tsc_msr.o io_delay.o rtc.o
obj-y += pci-iommu_table.o
obj-y += resource.o
+obj-$(CONFIG_PREEMPT) += preempt.o
+
obj-y += process.o
obj-y += i387.o xsave.o
obj-y += ptrace.o
@@ -89,15 +94,6 @@ obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
-obj-$(CONFIG_MICROCODE_EARLY) += microcode_core_early.o
-obj-$(CONFIG_MICROCODE_INTEL_EARLY) += microcode_intel_early.o
-obj-$(CONFIG_MICROCODE_INTEL_LIB) += microcode_intel_lib.o
-microcode-y := microcode_core.o
-microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o
-microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o
-obj-$(CONFIG_MICROCODE_AMD_EARLY) += microcode_amd_early.o
-obj-$(CONFIG_MICROCODE) += microcode.o
-
obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
@@ -109,6 +105,7 @@ obj-$(CONFIG_EFI) += sysfb_efi.o
obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
obj-$(CONFIG_TRACING) += tracepoint.o
+obj-$(CONFIG_IOSF_MBI) += iosf_mbi.o
###
# 64 bit specific files
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 40c76604199..86281ffb96d 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -46,7 +46,6 @@
#include "sleep.h" /* To include x86_acpi_suspend_lowlevel */
static int __initdata acpi_force = 0;
-u32 acpi_rsdt_forced;
int acpi_disabled;
EXPORT_SYMBOL(acpi_disabled);
@@ -54,10 +53,6 @@ EXPORT_SYMBOL(acpi_disabled);
# include <asm/proto.h>
#endif /* X86 */
-#define BAD_MADT_ENTRY(entry, end) ( \
- (!entry) || (unsigned long)entry + sizeof(*entry) > end || \
- ((struct acpi_subtable_header *)entry)->length < sizeof(*entry))
-
#define PREFIX "ACPI: "
int acpi_noirq; /* skip ACPI IRQ initialization */
@@ -189,24 +184,31 @@ static int __init acpi_parse_madt(struct acpi_table_header *table)
return 0;
}
-static void acpi_register_lapic(int id, u8 enabled)
+/**
+ * acpi_register_lapic - register a local apic and generates a logic cpu number
+ * @id: local apic id to register
+ * @enabled: this cpu is enabled or not
+ *
+ * Returns the logic cpu number which maps to the local apic
+ */
+static int acpi_register_lapic(int id, u8 enabled)
{
unsigned int ver = 0;
if (id >= MAX_LOCAL_APIC) {
printk(KERN_INFO PREFIX "skipped apicid that is too big\n");
- return;
+ return -EINVAL;
}
if (!enabled) {
++disabled_cpus;
- return;
+ return -EINVAL;
}
if (boot_cpu_physical_apicid != -1U)
ver = apic_version[boot_cpu_physical_apicid];
- generic_processor_info(id, ver);
+ return generic_processor_info(id, ver);
}
static int __init
@@ -607,91 +609,34 @@ static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
int nid;
nid = acpi_get_node(handle);
- if (nid == -1 || !node_online(nid))
- return;
- set_apicid_to_node(physid, nid);
- numa_set_node(cpu, nid);
+ if (nid != -1) {
+ set_apicid_to_node(physid, nid);
+ numa_set_node(cpu, nid);
+ }
#endif
}
-static int _acpi_map_lsapic(acpi_handle handle, int *pcpu)
+static int _acpi_map_lsapic(acpi_handle handle, int physid, int *pcpu)
{
- struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
- union acpi_object *obj;
- struct acpi_madt_local_apic *lapic;
- cpumask_var_t tmp_map, new_map;
- u8 physid;
int cpu;
- int retval = -ENOMEM;
- if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer)))
- return -EINVAL;
-
- if (!buffer.length || !buffer.pointer)
- return -EINVAL;
-
- obj = buffer.pointer;
- if (obj->type != ACPI_TYPE_BUFFER ||
- obj->buffer.length < sizeof(*lapic)) {
- kfree(buffer.pointer);
- return -EINVAL;
- }
-
- lapic = (struct acpi_madt_local_apic *)obj->buffer.pointer;
-
- if (lapic->header.type != ACPI_MADT_TYPE_LOCAL_APIC ||
- !(lapic->lapic_flags & ACPI_MADT_ENABLED)) {
- kfree(buffer.pointer);
- return -EINVAL;
- }
-
- physid = lapic->id;
-
- kfree(buffer.pointer);
- buffer.length = ACPI_ALLOCATE_BUFFER;
- buffer.pointer = NULL;
- lapic = NULL;
-
- if (!alloc_cpumask_var(&tmp_map, GFP_KERNEL))
- goto out;
-
- if (!alloc_cpumask_var(&new_map, GFP_KERNEL))
- goto free_tmp_map;
-
- cpumask_copy(tmp_map, cpu_present_mask);
- acpi_register_lapic(physid, ACPI_MADT_ENABLED);
-
- /*
- * If acpi_register_lapic successfully generates a new logical cpu
- * number, then the following will get us exactly what was mapped
- */
- cpumask_andnot(new_map, cpu_present_mask, tmp_map);
- if (cpumask_empty(new_map)) {
- printk ("Unable to map lapic to logical cpu number\n");
- retval = -EINVAL;
- goto free_new_map;
+ cpu = acpi_register_lapic(physid, ACPI_MADT_ENABLED);
+ if (cpu < 0) {
+ pr_info(PREFIX "Unable to map lapic to logical cpu number\n");
+ return cpu;
}
acpi_processor_set_pdc(handle);
-
- cpu = cpumask_first(new_map);
acpi_map_cpu2node(handle, cpu, physid);
*pcpu = cpu;
- retval = 0;
-
-free_new_map:
- free_cpumask_var(new_map);
-free_tmp_map:
- free_cpumask_var(tmp_map);
-out:
- return retval;
+ return 0;
}
/* wrapper to silence section mismatch warning */
-int __ref acpi_map_lsapic(acpi_handle handle, int *pcpu)
+int __ref acpi_map_lsapic(acpi_handle handle, int physid, int *pcpu)
{
- return _acpi_map_lsapic(handle, pcpu);
+ return _acpi_map_lsapic(handle, physid, pcpu);
}
EXPORT_SYMBOL(acpi_map_lsapic);
@@ -745,7 +690,7 @@ static int __init acpi_parse_sbf(struct acpi_table_header *table)
#ifdef CONFIG_HPET_TIMER
#include <asm/hpet.h>
-static struct __initdata resource *hpet_res;
+static struct resource *hpet_res __initdata;
static int __init acpi_parse_hpet(struct acpi_table_header *table)
{
@@ -958,10 +903,6 @@ static int __init acpi_parse_madt_lapic_entries(void)
#ifdef CONFIG_X86_IO_APIC
#define MP_ISA_BUS 0
-#ifdef CONFIG_X86_ES7000
-extern int es7000_plat;
-#endif
-
void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
{
int ioapic;
@@ -1011,14 +952,6 @@ void __init mp_config_acpi_legacy_irqs(void)
set_bit(MP_ISA_BUS, mp_bus_not_pci);
pr_debug("Bus #%d is ISA\n", MP_ISA_BUS);
-#ifdef CONFIG_X86_ES7000
- /*
- * Older generations of ES7000 have no legacy identity mappings
- */
- if (es7000_plat == 1)
- return;
-#endif
-
/*
* Use the default configuration for the IRQs 0-15. Unless
* overridden by (MADT) interrupt source override entries.
@@ -1084,9 +1017,7 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
if (!acpi_ioapic)
return 0;
- if (!dev)
- return 0;
- if (dev->bus != &pci_bus_type)
+ if (!dev || !dev_is_pci(dev))
return 0;
pdev = to_pci_dev(dev);
@@ -1614,7 +1545,7 @@ static int __init parse_acpi(char *arg)
}
/* acpi=rsdt use RSDT instead of XSDT */
else if (strcmp(arg, "rsdt") == 0) {
- acpi_rsdt_forced = 1;
+ acpi_gbl_do_not_use_xsdt = TRUE;
}
/* "acpi=noirq" disables ACPI interrupt routing */
else if (strcmp(arg, "noirq") == 0) {
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index d2b7f27781b..4b28159e042 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -87,7 +87,9 @@ static long acpi_processor_ffh_cstate_probe_cpu(void *_cx)
num_cstate_subtype = edx_part & MWAIT_SUBSTATE_MASK;
retval = 0;
- if (num_cstate_subtype < (cx->address & MWAIT_SUBSTATE_MASK)) {
+ /* If the HW does not support any sub-states in this C-state */
+ if (num_cstate_subtype == 0) {
+ pr_warn(FW_BUG "ACPI MWAIT C-state 0x%x not supported by HW (0x%x)\n", cx->address, edx_part);
retval = -1;
goto out;
}
@@ -150,29 +152,6 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
}
EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
-/*
- * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
- * which can obviate IPI to trigger checking of need_resched.
- * We execute MONITOR against need_resched and enter optimized wait state
- * through MWAIT. Whenever someone changes need_resched, we would be woken
- * up from MWAIT (without an IPI).
- *
- * New with Core Duo processors, MWAIT can take some hints based on CPU
- * capability.
- */
-void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
-{
- if (!need_resched()) {
- if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
- clflush((void *)&current_thread_info()->flags);
-
- __monitor((void *)&current_thread_info()->flags, 0, 0);
- smp_mb();
- if (!need_resched())
- __mwait(ax, cx);
- }
-}
-
void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
{
unsigned int cpu = smp_processor_id();
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 33120100ff5..31368207837 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -26,6 +26,17 @@ static char temp_stack[4096];
#endif
/**
+ * x86_acpi_enter_sleep_state - enter sleep state
+ * @state: Sleep state to enter.
+ *
+ * Wrapper around acpi_enter_sleep_state() to be called by assmebly.
+ */
+acpi_status asmlinkage __visible x86_acpi_enter_sleep_state(u8 state)
+{
+ return acpi_enter_sleep_state(state);
+}
+
+/**
* x86_acpi_suspend_lowlevel - save kernel state
*
* Create an identity mapped page table and copy the wakeup routine to
diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h
index c9c2c982d5e..65c7b606b60 100644
--- a/arch/x86/kernel/acpi/sleep.h
+++ b/arch/x86/kernel/acpi/sleep.h
@@ -17,3 +17,5 @@ extern void wakeup_long64(void);
extern void do_suspend_lowlevel(void);
extern int x86_acpi_suspend_lowlevel(void);
+
+acpi_status asmlinkage x86_acpi_enter_sleep_state(u8 state);
diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S
index d1daa66ab16..665c6b7d2ea 100644
--- a/arch/x86/kernel/acpi/wakeup_32.S
+++ b/arch/x86/kernel/acpi/wakeup_32.S
@@ -73,7 +73,7 @@ ENTRY(do_suspend_lowlevel)
call save_processor_state
call save_registers
pushl $3
- call acpi_enter_sleep_state
+ call x86_acpi_enter_sleep_state
addl $4, %esp
# In case of S3 failure, we'll emerge here. Jump
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index 8ea5164cbd0..ae693b51ed8 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -73,7 +73,7 @@ ENTRY(do_suspend_lowlevel)
addq $8, %rsp
movl $3, %edi
xorl %eax, %eax
- call acpi_enter_sleep_state
+ call x86_acpi_enter_sleep_state
/* in case something went wrong, restore the machine status and go on */
jmp resume_point
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 15e8563e5c2..703130f469e 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -5,7 +5,6 @@
#include <linux/mutex.h>
#include <linux/list.h>
#include <linux/stringify.h>
-#include <linux/kprobes.h>
#include <linux/mm.h>
#include <linux/vmalloc.h>
#include <linux/memory.h>
@@ -402,17 +401,6 @@ void alternatives_enable_smp(void)
{
struct smp_alt_module *mod;
-#ifdef CONFIG_LOCKDEP
- /*
- * Older binutils section handling bug prevented
- * alternatives-replacement from working reliably.
- *
- * If this still occurs then you should see a hang
- * or crash shortly after this line:
- */
- pr_info("lockdep: fixing up alternatives\n");
-#endif
-
/* Why bother if there are no other CPUs? */
BUG_ON(num_possible_cpus() == 1);
@@ -562,7 +550,7 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode,
*
* Note: Must be called under text_mutex.
*/
-void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
+void *text_poke(void *addr, const void *opcode, size_t len)
{
unsigned long flags;
char *vaddr;
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index b574b295a2f..8e3842fc8be 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -512,7 +512,7 @@ gart_free_coherent(struct device *dev, size_t size, void *vaddr,
dma_addr_t dma_addr, struct dma_attrs *attrs)
{
gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, NULL);
- free_pages((unsigned long)vaddr, get_order(size));
+ dma_generic_free_coherent(dev, size, vaddr, dma_addr, attrs);
}
static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr)
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 59554dca96e..f04dbb3069b 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -22,6 +22,7 @@ const struct pci_device_id amd_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M10H_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) },
{}
};
EXPORT_SYMBOL(amd_nb_misc_ids);
@@ -30,6 +31,7 @@ static const struct pci_device_id amd_nb_link_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) },
{}
};
@@ -179,7 +181,7 @@ int amd_get_subcaches(int cpu)
return (mask >> (4 * cuid)) & 0xf;
}
-int amd_set_subcaches(int cpu, int mask)
+int amd_set_subcaches(int cpu, unsigned long mask)
{
static unsigned int reset, ban;
struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu));
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index c9876efecaf..af5b08ab3b7 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -40,7 +40,7 @@
#include <asm/fixmap.h>
#include <asm/apb_timer.h>
-#include <asm/mrst.h>
+#include <asm/intel-mid.h>
#include <asm/time.h>
#define APBT_CLOCKEVENT_RATING 110
@@ -157,13 +157,13 @@ static int __init apbt_clockevent_register(void)
adev->num = smp_processor_id();
adev->timer = dw_apb_clockevent_init(smp_processor_id(), "apbt0",
- mrst_timer_options == MRST_TIMER_LAPIC_APBT ?
+ intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT ?
APBT_CLOCKEVENT_RATING - 100 : APBT_CLOCKEVENT_RATING,
adev_virt_addr(adev), 0, apbt_freq);
/* Firmware does EOI handling for us. */
adev->timer->eoi = NULL;
- if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) {
+ if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT) {
global_clock_event = &adev->timer->ced;
printk(KERN_DEBUG "%s clockevent registered as global\n",
global_clock_event->name);
@@ -253,7 +253,7 @@ static int apbt_cpuhp_notify(struct notifier_block *n,
static __init int apbt_late_init(void)
{
- if (mrst_timer_options == MRST_TIMER_LAPIC_APBT ||
+ if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT ||
!apb_timer_block_enabled)
return 0;
/* This notifier should be called after workqueue is ready */
@@ -340,7 +340,7 @@ void __init apbt_time_init(void)
}
#ifdef CONFIG_SMP
/* kernel cmdline disable apb timer, so we will use lapic timers */
- if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) {
+ if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT) {
printk(KERN_INFO "apbt: disabled per cpu timer\n");
return;
}
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index fd972a3e4cb..76164e173a2 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -10,6 +10,8 @@
*
* Copyright 2002 Andi Kleen, SuSE Labs.
*/
+#define pr_fmt(fmt) "AGP: " fmt
+
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/init.h>
@@ -18,7 +20,6 @@
#include <linux/pci_ids.h>
#include <linux/pci.h>
#include <linux/bitops.h>
-#include <linux/ioport.h>
#include <linux/suspend.h>
#include <asm/e820.h>
#include <asm/io.h>
@@ -54,18 +55,6 @@ int fallback_aper_force __initdata;
int fix_aperture __initdata = 1;
-static struct resource gart_resource = {
- .name = "GART",
- .flags = IORESOURCE_MEM,
-};
-
-static void __init insert_aperture_resource(u32 aper_base, u32 aper_size)
-{
- gart_resource.start = aper_base;
- gart_resource.end = aper_base + aper_size - 1;
- insert_resource(&iomem_resource, &gart_resource);
-}
-
/* This code runs before the PCI subsystem is initialized, so just
access the northbridge directly. */
@@ -88,15 +77,13 @@ static u32 __init allocate_aperture(void)
addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR,
aper_size, aper_size);
if (!addr) {
- printk(KERN_ERR
- "Cannot allocate aperture memory hole (%lx,%uK)\n",
- addr, aper_size>>10);
+ pr_err("Cannot allocate aperture memory hole [mem %#010lx-%#010lx] (%uKB)\n",
+ addr, addr + aper_size - 1, aper_size >> 10);
return 0;
}
memblock_reserve(addr, aper_size);
- printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n",
- aper_size >> 10, addr);
- insert_aperture_resource((u32)addr, aper_size);
+ pr_info("Mapping aperture over RAM [mem %#010lx-%#010lx] (%uKB)\n",
+ addr, addr + aper_size - 1, aper_size >> 10);
register_nosave_region(addr >> PAGE_SHIFT,
(addr+aper_size) >> PAGE_SHIFT);
@@ -140,10 +127,11 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order)
u64 aper;
u32 old_order;
- printk(KERN_INFO "AGP bridge at %02x:%02x:%02x\n", bus, slot, func);
+ pr_info("pci 0000:%02x:%02x:%02x: AGP bridge\n", bus, slot, func);
apsizereg = read_pci_config_16(bus, slot, func, cap + 0x14);
if (apsizereg == 0xffffffff) {
- printk(KERN_ERR "APSIZE in AGP bridge unreadable\n");
+ pr_err("pci 0000:%02x:%02x.%d: APSIZE unreadable\n",
+ bus, slot, func);
return 0;
}
@@ -167,16 +155,18 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order)
* On some sick chips, APSIZE is 0. It means it wants 4G
* so let double check that order, and lets trust AMD NB settings:
*/
- printk(KERN_INFO "Aperture from AGP @ %Lx old size %u MB\n",
- aper, 32 << old_order);
+ pr_info("pci 0000:%02x:%02x.%d: AGP aperture [bus addr %#010Lx-%#010Lx] (old size %uMB)\n",
+ bus, slot, func, aper, aper + (32ULL << (old_order + 20)) - 1,
+ 32 << old_order);
if (aper + (32ULL<<(20 + *order)) > 0x100000000ULL) {
- printk(KERN_INFO "Aperture size %u MB (APSIZE %x) is not right, using settings from NB\n",
- 32 << *order, apsizereg);
+ pr_info("pci 0000:%02x:%02x.%d: AGP aperture size %uMB (APSIZE %#x) is not right, using settings from NB\n",
+ bus, slot, func, 32 << *order, apsizereg);
*order = old_order;
}
- printk(KERN_INFO "Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n",
- aper, 32 << *order, apsizereg);
+ pr_info("pci 0000:%02x:%02x.%d: AGP aperture [bus addr %#010Lx-%#010Lx] (%uMB, APSIZE %#x)\n",
+ bus, slot, func, aper, aper + (32ULL << (*order + 20)) - 1,
+ 32 << *order, apsizereg);
if (!aperture_valid(aper, (32*1024*1024) << *order, 32<<20))
return 0;
@@ -232,7 +222,7 @@ static u32 __init search_agp_bridge(u32 *order, int *valid_agp)
}
}
}
- printk(KERN_INFO "No AGP bridge found\n");
+ pr_info("No AGP bridge found\n");
return 0;
}
@@ -324,7 +314,8 @@ void __init early_gart_iommu_check(void)
if (e820_any_mapped(aper_base, aper_base + aper_size,
E820_RAM)) {
/* reserve it, so we can reuse it in second kernel */
- printk(KERN_INFO "update e820 for GART\n");
+ pr_info("e820: reserve [mem %#010Lx-%#010Lx] for GART\n",
+ aper_base, aper_base + aper_size - 1);
e820_add_region(aper_base, aper_size, E820_RESERVED);
update_e820();
}
@@ -368,7 +359,7 @@ int __init gart_iommu_hole_init(void)
!early_pci_allowed())
return -ENODEV;
- printk(KERN_INFO "Checking aperture...\n");
+ pr_info("Checking aperture...\n");
if (!fallback_aper_force)
agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp);
@@ -409,8 +400,9 @@ int __init gart_iommu_hole_init(void)
aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
aper_base <<= 25;
- printk(KERN_INFO "Node %d: aperture @ %Lx size %u MB\n",
- node, aper_base, aper_size >> 20);
+ pr_info("Node %d: aperture [bus addr %#010Lx-%#010Lx] (%uMB)\n",
+ node, aper_base, aper_base + aper_size - 1,
+ aper_size >> 20);
node++;
if (!aperture_valid(aper_base, aper_size, 64<<20)) {
@@ -421,9 +413,9 @@ int __init gart_iommu_hole_init(void)
if (!no_iommu &&
max_pfn > MAX_DMA32_PFN &&
!printed_gart_size_msg) {
- printk(KERN_ERR "you are using iommu with agp, but GART size is less than 64M\n");
- printk(KERN_ERR "please increase GART size in your BIOS setup\n");
- printk(KERN_ERR "if BIOS doesn't have that option, contact your HW vendor!\n");
+ pr_err("you are using iommu with agp, but GART size is less than 64MB\n");
+ pr_err("please increase GART size in your BIOS setup\n");
+ pr_err("if BIOS doesn't have that option, contact your HW vendor!\n");
printed_gart_size_msg = 1;
}
} else {
@@ -444,12 +436,8 @@ int __init gart_iommu_hole_init(void)
out:
if (!fix && !fallback_aper_force) {
- if (last_aper_base) {
- unsigned long n = (32 * 1024 * 1024) << last_aper_order;
-
- insert_aperture_resource((u32)last_aper_base, n);
+ if (last_aper_base)
return 1;
- }
return 0;
}
@@ -464,13 +452,10 @@ out:
force_iommu ||
valid_agp ||
fallback_aper_force) {
- printk(KERN_INFO
- "Your BIOS doesn't leave a aperture memory hole\n");
- printk(KERN_INFO
- "Please enable the IOMMU option in the BIOS setup\n");
- printk(KERN_INFO
- "This costs you %d MB of RAM\n",
- 32 << fallback_aper_order);
+ pr_info("Your BIOS doesn't leave a aperture memory hole\n");
+ pr_info("Please enable the IOMMU option in the BIOS setup\n");
+ pr_info("This costs you %dMB of RAM\n",
+ 32 << fallback_aper_order);
aper_order = fallback_aper_order;
aper_alloc = allocate_aperture();
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 0ae0323b1f9..dcb5b15401c 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -18,10 +18,7 @@ obj-y += apic_flat_64.o
endif
# APIC probe will depend on the listing order here
-obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
-obj-$(CONFIG_X86_SUMMIT) += summit_32.o
obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o
-obj-$(CONFIG_X86_ES7000) += es7000_32.o
# For 32bit, probe_32 need to be listed last
obj-$(CONFIG_X86_LOCAL_APIC) += probe_$(BITS).o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index a7eb82d9b01..ad28db7e6bd 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -62,6 +62,7 @@ unsigned disabled_cpus;
/* Processor that is doing the boot up */
unsigned int boot_cpu_physical_apicid = -1U;
+EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid);
/*
* The highest APIC ID seen during enumeration.
@@ -74,6 +75,13 @@ unsigned int max_physical_apicid;
physid_mask_t phys_cpu_present_map;
/*
+ * Processor to be disabled specified by kernel parameter
+ * disable_cpu_apicid=<int>, mostly used for the kdump 2nd kernel to
+ * avoid undefined behaviour caused by sending INIT from AP to BSP.
+ */
+static unsigned int disabled_cpu_apicid __read_mostly = BAD_APICID;
+
+/*
* Map cpu index to physical APIC ID
*/
DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID);
@@ -125,6 +133,10 @@ static inline void imcr_apic_to_pic(void)
* +1=force-enable
*/
static int force_enable_local_apic __initdata;
+
+/* Control whether x2APIC mode is enabled or not */
+static bool nox2apic __initdata;
+
/*
* APIC command line parameters
*/
@@ -154,8 +166,7 @@ int x2apic_mode;
/* x2apic enabled before OS handover */
int x2apic_preenabled;
static int x2apic_disabled;
-static int nox2apic;
-static __init int setup_nox2apic(char *str)
+static int __init setup_nox2apic(char *str)
{
if (x2apic_enabled()) {
int apicid = native_apic_msr_read(APIC_ID);
@@ -170,7 +181,7 @@ static __init int setup_nox2apic(char *str)
} else
setup_clear_cpu_cap(X86_FEATURE_X2APIC);
- nox2apic = 1;
+ nox2apic = true;
return 0;
}
@@ -275,8 +286,12 @@ u32 native_safe_apic_wait_icr_idle(void)
void native_apic_icr_write(u32 low, u32 id)
{
+ unsigned long flags;
+
+ local_irq_save(flags);
apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id));
apic_write(APIC_ICR, low);
+ local_irq_restore(flags);
}
u64 native_apic_icr_read(void)
@@ -1967,7 +1982,7 @@ __visible void smp_trace_spurious_interrupt(struct pt_regs *regs)
*/
static inline void __smp_error_interrupt(struct pt_regs *regs)
{
- u32 v0, v1;
+ u32 v;
u32 i = 0;
static const char * const error_interrupt_reason[] = {
"Send CS error", /* APIC Error Bit 0 */
@@ -1981,21 +1996,21 @@ static inline void __smp_error_interrupt(struct pt_regs *regs)
};
/* First tickle the hardware, only then report what went on. -- REW */
- v0 = apic_read(APIC_ESR);
- apic_write(APIC_ESR, 0);
- v1 = apic_read(APIC_ESR);
+ if (lapic_get_maxlvt() > 3) /* Due to the Pentium erratum 3AP. */
+ apic_write(APIC_ESR, 0);
+ v = apic_read(APIC_ESR);
ack_APIC_irq();
atomic_inc(&irq_err_count);
- apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x(%02x)",
- smp_processor_id(), v0 , v1);
+ apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x",
+ smp_processor_id(), v);
- v1 = v1 & 0xff;
- while (v1) {
- if (v1 & 0x1)
+ v &= 0xff;
+ while (v) {
+ if (v & 0x1)
apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]);
i++;
- v1 >>= 1;
+ v >>= 1;
}
apic_printk(APIC_DEBUG, KERN_CONT "\n");
@@ -2107,13 +2122,45 @@ void disconnect_bsp_APIC(int virt_wire_setup)
apic_write(APIC_LVT1, value);
}
-void generic_processor_info(int apicid, int version)
+int generic_processor_info(int apicid, int version)
{
int cpu, max = nr_cpu_ids;
bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid,
phys_cpu_present_map);
/*
+ * boot_cpu_physical_apicid is designed to have the apicid
+ * returned by read_apic_id(), i.e, the apicid of the
+ * currently booting-up processor. However, on some platforms,
+ * it is temporarily modified by the apicid reported as BSP
+ * through MP table. Concretely:
+ *
+ * - arch/x86/kernel/mpparse.c: MP_processor_info()
+ * - arch/x86/mm/amdtopology.c: amd_numa_init()
+ *
+ * This function is executed with the modified
+ * boot_cpu_physical_apicid. So, disabled_cpu_apicid kernel
+ * parameter doesn't work to disable APs on kdump 2nd kernel.
+ *
+ * Since fixing handling of boot_cpu_physical_apicid requires
+ * another discussion and tests on each platform, we leave it
+ * for now and here we use read_apic_id() directly in this
+ * function, generic_processor_info().
+ */
+ if (disabled_cpu_apicid != BAD_APICID &&
+ disabled_cpu_apicid != read_apic_id() &&
+ disabled_cpu_apicid == apicid) {
+ int thiscpu = num_processors + disabled_cpus;
+
+ pr_warning("APIC: Disabling requested cpu."
+ " Processor %d/0x%x ignored.\n",
+ thiscpu, apicid);
+
+ disabled_cpus++;
+ return -ENODEV;
+ }
+
+ /*
* If boot cpu has not been detected yet, then only allow upto
* nr_cpu_ids - 1 processors and keep one slot free for boot cpu
*/
@@ -2127,7 +2174,7 @@ void generic_processor_info(int apicid, int version)
" Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
disabled_cpus++;
- return;
+ return -ENODEV;
}
if (num_processors >= nr_cpu_ids) {
@@ -2138,7 +2185,7 @@ void generic_processor_info(int apicid, int version)
" Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
disabled_cpus++;
- return;
+ return -EINVAL;
}
num_processors++;
@@ -2183,6 +2230,8 @@ void generic_processor_info(int apicid, int version)
#endif
set_cpu_possible(cpu, true);
set_cpu_present(cpu, true);
+
+ return cpu;
}
int hard_smp_processor_id(void)
@@ -2589,3 +2638,12 @@ static int __init lapic_insert_resource(void)
* that is using request_resource
*/
late_initcall(lapic_insert_resource);
+
+static int __init apic_set_disabled_cpu_apicid(char *arg)
+{
+ if (!arg || !get_option(&arg, &disabled_cpu_apicid))
+ return -EINVAL;
+
+ return 0;
+}
+early_param("disable_cpu_apicid", apic_set_disabled_cpu_apicid);
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 00c77cf78e9..7c1b2947951 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -14,16 +14,13 @@
#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/ctype.h>
-#include <linux/init.h>
#include <linux/hardirq.h>
#include <linux/module.h>
#include <asm/smp.h>
#include <asm/apic.h>
#include <asm/ipi.h>
-#ifdef CONFIG_ACPI
-#include <acpi/acpi_bus.h>
-#endif
+#include <linux/acpi.h>
static struct apic apic_physflat;
static struct apic apic_flat;
@@ -201,7 +198,7 @@ static struct apic apic_flat = {
.trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
.trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
- .wait_for_init_deassert = NULL,
+ .wait_for_init_deassert = false,
.smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = default_inquire_remote_apic,
@@ -317,7 +314,7 @@ static struct apic apic_physflat = {
.trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
.trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
- .wait_for_init_deassert = NULL,
+ .wait_for_init_deassert = false,
.smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = default_inquire_remote_apic,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index e145f28b409..8c7c98249c2 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -15,7 +15,6 @@
#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/ctype.h>
-#include <linux/init.h>
#include <linux/errno.h>
#include <asm/fixmap.h>
#include <asm/mpspec.h>
@@ -173,8 +172,7 @@ struct apic apic_noop = {
.trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
.trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
- .wait_for_init_deassert = NULL,
-
+ .wait_for_init_deassert = false,
.smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = NULL,
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 3e67f9e3d7e..a5b45df8bc8 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -248,7 +248,7 @@ static const struct apic apic_numachip __refconst = {
.wakeup_secondary_cpu = numachip_wakeup_secondary,
.trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
.trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
- .wait_for_init_deassert = NULL,
+ .wait_for_init_deassert = false,
.smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = NULL, /* REMRD not supported */
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index d50e3640d5a..e4840aa7a25 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -199,8 +199,7 @@ static struct apic apic_bigsmp = {
.trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
.trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
- .wait_for_init_deassert = default_wait_for_init_deassert,
-
+ .wait_for_init_deassert = true,
.smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = default_inquire_remote_apic,
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
deleted file mode 100644
index c55224731b2..00000000000
--- a/arch/x86/kernel/apic/es7000_32.c
+++ /dev/null
@@ -1,746 +0,0 @@
-/*
- * Written by: Garry Forsgren, Unisys Corporation
- * Natalie Protasevich, Unisys Corporation
- *
- * This file contains the code to configure and interface
- * with Unisys ES7000 series hardware system manager.
- *
- * Copyright (c) 2003 Unisys Corporation.
- * Copyright (C) 2009, Red Hat, Inc., Ingo Molnar
- *
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write the Free Software Foundation, Inc., 59
- * Temple Place - Suite 330, Boston MA 02111-1307, USA.
- *
- * Contact information: Unisys Corporation, Township Line & Union Meeting
- * Roads-A, Unisys Way, Blue Bell, Pennsylvania, 19424, or:
- *
- * http://www.unisys.com
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/notifier.h>
-#include <linux/spinlock.h>
-#include <linux/cpumask.h>
-#include <linux/threads.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/reboot.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/acpi.h>
-#include <linux/init.h>
-#include <linux/gfp.h>
-#include <linux/nmi.h>
-#include <linux/smp.h>
-#include <linux/io.h>
-
-#include <asm/apicdef.h>
-#include <linux/atomic.h>
-#include <asm/fixmap.h>
-#include <asm/mpspec.h>
-#include <asm/setup.h>
-#include <asm/apic.h>
-#include <asm/ipi.h>
-
-/*
- * ES7000 chipsets
- */
-
-#define NON_UNISYS 0
-#define ES7000_CLASSIC 1
-#define ES7000_ZORRO 2
-
-#define MIP_REG 1
-#define MIP_PSAI_REG 4
-
-#define MIP_BUSY 1
-#define MIP_SPIN 0xf0000
-#define MIP_VALID 0x0100000000000000ULL
-#define MIP_SW_APIC 0x1020b
-
-#define MIP_PORT(val) ((val >> 32) & 0xffff)
-
-#define MIP_RD_LO(val) (val & 0xffffffff)
-
-struct mip_reg {
- unsigned long long off_0x00;
- unsigned long long off_0x08;
- unsigned long long off_0x10;
- unsigned long long off_0x18;
- unsigned long long off_0x20;
- unsigned long long off_0x28;
- unsigned long long off_0x30;
- unsigned long long off_0x38;
-};
-
-struct mip_reg_info {
- unsigned long long mip_info;
- unsigned long long delivery_info;
- unsigned long long host_reg;
- unsigned long long mip_reg;
-};
-
-struct psai {
- unsigned long long entry_type;
- unsigned long long addr;
- unsigned long long bep_addr;
-};
-
-#ifdef CONFIG_ACPI
-
-struct es7000_oem_table {
- struct acpi_table_header Header;
- u32 OEMTableAddr;
- u32 OEMTableSize;
-};
-
-static unsigned long oem_addrX;
-static unsigned long oem_size;
-
-#endif
-
-/*
- * ES7000 Globals
- */
-
-static volatile unsigned long *psai;
-static struct mip_reg *mip_reg;
-static struct mip_reg *host_reg;
-static int mip_port;
-static unsigned long mip_addr;
-static unsigned long host_addr;
-
-int es7000_plat;
-
-/*
- * GSI override for ES7000 platforms.
- */
-
-
-static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
-{
- unsigned long vect = 0, psaival = 0;
-
- if (psai == NULL)
- return -1;
-
- vect = ((unsigned long)__pa(eip)/0x1000) << 16;
- psaival = (0x1000000 | vect | cpu);
-
- while (*psai & 0x1000000)
- ;
-
- *psai = psaival;
-
- return 0;
-}
-
-static int es7000_apic_is_cluster(void)
-{
- /* MPENTIUMIII */
- if (boot_cpu_data.x86 == 6 &&
- (boot_cpu_data.x86_model >= 7 && boot_cpu_data.x86_model <= 11))
- return 1;
-
- return 0;
-}
-
-static void setup_unisys(void)
-{
- /*
- * Determine the generation of the ES7000 currently running.
- *
- * es7000_plat = 1 if the machine is a 5xx ES7000 box
- * es7000_plat = 2 if the machine is a x86_64 ES7000 box
- *
- */
- if (!(boot_cpu_data.x86 <= 15 && boot_cpu_data.x86_model <= 2))
- es7000_plat = ES7000_ZORRO;
- else
- es7000_plat = ES7000_CLASSIC;
-}
-
-/*
- * Parse the OEM Table:
- */
-static int parse_unisys_oem(char *oemptr)
-{
- int i;
- int success = 0;
- unsigned char type, size;
- unsigned long val;
- char *tp = NULL;
- struct psai *psaip = NULL;
- struct mip_reg_info *mi;
- struct mip_reg *host, *mip;
-
- tp = oemptr;
-
- tp += 8;
-
- for (i = 0; i <= 6; i++) {
- type = *tp++;
- size = *tp++;
- tp -= 2;
- switch (type) {
- case MIP_REG:
- mi = (struct mip_reg_info *)tp;
- val = MIP_RD_LO(mi->host_reg);
- host_addr = val;
- host = (struct mip_reg *)val;
- host_reg = __va(host);
- val = MIP_RD_LO(mi->mip_reg);
- mip_port = MIP_PORT(mi->mip_info);
- mip_addr = val;
- mip = (struct mip_reg *)val;
- mip_reg = __va(mip);
- pr_debug("host_reg = 0x%lx\n",
- (unsigned long)host_reg);
- pr_debug("mip_reg = 0x%lx\n",
- (unsigned long)mip_reg);
- success++;
- break;
- case MIP_PSAI_REG:
- psaip = (struct psai *)tp;
- if (tp != NULL) {
- if (psaip->addr)
- psai = __va(psaip->addr);
- else
- psai = NULL;
- success++;
- }
- break;
- default:
- break;
- }
- tp += size;
- }
-
- if (success < 2)
- es7000_plat = NON_UNISYS;
- else
- setup_unisys();
-
- return es7000_plat;
-}
-
-#ifdef CONFIG_ACPI
-static int __init find_unisys_acpi_oem_table(unsigned long *oem_addr)
-{
- struct acpi_table_header *header = NULL;
- struct es7000_oem_table *table;
- acpi_size tbl_size;
- acpi_status ret;
- int i = 0;
-
- for (;;) {
- ret = acpi_get_table_with_size("OEM1", i++, &header, &tbl_size);
- if (!ACPI_SUCCESS(ret))
- return -1;
-
- if (!memcmp((char *) &header->oem_id, "UNISYS", 6))
- break;
-
- early_acpi_os_unmap_memory(header, tbl_size);
- }
-
- table = (void *)header;
-
- oem_addrX = table->OEMTableAddr;
- oem_size = table->OEMTableSize;
-
- early_acpi_os_unmap_memory(header, tbl_size);
-
- *oem_addr = (unsigned long)__acpi_map_table(oem_addrX, oem_size);
-
- return 0;
-}
-
-static void __init unmap_unisys_acpi_oem_table(unsigned long oem_addr)
-{
- if (!oem_addr)
- return;
-
- __acpi_unmap_table((char *)oem_addr, oem_size);
-}
-
-static int es7000_check_dsdt(void)
-{
- struct acpi_table_header header;
-
- if (ACPI_SUCCESS(acpi_get_table_header(ACPI_SIG_DSDT, 0, &header)) &&
- !strncmp(header.oem_id, "UNISYS", 6))
- return 1;
- return 0;
-}
-
-static int es7000_acpi_ret;
-
-/* Hook from generic ACPI tables.c */
-static int __init es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
-{
- unsigned long oem_addr = 0;
- int check_dsdt;
- int ret = 0;
-
- /* check dsdt at first to avoid clear fix_map for oem_addr */
- check_dsdt = es7000_check_dsdt();
-
- if (!find_unisys_acpi_oem_table(&oem_addr)) {
- if (check_dsdt) {
- ret = parse_unisys_oem((char *)oem_addr);
- } else {
- setup_unisys();
- ret = 1;
- }
- /*
- * we need to unmap it
- */
- unmap_unisys_acpi_oem_table(oem_addr);
- }
-
- es7000_acpi_ret = ret;
-
- return ret && !es7000_apic_is_cluster();
-}
-
-static int es7000_acpi_madt_oem_check_cluster(char *oem_id, char *oem_table_id)
-{
- int ret = es7000_acpi_ret;
-
- return ret && es7000_apic_is_cluster();
-}
-
-#else /* !CONFIG_ACPI: */
-static int es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
-{
- return 0;
-}
-
-static int es7000_acpi_madt_oem_check_cluster(char *oem_id, char *oem_table_id)
-{
- return 0;
-}
-#endif /* !CONFIG_ACPI */
-
-static void es7000_spin(int n)
-{
- int i = 0;
-
- while (i++ < n)
- rep_nop();
-}
-
-static int es7000_mip_write(struct mip_reg *mip_reg)
-{
- int status = 0;
- int spin;
-
- spin = MIP_SPIN;
- while ((host_reg->off_0x38 & MIP_VALID) != 0) {
- if (--spin <= 0) {
- WARN(1, "Timeout waiting for Host Valid Flag\n");
- return -1;
- }
- es7000_spin(MIP_SPIN);
- }
-
- memcpy(host_reg, mip_reg, sizeof(struct mip_reg));
- outb(1, mip_port);
-
- spin = MIP_SPIN;
-
- while ((mip_reg->off_0x38 & MIP_VALID) == 0) {
- if (--spin <= 0) {
- WARN(1, "Timeout waiting for MIP Valid Flag\n");
- return -1;
- }
- es7000_spin(MIP_SPIN);
- }
-
- status = (mip_reg->off_0x00 & 0xffff0000000000ULL) >> 48;
- mip_reg->off_0x38 &= ~MIP_VALID;
-
- return status;
-}
-
-static void es7000_enable_apic_mode(void)
-{
- struct mip_reg es7000_mip_reg;
- int mip_status;
-
- if (!es7000_plat)
- return;
-
- pr_info("Enabling APIC mode.\n");
- memset(&es7000_mip_reg, 0, sizeof(struct mip_reg));
- es7000_mip_reg.off_0x00 = MIP_SW_APIC;
- es7000_mip_reg.off_0x38 = MIP_VALID;
-
- while ((mip_status = es7000_mip_write(&es7000_mip_reg)) != 0)
- WARN(1, "Command failed, status = %x\n", mip_status);
-}
-
-static void es7000_wait_for_init_deassert(atomic_t *deassert)
-{
- while (!atomic_read(deassert))
- cpu_relax();
-}
-
-static unsigned int es7000_get_apic_id(unsigned long x)
-{
- return (x >> 24) & 0xFF;
-}
-
-static void es7000_send_IPI_mask(const struct cpumask *mask, int vector)
-{
- default_send_IPI_mask_sequence_phys(mask, vector);
-}
-
-static void es7000_send_IPI_allbutself(int vector)
-{
- default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector);
-}
-
-static void es7000_send_IPI_all(int vector)
-{
- es7000_send_IPI_mask(cpu_online_mask, vector);
-}
-
-static int es7000_apic_id_registered(void)
-{
- return 1;
-}
-
-static const struct cpumask *target_cpus_cluster(void)
-{
- return cpu_all_mask;
-}
-
-static const struct cpumask *es7000_target_cpus(void)
-{
- return cpumask_of(smp_processor_id());
-}
-
-static unsigned long es7000_check_apicid_used(physid_mask_t *map, int apicid)
-{
- return 0;
-}
-
-static unsigned long es7000_check_apicid_present(int bit)
-{
- return physid_isset(bit, phys_cpu_present_map);
-}
-
-static int es7000_early_logical_apicid(int cpu)
-{
- /* on es7000, logical apicid is the same as physical */
- return early_per_cpu(x86_bios_cpu_apicid, cpu);
-}
-
-static unsigned long calculate_ldr(int cpu)
-{
- unsigned long id = per_cpu(x86_bios_cpu_apicid, cpu);
-
- return SET_APIC_LOGICAL_ID(id);
-}
-
-/*
- * Set up the logical destination ID.
- *
- * Intel recommends to set DFR, LdR and TPR before enabling
- * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
- * document number 292116). So here it goes...
- */
-static void es7000_init_apic_ldr_cluster(void)
-{
- unsigned long val;
- int cpu = smp_processor_id();
-
- apic_write(APIC_DFR, APIC_DFR_CLUSTER);
- val = calculate_ldr(cpu);
- apic_write(APIC_LDR, val);
-}
-
-static void es7000_init_apic_ldr(void)
-{
- unsigned long val;
- int cpu = smp_processor_id();
-
- apic_write(APIC_DFR, APIC_DFR_FLAT);
- val = calculate_ldr(cpu);
- apic_write(APIC_LDR, val);
-}
-
-static void es7000_setup_apic_routing(void)
-{
- int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id());
-
- pr_info("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n",
- (apic_version[apic] == 0x14) ?
- "Physical Cluster" : "Logical Cluster",
- nr_ioapics, cpumask_bits(es7000_target_cpus())[0]);
-}
-
-static int es7000_cpu_present_to_apicid(int mps_cpu)
-{
- if (!mps_cpu)
- return boot_cpu_physical_apicid;
- else if (mps_cpu < nr_cpu_ids)
- return per_cpu(x86_bios_cpu_apicid, mps_cpu);
- else
- return BAD_APICID;
-}
-
-static int cpu_id;
-
-static void es7000_apicid_to_cpu_present(int phys_apicid, physid_mask_t *retmap)
-{
- physid_set_mask_of_physid(cpu_id, retmap);
- ++cpu_id;
-}
-
-static void es7000_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
-{
- /* For clustered we don't have a good way to do this yet - hack */
- physids_promote(0xFFL, retmap);
-}
-
-static int es7000_check_phys_apicid_present(int cpu_physical_apicid)
-{
- boot_cpu_physical_apicid = read_apic_id();
- return 1;
-}
-
-static inline int
-es7000_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id)
-{
- unsigned int round = 0;
- unsigned int cpu, uninitialized_var(apicid);
-
- /*
- * The cpus in the mask must all be on the apic cluster.
- */
- for_each_cpu_and(cpu, cpumask, cpu_online_mask) {
- int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
-
- if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
- WARN(1, "Not a valid mask!");
-
- return -EINVAL;
- }
- apicid |= new_apicid;
- round++;
- }
- if (!round)
- return -EINVAL;
- *dest_id = apicid;
- return 0;
-}
-
-static int
-es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask,
- const struct cpumask *andmask,
- unsigned int *apicid)
-{
- cpumask_var_t cpumask;
- *apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);
-
- if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
- return 0;
-
- cpumask_and(cpumask, inmask, andmask);
- es7000_cpu_mask_to_apicid(cpumask, apicid);
-
- free_cpumask_var(cpumask);
-
- return 0;
-}
-
-static int es7000_phys_pkg_id(int cpuid_apic, int index_msb)
-{
- return cpuid_apic >> index_msb;
-}
-
-static int probe_es7000(void)
-{
- /* probed later in mptable/ACPI hooks */
- return 0;
-}
-
-static int es7000_mps_ret;
-static int es7000_mps_oem_check(struct mpc_table *mpc, char *oem,
- char *productid)
-{
- int ret = 0;
-
- if (mpc->oemptr) {
- struct mpc_oemtable *oem_table =
- (struct mpc_oemtable *)mpc->oemptr;
-
- if (!strncmp(oem, "UNISYS", 6))
- ret = parse_unisys_oem((char *)oem_table);
- }
-
- es7000_mps_ret = ret;
-
- return ret && !es7000_apic_is_cluster();
-}
-
-static int es7000_mps_oem_check_cluster(struct mpc_table *mpc, char *oem,
- char *productid)
-{
- int ret = es7000_mps_ret;
-
- return ret && es7000_apic_is_cluster();
-}
-
-/* We've been warned by a false positive warning.Use __refdata to keep calm. */
-static struct apic __refdata apic_es7000_cluster = {
-
- .name = "es7000",
- .probe = probe_es7000,
- .acpi_madt_oem_check = es7000_acpi_madt_oem_check_cluster,
- .apic_id_valid = default_apic_id_valid,
- .apic_id_registered = es7000_apic_id_registered,
-
- .irq_delivery_mode = dest_LowestPrio,
- /* logical delivery broadcast to all procs: */
- .irq_dest_mode = 1,
-
- .target_cpus = target_cpus_cluster,
- .disable_esr = 1,
- .dest_logical = 0,
- .check_apicid_used = es7000_check_apicid_used,
- .check_apicid_present = es7000_check_apicid_present,
-
- .vector_allocation_domain = flat_vector_allocation_domain,
- .init_apic_ldr = es7000_init_apic_ldr_cluster,
-
- .ioapic_phys_id_map = es7000_ioapic_phys_id_map,
- .setup_apic_routing = es7000_setup_apic_routing,
- .multi_timer_check = NULL,
- .cpu_present_to_apicid = es7000_cpu_present_to_apicid,
- .apicid_to_cpu_present = es7000_apicid_to_cpu_present,
- .setup_portio_remap = NULL,
- .check_phys_apicid_present = es7000_check_phys_apicid_present,
- .enable_apic_mode = es7000_enable_apic_mode,
- .phys_pkg_id = es7000_phys_pkg_id,
- .mps_oem_check = es7000_mps_oem_check_cluster,
-
- .get_apic_id = es7000_get_apic_id,
- .set_apic_id = NULL,
- .apic_id_mask = 0xFF << 24,
-
- .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and,
-
- .send_IPI_mask = es7000_send_IPI_mask,
- .send_IPI_mask_allbutself = NULL,
- .send_IPI_allbutself = es7000_send_IPI_allbutself,
- .send_IPI_all = es7000_send_IPI_all,
- .send_IPI_self = default_send_IPI_self,
-
- .wakeup_secondary_cpu = wakeup_secondary_cpu_via_mip,
-
- .trampoline_phys_low = 0x467,
- .trampoline_phys_high = 0x469,
-
- .wait_for_init_deassert = NULL,
-
- /* Nothing to do for most platforms, since cleared by the INIT cycle: */
- .smp_callin_clear_local_apic = NULL,
- .inquire_remote_apic = default_inquire_remote_apic,
-
- .read = native_apic_mem_read,
- .write = native_apic_mem_write,
- .eoi_write = native_apic_mem_write,
- .icr_read = native_apic_icr_read,
- .icr_write = native_apic_icr_write,
- .wait_icr_idle = native_apic_wait_icr_idle,
- .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
-
- .x86_32_early_logical_apicid = es7000_early_logical_apicid,
-};
-
-static struct apic __refdata apic_es7000 = {
-
- .name = "es7000",
- .probe = probe_es7000,
- .acpi_madt_oem_check = es7000_acpi_madt_oem_check,
- .apic_id_valid = default_apic_id_valid,
- .apic_id_registered = es7000_apic_id_registered,
-
- .irq_delivery_mode = dest_Fixed,
- /* phys delivery to target CPUs: */
- .irq_dest_mode = 0,
-
- .target_cpus = es7000_target_cpus,
- .disable_esr = 1,
- .dest_logical = 0,
- .check_apicid_used = es7000_check_apicid_used,
- .check_apicid_present = es7000_check_apicid_present,
-
- .vector_allocation_domain = flat_vector_allocation_domain,
- .init_apic_ldr = es7000_init_apic_ldr,
-
- .ioapic_phys_id_map = es7000_ioapic_phys_id_map,
- .setup_apic_routing = es7000_setup_apic_routing,
- .multi_timer_check = NULL,
- .cpu_present_to_apicid = es7000_cpu_present_to_apicid,
- .apicid_to_cpu_present = es7000_apicid_to_cpu_present,
- .setup_portio_remap = NULL,
- .check_phys_apicid_present = es7000_check_phys_apicid_present,
- .enable_apic_mode = es7000_enable_apic_mode,
- .phys_pkg_id = es7000_phys_pkg_id,
- .mps_oem_check = es7000_mps_oem_check,
-
- .get_apic_id = es7000_get_apic_id,
- .set_apic_id = NULL,
- .apic_id_mask = 0xFF << 24,
-
- .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and,
-
- .send_IPI_mask = es7000_send_IPI_mask,
- .send_IPI_mask_allbutself = NULL,
- .send_IPI_allbutself = es7000_send_IPI_allbutself,
- .send_IPI_all = es7000_send_IPI_all,
- .send_IPI_self = default_send_IPI_self,
-
- .trampoline_phys_low = 0x467,
- .trampoline_phys_high = 0x469,
-
- .wait_for_init_deassert = es7000_wait_for_init_deassert,
-
- /* Nothing to do for most platforms, since cleared by the INIT cycle: */
- .smp_callin_clear_local_apic = NULL,
- .inquire_remote_apic = default_inquire_remote_apic,
-
- .read = native_apic_mem_read,
- .write = native_apic_mem_write,
- .eoi_write = native_apic_mem_write,
- .icr_read = native_apic_icr_read,
- .icr_write = native_apic_icr_write,
- .wait_icr_idle = native_apic_wait_icr_idle,
- .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
-
- .x86_32_early_logical_apicid = es7000_early_logical_apicid,
-};
-
-/*
- * Need to check for es7000 followed by es7000_cluster, so this order
- * in apic_drivers is important.
- */
-apic_drivers(apic_es7000, apic_es7000_cluster);
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index a698d7165c9..6a1e71bde32 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -33,34 +33,44 @@ static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
/* "in progress" flag of arch_trigger_all_cpu_backtrace */
static unsigned long backtrace_flag;
-void arch_trigger_all_cpu_backtrace(void)
+void arch_trigger_all_cpu_backtrace(bool include_self)
{
int i;
+ int cpu = get_cpu();
- if (test_and_set_bit(0, &backtrace_flag))
+ if (test_and_set_bit(0, &backtrace_flag)) {
/*
* If there is already a trigger_all_cpu_backtrace() in progress
* (backtrace_flag == 1), don't output double cpu dump infos.
*/
+ put_cpu();
return;
+ }
cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
+ if (!include_self)
+ cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
- printk(KERN_INFO "sending NMI to all CPUs:\n");
- apic->send_IPI_all(NMI_VECTOR);
+ if (!cpumask_empty(to_cpumask(backtrace_mask))) {
+ pr_info("sending NMI to %s CPUs:\n",
+ (include_self ? "all" : "other"));
+ apic->send_IPI_mask(to_cpumask(backtrace_mask), NMI_VECTOR);
+ }
/* Wait for up to 10 seconds for all CPUs to do the backtrace */
for (i = 0; i < 10 * 1000; i++) {
if (cpumask_empty(to_cpumask(backtrace_mask)))
break;
mdelay(1);
+ touch_softlockup_watchdog();
}
clear_bit(0, &backtrace_flag);
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
+ put_cpu();
}
-static int __kprobes
+static int
arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
{
int cpu;
@@ -80,6 +90,7 @@ arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
return NMI_DONE;
}
+NOKPROBE_SYMBOL(arch_trigger_all_cpu_backtrace_handler);
static int __init register_trigger_all_cpu_backtrace(void)
{
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index e63a5bd2a78..81e08eff05e 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -37,9 +37,6 @@
#include <linux/kthread.h>
#include <linux/jiffies.h> /* time_after() */
#include <linux/slab.h>
-#ifdef CONFIG_ACPI
-#include <acpi/acpi_bus.h>
-#endif
#include <linux/bootmem.h>
#include <linux/dmar.h>
#include <linux/hpet.h>
@@ -209,9 +206,6 @@ int __init arch_early_irq_init(void)
count = ARRAY_SIZE(irq_cfgx);
node = cpu_to_node(0);
- /* Make sure the legacy interrupts are marked in the bitmap */
- irq_reserve_irqs(0, legacy_pic->nr_legacy_irqs);
-
for (i = 0; i < count; i++) {
irq_set_chip_data(i, &cfg[i]);
zalloc_cpumask_var_node(&cfg[i].domain, GFP_KERNEL, node);
@@ -284,18 +278,6 @@ static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
return cfg;
}
-static int alloc_irqs_from(unsigned int from, unsigned int count, int node)
-{
- return irq_alloc_descs_from(from, count, node);
-}
-
-static void free_irq_at(unsigned int at, struct irq_cfg *cfg)
-{
- free_irq_cfg(at, cfg);
- irq_free_desc(at);
-}
-
-
struct io_apic {
unsigned int index;
unsigned int unused[3];
@@ -1142,9 +1124,10 @@ next:
if (test_bit(vector, used_vectors))
goto next;
- for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
- if (per_cpu(vector_irq, new_cpu)[vector] != -1)
+ for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) {
+ if (per_cpu(vector_irq, new_cpu)[vector] > VECTOR_UNDEFINED)
goto next;
+ }
/* Found one! */
current_vector = vector;
current_offset = offset;
@@ -1183,7 +1166,7 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
vector = cfg->vector;
for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
- per_cpu(vector_irq, cpu)[vector] = -1;
+ per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
cfg->vector = 0;
cpumask_clear(cfg->domain);
@@ -1191,11 +1174,10 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
if (likely(!cfg->move_in_progress))
return;
for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
- for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
- vector++) {
+ for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
if (per_cpu(vector_irq, cpu)[vector] != irq)
continue;
- per_cpu(vector_irq, cpu)[vector] = -1;
+ per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
break;
}
}
@@ -1228,12 +1210,12 @@ void __setup_vector_irq(int cpu)
/* Mark the free vectors */
for (vector = 0; vector < NR_VECTORS; ++vector) {
irq = per_cpu(vector_irq, cpu)[vector];
- if (irq < 0)
+ if (irq <= VECTOR_UNDEFINED)
continue;
cfg = irq_cfg(irq);
if (!cpumask_test_cpu(cpu, cfg->domain))
- per_cpu(vector_irq, cpu)[vector] = -1;
+ per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
}
raw_spin_unlock(&vector_lock);
}
@@ -2192,7 +2174,7 @@ void send_cleanup_vector(struct irq_cfg *cfg)
cfg->move_in_progress = 0;
}
-asmlinkage void smp_irq_move_cleanup_interrupt(void)
+asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
{
unsigned vector, me;
@@ -2202,13 +2184,13 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
me = smp_processor_id();
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
- unsigned int irq;
+ int irq;
unsigned int irr;
struct irq_desc *desc;
struct irq_cfg *cfg;
irq = __this_cpu_read(vector_irq[vector]);
- if (irq == -1)
+ if (irq <= VECTOR_UNDEFINED)
continue;
desc = irq_to_desc(irq);
@@ -2315,7 +2297,7 @@ int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
int err;
if (!config_enabled(CONFIG_SMP))
- return -1;
+ return -EPERM;
if (!cpumask_intersects(mask, cpu_online_mask))
return -EINVAL;
@@ -2346,7 +2328,7 @@ int native_ioapic_set_affinity(struct irq_data *data,
int ret;
if (!config_enabled(CONFIG_SMP))
- return -1;
+ return -EPERM;
raw_spin_lock_irqsave(&ioapic_lock, flags);
ret = __ioapic_set_affinity(data, mask, &dest);
@@ -2919,98 +2901,39 @@ static int __init ioapic_init_ops(void)
device_initcall(ioapic_init_ops);
/*
- * Dynamic irq allocate and deallocation
+ * Dynamic irq allocate and deallocation. Should be replaced by irq domains!
*/
-unsigned int __create_irqs(unsigned int from, unsigned int count, int node)
+int arch_setup_hwirq(unsigned int irq, int node)
{
- struct irq_cfg **cfg;
+ struct irq_cfg *cfg;
unsigned long flags;
- int irq, i;
-
- if (from < nr_irqs_gsi)
- from = nr_irqs_gsi;
+ int ret;
- cfg = kzalloc_node(count * sizeof(cfg[0]), GFP_KERNEL, node);
+ cfg = alloc_irq_cfg(irq, node);
if (!cfg)
- return 0;
-
- irq = alloc_irqs_from(from, count, node);
- if (irq < 0)
- goto out_cfgs;
-
- for (i = 0; i < count; i++) {
- cfg[i] = alloc_irq_cfg(irq + i, node);
- if (!cfg[i])
- goto out_irqs;
- }
+ return -ENOMEM;
raw_spin_lock_irqsave(&vector_lock, flags);
- for (i = 0; i < count; i++)
- if (__assign_irq_vector(irq + i, cfg[i], apic->target_cpus()))
- goto out_vecs;
+ ret = __assign_irq_vector(irq, cfg, apic->target_cpus());
raw_spin_unlock_irqrestore(&vector_lock, flags);
- for (i = 0; i < count; i++) {
- irq_set_chip_data(irq + i, cfg[i]);
- irq_clear_status_flags(irq + i, IRQ_NOREQUEST);
- }
-
- kfree(cfg);
- return irq;
-
-out_vecs:
- for (i--; i >= 0; i--)
- __clear_irq_vector(irq + i, cfg[i]);
- raw_spin_unlock_irqrestore(&vector_lock, flags);
-out_irqs:
- for (i = 0; i < count; i++)
- free_irq_at(irq + i, cfg[i]);
-out_cfgs:
- kfree(cfg);
- return 0;
-}
-
-unsigned int create_irq_nr(unsigned int from, int node)
-{
- return __create_irqs(from, 1, node);
-}
-
-int create_irq(void)
-{
- int node = cpu_to_node(0);
- unsigned int irq_want;
- int irq;
-
- irq_want = nr_irqs_gsi;
- irq = create_irq_nr(irq_want, node);
-
- if (irq == 0)
- irq = -1;
-
- return irq;
+ if (!ret)
+ irq_set_chip_data(irq, cfg);
+ else
+ free_irq_cfg(irq, cfg);
+ return ret;
}
-void destroy_irq(unsigned int irq)
+void arch_teardown_hwirq(unsigned int irq)
{
struct irq_cfg *cfg = irq_get_chip_data(irq);
unsigned long flags;
- irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE);
-
free_remapped_irq(irq);
-
raw_spin_lock_irqsave(&vector_lock, flags);
__clear_irq_vector(irq, cfg);
raw_spin_unlock_irqrestore(&vector_lock, flags);
- free_irq_at(irq, cfg);
-}
-
-void destroy_irqs(unsigned int irq, unsigned int count)
-{
- unsigned int i;
-
- for (i = 0; i < count; i++)
- destroy_irq(irq + i);
+ free_irq_cfg(irq, cfg);
}
/*
@@ -3078,9 +3001,11 @@ msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
struct irq_cfg *cfg = data->chip_data;
struct msi_msg msg;
unsigned int dest;
+ int ret;
- if (__ioapic_set_affinity(data, mask, &dest))
- return -1;
+ ret = __ioapic_set_affinity(data, mask, &dest);
+ if (ret)
+ return ret;
__get_cached_msi_msg(data->msi_desc, &msg);
@@ -3139,8 +3064,8 @@ int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
{
- unsigned int irq, irq_want;
struct msi_desc *msidesc;
+ unsigned int irq;
int node, ret;
/* Multiple MSI vectors only supported with interrupt remapping */
@@ -3148,28 +3073,25 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
return 1;
node = dev_to_node(&dev->dev);
- irq_want = nr_irqs_gsi;
+
list_for_each_entry(msidesc, &dev->msi_list, list) {
- irq = create_irq_nr(irq_want, node);
- if (irq == 0)
+ irq = irq_alloc_hwirq(node);
+ if (!irq)
return -ENOSPC;
- irq_want = irq + 1;
-
ret = setup_msi_irq(dev, msidesc, irq, 0);
- if (ret < 0)
- goto error;
+ if (ret < 0) {
+ irq_free_hwirq(irq);
+ return ret;
+ }
+
}
return 0;
-
-error:
- destroy_irq(irq);
- return ret;
}
void native_teardown_msi_irq(unsigned int irq)
{
- destroy_irq(irq);
+ irq_free_hwirq(irq);
}
#ifdef CONFIG_DMAR_TABLE
@@ -3180,9 +3102,11 @@ dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
struct irq_cfg *cfg = data->chip_data;
unsigned int dest, irq = data->irq;
struct msi_msg msg;
+ int ret;
- if (__ioapic_set_affinity(data, mask, &dest))
- return -1;
+ ret = __ioapic_set_affinity(data, mask, &dest);
+ if (ret)
+ return ret;
dmar_msi_read(irq, &msg);
@@ -3229,9 +3153,11 @@ static int hpet_msi_set_affinity(struct irq_data *data,
struct irq_cfg *cfg = data->chip_data;
struct msi_msg msg;
unsigned int dest;
+ int ret;
- if (__ioapic_set_affinity(data, mask, &dest))
- return -1;
+ ret = __ioapic_set_affinity(data, mask, &dest);
+ if (ret)
+ return ret;
hpet_msi_read(data->handler_data, &msg);
@@ -3298,9 +3224,11 @@ ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
{
struct irq_cfg *cfg = data->chip_data;
unsigned int dest;
+ int ret;
- if (__ioapic_set_affinity(data, mask, &dest))
- return -1;
+ ret = __ioapic_set_affinity(data, mask, &dest);
+ if (ret)
+ return ret;
target_ht_irq(data->irq, dest, cfg->vector);
return IRQ_SET_MASK_OK_NOCOPY;
@@ -3423,9 +3351,9 @@ static void __init probe_nr_irqs_gsi(void)
printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
}
-int get_nr_irqs_gsi(void)
+unsigned int arch_dynirq_lower_bound(unsigned int from)
{
- return nr_irqs_gsi;
+ return from < nr_irqs_gsi ? nr_irqs_gsi : from;
}
int __init arch_probe_nr_irqs(void)
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index 7434d8556d0..62071569bd5 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -1,6 +1,5 @@
#include <linux/cpumask.h>
#include <linux/interrupt.h>
-#include <linux/init.h>
#include <linux/mm.h>
#include <linux/delay.h>
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
deleted file mode 100644
index 1e42e8f305e..00000000000
--- a/arch/x86/kernel/apic/numaq_32.c
+++ /dev/null
@@ -1,525 +0,0 @@
-/*
- * Written by: Patricia Gaughen, IBM Corporation
- *
- * Copyright (C) 2002, IBM Corp.
- * Copyright (C) 2009, Red Hat, Inc., Ingo Molnar
- *
- * All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to <gone@us.ibm.com>
- */
-#include <linux/nodemask.h>
-#include <linux/topology.h>
-#include <linux/bootmem.h>
-#include <linux/memblock.h>
-#include <linux/threads.h>
-#include <linux/cpumask.h>
-#include <linux/kernel.h>
-#include <linux/mmzone.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/init.h>
-#include <linux/numa.h>
-#include <linux/smp.h>
-#include <linux/io.h>
-#include <linux/mm.h>
-
-#include <asm/processor.h>
-#include <asm/fixmap.h>
-#include <asm/mpspec.h>
-#include <asm/numaq.h>
-#include <asm/setup.h>
-#include <asm/apic.h>
-#include <asm/e820.h>
-#include <asm/ipi.h>
-
-int found_numaq;
-
-/*
- * Have to match translation table entries to main table entries by counter
- * hence the mpc_record variable .... can't see a less disgusting way of
- * doing this ....
- */
-struct mpc_trans {
- unsigned char mpc_type;
- unsigned char trans_len;
- unsigned char trans_type;
- unsigned char trans_quad;
- unsigned char trans_global;
- unsigned char trans_local;
- unsigned short trans_reserved;
-};
-
-static int mpc_record;
-
-static struct mpc_trans *translation_table[MAX_MPC_ENTRY];
-
-int mp_bus_id_to_node[MAX_MP_BUSSES];
-int mp_bus_id_to_local[MAX_MP_BUSSES];
-int quad_local_to_mp_bus_id[NR_CPUS/4][4];
-
-
-static inline void numaq_register_node(int node, struct sys_cfg_data *scd)
-{
- struct eachquadmem *eq = scd->eq + node;
- u64 start = (u64)(eq->hi_shrd_mem_start - eq->priv_mem_size) << 20;
- u64 end = (u64)(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size) << 20;
- int ret;
-
- node_set(node, numa_nodes_parsed);
- ret = numa_add_memblk(node, start, end);
- BUG_ON(ret < 0);
-}
-
-/*
- * Function: smp_dump_qct()
- *
- * Description: gets memory layout from the quad config table. This
- * function also updates numa_nodes_parsed with the nodes (quads) present.
- */
-static void __init smp_dump_qct(void)
-{
- struct sys_cfg_data *scd;
- int node;
-
- scd = (void *)__va(SYS_CFG_DATA_PRIV_ADDR);
-
- for_each_node(node) {
- if (scd->quads_present31_0 & (1 << node))
- numaq_register_node(node, scd);
- }
-}
-
-void numaq_tsc_disable(void)
-{
- if (!found_numaq)
- return;
-
- if (num_online_nodes() > 1) {
- printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
- setup_clear_cpu_cap(X86_FEATURE_TSC);
- }
-}
-
-static void __init numaq_tsc_init(void)
-{
- numaq_tsc_disable();
-}
-
-static inline int generate_logical_apicid(int quad, int phys_apicid)
-{
- return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
-}
-
-/* x86_quirks member */
-static int mpc_apic_id(struct mpc_cpu *m)
-{
- int quad = translation_table[mpc_record]->trans_quad;
- int logical_apicid = generate_logical_apicid(quad, m->apicid);
-
- printk(KERN_DEBUG
- "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
- m->apicid, (m->cpufeature & CPU_FAMILY_MASK) >> 8,
- (m->cpufeature & CPU_MODEL_MASK) >> 4,
- m->apicver, quad, logical_apicid);
-
- return logical_apicid;
-}
-
-/* x86_quirks member */
-static void mpc_oem_bus_info(struct mpc_bus *m, char *name)
-{
- int quad = translation_table[mpc_record]->trans_quad;
- int local = translation_table[mpc_record]->trans_local;
-
- mp_bus_id_to_node[m->busid] = quad;
- mp_bus_id_to_local[m->busid] = local;
-
- printk(KERN_INFO "Bus #%d is %s (node %d)\n", m->busid, name, quad);
-}
-
-/* x86_quirks member */
-static void mpc_oem_pci_bus(struct mpc_bus *m)
-{
- int quad = translation_table[mpc_record]->trans_quad;
- int local = translation_table[mpc_record]->trans_local;
-
- quad_local_to_mp_bus_id[quad][local] = m->busid;
-}
-
-/*
- * Called from mpparse code.
- * mode = 0: prescan
- * mode = 1: one mpc entry scanned
- */
-static void numaq_mpc_record(unsigned int mode)
-{
- if (!mode)
- mpc_record = 0;
- else
- mpc_record++;
-}
-
-static void __init MP_translation_info(struct mpc_trans *m)
-{
- printk(KERN_INFO
- "Translation: record %d, type %d, quad %d, global %d, local %d\n",
- mpc_record, m->trans_type, m->trans_quad, m->trans_global,
- m->trans_local);
-
- if (mpc_record >= MAX_MPC_ENTRY)
- printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
- else
- translation_table[mpc_record] = m; /* stash this for later */
-
- if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
- node_set_online(m->trans_quad);
-}
-
-static int __init mpf_checksum(unsigned char *mp, int len)
-{
- int sum = 0;
-
- while (len--)
- sum += *mp++;
-
- return sum & 0xFF;
-}
-
-/*
- * Read/parse the MPC oem tables
- */
-static void __init smp_read_mpc_oem(struct mpc_table *mpc)
-{
- struct mpc_oemtable *oemtable = (void *)(long)mpc->oemptr;
- int count = sizeof(*oemtable); /* the header size */
- unsigned char *oemptr = ((unsigned char *)oemtable) + count;
-
- mpc_record = 0;
- printk(KERN_INFO
- "Found an OEM MPC table at %8p - parsing it...\n", oemtable);
-
- if (memcmp(oemtable->signature, MPC_OEM_SIGNATURE, 4)) {
- printk(KERN_WARNING
- "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
- oemtable->signature[0], oemtable->signature[1],
- oemtable->signature[2], oemtable->signature[3]);
- return;
- }
-
- if (mpf_checksum((unsigned char *)oemtable, oemtable->length)) {
- printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
- return;
- }
-
- while (count < oemtable->length) {
- switch (*oemptr) {
- case MP_TRANSLATION:
- {
- struct mpc_trans *m = (void *)oemptr;
-
- MP_translation_info(m);
- oemptr += sizeof(*m);
- count += sizeof(*m);
- ++mpc_record;
- break;
- }
- default:
- printk(KERN_WARNING
- "Unrecognised OEM table entry type! - %d\n",
- (int)*oemptr);
- return;
- }
- }
-}
-
-static __init void early_check_numaq(void)
-{
- /*
- * get boot-time SMP configuration:
- */
- if (smp_found_config)
- early_get_smp_config();
-
- if (found_numaq) {
- x86_init.mpparse.mpc_record = numaq_mpc_record;
- x86_init.mpparse.setup_ioapic_ids = x86_init_noop;
- x86_init.mpparse.mpc_apic_id = mpc_apic_id;
- x86_init.mpparse.smp_read_mpc_oem = smp_read_mpc_oem;
- x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus;
- x86_init.mpparse.mpc_oem_bus_info = mpc_oem_bus_info;
- x86_init.timers.tsc_pre_init = numaq_tsc_init;
- x86_init.pci.init = pci_numaq_init;
- }
-}
-
-int __init numaq_numa_init(void)
-{
- early_check_numaq();
- if (!found_numaq)
- return -ENOENT;
- smp_dump_qct();
-
- return 0;
-}
-
-#define NUMAQ_APIC_DFR_VALUE (APIC_DFR_CLUSTER)
-
-static inline unsigned int numaq_get_apic_id(unsigned long x)
-{
- return (x >> 24) & 0x0F;
-}
-
-static inline void numaq_send_IPI_mask(const struct cpumask *mask, int vector)
-{
- default_send_IPI_mask_sequence_logical(mask, vector);
-}
-
-static inline void numaq_send_IPI_allbutself(int vector)
-{
- default_send_IPI_mask_allbutself_logical(cpu_online_mask, vector);
-}
-
-static inline void numaq_send_IPI_all(int vector)
-{
- numaq_send_IPI_mask(cpu_online_mask, vector);
-}
-
-#define NUMAQ_TRAMPOLINE_PHYS_LOW (0x8)
-#define NUMAQ_TRAMPOLINE_PHYS_HIGH (0xa)
-
-/*
- * Because we use NMIs rather than the INIT-STARTUP sequence to
- * bootstrap the CPUs, the APIC may be in a weird state. Kick it:
- */
-static inline void numaq_smp_callin_clear_local_apic(void)
-{
- clear_local_APIC();
-}
-
-static inline const struct cpumask *numaq_target_cpus(void)
-{
- return cpu_all_mask;
-}
-
-static unsigned long numaq_check_apicid_used(physid_mask_t *map, int apicid)
-{
- return physid_isset(apicid, *map);
-}
-
-static inline unsigned long numaq_check_apicid_present(int bit)
-{
- return physid_isset(bit, phys_cpu_present_map);
-}
-
-static inline int numaq_apic_id_registered(void)
-{
- return 1;
-}
-
-static inline void numaq_init_apic_ldr(void)
-{
- /* Already done in NUMA-Q firmware */
-}
-
-static inline void numaq_setup_apic_routing(void)
-{
- printk(KERN_INFO
- "Enabling APIC mode: NUMA-Q. Using %d I/O APICs\n",
- nr_ioapics);
-}
-
-/*
- * Skip adding the timer int on secondary nodes, which causes
- * a small but painful rift in the time-space continuum.
- */
-static inline int numaq_multi_timer_check(int apic, int irq)
-{
- return apic != 0 && irq == 0;
-}
-
-static inline void numaq_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
-{
- /* We don't have a good way to do this yet - hack */
- return physids_promote(0xFUL, retmap);
-}
-
-/*
- * Supporting over 60 cpus on NUMA-Q requires a locality-dependent
- * cpu to APIC ID relation to properly interact with the intelligent
- * mode of the cluster controller.
- */
-static inline int numaq_cpu_present_to_apicid(int mps_cpu)
-{
- if (mps_cpu < 60)
- return ((mps_cpu >> 2) << 4) | (1 << (mps_cpu & 0x3));
- else
- return BAD_APICID;
-}
-
-static inline int numaq_apicid_to_node(int logical_apicid)
-{
- return logical_apicid >> 4;
-}
-
-static int numaq_numa_cpu_node(int cpu)
-{
- int logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
-
- if (logical_apicid != BAD_APICID)
- return numaq_apicid_to_node(logical_apicid);
- return NUMA_NO_NODE;
-}
-
-static void numaq_apicid_to_cpu_present(int logical_apicid, physid_mask_t *retmap)
-{
- int node = numaq_apicid_to_node(logical_apicid);
- int cpu = __ffs(logical_apicid & 0xf);
-
- physid_set_mask_of_physid(cpu + 4*node, retmap);
-}
-
-/* Where the IO area was mapped on multiquad, always 0 otherwise */
-void *xquad_portio;
-
-static inline int numaq_check_phys_apicid_present(int phys_apicid)
-{
- return 1;
-}
-
-/*
- * We use physical apicids here, not logical, so just return the default
- * physical broadcast to stop people from breaking us
- */
-static int
-numaq_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
- const struct cpumask *andmask,
- unsigned int *apicid)
-{
- *apicid = 0x0F;
- return 0;
-}
-
-/* No NUMA-Q box has a HT CPU, but it can't hurt to use the default code. */
-static inline int numaq_phys_pkg_id(int cpuid_apic, int index_msb)
-{
- return cpuid_apic >> index_msb;
-}
-
-static int
-numaq_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
-{
- if (strncmp(oem, "IBM NUMA", 8))
- printk(KERN_ERR "Warning! Not a NUMA-Q system!\n");
- else
- found_numaq = 1;
-
- return found_numaq;
-}
-
-static int probe_numaq(void)
-{
- /* already know from get_memcfg_numaq() */
- return found_numaq;
-}
-
-static void numaq_setup_portio_remap(void)
-{
- int num_quads = num_online_nodes();
-
- if (num_quads <= 1)
- return;
-
- printk(KERN_INFO
- "Remapping cross-quad port I/O for %d quads\n", num_quads);
-
- xquad_portio = ioremap(XQUAD_PORTIO_BASE, num_quads*XQUAD_PORTIO_QUAD);
-
- printk(KERN_INFO
- "xquad_portio vaddr 0x%08lx, len %08lx\n",
- (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD);
-}
-
-/* Use __refdata to keep false positive warning calm. */
-static struct apic __refdata apic_numaq = {
-
- .name = "NUMAQ",
- .probe = probe_numaq,
- .acpi_madt_oem_check = NULL,
- .apic_id_valid = default_apic_id_valid,
- .apic_id_registered = numaq_apic_id_registered,
-
- .irq_delivery_mode = dest_LowestPrio,
- /* physical delivery on LOCAL quad: */
- .irq_dest_mode = 0,
-
- .target_cpus = numaq_target_cpus,
- .disable_esr = 1,
- .dest_logical = APIC_DEST_LOGICAL,
- .check_apicid_used = numaq_check_apicid_used,
- .check_apicid_present = numaq_check_apicid_present,
-
- .vector_allocation_domain = flat_vector_allocation_domain,
- .init_apic_ldr = numaq_init_apic_ldr,
-
- .ioapic_phys_id_map = numaq_ioapic_phys_id_map,
- .setup_apic_routing = numaq_setup_apic_routing,
- .multi_timer_check = numaq_multi_timer_check,
- .cpu_present_to_apicid = numaq_cpu_present_to_apicid,
- .apicid_to_cpu_present = numaq_apicid_to_cpu_present,
- .setup_portio_remap = numaq_setup_portio_remap,
- .check_phys_apicid_present = numaq_check_phys_apicid_present,
- .enable_apic_mode = NULL,
- .phys_pkg_id = numaq_phys_pkg_id,
- .mps_oem_check = numaq_mps_oem_check,
-
- .get_apic_id = numaq_get_apic_id,
- .set_apic_id = NULL,
- .apic_id_mask = 0x0F << 24,
-
- .cpu_mask_to_apicid_and = numaq_cpu_mask_to_apicid_and,
-
- .send_IPI_mask = numaq_send_IPI_mask,
- .send_IPI_mask_allbutself = NULL,
- .send_IPI_allbutself = numaq_send_IPI_allbutself,
- .send_IPI_all = numaq_send_IPI_all,
- .send_IPI_self = default_send_IPI_self,
-
- .wakeup_secondary_cpu = wakeup_secondary_cpu_via_nmi,
- .trampoline_phys_low = NUMAQ_TRAMPOLINE_PHYS_LOW,
- .trampoline_phys_high = NUMAQ_TRAMPOLINE_PHYS_HIGH,
-
- /* We don't do anything here because we use NMI's to boot instead */
- .wait_for_init_deassert = NULL,
-
- .smp_callin_clear_local_apic = numaq_smp_callin_clear_local_apic,
- .inquire_remote_apic = NULL,
-
- .read = native_apic_mem_read,
- .write = native_apic_mem_write,
- .eoi_write = native_apic_mem_write,
- .icr_read = native_apic_icr_read,
- .icr_write = native_apic_icr_write,
- .wait_icr_idle = native_apic_wait_icr_idle,
- .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
-
- .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid,
- .x86_32_numa_cpu_node = numaq_numa_cpu_node,
-};
-
-apic_driver(apic_numaq);
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index eb35ef9ee63..cceb352c968 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -119,8 +119,7 @@ static struct apic apic_default = {
.trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
.trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
- .wait_for_init_deassert = default_wait_for_init_deassert,
-
+ .wait_for_init_deassert = true,
.smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = default_inquire_remote_apic,
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
deleted file mode 100644
index 77c95c0e1bf..00000000000
--- a/arch/x86/kernel/apic/summit_32.c
+++ /dev/null
@@ -1,552 +0,0 @@
-/*
- * IBM Summit-Specific Code
- *
- * Written By: Matthew Dobson, IBM Corporation
- *
- * Copyright (c) 2003 IBM Corp.
- *
- * All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or (at
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to <colpatch@us.ibm.com>
- *
- */
-
-#define pr_fmt(fmt) "summit: %s: " fmt, __func__
-
-#include <linux/mm.h>
-#include <linux/init.h>
-#include <asm/io.h>
-#include <asm/bios_ebda.h>
-
-/*
- * APIC driver for the IBM "Summit" chipset.
- */
-#include <linux/threads.h>
-#include <linux/cpumask.h>
-#include <asm/mpspec.h>
-#include <asm/apic.h>
-#include <asm/smp.h>
-#include <asm/fixmap.h>
-#include <asm/apicdef.h>
-#include <asm/ipi.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/gfp.h>
-#include <linux/smp.h>
-
-static unsigned summit_get_apic_id(unsigned long x)
-{
- return (x >> 24) & 0xFF;
-}
-
-static inline void summit_send_IPI_mask(const struct cpumask *mask, int vector)
-{
- default_send_IPI_mask_sequence_logical(mask, vector);
-}
-
-static void summit_send_IPI_allbutself(int vector)
-{
- default_send_IPI_mask_allbutself_logical(cpu_online_mask, vector);
-}
-
-static void summit_send_IPI_all(int vector)
-{
- summit_send_IPI_mask(cpu_online_mask, vector);
-}
-
-#include <asm/tsc.h>
-
-extern int use_cyclone;
-
-#ifdef CONFIG_X86_SUMMIT_NUMA
-static void setup_summit(void);
-#else
-static inline void setup_summit(void) {}
-#endif
-
-static int summit_mps_oem_check(struct mpc_table *mpc, char *oem,
- char *productid)
-{
- if (!strncmp(oem, "IBM ENSW", 8) &&
- (!strncmp(productid, "VIGIL SMP", 9)
- || !strncmp(productid, "EXA", 3)
- || !strncmp(productid, "RUTHLESS SMP", 12))){
- mark_tsc_unstable("Summit based system");
- use_cyclone = 1; /*enable cyclone-timer*/
- setup_summit();
- return 1;
- }
- return 0;
-}
-
-/* Hook from generic ACPI tables.c */
-static int summit_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
-{
- if (!strncmp(oem_id, "IBM", 3) &&
- (!strncmp(oem_table_id, "SERVIGIL", 8)
- || !strncmp(oem_table_id, "EXA", 3))){
- mark_tsc_unstable("Summit based system");
- use_cyclone = 1; /*enable cyclone-timer*/
- setup_summit();
- return 1;
- }
- return 0;
-}
-
-struct rio_table_hdr {
- unsigned char version; /* Version number of this data structure */
- /* Version 3 adds chassis_num & WP_index */
- unsigned char num_scal_dev; /* # of Scalability devices (Twisters for Vigil) */
- unsigned char num_rio_dev; /* # of RIO I/O devices (Cyclones and Winnipegs) */
-} __attribute__((packed));
-
-struct scal_detail {
- unsigned char node_id; /* Scalability Node ID */
- unsigned long CBAR; /* Address of 1MB register space */
- unsigned char port0node; /* Node ID port connected to: 0xFF=None */
- unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */
- unsigned char port1node; /* Node ID port connected to: 0xFF = None */
- unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */
- unsigned char port2node; /* Node ID port connected to: 0xFF = None */
- unsigned char port2port; /* Port num port connected to: 0,1,2, or 0xFF=None */
- unsigned char chassis_num; /* 1 based Chassis number (1 = boot node) */
-} __attribute__((packed));
-
-struct rio_detail {
- unsigned char node_id; /* RIO Node ID */
- unsigned long BBAR; /* Address of 1MB register space */
- unsigned char type; /* Type of device */
- unsigned char owner_id; /* For WPEG: Node ID of Cyclone that owns this WPEG*/
- /* For CYC: Node ID of Twister that owns this CYC */
- unsigned char port0node; /* Node ID port connected to: 0xFF=None */
- unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */
- unsigned char port1node; /* Node ID port connected to: 0xFF=None */
- unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */
- unsigned char first_slot; /* For WPEG: Lowest slot number below this WPEG */
- /* For CYC: 0 */
- unsigned char status; /* For WPEG: Bit 0 = 1 : the XAPIC is used */
- /* = 0 : the XAPIC is not used, ie:*/
- /* ints fwded to another XAPIC */
- /* Bits1:7 Reserved */
- /* For CYC: Bits0:7 Reserved */
- unsigned char WP_index; /* For WPEG: WPEG instance index - lower ones have */
- /* lower slot numbers/PCI bus numbers */
- /* For CYC: No meaning */
- unsigned char chassis_num; /* 1 based Chassis number */
- /* For LookOut WPEGs this field indicates the */
- /* Expansion Chassis #, enumerated from Boot */
- /* Node WPEG external port, then Boot Node CYC */
- /* external port, then Next Vigil chassis WPEG */
- /* external port, etc. */
- /* Shared Lookouts have only 1 chassis number (the */
- /* first one assigned) */
-} __attribute__((packed));
-
-
-typedef enum {
- CompatTwister = 0, /* Compatibility Twister */
- AltTwister = 1, /* Alternate Twister of internal 8-way */
- CompatCyclone = 2, /* Compatibility Cyclone */
- AltCyclone = 3, /* Alternate Cyclone of internal 8-way */
- CompatWPEG = 4, /* Compatibility WPEG */
- AltWPEG = 5, /* Second Planar WPEG */
- LookOutAWPEG = 6, /* LookOut WPEG */
- LookOutBWPEG = 7, /* LookOut WPEG */
-} node_type;
-
-static inline int is_WPEG(struct rio_detail *rio){
- return (rio->type == CompatWPEG || rio->type == AltWPEG ||
- rio->type == LookOutAWPEG || rio->type == LookOutBWPEG);
-}
-
-#define SUMMIT_APIC_DFR_VALUE (APIC_DFR_CLUSTER)
-
-static const struct cpumask *summit_target_cpus(void)
-{
- /* CPU_MASK_ALL (0xff) has undefined behaviour with
- * dest_LowestPrio mode logical clustered apic interrupt routing
- * Just start on cpu 0. IRQ balancing will spread load
- */
- return cpumask_of(0);
-}
-
-static unsigned long summit_check_apicid_used(physid_mask_t *map, int apicid)
-{
- return 0;
-}
-
-/* we don't use the phys_cpu_present_map to indicate apicid presence */
-static unsigned long summit_check_apicid_present(int bit)
-{
- return 1;
-}
-
-static int summit_early_logical_apicid(int cpu)
-{
- int count = 0;
- u8 my_id = early_per_cpu(x86_cpu_to_apicid, cpu);
- u8 my_cluster = APIC_CLUSTER(my_id);
-#ifdef CONFIG_SMP
- u8 lid;
- int i;
-
- /* Create logical APIC IDs by counting CPUs already in cluster. */
- for (count = 0, i = nr_cpu_ids; --i >= 0; ) {
- lid = early_per_cpu(x86_cpu_to_logical_apicid, i);
- if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster)
- ++count;
- }
-#endif
- /* We only have a 4 wide bitmap in cluster mode. If a deranged
- * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */
- BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT);
- return my_cluster | (1UL << count);
-}
-
-static void summit_init_apic_ldr(void)
-{
- int cpu = smp_processor_id();
- unsigned long id = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
- unsigned long val;
-
- apic_write(APIC_DFR, SUMMIT_APIC_DFR_VALUE);
- val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
- val |= SET_APIC_LOGICAL_ID(id);
- apic_write(APIC_LDR, val);
-}
-
-static int summit_apic_id_registered(void)
-{
- return 1;
-}
-
-static void summit_setup_apic_routing(void)
-{
- pr_info("Enabling APIC mode: Summit. Using %d I/O APICs\n",
- nr_ioapics);
-}
-
-static int summit_cpu_present_to_apicid(int mps_cpu)
-{
- if (mps_cpu < nr_cpu_ids)
- return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
- else
- return BAD_APICID;
-}
-
-static void summit_ioapic_phys_id_map(physid_mask_t *phys_id_map, physid_mask_t *retmap)
-{
- /* For clustered we don't have a good way to do this yet - hack */
- physids_promote(0x0FL, retmap);
-}
-
-static void summit_apicid_to_cpu_present(int apicid, physid_mask_t *retmap)
-{
- physid_set_mask_of_physid(0, retmap);
-}
-
-static int summit_check_phys_apicid_present(int physical_apicid)
-{
- return 1;
-}
-
-static inline int
-summit_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id)
-{
- unsigned int round = 0;
- unsigned int cpu, apicid = 0;
-
- /*
- * The cpus in the mask must all be on the apic cluster.
- */
- for_each_cpu_and(cpu, cpumask, cpu_online_mask) {
- int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
-
- if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
- pr_err("Not a valid mask!\n");
- return -EINVAL;
- }
- apicid |= new_apicid;
- round++;
- }
- if (!round)
- return -EINVAL;
- *dest_id = apicid;
- return 0;
-}
-
-static int
-summit_cpu_mask_to_apicid_and(const struct cpumask *inmask,
- const struct cpumask *andmask,
- unsigned int *apicid)
-{
- cpumask_var_t cpumask;
- *apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);
-
- if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
- return 0;
-
- cpumask_and(cpumask, inmask, andmask);
- summit_cpu_mask_to_apicid(cpumask, apicid);
-
- free_cpumask_var(cpumask);
-
- return 0;
-}
-
-/*
- * cpuid returns the value latched in the HW at reset, not the APIC ID
- * register's value. For any box whose BIOS changes APIC IDs, like
- * clustered APIC systems, we must use hard_smp_processor_id.
- *
- * See Intel's IA-32 SW Dev's Manual Vol2 under CPUID.
- */
-static int summit_phys_pkg_id(int cpuid_apic, int index_msb)
-{
- return hard_smp_processor_id() >> index_msb;
-}
-
-static int probe_summit(void)
-{
- /* probed later in mptable/ACPI hooks */
- return 0;
-}
-
-#ifdef CONFIG_X86_SUMMIT_NUMA
-static struct rio_table_hdr *rio_table_hdr;
-static struct scal_detail *scal_devs[MAX_NUMNODES];
-static struct rio_detail *rio_devs[MAX_NUMNODES*4];
-
-#ifndef CONFIG_X86_NUMAQ
-static int mp_bus_id_to_node[MAX_MP_BUSSES];
-#endif
-
-static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
-{
- int twister = 0, node = 0;
- int i, bus, num_buses;
-
- for (i = 0; i < rio_table_hdr->num_rio_dev; i++) {
- if (rio_devs[i]->node_id == rio_devs[wpeg_num]->owner_id) {
- twister = rio_devs[i]->owner_id;
- break;
- }
- }
- if (i == rio_table_hdr->num_rio_dev) {
- pr_err("Couldn't find owner Cyclone for Winnipeg!\n");
- return last_bus;
- }
-
- for (i = 0; i < rio_table_hdr->num_scal_dev; i++) {
- if (scal_devs[i]->node_id == twister) {
- node = scal_devs[i]->node_id;
- break;
- }
- }
- if (i == rio_table_hdr->num_scal_dev) {
- pr_err("Couldn't find owner Twister for Cyclone!\n");
- return last_bus;
- }
-
- switch (rio_devs[wpeg_num]->type) {
- case CompatWPEG:
- /*
- * The Compatibility Winnipeg controls the 2 legacy buses,
- * the 66MHz PCI bus [2 slots] and the 2 "extra" buses in case
- * a PCI-PCI bridge card is used in either slot: total 5 buses.
- */
- num_buses = 5;
- break;
- case AltWPEG:
- /*
- * The Alternate Winnipeg controls the 2 133MHz buses [1 slot
- * each], their 2 "extra" buses, the 100MHz bus [2 slots] and
- * the "extra" buses for each of those slots: total 7 buses.
- */
- num_buses = 7;
- break;
- case LookOutAWPEG:
- case LookOutBWPEG:
- /*
- * A Lookout Winnipeg controls 3 100MHz buses [2 slots each]
- * & the "extra" buses for each of those slots: total 9 buses.
- */
- num_buses = 9;
- break;
- default:
- pr_info("Unsupported Winnipeg type!\n");
- return last_bus;
- }
-
- for (bus = last_bus; bus < last_bus + num_buses; bus++)
- mp_bus_id_to_node[bus] = node;
- return bus;
-}
-
-static int build_detail_arrays(void)
-{
- unsigned long ptr;
- int i, scal_detail_size, rio_detail_size;
-
- if (rio_table_hdr->num_scal_dev > MAX_NUMNODES) {
- pr_warn("MAX_NUMNODES too low! Defined as %d, but system has %d nodes\n",
- MAX_NUMNODES, rio_table_hdr->num_scal_dev);
- return 0;
- }
-
- switch (rio_table_hdr->version) {
- default:
- pr_warn("Invalid Rio Grande Table Version: %d\n",
- rio_table_hdr->version);
- return 0;
- case 2:
- scal_detail_size = 11;
- rio_detail_size = 13;
- break;
- case 3:
- scal_detail_size = 12;
- rio_detail_size = 15;
- break;
- }
-
- ptr = (unsigned long)rio_table_hdr + 3;
- for (i = 0; i < rio_table_hdr->num_scal_dev; i++, ptr += scal_detail_size)
- scal_devs[i] = (struct scal_detail *)ptr;
-
- for (i = 0; i < rio_table_hdr->num_rio_dev; i++, ptr += rio_detail_size)
- rio_devs[i] = (struct rio_detail *)ptr;
-
- return 1;
-}
-
-void setup_summit(void)
-{
- unsigned long ptr;
- unsigned short offset;
- int i, next_wpeg, next_bus = 0;
-
- /* The pointer to the EBDA is stored in the word @ phys 0x40E(40:0E) */
- ptr = get_bios_ebda();
- ptr = (unsigned long)phys_to_virt(ptr);
-
- rio_table_hdr = NULL;
- offset = 0x180;
- while (offset) {
- /* The block id is stored in the 2nd word */
- if (*((unsigned short *)(ptr + offset + 2)) == 0x4752) {
- /* set the pointer past the offset & block id */
- rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4);
- break;
- }
- /* The next offset is stored in the 1st word. 0 means no more */
- offset = *((unsigned short *)(ptr + offset));
- }
- if (!rio_table_hdr) {
- pr_err("Unable to locate Rio Grande Table in EBDA - bailing!\n");
- return;
- }
-
- if (!build_detail_arrays())
- return;
-
- /* The first Winnipeg we're looking for has an index of 0 */
- next_wpeg = 0;
- do {
- for (i = 0; i < rio_table_hdr->num_rio_dev; i++) {
- if (is_WPEG(rio_devs[i]) && rio_devs[i]->WP_index == next_wpeg) {
- /* It's the Winnipeg we're looking for! */
- next_bus = setup_pci_node_map_for_wpeg(i, next_bus);
- next_wpeg++;
- break;
- }
- }
- /*
- * If we go through all Rio devices and don't find one with
- * the next index, it means we've found all the Winnipegs,
- * and thus all the PCI buses.
- */
- if (i == rio_table_hdr->num_rio_dev)
- next_wpeg = 0;
- } while (next_wpeg != 0);
-}
-#endif
-
-static struct apic apic_summit = {
-
- .name = "summit",
- .probe = probe_summit,
- .acpi_madt_oem_check = summit_acpi_madt_oem_check,
- .apic_id_valid = default_apic_id_valid,
- .apic_id_registered = summit_apic_id_registered,
-
- .irq_delivery_mode = dest_LowestPrio,
- /* logical delivery broadcast to all CPUs: */
- .irq_dest_mode = 1,
-
- .target_cpus = summit_target_cpus,
- .disable_esr = 1,
- .dest_logical = APIC_DEST_LOGICAL,
- .check_apicid_used = summit_check_apicid_used,
- .check_apicid_present = summit_check_apicid_present,
-
- .vector_allocation_domain = flat_vector_allocation_domain,
- .init_apic_ldr = summit_init_apic_ldr,
-
- .ioapic_phys_id_map = summit_ioapic_phys_id_map,
- .setup_apic_routing = summit_setup_apic_routing,
- .multi_timer_check = NULL,
- .cpu_present_to_apicid = summit_cpu_present_to_apicid,
- .apicid_to_cpu_present = summit_apicid_to_cpu_present,
- .setup_portio_remap = NULL,
- .check_phys_apicid_present = summit_check_phys_apicid_present,
- .enable_apic_mode = NULL,
- .phys_pkg_id = summit_phys_pkg_id,
- .mps_oem_check = summit_mps_oem_check,
-
- .get_apic_id = summit_get_apic_id,
- .set_apic_id = NULL,
- .apic_id_mask = 0xFF << 24,
-
- .cpu_mask_to_apicid_and = summit_cpu_mask_to_apicid_and,
-
- .send_IPI_mask = summit_send_IPI_mask,
- .send_IPI_mask_allbutself = NULL,
- .send_IPI_allbutself = summit_send_IPI_allbutself,
- .send_IPI_all = summit_send_IPI_all,
- .send_IPI_self = default_send_IPI_self,
-
- .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
- .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
-
- .wait_for_init_deassert = default_wait_for_init_deassert,
-
- .smp_callin_clear_local_apic = NULL,
- .inquire_remote_apic = default_inquire_remote_apic,
-
- .read = native_apic_mem_read,
- .write = native_apic_mem_write,
- .eoi_write = native_apic_mem_write,
- .icr_read = native_apic_icr_read,
- .icr_write = native_apic_icr_write,
- .wait_icr_idle = native_apic_wait_icr_idle,
- .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
-
- .x86_32_early_logical_apicid = summit_early_logical_apicid,
-};
-
-apic_driver(apic_summit);
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 140e29db478..e66766bf164 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -3,7 +3,6 @@
#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/ctype.h>
-#include <linux/init.h>
#include <linux/dmar.h>
#include <linux/cpu.h>
@@ -280,7 +279,7 @@ static struct apic apic_x2apic_cluster = {
.trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
.trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
- .wait_for_init_deassert = NULL,
+ .wait_for_init_deassert = false,
.smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = NULL,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 562a76d433c..6d600ebf6c1 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -3,7 +3,6 @@
#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/ctype.h>
-#include <linux/init.h>
#include <linux/dmar.h>
#include <asm/smp.h>
@@ -134,7 +133,7 @@ static struct apic apic_x2apic_phys = {
.trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
.trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
- .wait_for_init_deassert = NULL,
+ .wait_for_init_deassert = false,
.smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = NULL,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 1191ac1c9d2..293b41df54e 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -5,7 +5,7 @@
*
* SGI UV APIC functions (note: not an Intel compatible APIC)
*
- * Copyright (C) 2007-2013 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2007-2014 Silicon Graphics, Inc. All rights reserved.
*/
#include <linux/cpumask.h>
#include <linux/hardirq.h>
@@ -39,12 +39,6 @@
#include <asm/x86_init.h>
#include <asm/nmi.h>
-/* BMC sets a bit this MMR non-zero before sending an NMI */
-#define UVH_NMI_MMR UVH_SCRATCH5
-#define UVH_NMI_MMR_CLEAR (UVH_NMI_MMR + 8)
-#define UV_NMI_PENDING_MASK (1UL << 63)
-DEFINE_PER_CPU(unsigned long, cpu_last_nmi_count);
-
DEFINE_PER_CPU(int, x2apic_extra_bits);
#define PR_DEVEL(fmt, args...) pr_devel("%s: " fmt, __func__, args)
@@ -58,7 +52,6 @@ int uv_min_hub_revision_id;
EXPORT_SYMBOL_GPL(uv_min_hub_revision_id);
unsigned int uv_apicid_hibits;
EXPORT_SYMBOL_GPL(uv_apicid_hibits);
-static DEFINE_SPINLOCK(uv_nmi_lock);
static struct apic apic_x2apic_uv_x;
@@ -113,7 +106,7 @@ static int __init early_get_pnodeid(void)
break;
case UV3_HUB_PART_NUMBER:
case UV3_HUB_PART_NUMBER_X:
- uv_min_hub_revision_id += UV3_HUB_REVISION_BASE - 1;
+ uv_min_hub_revision_id += UV3_HUB_REVISION_BASE;
break;
}
@@ -403,7 +396,7 @@ static struct apic __refdata apic_x2apic_uv_x = {
.wakeup_secondary_cpu = uv_wakeup_secondary,
.trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
.trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
- .wait_for_init_deassert = NULL,
+ .wait_for_init_deassert = false,
.smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = NULL,
@@ -447,6 +440,20 @@ static __initdata struct redir_addr redir_addrs[] = {
{UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR},
};
+static unsigned char get_n_lshift(int m_val)
+{
+ union uv3h_gr0_gam_gr_config_u m_gr_config;
+
+ if (is_uv1_hub())
+ return m_val;
+
+ if (is_uv2_hub())
+ return m_val == 40 ? 40 : 39;
+
+ m_gr_config.v = uv_read_local_mmr(UV3H_GR0_GAM_GR_CONFIG);
+ return m_gr_config.s3.m_skt;
+}
+
static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
{
union uvh_rh_gam_alias210_overlay_config_2_mmr_u alias;
@@ -847,68 +854,6 @@ void uv_cpu_init(void)
set_x2apic_extra_bits(uv_hub_info->pnode);
}
-/*
- * When NMI is received, print a stack trace.
- */
-int uv_handle_nmi(unsigned int reason, struct pt_regs *regs)
-{
- unsigned long real_uv_nmi;
- int bid;
-
- /*
- * Each blade has an MMR that indicates when an NMI has been sent
- * to cpus on the blade. If an NMI is detected, atomically
- * clear the MMR and update a per-blade NMI count used to
- * cause each cpu on the blade to notice a new NMI.
- */
- bid = uv_numa_blade_id();
- real_uv_nmi = (uv_read_local_mmr(UVH_NMI_MMR) & UV_NMI_PENDING_MASK);
-
- if (unlikely(real_uv_nmi)) {
- spin_lock(&uv_blade_info[bid].nmi_lock);
- real_uv_nmi = (uv_read_local_mmr(UVH_NMI_MMR) & UV_NMI_PENDING_MASK);
- if (real_uv_nmi) {
- uv_blade_info[bid].nmi_count++;
- uv_write_local_mmr(UVH_NMI_MMR_CLEAR, UV_NMI_PENDING_MASK);
- }
- spin_unlock(&uv_blade_info[bid].nmi_lock);
- }
-
- if (likely(__get_cpu_var(cpu_last_nmi_count) == uv_blade_info[bid].nmi_count))
- return NMI_DONE;
-
- __get_cpu_var(cpu_last_nmi_count) = uv_blade_info[bid].nmi_count;
-
- /*
- * Use a lock so only one cpu prints at a time.
- * This prevents intermixed output.
- */
- spin_lock(&uv_nmi_lock);
- pr_info("UV NMI stack dump cpu %u:\n", smp_processor_id());
- dump_stack();
- spin_unlock(&uv_nmi_lock);
-
- return NMI_HANDLED;
-}
-
-void uv_register_nmi_notifier(void)
-{
- if (register_nmi_handler(NMI_UNKNOWN, uv_handle_nmi, 0, "uv"))
- printk(KERN_WARNING "UV NMI handler failed to register\n");
-}
-
-void uv_nmi_init(void)
-{
- unsigned int value;
-
- /*
- * Unmask NMI on all cpus
- */
- value = apic_read(APIC_LVT1) | APIC_DM_NMI;
- value &= ~APIC_LVT_MASKED;
- apic_write(APIC_LVT1, value);
-}
-
void __init uv_system_init(void)
{
union uvh_rh_gam_config_mmr_u m_n_config;
@@ -918,6 +863,7 @@ void __init uv_system_init(void)
int gnode_extra, min_pnode = 999999, max_pnode = -1;
unsigned long mmr_base, present, paddr;
unsigned short pnode_mask;
+ unsigned char n_lshift;
char *hub = (is_uv1_hub() ? "UV1" :
(is_uv2_hub() ? "UV2" :
"UV3"));
@@ -929,6 +875,7 @@ void __init uv_system_init(void)
m_val = m_n_config.s.m_skt;
n_val = m_n_config.s.n_skt;
pnode_mask = (1 << n_val) - 1;
+ n_lshift = get_n_lshift(m_val);
mmr_base =
uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
~UV_MMR_ENABLE;
@@ -936,8 +883,9 @@ void __init uv_system_init(void)
node_id.v = uv_read_local_mmr(UVH_NODE_ID);
gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1;
gnode_upper = ((unsigned long)gnode_extra << m_val);
- pr_info("UV: N:%d M:%d pnode_mask:0x%x gnode_upper/extra:0x%lx/0x%x\n",
- n_val, m_val, pnode_mask, gnode_upper, gnode_extra);
+ pr_info("UV: N:%d M:%d pnode_mask:0x%x gnode_upper/extra:0x%lx/0x%x n_lshift 0x%x\n",
+ n_val, m_val, pnode_mask, gnode_upper, gnode_extra,
+ n_lshift);
pr_info("UV: global MMR base 0x%lx\n", mmr_base);
@@ -1004,8 +952,7 @@ void __init uv_system_init(void)
uv_cpu_hub_info(cpu)->hub_revision = uv_hub_info->hub_revision;
uv_cpu_hub_info(cpu)->m_shift = 64 - m_val;
- uv_cpu_hub_info(cpu)->n_lshift = is_uv2_1_hub() ?
- (m_val == 40 ? 40 : 39) : m_val;
+ uv_cpu_hub_info(cpu)->n_lshift = n_lshift;
pnode = uv_apicid_to_pnode(apicid);
blade = boot_pnode_to_blade(pnode);
@@ -1046,9 +993,9 @@ void __init uv_system_init(void)
map_mmr_high(max_pnode);
map_mmioh_high(min_pnode, max_pnode);
+ uv_nmi_setup();
uv_cpu_init();
uv_scir_register_cpu_notifier();
- uv_register_nmi_notifier();
proc_mkdir("sgi_uv", NULL);
/* register Legacy VGA I/O redirection handler */
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 3ab03430211..58487445141 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -841,24 +841,12 @@ static int apm_do_idle(void)
u32 eax;
u8 ret = 0;
int idled = 0;
- int polling;
int err = 0;
- polling = !!(current_thread_info()->status & TS_POLLING);
- if (polling) {
- current_thread_info()->status &= ~TS_POLLING;
- /*
- * TS_POLLING-cleared state must be visible before we
- * test NEED_RESCHED:
- */
- smp_mb();
- }
if (!need_resched()) {
idled = 1;
ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax, &err);
}
- if (polling)
- current_thread_info()->status |= TS_POLLING;
if (!idled)
return 0;
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 28610822fb3..9f6b9341950 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -32,7 +32,6 @@ void common(void) {
OFFSET(TI_flags, thread_info, flags);
OFFSET(TI_status, thread_info, status);
OFFSET(TI_addr_limit, thread_info, addr_limit);
- OFFSET(TI_preempt_count, thread_info, preempt_count);
BLANK();
OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index e2dbcb7dabd..83a7995625a 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -91,7 +91,7 @@ void __init setup_bios_corruption_check(void)
corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
- for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) {
+ for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) {
start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE),
PAGE_SIZE, corruption_check_size);
end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE),
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 47b56a7e99c..7fd54f09b01 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -36,12 +36,13 @@ obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd_iommu.o
endif
obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o
obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
-obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o
+obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o perf_event_intel_rapl.o
endif
obj-$(CONFIG_X86_MCE) += mcheck/
obj-$(CONFIG_MTRR) += mtrr/
+obj-$(CONFIG_MICROCODE) += microcode/
obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o perf_event_amd_ibs.o
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 903a264af98..ce8b8ff0e0e 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -1,5 +1,4 @@
#include <linux/export.h>
-#include <linux/init.h>
#include <linux/bitops.h>
#include <linux/elf.h>
#include <linux/mm.h>
@@ -219,7 +218,7 @@ static void amd_k7_smp_check(struct cpuinfo_x86 *c)
*/
WARN_ONCE(1, "WARNING: This combination of AMD"
" processors is not suitable for SMP.\n");
- add_taint(TAINT_UNSAFE_SMP, LOCKDEP_NOW_UNRELIABLE);
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE);
}
static void init_amd_k7(struct cpuinfo_x86 *c)
@@ -234,9 +233,7 @@ static void init_amd_k7(struct cpuinfo_x86 *c)
if (c->x86_model >= 6 && c->x86_model <= 10) {
if (!cpu_has(c, X86_FEATURE_XMM)) {
printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
- rdmsr(MSR_K7_HWCR, l, h);
- l &= ~0x00008000;
- wrmsr(MSR_K7_HWCR, l, h);
+ msr_clear_bit(MSR_K7_HWCR, 15);
set_cpu_cap(c, X86_FEATURE_XMM);
}
}
@@ -339,7 +336,7 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
#endif
/*
- * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
+ * On a AMD dual core setup the lower bits of the APIC id distinguish the cores.
* Assumes number of cores is a power of two.
*/
static void amd_detect_cmp(struct cpuinfo_x86 *c)
@@ -487,7 +484,7 @@ static void early_init_amd(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
if (!check_tsc_unstable())
- sched_clock_stable = 1;
+ set_sched_clock_stable();
}
#ifdef CONFIG_X86_64
@@ -508,6 +505,10 @@ static void early_init_amd(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
}
#endif
+
+ /* F16h erratum 793, CVE-2013-6885 */
+ if (c->x86 == 0x16 && c->x86_model <= 0xf)
+ msr_set_bit(MSR_AMD64_LS_CFG, 15);
}
static const int amd_erratum_383[];
@@ -527,11 +528,8 @@ static void init_amd(struct cpuinfo_x86 *c)
* Errata 63 for SH-B3 steppings
* Errata 122 for all steppings (F+ have it disabled by default)
*/
- if (c->x86 == 0xf) {
- rdmsrl(MSR_K7_HWCR, value);
- value |= 1 << 6;
- wrmsrl(MSR_K7_HWCR, value);
- }
+ if (c->x86 == 0xf)
+ msr_set_bit(MSR_K7_HWCR, 6);
#endif
early_init_amd(c);
@@ -614,14 +612,11 @@ static void init_amd(struct cpuinfo_x86 *c)
(c->x86_model >= 0x10) && (c->x86_model <= 0x1f) &&
!cpu_has(c, X86_FEATURE_TOPOEXT)) {
- if (!rdmsrl_safe(0xc0011005, &value)) {
- value |= 1ULL << 54;
- wrmsrl_safe(0xc0011005, value);
+ if (msr_set_bit(0xc0011005, 54) > 0) {
rdmsrl(0xc0011005, value);
- if (value & (1ULL << 54)) {
+ if (value & BIT_64(54)) {
set_cpu_cap(c, X86_FEATURE_TOPOEXT);
- printk(KERN_INFO FW_INFO "CPU: Re-enabling "
- "disabled Topology Extensions Support\n");
+ pr_info(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n");
}
}
}
@@ -700,19 +695,12 @@ static void init_amd(struct cpuinfo_x86 *c)
* Disable GART TLB Walk Errors on Fam10h. We do this here
* because this is always needed when GART is enabled, even in a
* kernel which has no MCE support built in.
- * BIOS should disable GartTlbWlk Errors themself. If
- * it doesn't do it here as suggested by the BKDG.
+ * BIOS should disable GartTlbWlk Errors already. If
+ * it doesn't, do it here as suggested by the BKDG.
*
* Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012
*/
- u64 mask;
- int err;
-
- err = rdmsrl_safe(MSR_AMD64_MCx_MASK(4), &mask);
- if (err == 0) {
- mask |= (1 << 10);
- wrmsrl_safe(MSR_AMD64_MCx_MASK(4), mask);
- }
+ msr_set_bit(MSR_AMD64_MCx_MASK(4), 10);
/*
* On family 10h BIOS may not have properly enabled WC+ support,
@@ -724,10 +712,7 @@ static void init_amd(struct cpuinfo_x86 *c)
* NOTE: we want to use the _safe accessors so as not to #GP kvm
* guests on older kvm hosts.
*/
-
- rdmsrl_safe(MSR_AMD64_BU_CFG2, &value);
- value &= ~(1ULL << 24);
- wrmsrl_safe(MSR_AMD64_BU_CFG2, value);
+ msr_clear_bit(MSR_AMD64_BU_CFG2, 24);
if (cpu_has_amd_erratum(c, amd_erratum_383))
set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH);
@@ -758,10 +743,7 @@ static unsigned int amd_size_cache(struct cpuinfo_x86 *c, unsigned int size)
static void cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c)
{
- tlb_flushall_shift = 5;
-
- if (c->x86 <= 0x11)
- tlb_flushall_shift = 4;
+ tlb_flushall_shift = 6;
}
static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
@@ -790,14 +772,10 @@ static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
}
/* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
- if (!((eax >> 16) & mask)) {
- u32 a, b, c, d;
-
- cpuid(0x80000005, &a, &b, &c, &d);
- tlb_lld_2m[ENTRIES] = (a >> 16) & 0xff;
- } else {
+ if (!((eax >> 16) & mask))
+ tlb_lld_2m[ENTRIES] = (cpuid_eax(0x80000005) >> 16) & 0xff;
+ else
tlb_lld_2m[ENTRIES] = (eax >> 16) & mask;
- }
/* a 4M entry uses two 2M entries */
tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1;
@@ -823,8 +801,8 @@ static const struct cpu_dev amd_cpu_dev = {
.c_vendor = "AMD",
.c_ident = { "AuthenticAMD" },
#ifdef CONFIG_X86_32
- .c_models = {
- { .vendor = X86_VENDOR_AMD, .family = 4, .model_names =
+ .legacy_models = {
+ { .family = 4, .model_names =
{
[3] = "486 DX/2",
[7] = "486 DX/2-WB",
@@ -835,7 +813,7 @@ static const struct cpu_dev amd_cpu_dev = {
}
},
},
- .c_size_cache = amd_size_cache,
+ .legacy_cache_size = amd_size_cache,
#endif
.c_early_init = early_init_amd,
.c_detect_tlb = cpu_detect_tlb_amd,
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index fbf6c3bc240..d8fba5c15fb 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -1,6 +1,5 @@
#include <linux/bitops.h>
#include <linux/kernel.h>
-#include <linux/init.h>
#include <asm/processor.h>
#include <asm/e820.h>
@@ -9,236 +8,6 @@
#include "cpu.h"
-#ifdef CONFIG_X86_OOSTORE
-
-static u32 power2(u32 x)
-{
- u32 s = 1;
-
- while (s <= x)
- s <<= 1;
-
- return s >>= 1;
-}
-
-
-/*
- * Set up an actual MCR
- */
-static void centaur_mcr_insert(int reg, u32 base, u32 size, int key)
-{
- u32 lo, hi;
-
- hi = base & ~0xFFF;
- lo = ~(size-1); /* Size is a power of 2 so this makes a mask */
- lo &= ~0xFFF; /* Remove the ctrl value bits */
- lo |= key; /* Attribute we wish to set */
- wrmsr(reg+MSR_IDT_MCR0, lo, hi);
- mtrr_centaur_report_mcr(reg, lo, hi); /* Tell the mtrr driver */
-}
-
-/*
- * Figure what we can cover with MCR's
- *
- * Shortcut: We know you can't put 4Gig of RAM on a winchip
- */
-static u32 ramtop(void)
-{
- u32 clip = 0xFFFFFFFFUL;
- u32 top = 0;
- int i;
-
- for (i = 0; i < e820.nr_map; i++) {
- unsigned long start, end;
-
- if (e820.map[i].addr > 0xFFFFFFFFUL)
- continue;
- /*
- * Don't MCR over reserved space. Ignore the ISA hole
- * we frob around that catastrophe already
- */
- if (e820.map[i].type == E820_RESERVED) {
- if (e820.map[i].addr >= 0x100000UL &&
- e820.map[i].addr < clip)
- clip = e820.map[i].addr;
- continue;
- }
- start = e820.map[i].addr;
- end = e820.map[i].addr + e820.map[i].size;
- if (start >= end)
- continue;
- if (end > top)
- top = end;
- }
- /*
- * Everything below 'top' should be RAM except for the ISA hole.
- * Because of the limited MCR's we want to map NV/ACPI into our
- * MCR range for gunk in RAM
- *
- * Clip might cause us to MCR insufficient RAM but that is an
- * acceptable failure mode and should only bite obscure boxes with
- * a VESA hole at 15Mb
- *
- * The second case Clip sometimes kicks in is when the EBDA is marked
- * as reserved. Again we fail safe with reasonable results
- */
- if (top > clip)
- top = clip;
-
- return top;
-}
-
-/*
- * Compute a set of MCR's to give maximum coverage
- */
-static int centaur_mcr_compute(int nr, int key)
-{
- u32 mem = ramtop();
- u32 root = power2(mem);
- u32 base = root;
- u32 top = root;
- u32 floor = 0;
- int ct = 0;
-
- while (ct < nr) {
- u32 fspace = 0;
- u32 high;
- u32 low;
-
- /*
- * Find the largest block we will fill going upwards
- */
- high = power2(mem-top);
-
- /*
- * Find the largest block we will fill going downwards
- */
- low = base/2;
-
- /*
- * Don't fill below 1Mb going downwards as there
- * is an ISA hole in the way.
- */
- if (base <= 1024*1024)
- low = 0;
-
- /*
- * See how much space we could cover by filling below
- * the ISA hole
- */
-
- if (floor == 0)
- fspace = 512*1024;
- else if (floor == 512*1024)
- fspace = 128*1024;
-
- /* And forget ROM space */
-
- /*
- * Now install the largest coverage we get
- */
- if (fspace > high && fspace > low) {
- centaur_mcr_insert(ct, floor, fspace, key);
- floor += fspace;
- } else if (high > low) {
- centaur_mcr_insert(ct, top, high, key);
- top += high;
- } else if (low > 0) {
- base -= low;
- centaur_mcr_insert(ct, base, low, key);
- } else
- break;
- ct++;
- }
- /*
- * We loaded ct values. We now need to set the mask. The caller
- * must do this bit.
- */
- return ct;
-}
-
-static void centaur_create_optimal_mcr(void)
-{
- int used;
- int i;
-
- /*
- * Allocate up to 6 mcrs to mark as much of ram as possible
- * as write combining and weak write ordered.
- *
- * To experiment with: Linux never uses stack operations for
- * mmio spaces so we could globally enable stack operation wc
- *
- * Load the registers with type 31 - full write combining, all
- * writes weakly ordered.
- */
- used = centaur_mcr_compute(6, 31);
-
- /*
- * Wipe unused MCRs
- */
- for (i = used; i < 8; i++)
- wrmsr(MSR_IDT_MCR0+i, 0, 0);
-}
-
-static void winchip2_create_optimal_mcr(void)
-{
- u32 lo, hi;
- int used;
- int i;
-
- /*
- * Allocate up to 6 mcrs to mark as much of ram as possible
- * as write combining, weak store ordered.
- *
- * Load the registers with type 25
- * 8 - weak write ordering
- * 16 - weak read ordering
- * 1 - write combining
- */
- used = centaur_mcr_compute(6, 25);
-
- /*
- * Mark the registers we are using.
- */
- rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
- for (i = 0; i < used; i++)
- lo |= 1<<(9+i);
- wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
-
- /*
- * Wipe unused MCRs
- */
-
- for (i = used; i < 8; i++)
- wrmsr(MSR_IDT_MCR0+i, 0, 0);
-}
-
-/*
- * Handle the MCR key on the Winchip 2.
- */
-static void winchip2_unprotect_mcr(void)
-{
- u32 lo, hi;
- u32 key;
-
- rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
- lo &= ~0x1C0; /* blank bits 8-6 */
- key = (lo>>17) & 7;
- lo |= key<<6; /* replace with unlock key */
- wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
-}
-
-static void winchip2_protect_mcr(void)
-{
- u32 lo, hi;
-
- rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
- lo &= ~0x1C0; /* blank bits 8-6 */
- wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
-}
-#endif /* CONFIG_X86_OOSTORE */
-
#define ACE_PRESENT (1 << 6)
#define ACE_ENABLED (1 << 7)
#define ACE_FCR (1 << 28) /* MSR_VIA_FCR */
@@ -363,20 +132,6 @@ static void init_centaur(struct cpuinfo_x86 *c)
fcr_clr = DPDC;
printk(KERN_NOTICE "Disabling bugged TSC.\n");
clear_cpu_cap(c, X86_FEATURE_TSC);
-#ifdef CONFIG_X86_OOSTORE
- centaur_create_optimal_mcr();
- /*
- * Enable:
- * write combining on non-stack, non-string
- * write combining on string, all types
- * weak write ordering
- *
- * The C6 original lacks weak read order
- *
- * Note 0x120 is write only on Winchip 1
- */
- wrmsr(MSR_IDT_MCR_CTRL, 0x01F0001F, 0);
-#endif
break;
case 8:
switch (c->x86_mask) {
@@ -393,40 +148,12 @@ static void init_centaur(struct cpuinfo_x86 *c)
fcr_set = ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK|
E2MMX|EAMD3D;
fcr_clr = DPDC;
-#ifdef CONFIG_X86_OOSTORE
- winchip2_unprotect_mcr();
- winchip2_create_optimal_mcr();
- rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
- /*
- * Enable:
- * write combining on non-stack, non-string
- * write combining on string, all types
- * weak write ordering
- */
- lo |= 31;
- wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
- winchip2_protect_mcr();
-#endif
break;
case 9:
name = "3";
fcr_set = ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK|
E2MMX|EAMD3D;
fcr_clr = DPDC;
-#ifdef CONFIG_X86_OOSTORE
- winchip2_unprotect_mcr();
- winchip2_create_optimal_mcr();
- rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
- /*
- * Enable:
- * write combining on non-stack, non-string
- * write combining on string, all types
- * weak write ordering
- */
- lo |= 31;
- wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
- winchip2_protect_mcr();
-#endif
break;
default:
name = "??";
@@ -468,10 +195,10 @@ static void init_centaur(struct cpuinfo_x86 *c)
#endif
}
+#ifdef CONFIG_X86_32
static unsigned int
centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
{
-#ifdef CONFIG_X86_32
/* VIA C3 CPUs (670-68F) need further shifting. */
if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8)))
size >>= 8;
@@ -484,16 +211,18 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
if ((c->x86 == 6) && (c->x86_model == 9) &&
(c->x86_mask == 1) && (size == 65))
size -= 1;
-#endif
return size;
}
+#endif
static const struct cpu_dev centaur_cpu_dev = {
.c_vendor = "Centaur",
.c_ident = { "CentaurHauls" },
.c_early_init = early_init_centaur,
.c_init = init_centaur,
- .c_size_cache = centaur_size_cache,
+#ifdef CONFIG_X86_32
+ .legacy_cache_size = centaur_size_cache,
+#endif
.c_x86_vendor = X86_VENDOR_CENTAUR,
};
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 2793d1f095a..ef1b93f18ed 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -8,6 +8,7 @@
#include <linux/delay.h>
#include <linux/sched.h>
#include <linux/init.h>
+#include <linux/kprobes.h>
#include <linux/kgdb.h>
#include <linux/smp.h>
#include <linux/io.h>
@@ -20,6 +21,7 @@
#include <asm/processor.h>
#include <asm/debugreg.h>
#include <asm/sections.h>
+#include <asm/vsyscall.h>
#include <linux/topology.h>
#include <linux/cpumask.h>
#include <asm/pgtable.h>
@@ -284,8 +286,13 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
raw_local_save_flags(eflags);
BUG_ON(eflags & X86_EFLAGS_AC);
- if (cpu_has(c, X86_FEATURE_SMAP))
+ if (cpu_has(c, X86_FEATURE_SMAP)) {
+#ifdef CONFIG_X86_SMAP
set_in_cr4(X86_CR4_SMAP);
+#else
+ clear_in_cr4(X86_CR4_SMAP);
+#endif
+ }
}
/*
@@ -346,7 +353,8 @@ static void filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
/* Look up CPU names by table lookup. */
static const char *table_lookup_model(struct cpuinfo_x86 *c)
{
- const struct cpu_model_info *info;
+#ifdef CONFIG_X86_32
+ const struct legacy_cpu_model_info *info;
if (c->x86_model >= 16)
return NULL; /* Range check */
@@ -354,13 +362,14 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c)
if (!this_cpu)
return NULL;
- info = this_cpu->c_models;
+ info = this_cpu->legacy_models;
- while (info && info->family) {
+ while (info->family) {
if (info->family == c->x86)
return info->model_names[c->x86_model];
info++;
}
+#endif
return NULL; /* Not found */
}
@@ -450,8 +459,8 @@ void cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
#else
/* do processor-specific cache resizing */
- if (this_cpu->c_size_cache)
- l2size = this_cpu->c_size_cache(c, l2size);
+ if (this_cpu->legacy_cache_size)
+ l2size = this_cpu->legacy_cache_size(c, l2size);
/* Allow user to override all this if necessary. */
if (cachesize_override != -1)
@@ -470,6 +479,7 @@ u16 __read_mostly tlb_lli_4m[NR_INFO];
u16 __read_mostly tlb_lld_4k[NR_INFO];
u16 __read_mostly tlb_lld_2m[NR_INFO];
u16 __read_mostly tlb_lld_4m[NR_INFO];
+u16 __read_mostly tlb_lld_1g[NR_INFO];
/*
* tlb_flushall_shift shows the balance point in replacing cr3 write
@@ -484,13 +494,13 @@ void cpu_detect_tlb(struct cpuinfo_x86 *c)
if (this_cpu->c_detect_tlb)
this_cpu->c_detect_tlb(c);
- printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \
- "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \
+ printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n"
+ "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n"
"tlb_flushall_shift: %d\n",
tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES],
tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES],
tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES],
- tlb_flushall_shift);
+ tlb_lld_1g[ENTRIES], tlb_flushall_shift);
}
void detect_ht(struct cpuinfo_x86 *c)
@@ -945,6 +955,38 @@ static void vgetcpu_set_mode(void)
else
vgetcpu_mode = VGETCPU_LSL;
}
+
+/* May not be __init: called during resume */
+static void syscall32_cpu_init(void)
+{
+ /* Load these always in case some future AMD CPU supports
+ SYSENTER from compat mode too. */
+ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
+ wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
+ wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
+
+ wrmsrl(MSR_CSTAR, ia32_cstar_target);
+}
+#endif
+
+#ifdef CONFIG_X86_32
+void enable_sep_cpu(void)
+{
+ int cpu = get_cpu();
+ struct tss_struct *tss = &per_cpu(init_tss, cpu);
+
+ if (!boot_cpu_has(X86_FEATURE_SEP)) {
+ put_cpu();
+ return;
+ }
+
+ tss->x86_tss.ss1 = __KERNEL_CS;
+ tss->x86_tss.sp1 = sizeof(struct tss_struct) + (unsigned long) tss;
+ wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
+ wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0);
+ wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0);
+ put_cpu();
+}
#endif
void __init identify_boot_cpu(void)
@@ -1017,7 +1059,8 @@ __setup("show_msr=", setup_show_msr);
static __init int setup_noclflush(char *arg)
{
- setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
+ setup_clear_cpu_cap(X86_FEATURE_CLFLUSH);
+ setup_clear_cpu_cap(X86_FEATURE_CLFLUSHOPT);
return 1;
}
__setup("noclflush", setup_noclflush);
@@ -1070,6 +1113,10 @@ static __init int setup_disablecpuid(char *arg)
}
__setup("clearcpuid=", setup_disablecpuid);
+DEFINE_PER_CPU(unsigned long, kernel_stack) =
+ (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE;
+EXPORT_PER_CPU_SYMBOL(kernel_stack);
+
#ifdef CONFIG_X86_64
struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table };
struct desc_ptr debug_idt_descr = { NR_VECTORS * 16 - 1,
@@ -1086,15 +1133,14 @@ DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =
&init_task;
EXPORT_PER_CPU_SYMBOL(current_task);
-DEFINE_PER_CPU(unsigned long, kernel_stack) =
- (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE;
-EXPORT_PER_CPU_SYMBOL(kernel_stack);
-
DEFINE_PER_CPU(char *, irq_stack_ptr) =
init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;
DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
+DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
+EXPORT_PER_CPU_SYMBOL(__preempt_count);
+
DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
/*
@@ -1148,6 +1194,7 @@ int is_debug_stack(unsigned long addr)
(addr <= __get_cpu_var(debug_stack_addr) &&
addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ));
}
+NOKPROBE_SYMBOL(is_debug_stack);
DEFINE_PER_CPU(u32, debug_idt_ctr);
@@ -1156,6 +1203,7 @@ void debug_stack_set_zero(void)
this_cpu_inc(debug_idt_ctr);
load_current_idt();
}
+NOKPROBE_SYMBOL(debug_stack_set_zero);
void debug_stack_reset(void)
{
@@ -1164,11 +1212,14 @@ void debug_stack_reset(void)
if (this_cpu_dec_return(debug_idt_ctr) == 0)
load_current_idt();
}
+NOKPROBE_SYMBOL(debug_stack_reset);
#else /* CONFIG_X86_64 */
DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
EXPORT_PER_CPU_SYMBOL(current_task);
+DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
+EXPORT_PER_CPU_SYMBOL(__preempt_count);
DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
#ifdef CONFIG_CC_STACKPROTECTOR
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 4041c24ae7d..c37dc37e831 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -1,12 +1,6 @@
#ifndef ARCH_X86_CPU_H
#define ARCH_X86_CPU_H
-struct cpu_model_info {
- int vendor;
- int family;
- const char *model_names[16];
-};
-
/* attempt to consolidate cpu attributes */
struct cpu_dev {
const char *c_vendor;
@@ -14,15 +8,23 @@ struct cpu_dev {
/* some have two possibilities for cpuid string */
const char *c_ident[2];
- struct cpu_model_info c_models[4];
-
void (*c_early_init)(struct cpuinfo_x86 *);
void (*c_bsp_init)(struct cpuinfo_x86 *);
void (*c_init)(struct cpuinfo_x86 *);
void (*c_identify)(struct cpuinfo_x86 *);
void (*c_detect_tlb)(struct cpuinfo_x86 *);
- unsigned int (*c_size_cache)(struct cpuinfo_x86 *, unsigned int);
int c_x86_vendor;
+#ifdef CONFIG_X86_32
+ /* Optional vendor specific routine to obtain the cache size. */
+ unsigned int (*legacy_cache_size)(struct cpuinfo_x86 *,
+ unsigned int);
+
+ /* Family/stepping-based lookup table for model names. */
+ struct legacy_cpu_model_info {
+ int family;
+ const char *model_names[16];
+ } legacy_models[5];
+#endif
};
struct _tlb_table {
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index d0969c75ab5..aaf152e7963 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -1,4 +1,3 @@
-#include <linux/init.h>
#include <linux/bitops.h>
#include <linux/delay.h>
#include <linux/pci.h>
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index ec7299566f7..f9e4fdd3b87 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -1,4 +1,3 @@
-#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/string.h>
@@ -32,11 +31,8 @@ static void early_init_intel(struct cpuinfo_x86 *c)
/* Unmask CPUID levels if masked: */
if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
- rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
-
- if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) {
- misc_enable &= ~MSR_IA32_MISC_ENABLE_LIMIT_CPUID;
- wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
+ if (msr_clear_bit(MSR_IA32_MISC_ENABLE,
+ MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) > 0) {
c->cpuid_level = cpuid_eax(0);
get_cpu_cap(c);
}
@@ -93,7 +89,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
if (!check_tsc_unstable())
- sched_clock_stable = 1;
+ set_sched_clock_stable();
}
/* Penwell and Cloverview have the TSC which doesn't sleep on S3 */
@@ -130,16 +126,10 @@ static void early_init_intel(struct cpuinfo_x86 *c)
* Ingo Molnar reported a Pentium D (model 6) and a Xeon
* (model 2) with the same problem.
*/
- if (c->x86 == 15) {
- rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
-
- if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) {
- printk(KERN_INFO "kmemcheck: Disabling fast string operations\n");
-
- misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING;
- wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
- }
- }
+ if (c->x86 == 15)
+ if (msr_clear_bit(MSR_IA32_MISC_ENABLE,
+ MSR_IA32_MISC_ENABLE_FAST_STRING_BIT) > 0)
+ pr_info("kmemcheck: Disabling fast string operations\n");
#endif
/*
@@ -196,10 +186,16 @@ static void intel_smp_check(struct cpuinfo_x86 *c)
}
}
-static void intel_workarounds(struct cpuinfo_x86 *c)
+static int forcepae;
+static int __init forcepae_setup(char *__unused)
{
- unsigned long lo, hi;
+ forcepae = 1;
+ return 1;
+}
+__setup("forcepae", forcepae_setup);
+static void intel_workarounds(struct cpuinfo_x86 *c)
+{
#ifdef CONFIG_X86_F00F_BUG
/*
* All current models of Pentium and Pentium with MMX technology CPUs
@@ -226,16 +222,26 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
clear_cpu_cap(c, X86_FEATURE_SEP);
/*
+ * PAE CPUID issue: many Pentium M report no PAE but may have a
+ * functionally usable PAE implementation.
+ * Forcefully enable PAE if kernel parameter "forcepae" is present.
+ */
+ if (forcepae) {
+ printk(KERN_WARNING "PAE forced!\n");
+ set_cpu_cap(c, X86_FEATURE_PAE);
+ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE);
+ }
+
+ /*
* P4 Xeon errata 037 workaround.
* Hardware prefetcher may cause stale data to be loaded into the cache.
*/
if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) {
- rdmsr(MSR_IA32_MISC_ENABLE, lo, hi);
- if ((lo & MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE) == 0) {
- printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n");
- printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n");
- lo |= MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE;
- wrmsr(MSR_IA32_MISC_ENABLE, lo, hi);
+ if (msr_set_bit(MSR_IA32_MISC_ENABLE,
+ MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT)
+ > 0) {
+ pr_info("CPU: C0 stepping P4 Xeon detected.\n");
+ pr_info("CPU: Disabling hardware prefetching (Errata 037)\n");
}
}
@@ -268,10 +274,6 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
}
#endif
-#ifdef CONFIG_X86_NUMAQ
- numaq_tsc_disable();
-#endif
-
intel_smp_check(c);
}
#else
@@ -368,6 +370,17 @@ static void init_intel(struct cpuinfo_x86 *c)
*/
detect_extended_topology(c);
+ if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
+ /*
+ * let's use the legacy cpuid vector 0x1 and 0x4 for topology
+ * detection.
+ */
+ c->x86_max_cores = intel_num_cpu_cores(c);
+#ifdef CONFIG_X86_32
+ detect_ht(c);
+#endif
+ }
+
l2 = init_intel_cacheinfo(c);
if (c->cpuid_level > 9) {
unsigned eax = cpuid_eax(10);
@@ -387,7 +400,8 @@ static void init_intel(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_PEBS);
}
- if (c->x86 == 6 && c->x86_model == 29 && cpu_has_clflush)
+ if (c->x86 == 6 && cpu_has_clflush &&
+ (c->x86_model == 29 || c->x86_model == 46 || c->x86_model == 47))
set_cpu_cap(c, X86_FEATURE_CLFLUSH_MONITOR);
#ifdef CONFIG_X86_64
@@ -435,17 +449,6 @@ static void init_intel(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_P3);
#endif
- if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
- /*
- * let's use the legacy cpuid vector 0x1 and 0x4 for topology
- * detection.
- */
- c->x86_max_cores = intel_num_cpu_cores(c);
-#ifdef CONFIG_X86_32
- detect_ht(c);
-#endif
- }
-
/* Work around errata */
srat_detect_node(c);
@@ -505,6 +508,7 @@ static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
#define TLB_DATA0_2M_4M 0x23
#define STLB_4K 0x41
+#define STLB_4K_2M 0x42
static const struct _tlb_table intel_tlb_table[] = {
{ 0x01, TLB_INST_4K, 32, " TLB_INST 4 KByte pages, 4-way set associative" },
@@ -525,13 +529,20 @@ static const struct _tlb_table intel_tlb_table[] = {
{ 0x5b, TLB_DATA_4K_4M, 64, " TLB_DATA 4 KByte and 4 MByte pages" },
{ 0x5c, TLB_DATA_4K_4M, 128, " TLB_DATA 4 KByte and 4 MByte pages" },
{ 0x5d, TLB_DATA_4K_4M, 256, " TLB_DATA 4 KByte and 4 MByte pages" },
+ { 0x61, TLB_INST_4K, 48, " TLB_INST 4 KByte pages, full associative" },
+ { 0x63, TLB_DATA_1G, 4, " TLB_DATA 1 GByte pages, 4-way set associative" },
+ { 0x76, TLB_INST_2M_4M, 8, " TLB_INST 2-MByte or 4-MByte pages, fully associative" },
{ 0xb0, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 4-way set associative" },
{ 0xb1, TLB_INST_2M_4M, 4, " TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" },
{ 0xb2, TLB_INST_4K, 64, " TLB_INST 4KByte pages, 4-way set associative" },
{ 0xb3, TLB_DATA_4K, 128, " TLB_DATA 4 KByte pages, 4-way set associative" },
{ 0xb4, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 4-way associative" },
+ { 0xb5, TLB_INST_4K, 64, " TLB_INST 4 KByte pages, 8-way set ssociative" },
+ { 0xb6, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 8-way set ssociative" },
{ 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" },
{ 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" },
+ { 0xc1, STLB_4K_2M, 1024, " STLB 4 KByte and 2 MByte pages, 8-way associative" },
+ { 0xc2, TLB_DATA_2M_4M, 16, " DTLB 2 MByte/4MByte pages, 4-way associative" },
{ 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" },
{ 0x00, 0, 0 }
};
@@ -557,6 +568,20 @@ static void intel_tlb_lookup(const unsigned char desc)
if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
break;
+ case STLB_4K_2M:
+ if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
+ if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
+ if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
+ if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries;
+ if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
+ if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
+ break;
case TLB_INST_ALL:
if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
@@ -602,6 +627,10 @@ static void intel_tlb_lookup(const unsigned char desc)
if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
break;
+ case TLB_DATA_1G:
+ if (tlb_lld_1g[ENTRIES] < intel_tlb_table[k].entries)
+ tlb_lld_1g[ENTRIES] = intel_tlb_table[k].entries;
+ break;
}
}
@@ -614,21 +643,17 @@ static void intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c)
case 0x61d: /* six-core 45 nm xeon "Dunnington" */
tlb_flushall_shift = -1;
break;
+ case 0x63a: /* Ivybridge */
+ tlb_flushall_shift = 2;
+ break;
case 0x61a: /* 45 nm nehalem, "Bloomfield" */
case 0x61e: /* 45 nm nehalem, "Lynnfield" */
case 0x625: /* 32 nm nehalem, "Clarkdale" */
case 0x62c: /* 32 nm nehalem, "Gulftown" */
case 0x62e: /* 45 nm nehalem-ex, "Beckton" */
case 0x62f: /* 32 nm Xeon E7 */
- tlb_flushall_shift = 6;
- break;
case 0x62a: /* SandyBridge */
case 0x62d: /* SandyBridge, "Romely-EP" */
- tlb_flushall_shift = 5;
- break;
- case 0x63a: /* Ivybridge */
- tlb_flushall_shift = 1;
- break;
default:
tlb_flushall_shift = 6;
}
@@ -665,8 +690,8 @@ static const struct cpu_dev intel_cpu_dev = {
.c_vendor = "Intel",
.c_ident = { "GenuineIntel" },
#ifdef CONFIG_X86_32
- .c_models = {
- { .vendor = X86_VENDOR_INTEL, .family = 4, .model_names =
+ .legacy_models = {
+ { .family = 4, .model_names =
{
[0] = "486 DX-25/33",
[1] = "486 DX-50",
@@ -679,7 +704,7 @@ static const struct cpu_dev intel_cpu_dev = {
[9] = "486 DX/4-WB"
}
},
- { .vendor = X86_VENDOR_INTEL, .family = 5, .model_names =
+ { .family = 5, .model_names =
{
[0] = "Pentium 60/66 A-step",
[1] = "Pentium 60/66",
@@ -690,7 +715,7 @@ static const struct cpu_dev intel_cpu_dev = {
[8] = "Mobile Pentium MMX"
}
},
- { .vendor = X86_VENDOR_INTEL, .family = 6, .model_names =
+ { .family = 6, .model_names =
{
[0] = "Pentium Pro A-step",
[1] = "Pentium Pro",
@@ -704,7 +729,7 @@ static const struct cpu_dev intel_cpu_dev = {
[11] = "Pentium III (Tualatin)",
}
},
- { .vendor = X86_VENDOR_INTEL, .family = 15, .model_names =
+ { .family = 15, .model_names =
{
[0] = "Pentium 4 (Unknown)",
[1] = "Pentium 4 (Willamette)",
@@ -714,7 +739,7 @@ static const struct cpu_dev intel_cpu_dev = {
}
},
},
- .c_size_cache = intel_size_cache,
+ .legacy_cache_size = intel_size_cache,
#endif
.c_detect_tlb = intel_detect_tlb,
.c_early_init = early_init_intel,
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 1414c90feab..9c8f7394c61 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -1,5 +1,5 @@
/*
- * Routines to indentify caches on Intel CPU.
+ * Routines to identify caches on Intel CPU.
*
* Changes:
* Venkatesh Pallipadi : Adding cache identification through cpuid(4)
@@ -730,6 +730,18 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c)
#endif
}
+#ifdef CONFIG_X86_HT
+ /*
+ * If cpu_llc_id is not yet set, this means cpuid_level < 4 which in
+ * turns means that the only possibility is SMT (as indicated in
+ * cpuid1). Since cpuid2 doesn't specify shared caches, and we know
+ * that SMT shares all caches, we can unconditionally set cpu_llc_id to
+ * c->phys_proc_id.
+ */
+ if (per_cpu(cpu_llc_id, cpu) == BAD_APICID)
+ per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
+#endif
+
c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d));
return l2;
@@ -1225,21 +1237,24 @@ static struct notifier_block cacheinfo_cpu_notifier = {
static int __init cache_sysfs_init(void)
{
- int i;
+ int i, err = 0;
if (num_cache_leaves == 0)
return 0;
+ cpu_notifier_register_begin();
for_each_online_cpu(i) {
- int err;
struct device *dev = get_cpu_device(i);
err = cache_add_dev(dev);
if (err)
- return err;
+ goto out;
}
- register_hotcpu_notifier(&cacheinfo_cpu_notifier);
- return 0;
+ __register_hotcpu_notifier(&cacheinfo_cpu_notifier);
+
+out:
+ cpu_notifier_register_done();
+ return err;
}
device_initcall(cache_sysfs_init);
diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c
index 36565373af8..afa9f0d487e 100644
--- a/arch/x86/kernel/cpu/match.c
+++ b/arch/x86/kernel/cpu/match.c
@@ -47,45 +47,3 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match)
return NULL;
}
EXPORT_SYMBOL(x86_match_cpu);
-
-ssize_t arch_print_cpu_modalias(struct device *dev,
- struct device_attribute *attr,
- char *bufptr)
-{
- int size = PAGE_SIZE;
- int i, n;
- char *buf = bufptr;
-
- n = snprintf(buf, size, "x86cpu:vendor:%04X:family:%04X:"
- "model:%04X:feature:",
- boot_cpu_data.x86_vendor,
- boot_cpu_data.x86,
- boot_cpu_data.x86_model);
- size -= n;
- buf += n;
- size -= 1;
- for (i = 0; i < NCAPINTS*32; i++) {
- if (boot_cpu_has(i)) {
- n = snprintf(buf, size, ",%04X", i);
- if (n >= size) {
- WARN(1, "x86 features overflow page\n");
- break;
- }
- size -= n;
- buf += n;
- }
- }
- *buf++ = '\n';
- return buf - bufptr;
-}
-
-int arch_cpu_uevent(struct device *dev, struct kobj_uevent_env *env)
-{
- char *buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
- if (buf) {
- arch_print_cpu_modalias(NULL, NULL, buf);
- add_uevent_var(env, "MODALIAS=%s", buf);
- kfree(buf);
- }
- return 0;
-}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
index cd8b166a173..a1aef953315 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -33,23 +33,28 @@
#include <linux/acpi.h>
#include <linux/cper.h>
#include <acpi/apei.h>
+#include <acpi/ghes.h>
#include <asm/mce.h>
#include "mce-internal.h"
-void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err)
+void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
{
struct mce m;
- /* Only corrected MC is reported */
- if (!corrected || !(mem_err->validation_bits &
- CPER_MEM_VALID_PHYSICAL_ADDRESS))
+ if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
return;
mce_setup(&m);
m.bank = 1;
- /* Fake a memory read corrected error with unknown channel */
+ /* Fake a memory read error with unknown channel */
m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f;
+
+ if (severity >= GHES_SEV_RECOVERABLE)
+ m.status |= MCI_STATUS_UC;
+ if (severity >= GHES_SEV_PANIC)
+ m.status |= MCI_STATUS_PCC;
+
m.addr = mem_err->physical_addr;
mce_log(&m);
mce_notify_irq();
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index b3218cdee95..9a79c8dbd8e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -60,8 +60,6 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex);
#define SPINUNIT 100 /* 100ns */
-atomic_t mce_entry;
-
DEFINE_PER_CPU(unsigned, mce_exception_count);
struct mce_bank *mce_banks __read_mostly;
@@ -89,6 +87,9 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
static DEFINE_PER_CPU(struct mce, mces_seen);
static int cpu_missing;
+/* CMCI storm detection filter */
+static DEFINE_PER_CPU(unsigned long, mce_polled_error);
+
/*
* MCA banks polled by the period polling timer for corrected events.
* With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
@@ -614,6 +615,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
if (!(m.status & MCI_STATUS_VAL))
continue;
+ this_cpu_write(mce_polled_error, 1);
/*
* Uncorrected or signalled events are handled by the exception
* handler when it is enabled, so don't process those here.
@@ -700,8 +702,7 @@ static int mce_timed_out(u64 *t)
if (!mca_cfg.monarch_timeout)
goto out;
if ((s64)*t < SPINUNIT) {
- /* CHECKME: Make panic default for 1 too? */
- if (mca_cfg.tolerant < 1)
+ if (mca_cfg.tolerant <= 1)
mce_panic("Timeout synchronizing machine check over CPUs",
NULL, NULL);
cpu_missing = 1;
@@ -1037,8 +1038,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
char *msg = "Unknown";
- atomic_inc(&mce_entry);
-
this_cpu_inc(mce_exception_count);
if (!cfg->banks)
@@ -1168,7 +1167,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
mce_report_event(regs);
mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
out:
- atomic_dec(&mce_entry);
sync_core();
}
EXPORT_SYMBOL_GPL(do_machine_check);
@@ -1278,10 +1276,18 @@ static unsigned long mce_adjust_timer_default(unsigned long interval)
static unsigned long (*mce_adjust_timer)(unsigned long interval) =
mce_adjust_timer_default;
+static int cmc_error_seen(void)
+{
+ unsigned long *v = &__get_cpu_var(mce_polled_error);
+
+ return test_and_clear_bit(0, v);
+}
+
static void mce_timer_fn(unsigned long data)
{
struct timer_list *t = &__get_cpu_var(mce_timer);
unsigned long iv;
+ int notify;
WARN_ON(smp_processor_id() != data);
@@ -1296,7 +1302,9 @@ static void mce_timer_fn(unsigned long data)
* polling interval, otherwise increase the polling interval.
*/
iv = __this_cpu_read(mce_next_interval);
- if (mce_notify_irq()) {
+ notify = mce_notify_irq();
+ notify |= cmc_error_seen();
+ if (notify) {
iv = max(iv / 2, (unsigned long) HZ/100);
} else {
iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
@@ -1638,15 +1646,15 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
static void mce_start_timer(unsigned int cpu, struct timer_list *t)
{
- unsigned long iv = mce_adjust_timer(check_interval * HZ);
-
- __this_cpu_write(mce_next_interval, iv);
+ unsigned long iv = check_interval * HZ;
if (mca_cfg.ignore_ce || !iv)
return;
+ per_cpu(mce_next_interval, cpu) = iv;
+
t->expires = round_jiffies(jiffies + iv);
- add_timer_on(t, smp_processor_id());
+ add_timer_on(t, cpu);
}
static void __mcheck_cpu_init_timer(void)
@@ -2272,8 +2280,10 @@ static int mce_device_create(unsigned int cpu)
dev->release = &mce_device_release;
err = device_register(dev);
- if (err)
+ if (err) {
+ put_device(dev);
return err;
+ }
for (i = 0; mce_device_attrs[i]; i++) {
err = device_create_file(dev, mce_device_attrs[i]);
@@ -2421,28 +2431,67 @@ static __init int mcheck_init_device(void)
int err;
int i = 0;
- if (!mce_available(&boot_cpu_data))
- return -EIO;
+ if (!mce_available(&boot_cpu_data)) {
+ err = -EIO;
+ goto err_out;
+ }
- zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
+ if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
+ err = -ENOMEM;
+ goto err_out;
+ }
mce_init_banks();
err = subsys_system_register(&mce_subsys, NULL);
if (err)
- return err;
+ goto err_out_mem;
+ cpu_notifier_register_begin();
for_each_online_cpu(i) {
err = mce_device_create(i);
- if (err)
- return err;
+ if (err) {
+ /*
+ * Register notifier anyway (and do not unreg it) so
+ * that we don't leave undeleted timers, see notifier
+ * callback above.
+ */
+ __register_hotcpu_notifier(&mce_cpu_notifier);
+ cpu_notifier_register_done();
+ goto err_device_create;
+ }
}
+ __register_hotcpu_notifier(&mce_cpu_notifier);
+ cpu_notifier_register_done();
+
register_syscore_ops(&mce_syscore_ops);
- register_hotcpu_notifier(&mce_cpu_notifier);
/* register character device /dev/mcelog */
- misc_register(&mce_chrdev_device);
+ err = misc_register(&mce_chrdev_device);
+ if (err)
+ goto err_register;
+
+ return 0;
+
+err_register:
+ unregister_syscore_ops(&mce_syscore_ops);
+
+err_device_create:
+ /*
+ * We didn't keep track of which devices were created above, but
+ * even if we had, the set of online cpus might have changed.
+ * Play safe and remove for every possible cpu, since
+ * mce_device_remove() will do the right thing.
+ */
+ for_each_possible_cpu(i)
+ mce_device_remove(i);
+
+err_out_mem:
+ free_cpumask_var(mce_device_initialized);
+
+err_out:
+ pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
return err;
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 4cfe0458ca6..9a316b21df8 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -6,10 +6,10 @@
*/
#include <linux/gfp.h>
-#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/percpu.h>
#include <linux/sched.h>
+#include <linux/cpumask.h>
#include <asm/apic.h>
#include <asm/processor.h>
#include <asm/msr.h>
@@ -42,7 +42,7 @@ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
* cmci_discover_lock protects against parallel discovery attempts
* which could race against each other.
*/
-static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
+static DEFINE_SPINLOCK(cmci_discover_lock);
#define CMCI_THRESHOLD 1
#define CMCI_POLL_INTERVAL (30 * HZ)
@@ -138,6 +138,22 @@ unsigned long mce_intel_adjust_timer(unsigned long interval)
}
}
+static void cmci_storm_disable_banks(void)
+{
+ unsigned long flags, *owned;
+ int bank;
+ u64 val;
+
+ spin_lock_irqsave(&cmci_discover_lock, flags);
+ owned = __get_cpu_var(mce_banks_owned);
+ for_each_set_bit(bank, owned, MAX_NR_BANKS) {
+ rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
+ val &= ~MCI_CTL2_CMCI_EN;
+ wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
+ }
+ spin_unlock_irqrestore(&cmci_discover_lock, flags);
+}
+
static bool cmci_storm_detect(void)
{
unsigned int cnt = __this_cpu_read(cmci_storm_cnt);
@@ -159,7 +175,7 @@ static bool cmci_storm_detect(void)
if (cnt <= CMCI_STORM_THRESHOLD)
return false;
- cmci_clear();
+ cmci_storm_disable_banks();
__this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
r = atomic_add_return(1, &cmci_storm_on_cpus);
mce_timer_kick(CMCI_POLL_INTERVAL);
@@ -195,7 +211,7 @@ static void cmci_discover(int banks)
int i;
int bios_wrong_thresh = 0;
- raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+ spin_lock_irqsave(&cmci_discover_lock, flags);
for (i = 0; i < banks; i++) {
u64 val;
int bios_zero_thresh = 0;
@@ -250,7 +266,7 @@ static void cmci_discover(int banks)
WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
}
}
- raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
+ spin_unlock_irqrestore(&cmci_discover_lock, flags);
if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) {
pr_info_once(
"bios_cmci_threshold: Some banks do not have valid thresholds set\n");
@@ -300,10 +316,10 @@ void cmci_clear(void)
if (!cmci_supported(&banks))
return;
- raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+ spin_lock_irqsave(&cmci_discover_lock, flags);
for (i = 0; i < banks; i++)
__cmci_disable_bank(i);
- raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
+ spin_unlock_irqrestore(&cmci_discover_lock, flags);
}
static void cmci_rediscover_work_func(void *arg)
@@ -344,9 +360,9 @@ void cmci_disable_bank(int bank)
if (!cmci_supported(&banks))
return;
- raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+ spin_lock_irqsave(&cmci_discover_lock, flags);
__cmci_disable_bank(bank);
- raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
+ spin_unlock_irqrestore(&cmci_discover_lock, flags);
}
static void intel_init_cmci(void)
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index 1c044b1ccc5..a3042989398 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -5,7 +5,6 @@
#include <linux/interrupt.h>
#include <linux/kernel.h>
#include <linux/types.h>
-#include <linux/init.h>
#include <linux/smp.h>
#include <asm/processor.h>
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 3eec7de76ef..36a1bb6d1ee 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -271,9 +271,6 @@ static void thermal_throttle_remove_dev(struct device *dev)
sysfs_remove_group(&dev->kobj, &thermal_attr_group);
}
-/* Mutex protecting device creation against CPU hotplug: */
-static DEFINE_MUTEX(therm_cpu_lock);
-
/* Get notified when a cpu comes on/off. Be hotplug friendly. */
static int
thermal_throttle_cpu_callback(struct notifier_block *nfb,
@@ -289,18 +286,14 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
- mutex_lock(&therm_cpu_lock);
err = thermal_throttle_add_dev(dev, cpu);
- mutex_unlock(&therm_cpu_lock);
WARN_ON(err);
break;
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
case CPU_DEAD:
case CPU_DEAD_FROZEN:
- mutex_lock(&therm_cpu_lock);
thermal_throttle_remove_dev(dev);
- mutex_unlock(&therm_cpu_lock);
break;
}
return notifier_from_errno(err);
@@ -319,19 +312,16 @@ static __init int thermal_throttle_init_device(void)
if (!atomic_read(&therm_throt_en))
return 0;
- register_hotcpu_notifier(&thermal_throttle_cpu_notifier);
+ cpu_notifier_register_begin();
-#ifdef CONFIG_HOTPLUG_CPU
- mutex_lock(&therm_cpu_lock);
-#endif
/* connect live CPUs to sysfs */
for_each_online_cpu(cpu) {
err = thermal_throttle_add_dev(get_cpu_device(cpu), cpu);
WARN_ON(err);
}
-#ifdef CONFIG_HOTPLUG_CPU
- mutex_unlock(&therm_cpu_lock);
-#endif
+
+ __register_hotcpu_notifier(&thermal_throttle_cpu_notifier);
+ cpu_notifier_register_done();
return 0;
}
@@ -439,14 +429,14 @@ static inline void __smp_thermal_interrupt(void)
smp_thermal_vector();
}
-asmlinkage void smp_thermal_interrupt(struct pt_regs *regs)
+asmlinkage __visible void smp_thermal_interrupt(struct pt_regs *regs)
{
entering_irq();
__smp_thermal_interrupt();
exiting_ack_irq();
}
-asmlinkage void smp_trace_thermal_interrupt(struct pt_regs *regs)
+asmlinkage __visible void smp_trace_thermal_interrupt(struct pt_regs *regs)
{
entering_irq();
trace_thermal_apic_entry(THERMAL_APIC_VECTOR);
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
index fe6b1c86645..7245980186e 100644
--- a/arch/x86/kernel/cpu/mcheck/threshold.c
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -24,14 +24,14 @@ static inline void __smp_threshold_interrupt(void)
mce_threshold_vector();
}
-asmlinkage void smp_threshold_interrupt(void)
+asmlinkage __visible void smp_threshold_interrupt(void)
{
entering_irq();
__smp_threshold_interrupt();
exiting_ack_irq();
}
-asmlinkage void smp_trace_threshold_interrupt(void)
+asmlinkage __visible void smp_trace_threshold_interrupt(void)
{
entering_irq();
trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR);
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index e9a701aecaa..7dc5564d0cd 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -5,7 +5,6 @@
#include <linux/interrupt.h>
#include <linux/kernel.h>
#include <linux/types.h>
-#include <linux/init.h>
#include <asm/processor.h>
#include <asm/mce.h>
diff --git a/arch/x86/kernel/cpu/microcode/Makefile b/arch/x86/kernel/cpu/microcode/Makefile
new file mode 100644
index 00000000000..285c85427c3
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/Makefile
@@ -0,0 +1,7 @@
+microcode-y := core.o
+obj-$(CONFIG_MICROCODE) += microcode.o
+microcode-$(CONFIG_MICROCODE_INTEL) += intel.o intel_lib.o
+microcode-$(CONFIG_MICROCODE_AMD) += amd.o
+obj-$(CONFIG_MICROCODE_EARLY) += core_early.o
+obj-$(CONFIG_MICROCODE_INTEL_EARLY) += intel_early.o
+obj-$(CONFIG_MICROCODE_AMD_EARLY) += amd_early.o
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index af99f71aeb7..8fffd845e22 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -182,10 +182,10 @@ int __apply_microcode_amd(struct microcode_amd *mc_amd)
{
u32 rev, dummy;
- wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
+ native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
/* verify patch application was successful */
- rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
+ native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
if (rev != mc_amd->hdr.patch_id)
return -1;
@@ -332,6 +332,9 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover)
patch->patch_id = mc_hdr->patch_id;
patch->equiv_cpu = proc_id;
+ pr_debug("%s: Added patch_id: 0x%08x, proc_id: 0x%04x\n",
+ __func__, patch->patch_id, proc_id);
+
/* ... and add to cache. */
update_cache(patch);
@@ -390,9 +393,9 @@ enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size)
if (cpu_data(smp_processor_id()).cpu_index == boot_cpu_data.cpu_index) {
struct ucode_patch *p = find_patch(smp_processor_id());
if (p) {
- memset(amd_bsp_mpb, 0, MPB_MAX_SIZE);
- memcpy(amd_bsp_mpb, p->data, min_t(u32, ksize(p->data),
- MPB_MAX_SIZE));
+ memset(amd_ucode_patch, 0, PATCH_MAX_SIZE);
+ memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data),
+ PATCH_MAX_SIZE));
}
}
#endif
@@ -430,8 +433,8 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device,
if (c->x86 >= 0x15)
snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86);
- if (request_firmware(&fw, (const char *)fw_name, device)) {
- pr_err("failed to load file %s\n", fw_name);
+ if (request_firmware_direct(&fw, (const char *)fw_name, device)) {
+ pr_debug("failed to load file %s\n", fw_name);
goto out;
}
diff --git a/arch/x86/kernel/microcode_amd_early.c b/arch/x86/kernel/cpu/microcode/amd_early.c
index 6073104ccaa..617a9e28424 100644
--- a/arch/x86/kernel/microcode_amd_early.c
+++ b/arch/x86/kernel/cpu/microcode/amd_early.c
@@ -2,6 +2,7 @@
* Copyright (C) 2013 Advanced Micro Devices, Inc.
*
* Author: Jacob Shin <jacob.shin@amd.com>
+ * Fixes: Borislav Petkov <bp@suse.de>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -15,10 +16,18 @@
#include <asm/setup.h>
#include <asm/microcode_amd.h>
-static bool ucode_loaded;
+/*
+ * This points to the current valid container of microcode patches which we will
+ * save from the initrd before jettisoning its contents.
+ */
+static u8 *container;
+static size_t container_size;
+
static u32 ucode_new_rev;
-static unsigned long ucode_offset;
-static size_t ucode_size;
+u8 amd_ucode_patch[PATCH_MAX_SIZE];
+static u16 this_equiv_id;
+
+struct cpio_data ucode_cpio;
/*
* Microcode patch container file is prepended to the initrd in cpio format.
@@ -32,9 +41,6 @@ static struct cpio_data __init find_ucode_in_initrd(void)
char *path;
void *start;
size_t size;
- unsigned long *uoffset;
- size_t *usize;
- struct cpio_data cd;
#ifdef CONFIG_X86_32
struct boot_params *p;
@@ -47,30 +53,50 @@ static struct cpio_data __init find_ucode_in_initrd(void)
path = (char *)__pa_nodebug(ucode_path);
start = (void *)p->hdr.ramdisk_image;
size = p->hdr.ramdisk_size;
- uoffset = (unsigned long *)__pa_nodebug(&ucode_offset);
- usize = (size_t *)__pa_nodebug(&ucode_size);
#else
path = ucode_path;
start = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET);
size = boot_params.hdr.ramdisk_size;
- uoffset = &ucode_offset;
- usize = &ucode_size;
#endif
- cd = find_cpio_data(path, start, size, &offset);
- if (!cd.data)
- return cd;
+ return find_cpio_data(path, start, size, &offset);
+}
- if (*(u32 *)cd.data != UCODE_MAGIC) {
- cd.data = NULL;
- cd.size = 0;
- return cd;
- }
+static size_t compute_container_size(u8 *data, u32 total_size)
+{
+ size_t size = 0;
+ u32 *header = (u32 *)data;
+
+ if (header[0] != UCODE_MAGIC ||
+ header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
+ header[2] == 0) /* size */
+ return size;
+
+ size = header[2] + CONTAINER_HDR_SZ;
+ total_size -= size;
+ data += size;
- *uoffset = (u8 *)cd.data - (u8 *)start;
- *usize = cd.size;
+ while (total_size) {
+ u16 patch_size;
+
+ header = (u32 *)data;
+
+ if (header[0] != UCODE_UCODE_TYPE)
+ break;
+
+ /*
+ * Sanity-check patch size.
+ */
+ patch_size = header[1];
+ if (patch_size > PATCH_MAX_SIZE)
+ break;
+
+ size += patch_size + SECTION_HDR_SIZE;
+ data += patch_size + SECTION_HDR_SIZE;
+ total_size -= patch_size + SECTION_HDR_SIZE;
+ }
- return cd;
+ return size;
}
/*
@@ -85,23 +111,22 @@ static struct cpio_data __init find_ucode_in_initrd(void)
static void apply_ucode_in_initrd(void *ucode, size_t size)
{
struct equiv_cpu_entry *eq;
+ size_t *cont_sz;
u32 *header;
- u8 *data;
+ u8 *data, **cont;
u16 eq_id = 0;
int offset, left;
- u32 rev, eax;
+ u32 rev, eax, ebx, ecx, edx;
u32 *new_rev;
- unsigned long *uoffset;
- size_t *usize;
#ifdef CONFIG_X86_32
new_rev = (u32 *)__pa_nodebug(&ucode_new_rev);
- uoffset = (unsigned long *)__pa_nodebug(&ucode_offset);
- usize = (size_t *)__pa_nodebug(&ucode_size);
+ cont_sz = (size_t *)__pa_nodebug(&container_size);
+ cont = (u8 **)__pa_nodebug(&container);
#else
new_rev = &ucode_new_rev;
- uoffset = &ucode_offset;
- usize = &ucode_size;
+ cont_sz = &container_size;
+ cont = &container;
#endif
data = ucode;
@@ -109,23 +134,37 @@ static void apply_ucode_in_initrd(void *ucode, size_t size)
header = (u32 *)data;
/* find equiv cpu table */
-
- if (header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
+ if (header[0] != UCODE_MAGIC ||
+ header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
header[2] == 0) /* size */
return;
- eax = cpuid_eax(0x00000001);
+ eax = 0x00000001;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
while (left > 0) {
eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ);
+ *cont = data;
+
+ /* Advance past the container header */
offset = header[2] + CONTAINER_HDR_SZ;
data += offset;
left -= offset;
eq_id = find_equiv_id(eq, eax);
- if (eq_id)
+ if (eq_id) {
+ this_equiv_id = eq_id;
+ *cont_sz = compute_container_size(*cont, left + offset);
+
+ /*
+ * truncate how much we need to iterate over in the
+ * ucode update loop below
+ */
+ left = *cont_sz - offset;
break;
+ }
/*
* support multiple container files appended together. if this
@@ -145,19 +184,18 @@ static void apply_ucode_in_initrd(void *ucode, size_t size)
/* mark where the next microcode container file starts */
offset = data - (u8 *)ucode;
- *uoffset += offset;
- *usize -= offset;
ucode = data;
}
if (!eq_id) {
- *usize = 0;
+ *cont = NULL;
+ *cont_sz = 0;
return;
}
/* find ucode and update if needed */
- rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax);
+ native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax);
while (left > 0) {
struct microcode_amd *mc;
@@ -168,134 +206,190 @@ static void apply_ucode_in_initrd(void *ucode, size_t size)
break;
mc = (struct microcode_amd *)(data + SECTION_HDR_SIZE);
- if (eq_id == mc->hdr.processor_rev_id && rev < mc->hdr.patch_id)
- if (__apply_microcode_amd(mc) == 0) {
+
+ if (eq_id == mc->hdr.processor_rev_id && rev < mc->hdr.patch_id) {
+
+ if (!__apply_microcode_amd(mc)) {
rev = mc->hdr.patch_id;
*new_rev = rev;
+
+ /* save ucode patch */
+ memcpy(amd_ucode_patch, mc,
+ min_t(u32, header[1], PATCH_MAX_SIZE));
}
+ }
offset = header[1] + SECTION_HDR_SIZE;
data += offset;
left -= offset;
}
-
- /* mark where this microcode container file ends */
- offset = *usize - (data - (u8 *)ucode);
- *usize -= offset;
-
- if (!(*new_rev))
- *usize = 0;
}
void __init load_ucode_amd_bsp(void)
{
- struct cpio_data cd = find_ucode_in_initrd();
- if (!cd.data)
+ struct cpio_data cp;
+ void **data;
+ size_t *size;
+
+#ifdef CONFIG_X86_32
+ data = (void **)__pa_nodebug(&ucode_cpio.data);
+ size = (size_t *)__pa_nodebug(&ucode_cpio.size);
+#else
+ data = &ucode_cpio.data;
+ size = &ucode_cpio.size;
+#endif
+
+ cp = find_ucode_in_initrd();
+ if (!cp.data)
return;
- apply_ucode_in_initrd(cd.data, cd.size);
+ *data = cp.data;
+ *size = cp.size;
+
+ apply_ucode_in_initrd(cp.data, cp.size);
}
#ifdef CONFIG_X86_32
-u8 amd_bsp_mpb[MPB_MAX_SIZE];
-
/*
* On 32-bit, since AP's early load occurs before paging is turned on, we
* cannot traverse cpu_equiv_table and pcache in kernel heap memory. So during
* cold boot, AP will apply_ucode_in_initrd() just like the BSP. During
- * save_microcode_in_initrd_amd() BSP's patch is copied to amd_bsp_mpb, which
- * is used upon resume from suspend.
+ * save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch,
+ * which is used upon resume from suspend.
*/
void load_ucode_amd_ap(void)
{
struct microcode_amd *mc;
- unsigned long *initrd;
- unsigned long *uoffset;
size_t *usize;
- void *ucode;
+ void **ucode;
- mc = (struct microcode_amd *)__pa(amd_bsp_mpb);
+ mc = (struct microcode_amd *)__pa(amd_ucode_patch);
if (mc->hdr.patch_id && mc->hdr.processor_rev_id) {
__apply_microcode_amd(mc);
return;
}
- initrd = (unsigned long *)__pa(&initrd_start);
- uoffset = (unsigned long *)__pa(&ucode_offset);
- usize = (size_t *)__pa(&ucode_size);
+ ucode = (void *)__pa_nodebug(&container);
+ usize = (size_t *)__pa_nodebug(&container_size);
- if (!*usize || !*initrd)
+ if (!*ucode || !*usize)
return;
- ucode = (void *)((unsigned long)__pa(*initrd) + *uoffset);
- apply_ucode_in_initrd(ucode, *usize);
+ apply_ucode_in_initrd(*ucode, *usize);
}
static void __init collect_cpu_sig_on_bsp(void *arg)
{
unsigned int cpu = smp_processor_id();
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
uci->cpu_sig.sig = cpuid_eax(0x00000001);
}
+
+static void __init get_bsp_sig(void)
+{
+ unsigned int bsp = boot_cpu_data.cpu_index;
+ struct ucode_cpu_info *uci = ucode_cpu_info + bsp;
+
+ if (!uci->cpu_sig.sig)
+ smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1);
+}
#else
void load_ucode_amd_ap(void)
{
unsigned int cpu = smp_processor_id();
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ struct equiv_cpu_entry *eq;
+ struct microcode_amd *mc;
u32 rev, eax;
+ u16 eq_id;
+
+ /* Exit if called on the BSP. */
+ if (!cpu)
+ return;
+
+ if (!container)
+ return;
rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax);
- eax = cpuid_eax(0x00000001);
uci->cpu_sig.rev = rev;
uci->cpu_sig.sig = eax;
- if (cpu && !ucode_loaded) {
- void *ucode;
+ eax = cpuid_eax(0x00000001);
+ eq = (struct equiv_cpu_entry *)(container + CONTAINER_HDR_SZ);
+
+ eq_id = find_equiv_id(eq, eax);
+ if (!eq_id)
+ return;
- if (!ucode_size || !initrd_start)
- return;
+ if (eq_id == this_equiv_id) {
+ mc = (struct microcode_amd *)amd_ucode_patch;
+
+ if (mc && rev < mc->hdr.patch_id) {
+ if (!__apply_microcode_amd(mc))
+ ucode_new_rev = mc->hdr.patch_id;
+ }
- ucode = (void *)(initrd_start + ucode_offset);
- eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
- if (load_microcode_amd(eax, ucode, ucode_size) != UCODE_OK)
+ } else {
+ if (!ucode_cpio.data)
return;
- ucode_loaded = true;
+ /*
+ * AP has a different equivalence ID than BSP, looks like
+ * mixed-steppings silicon so go through the ucode blob anew.
+ */
+ apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size);
}
-
- apply_microcode_amd(cpu);
}
#endif
int __init save_microcode_in_initrd_amd(void)
{
+ unsigned long cont;
enum ucode_state ret;
- void *ucode;
u32 eax;
-#ifdef CONFIG_X86_32
- unsigned int bsp = boot_cpu_data.cpu_index;
- struct ucode_cpu_info *uci = ucode_cpu_info + bsp;
+ if (!container)
+ return -EINVAL;
- if (!uci->cpu_sig.sig)
- smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1);
+#ifdef CONFIG_X86_32
+ get_bsp_sig();
+ cont = (unsigned long)container;
+#else
+ /*
+ * We need the physical address of the container for both bitness since
+ * boot_params.hdr.ramdisk_image is a physical address.
+ */
+ cont = __pa(container);
#endif
+
+ /*
+ * Take into account the fact that the ramdisk might get relocated and
+ * therefore we need to recompute the container's position in virtual
+ * memory space.
+ */
+ if (relocated_ramdisk)
+ container = (u8 *)(__va(relocated_ramdisk) +
+ (cont - boot_params.hdr.ramdisk_image));
+
if (ucode_new_rev)
pr_info("microcode: updated early to new patch_level=0x%08x\n",
ucode_new_rev);
- if (ucode_loaded || !ucode_size || !initrd_start)
- return 0;
-
- ucode = (void *)(initrd_start + ucode_offset);
eax = cpuid_eax(0x00000001);
eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
- ret = load_microcode_amd(eax, ucode, ucode_size);
+ ret = load_microcode_amd(eax, container, container_size);
if (ret != UCODE_OK)
return -EINVAL;
- ucode_loaded = true;
+ /*
+ * This will be freed any msec now, stash patches for the current
+ * family and switch to patch cache for cpu hotplug, etc later.
+ */
+ container = NULL;
+ container_size = 0;
+
return 0;
}
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/cpu/microcode/core.c
index 15c987698b0..dd9d6190b08 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -97,6 +97,9 @@ MODULE_LICENSE("GPL");
static struct microcode_ops *microcode_ops;
+bool dis_ucode_ldr;
+module_param(dis_ucode_ldr, bool, 0);
+
/*
* Synchronization.
*
@@ -546,6 +549,9 @@ static int __init microcode_init(void)
struct cpuinfo_x86 *c = &cpu_data(0);
int error;
+ if (dis_ucode_ldr)
+ return 0;
+
if (c->x86_vendor == X86_VENDOR_INTEL)
microcode_ops = init_intel_microcode();
else if (c->x86_vendor == X86_VENDOR_AMD)
diff --git a/arch/x86/kernel/microcode_core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c
index be7f8514f57..5f28a64e71e 100644
--- a/arch/x86/kernel/microcode_core_early.c
+++ b/arch/x86/kernel/cpu/microcode/core_early.c
@@ -17,9 +17,11 @@
* 2 of the License, or (at your option) any later version.
*/
#include <linux/module.h>
+#include <asm/microcode.h>
#include <asm/microcode_intel.h>
#include <asm/microcode_amd.h>
#include <asm/processor.h>
+#include <asm/cmdline.h>
#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24))
#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u')
@@ -72,10 +74,33 @@ static int x86_family(void)
return x86;
}
+static bool __init check_loader_disabled_bsp(void)
+{
+#ifdef CONFIG_X86_32
+ const char *cmdline = (const char *)__pa_nodebug(boot_command_line);
+ const char *opt = "dis_ucode_ldr";
+ const char *option = (const char *)__pa_nodebug(opt);
+ bool *res = (bool *)__pa_nodebug(&dis_ucode_ldr);
+
+#else /* CONFIG_X86_64 */
+ const char *cmdline = boot_command_line;
+ const char *option = "dis_ucode_ldr";
+ bool *res = &dis_ucode_ldr;
+#endif
+
+ if (cmdline_find_option_bool(cmdline, option))
+ *res = true;
+
+ return *res;
+}
+
void __init load_ucode_bsp(void)
{
int vendor, x86;
+ if (check_loader_disabled_bsp())
+ return;
+
if (!have_cpuid_p())
return;
@@ -96,10 +121,22 @@ void __init load_ucode_bsp(void)
}
}
+static bool check_loader_disabled_ap(void)
+{
+#ifdef CONFIG_X86_32
+ return __pa_nodebug(dis_ucode_ldr);
+#else
+ return dis_ucode_ldr;
+#endif
+}
+
void load_ucode_ap(void)
{
int vendor, x86;
+ if (check_loader_disabled_ap())
+ return;
+
if (!have_cpuid_p())
return;
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 5fb2cebf556..a276fa75d9b 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -278,7 +278,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device,
sprintf(name, "intel-ucode/%02x-%02x-%02x",
c->x86, c->x86_model, c->x86_mask);
- if (request_firmware(&firmware, name, device)) {
+ if (request_firmware_direct(&firmware, name, device)) {
pr_debug("data file %s load failed\n", name);
return UCODE_NFOUND;
}
diff --git a/arch/x86/kernel/microcode_intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c
index 1575deb2e63..18f739129e7 100644
--- a/arch/x86/kernel/microcode_intel_early.c
+++ b/arch/x86/kernel/cpu/microcode/intel_early.c
@@ -365,16 +365,6 @@ out:
return state;
}
-#define native_rdmsr(msr, val1, val2) \
-do { \
- u64 __val = native_read_msr((msr)); \
- (void)((val1) = (u32)__val); \
- (void)((val2) = (u32)(__val >> 32)); \
-} while (0)
-
-#define native_wrmsr(msr, low, high) \
- native_write_msr(msr, low, high);
-
static int collect_cpu_info_early(struct ucode_cpu_info *uci)
{
unsigned int val[2];
diff --git a/arch/x86/kernel/microcode_intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c
index ce69320d017..ce69320d017 100644
--- a/arch/x86/kernel/microcode_intel_lib.c
+++ b/arch/x86/kernel/cpu/microcode/intel_lib.c
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 71a39f3621b..a450373e8e9 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -15,7 +15,9 @@
#include <linux/clocksource.h>
#include <linux/module.h>
#include <linux/hardirq.h>
+#include <linux/efi.h>
#include <linux/interrupt.h>
+#include <linux/irq.h>
#include <asm/processor.h>
#include <asm/hypervisor.h>
#include <asm/hyperv.h>
@@ -23,10 +25,52 @@
#include <asm/desc.h>
#include <asm/idle.h>
#include <asm/irq_regs.h>
+#include <asm/i8259.h>
+#include <asm/apic.h>
+#include <asm/timer.h>
struct ms_hyperv_info ms_hyperv;
EXPORT_SYMBOL_GPL(ms_hyperv);
+#if IS_ENABLED(CONFIG_HYPERV)
+static void (*vmbus_handler)(void);
+
+void hyperv_vector_handler(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ irq_enter();
+ exit_idle();
+
+ inc_irq_stat(irq_hv_callback_count);
+ if (vmbus_handler)
+ vmbus_handler();
+
+ irq_exit();
+ set_irq_regs(old_regs);
+}
+
+void hv_setup_vmbus_irq(void (*handler)(void))
+{
+ vmbus_handler = handler;
+ /*
+ * Setup the IDT for hypervisor callback. Prevent reallocation
+ * at module reload.
+ */
+ if (!test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors))
+ alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR,
+ hyperv_callback_vector);
+}
+
+void hv_remove_vmbus_irq(void)
+{
+ /* We have no way to deallocate the interrupt gate */
+ vmbus_handler = NULL;
+}
+EXPORT_SYMBOL_GPL(hv_setup_vmbus_irq);
+EXPORT_SYMBOL_GPL(hv_remove_vmbus_irq);
+#endif
+
static uint32_t __init ms_hyperv_platform(void)
{
u32 eax;
@@ -76,8 +120,28 @@ static void __init ms_hyperv_init_platform(void)
printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n",
ms_hyperv.features, ms_hyperv.hints);
+#ifdef CONFIG_X86_LOCAL_APIC
+ if (ms_hyperv.features & HV_X64_MSR_APIC_FREQUENCY_AVAILABLE) {
+ /*
+ * Get the APIC frequency.
+ */
+ u64 hv_lapic_frequency;
+
+ rdmsrl(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency);
+ hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ);
+ lapic_timer_frequency = hv_lapic_frequency;
+ printk(KERN_INFO "HyperV: LAPIC Timer Frequency: %#x\n",
+ lapic_timer_frequency);
+ }
+#endif
+
if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE)
clocksource_register_hz(&hyperv_cs, NSEC_PER_SEC/100);
+
+#ifdef CONFIG_X86_IO_APIC
+ no_timer_check = 1;
+#endif
+
}
const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
@@ -86,41 +150,3 @@ const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
.init_platform = ms_hyperv_init_platform,
};
EXPORT_SYMBOL(x86_hyper_ms_hyperv);
-
-#if IS_ENABLED(CONFIG_HYPERV)
-static int vmbus_irq = -1;
-static irq_handler_t vmbus_isr;
-
-void hv_register_vmbus_handler(int irq, irq_handler_t handler)
-{
- /*
- * Setup the IDT for hypervisor callback.
- */
- alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector);
-
- vmbus_irq = irq;
- vmbus_isr = handler;
-}
-
-void hyperv_vector_handler(struct pt_regs *regs)
-{
- struct pt_regs *old_regs = set_irq_regs(regs);
- struct irq_desc *desc;
-
- irq_enter();
- exit_idle();
-
- desc = irq_to_desc(vmbus_irq);
-
- if (desc)
- generic_handle_irq_desc(vmbus_irq, desc);
-
- irq_exit();
- set_irq_regs(old_regs);
-}
-#else
-void hv_register_vmbus_handler(int irq, irq_handler_t handler)
-{
-}
-#endif
-EXPORT_SYMBOL_GPL(hv_register_vmbus_handler);
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index ce2d0a2c3e4..0e25a1bc5ab 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -683,7 +683,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
}
/* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
- count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
__flush_tlb();
/* Save MTRR state */
@@ -697,7 +697,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
static void post_set(void) __releases(set_atomicity_lock)
{
/* Flush TLBs (no need to flush caches - they are disabled) */
- count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
__flush_tlb();
/* Intel (P6) standard MTRRs */
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 897783b3302..2879ecdaac4 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -118,6 +118,9 @@ static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
continue;
if (event->attr.config1 & ~er->valid_mask)
return -EINVAL;
+ /* Check if the extra msrs can be safely accessed*/
+ if (!er->extra_msr_access)
+ return -ENXIO;
reg->idx = er->idx;
reg->config = event->attr.config1;
@@ -303,15 +306,6 @@ int x86_setup_perfctr(struct perf_event *event)
hwc->sample_period = x86_pmu.max_period;
hwc->last_period = hwc->sample_period;
local64_set(&hwc->period_left, hwc->sample_period);
- } else {
- /*
- * If we have a PMU initialized but no APIC
- * interrupts, we cannot sample hardware
- * events (user-space has to fall back and
- * sample via a hrtimer based software event):
- */
- if (!x86_pmu.apic)
- return -EOPNOTSUPP;
}
if (attr->type == PERF_TYPE_RAW)
@@ -721,6 +715,7 @@ int perf_assign_events(struct perf_event **events, int n,
return sched.state.unassigned;
}
+EXPORT_SYMBOL_GPL(perf_assign_events);
int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
{
@@ -892,7 +887,6 @@ static void x86_pmu_enable(struct pmu *pmu)
* hw_perf_group_sched_in() or x86_pmu_enable()
*
* step1: save events moving to new counters
- * step2: reprogram moved events into new counters
*/
for (i = 0; i < n_running; i++) {
event = cpuc->event_list[i];
@@ -918,6 +912,9 @@ static void x86_pmu_enable(struct pmu *pmu)
x86_pmu_stop(event, PERF_EF_UPDATE);
}
+ /*
+ * step2: reprogram moved events into new counters
+ */
for (i = 0; i < cpuc->n_events; i++) {
event = cpuc->event_list[i];
hwc = &event->hw;
@@ -1043,7 +1040,7 @@ static int x86_pmu_add(struct perf_event *event, int flags)
/*
* If group events scheduling transaction was started,
* skip the schedulability test here, it will be performed
- * at commit time (->commit_txn) as a whole
+ * at commit time (->commit_txn) as a whole.
*/
if (cpuc->group_flag & PERF_EVENT_TXN)
goto done_collect;
@@ -1058,6 +1055,10 @@ static int x86_pmu_add(struct perf_event *event, int flags)
memcpy(cpuc->assign, assign, n*sizeof(int));
done_collect:
+ /*
+ * Commit the collect_events() state. See x86_pmu_del() and
+ * x86_pmu_*_txn().
+ */
cpuc->n_events = n;
cpuc->n_added += n - n0;
cpuc->n_txn += n - n0;
@@ -1183,25 +1184,38 @@ static void x86_pmu_del(struct perf_event *event, int flags)
* If we're called during a txn, we don't need to do anything.
* The events never got scheduled and ->cancel_txn will truncate
* the event_list.
+ *
+ * XXX assumes any ->del() called during a TXN will only be on
+ * an event added during that same TXN.
*/
if (cpuc->group_flag & PERF_EVENT_TXN)
return;
+ /*
+ * Not a TXN, therefore cleanup properly.
+ */
x86_pmu_stop(event, PERF_EF_UPDATE);
for (i = 0; i < cpuc->n_events; i++) {
- if (event == cpuc->event_list[i]) {
+ if (event == cpuc->event_list[i])
+ break;
+ }
- if (x86_pmu.put_event_constraints)
- x86_pmu.put_event_constraints(cpuc, event);
+ if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */
+ return;
- while (++i < cpuc->n_events)
- cpuc->event_list[i-1] = cpuc->event_list[i];
+ /* If we have a newly added event; make sure to decrease n_added. */
+ if (i >= cpuc->n_events - cpuc->n_added)
+ --cpuc->n_added;
+
+ if (x86_pmu.put_event_constraints)
+ x86_pmu.put_event_constraints(cpuc, event);
+
+ /* Delete the array entry. */
+ while (++i < cpuc->n_events)
+ cpuc->event_list[i-1] = cpuc->event_list[i];
+ --cpuc->n_events;
- --cpuc->n_events;
- break;
- }
- }
perf_event_update_userpage(event);
}
@@ -1273,24 +1287,25 @@ void perf_events_lapic_init(void)
apic_write(APIC_LVTPC, APIC_DM_NMI);
}
-static int __kprobes
+static int
perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
{
- int ret;
u64 start_clock;
u64 finish_clock;
+ int ret;
if (!atomic_read(&active_events))
return NMI_DONE;
- start_clock = local_clock();
+ start_clock = sched_clock();
ret = x86_pmu.handle_irq(regs);
- finish_clock = local_clock();
+ finish_clock = sched_clock();
perf_sample_event_took(finish_clock - start_clock);
return ret;
}
+NOKPROBE_SYMBOL(perf_event_nmi_handler);
struct event_constraint emptyconstraint;
struct event_constraint unconstrained;
@@ -1346,6 +1361,15 @@ static void __init pmu_check_apic(void)
x86_pmu.apic = 0;
pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
pr_info("no hardware sampling interrupt available.\n");
+
+ /*
+ * If we have a PMU initialized but no APIC
+ * interrupts, we cannot sample hardware
+ * events (user-space has to fall back and
+ * sample via a hrtimer based software event):
+ */
+ pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
+
}
static struct attribute_group x86_pmu_format_group = {
@@ -1521,6 +1545,8 @@ static int __init init_hw_perf_events(void)
pr_cont("%s PMU driver.\n", x86_pmu.name);
+ x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */
+
for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next)
quirk->func();
@@ -1534,7 +1560,6 @@ static int __init init_hw_perf_events(void)
__EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
0, x86_pmu.num_counters, 0, 0);
- x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */
x86_pmu_format_group.attrs = x86_pmu.format_attrs;
if (x86_pmu.event_attrs)
@@ -1594,7 +1619,8 @@ static void x86_pmu_cancel_txn(struct pmu *pmu)
{
__this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN);
/*
- * Truncate the collected events.
+ * Truncate collected array by the number of events added in this
+ * transaction. See x86_pmu_add() and x86_pmu_*_txn().
*/
__this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));
__this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn));
@@ -1605,6 +1631,8 @@ static void x86_pmu_cancel_txn(struct pmu *pmu)
* Commit group events scheduling transaction
* Perform the group schedulability test as a whole
* Return 0 if success
+ *
+ * Does not cancel the transaction on failure; expects the caller to do this.
*/
static int x86_pmu_commit_txn(struct pmu *pmu)
{
@@ -1820,9 +1848,12 @@ static ssize_t set_attr_rdpmc(struct device *cdev,
if (ret)
return ret;
+ if (x86_pmu.attr_rdpmc_broken)
+ return -ENOTSUPP;
+
if (!!val != !!x86_pmu.attr_rdpmc) {
x86_pmu.attr_rdpmc = !!val;
- smp_call_function(change_rdpmc, (void *)val, 1);
+ on_each_cpu(change_rdpmc, (void *)val, 1);
}
return count;
@@ -1883,26 +1914,27 @@ static struct pmu pmu = {
void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
{
+ struct cyc2ns_data *data;
+
userpg->cap_user_time = 0;
userpg->cap_user_time_zero = 0;
userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc;
userpg->pmc_width = x86_pmu.cntval_bits;
- if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+ if (!sched_clock_stable())
return;
- if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
- return;
+ data = cyc2ns_read_begin();
userpg->cap_user_time = 1;
- userpg->time_mult = this_cpu_read(cyc2ns);
- userpg->time_shift = CYC2NS_SCALE_FACTOR;
- userpg->time_offset = this_cpu_read(cyc2ns_offset) - now;
+ userpg->time_mult = data->cyc2ns_mul;
+ userpg->time_shift = data->cyc2ns_shift;
+ userpg->time_offset = data->cyc2ns_offset - now;
- if (sched_clock_stable && !check_tsc_disabled()) {
- userpg->cap_user_time_zero = 1;
- userpg->time_zero = this_cpu_read(cyc2ns_offset);
- }
+ userpg->cap_user_time_zero = 1;
+ userpg->time_zero = data->cyc2ns_offset;
+
+ cyc2ns_read_end(data);
}
/*
@@ -1994,7 +2026,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
frame.return_address = 0;
bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
- if (bytes != sizeof(frame))
+ if (bytes != 0)
break;
if (!valid_user_frame(fp, sizeof(frame)))
@@ -2046,7 +2078,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
frame.return_address = 0;
bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
- if (bytes != sizeof(frame))
+ if (bytes != 0)
break;
if (!valid_user_frame(fp, sizeof(frame)))
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index cc16faae053..8ade93111e0 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -130,9 +130,11 @@ struct cpu_hw_events {
unsigned long running[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
int enabled;
- int n_events;
- int n_added;
- int n_txn;
+ int n_events; /* the # of events in the below arrays */
+ int n_added; /* the # last events in the below arrays;
+ they've never been enabled yet */
+ int n_txn; /* the # last events in the below arrays;
+ added in the current transaction */
int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
u64 tags[X86_PMC_IDX_MAX];
struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
@@ -164,6 +166,11 @@ struct cpu_hw_events {
struct perf_guest_switch_msr guest_switch_msrs[X86_PMC_IDX_MAX];
/*
+ * Intel checkpoint mask
+ */
+ u64 intel_cp_status;
+
+ /*
* manage shared (per-core, per-cpu) registers
* used on Intel NHM/WSM/SNB
*/
@@ -257,11 +264,20 @@ struct cpu_hw_events {
__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
-#define EVENT_CONSTRAINT_END \
- EVENT_CONSTRAINT(0, 0, 0)
+/*
+ * We define the end marker as having a weight of -1
+ * to enable blacklisting of events using a counter bitmask
+ * of zero and thus a weight of zero.
+ * The end marker has a weight that cannot possibly be
+ * obtained from counting the bits in the bitmask.
+ */
+#define EVENT_CONSTRAINT_END { .weight = -1 }
+/*
+ * Check for end marker with weight == -1
+ */
#define for_each_event_constraint(e, c) \
- for ((e) = (c); (e)->weight; (e)++)
+ for ((e) = (c); (e)->weight != -1; (e)++)
/*
* Extra registers for specific events.
@@ -279,14 +295,16 @@ struct extra_reg {
u64 config_mask;
u64 valid_mask;
int idx; /* per_xxx->regs[] reg index */
+ bool extra_msr_access;
};
#define EVENT_EXTRA_REG(e, ms, m, vm, i) { \
- .event = (e), \
- .msr = (ms), \
- .config_mask = (m), \
- .valid_mask = (vm), \
- .idx = EXTRA_REG_##i, \
+ .event = (e), \
+ .msr = (ms), \
+ .config_mask = (m), \
+ .valid_mask = (vm), \
+ .idx = EXTRA_REG_##i, \
+ .extra_msr_access = true, \
}
#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx) \
@@ -395,6 +413,7 @@ struct x86_pmu {
/*
* sysfs attrs
*/
+ int attr_rdpmc_broken;
int attr_rdpmc;
struct attribute **format_attrs;
struct attribute **event_attrs;
@@ -440,6 +459,7 @@ struct x86_pmu {
int lbr_nr; /* hardware stack size */
u64 lbr_sel_mask; /* LBR_SELECT valid bits */
const int *lbr_sel_map; /* lbr_select mappings */
+ bool lbr_double_abort; /* duplicated lbr aborts */
/*
* Extra registers for events
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index e09f0bfb7b8..cbb1be3ed9e 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -10,6 +10,7 @@
#include <linux/module.h>
#include <linux/pci.h>
#include <linux/ptrace.h>
+#include <linux/syscore_ops.h>
#include <asm/apic.h>
@@ -592,7 +593,7 @@ out:
return 1;
}
-static int __kprobes
+static int
perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
{
int handled = 0;
@@ -605,6 +606,7 @@ perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
return handled;
}
+NOKPROBE_SYMBOL(perf_ibs_nmi_handler);
static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name)
{
@@ -816,6 +818,18 @@ out:
return ret;
}
+static void ibs_eilvt_setup(void)
+{
+ /*
+ * Force LVT offset assignment for family 10h: The offsets are
+ * not assigned by the BIOS for this family, so the OS is
+ * responsible for doing it. If the OS assignment fails, fall
+ * back to BIOS settings and try to setup this.
+ */
+ if (boot_cpu_data.x86 == 0x10)
+ force_ibs_eilvt_setup();
+}
+
static inline int get_ibs_lvt_offset(void)
{
u64 val;
@@ -851,6 +865,36 @@ static void clear_APIC_ibs(void *dummy)
setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1);
}
+#ifdef CONFIG_PM
+
+static int perf_ibs_suspend(void)
+{
+ clear_APIC_ibs(NULL);
+ return 0;
+}
+
+static void perf_ibs_resume(void)
+{
+ ibs_eilvt_setup();
+ setup_APIC_ibs(NULL);
+}
+
+static struct syscore_ops perf_ibs_syscore_ops = {
+ .resume = perf_ibs_resume,
+ .suspend = perf_ibs_suspend,
+};
+
+static void perf_ibs_pm_init(void)
+{
+ register_syscore_ops(&perf_ibs_syscore_ops);
+}
+
+#else
+
+static inline void perf_ibs_pm_init(void) { }
+
+#endif
+
static int
perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
{
@@ -877,25 +921,19 @@ static __init int amd_ibs_init(void)
if (!caps)
return -ENODEV; /* ibs not supported by the cpu */
- /*
- * Force LVT offset assignment for family 10h: The offsets are
- * not assigned by the BIOS for this family, so the OS is
- * responsible for doing it. If the OS assignment fails, fall
- * back to BIOS settings and try to setup this.
- */
- if (boot_cpu_data.x86 == 0x10)
- force_ibs_eilvt_setup();
+ ibs_eilvt_setup();
if (!ibs_eilvt_valid())
goto out;
- get_online_cpus();
+ perf_ibs_pm_init();
+ cpu_notifier_register_begin();
ibs_caps = caps;
/* make ibs_caps visible to other cpus: */
smp_mb();
- perf_cpu_notifier(perf_ibs_cpu_notifier);
smp_call_function(setup_APIC_ibs, NULL, 1);
- put_online_cpus();
+ __perf_cpu_notifier(perf_ibs_cpu_notifier);
+ cpu_notifier_register_done();
ret = perf_event_ibs_init();
out:
diff --git a/arch/x86/kernel/cpu/perf_event_amd_uncore.c b/arch/x86/kernel/cpu/perf_event_amd_uncore.c
index 754291adec3..3bbdf4cd38b 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_uncore.c
@@ -531,15 +531,16 @@ static int __init amd_uncore_init(void)
if (ret)
return -ENODEV;
- get_online_cpus();
+ cpu_notifier_register_begin();
+
/* init cpus already online before registering for hotplug notifier */
for_each_online_cpu(cpu) {
amd_uncore_cpu_up_prepare(cpu);
smp_call_function_single(cpu, init_cpu_already_online, NULL, 1);
}
- register_cpu_notifier(&amd_uncore_cpu_notifier_block);
- put_online_cpus();
+ __register_cpu_notifier(&amd_uncore_cpu_notifier_block);
+ cpu_notifier_register_done();
return 0;
}
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index f31a1655d1f..2502d0d9d24 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -169,7 +169,6 @@ static struct event_constraint intel_slm_event_constraints[] __read_mostly =
{
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
- FIXED_EVENT_CONSTRAINT(0x013c, 2), /* CPU_CLK_UNHALTED.REF */
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* pseudo CPU_CLK_UNHALTED.REF */
EVENT_CONSTRAINT_END
};
@@ -190,9 +189,9 @@ static struct extra_reg intel_snbep_extra_regs[] __read_mostly = {
EVENT_EXTRA_END
};
-EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3");
-EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3");
-EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2");
+EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3");
+EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3");
+EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2");
struct attribute *nhm_events_attrs[] = {
EVENT_PTR(mem_ld_nhm),
@@ -1184,6 +1183,11 @@ static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)
wrmsrl(hwc->config_base, ctrl_val);
}
+static inline bool event_is_checkpointed(struct perf_event *event)
+{
+ return (event->hw.config & HSW_IN_TX_CHECKPOINTED) != 0;
+}
+
static void intel_pmu_disable_event(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
@@ -1197,6 +1201,7 @@ static void intel_pmu_disable_event(struct perf_event *event)
cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx);
cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx);
+ cpuc->intel_cp_status &= ~(1ull << hwc->idx);
/*
* must disable before any actual event
@@ -1271,6 +1276,9 @@ static void intel_pmu_enable_event(struct perf_event *event)
if (event->attr.exclude_guest)
cpuc->intel_ctrl_host_mask |= (1ull << hwc->idx);
+ if (unlikely(event_is_checkpointed(event)))
+ cpuc->intel_cp_status |= (1ull << hwc->idx);
+
if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
intel_pmu_enable_fixed(hwc);
return;
@@ -1289,6 +1297,17 @@ static void intel_pmu_enable_event(struct perf_event *event)
int intel_pmu_save_and_restart(struct perf_event *event)
{
x86_perf_event_update(event);
+ /*
+ * For a checkpointed counter always reset back to 0. This
+ * avoids a situation where the counter overflows, aborts the
+ * transaction and is then set back to shortly before the
+ * overflow, and overflows and aborts again.
+ */
+ if (unlikely(event_is_checkpointed(event))) {
+ /* No race with NMIs because the counter should not be armed */
+ wrmsrl(event->hw.event_base, 0);
+ local64_set(&event->hw.prev_count, 0);
+ }
return x86_perf_event_set_period(event);
}
@@ -1341,10 +1360,8 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
intel_pmu_disable_all();
handled = intel_pmu_drain_bts_buffer();
status = intel_pmu_get_status();
- if (!status) {
- intel_pmu_enable_all(0);
- return handled;
- }
+ if (!status)
+ goto done;
loops = 0;
again:
@@ -1365,6 +1382,15 @@ again:
intel_pmu_lbr_read();
/*
+ * CondChgd bit 63 doesn't mean any overflow status. Ignore
+ * and clear the bit.
+ */
+ if (__test_and_clear_bit(63, (unsigned long *)&status)) {
+ if (!status)
+ goto done;
+ }
+
+ /*
* PEBS overflow sets bit 62 in the global status register
*/
if (__test_and_clear_bit(62, (unsigned long *)&status)) {
@@ -1372,6 +1398,13 @@ again:
x86_pmu.drain_pebs(regs);
}
+ /*
+ * Checkpointed counters can lead to 'spurious' PMIs because the
+ * rollback caused by the PMI will have cleared the overflow status
+ * bit. Therefore always force probe these counters.
+ */
+ status |= cpuc->intel_cp_status;
+
for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
struct perf_event *event = cpuc->events[bit];
@@ -1837,6 +1870,20 @@ static int hsw_hw_config(struct perf_event *event)
event->attr.precise_ip > 0))
return -EOPNOTSUPP;
+ if (event_is_checkpointed(event)) {
+ /*
+ * Sampling of checkpointed events can cause situations where
+ * the CPU constantly aborts because of a overflow, which is
+ * then checkpointed back and ignored. Forbid checkpointing
+ * for sampling.
+ *
+ * But still allow a long sampling period, so that perf stat
+ * from KVM works.
+ */
+ if (event->attr.sample_period > 0 &&
+ event->attr.sample_period < 0x7fffffff)
+ return -EOPNOTSUPP;
+ }
return 0;
}
@@ -2135,6 +2182,41 @@ static void intel_snb_check_microcode(void)
}
}
+/*
+ * Under certain circumstances, access certain MSR may cause #GP.
+ * The function tests if the input MSR can be safely accessed.
+ */
+static bool check_msr(unsigned long msr, u64 mask)
+{
+ u64 val_old, val_new, val_tmp;
+
+ /*
+ * Read the current value, change it and read it back to see if it
+ * matches, this is needed to detect certain hardware emulators
+ * (qemu/kvm) that don't trap on the MSR access and always return 0s.
+ */
+ if (rdmsrl_safe(msr, &val_old))
+ return false;
+
+ /*
+ * Only change the bits which can be updated by wrmsrl.
+ */
+ val_tmp = val_old ^ mask;
+ if (wrmsrl_safe(msr, val_tmp) ||
+ rdmsrl_safe(msr, &val_new))
+ return false;
+
+ if (val_new != val_tmp)
+ return false;
+
+ /* Here it's sure that the MSR can be safely accessed.
+ * Restore the old value and return.
+ */
+ wrmsrl(msr, val_old);
+
+ return true;
+}
+
static __init void intel_sandybridge_quirk(void)
{
x86_pmu.check_microcode = intel_snb_check_microcode;
@@ -2182,10 +2264,36 @@ static __init void intel_nehalem_quirk(void)
}
}
-EVENT_ATTR_STR(mem-loads, mem_ld_hsw, "event=0xcd,umask=0x1,ldlat=3");
-EVENT_ATTR_STR(mem-stores, mem_st_hsw, "event=0xd0,umask=0x82")
+EVENT_ATTR_STR(mem-loads, mem_ld_hsw, "event=0xcd,umask=0x1,ldlat=3");
+EVENT_ATTR_STR(mem-stores, mem_st_hsw, "event=0xd0,umask=0x82")
+
+/* Haswell special events */
+EVENT_ATTR_STR(tx-start, tx_start, "event=0xc9,umask=0x1");
+EVENT_ATTR_STR(tx-commit, tx_commit, "event=0xc9,umask=0x2");
+EVENT_ATTR_STR(tx-abort, tx_abort, "event=0xc9,umask=0x4");
+EVENT_ATTR_STR(tx-capacity, tx_capacity, "event=0x54,umask=0x2");
+EVENT_ATTR_STR(tx-conflict, tx_conflict, "event=0x54,umask=0x1");
+EVENT_ATTR_STR(el-start, el_start, "event=0xc8,umask=0x1");
+EVENT_ATTR_STR(el-commit, el_commit, "event=0xc8,umask=0x2");
+EVENT_ATTR_STR(el-abort, el_abort, "event=0xc8,umask=0x4");
+EVENT_ATTR_STR(el-capacity, el_capacity, "event=0x54,umask=0x2");
+EVENT_ATTR_STR(el-conflict, el_conflict, "event=0x54,umask=0x1");
+EVENT_ATTR_STR(cycles-t, cycles_t, "event=0x3c,in_tx=1");
+EVENT_ATTR_STR(cycles-ct, cycles_ct, "event=0x3c,in_tx=1,in_tx_cp=1");
static struct attribute *hsw_events_attrs[] = {
+ EVENT_PTR(tx_start),
+ EVENT_PTR(tx_commit),
+ EVENT_PTR(tx_abort),
+ EVENT_PTR(tx_capacity),
+ EVENT_PTR(tx_conflict),
+ EVENT_PTR(el_start),
+ EVENT_PTR(el_commit),
+ EVENT_PTR(el_abort),
+ EVENT_PTR(el_capacity),
+ EVENT_PTR(el_conflict),
+ EVENT_PTR(cycles_t),
+ EVENT_PTR(cycles_ct),
EVENT_PTR(mem_ld_hsw),
EVENT_PTR(mem_st_hsw),
NULL
@@ -2198,7 +2306,8 @@ __init int intel_pmu_init(void)
union cpuid10_ebx ebx;
struct event_constraint *c;
unsigned int unused;
- int version;
+ struct extra_reg *er;
+ int version, i;
if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
switch (boot_cpu_data.x86) {
@@ -2243,10 +2352,7 @@ __init int intel_pmu_init(void)
if (version > 1)
x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
- /*
- * v2 and above have a perf capabilities MSR
- */
- if (version > 1) {
+ if (boot_cpu_has(X86_FEATURE_PDCM)) {
u64 capabilities;
rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
@@ -2404,6 +2510,9 @@ __init int intel_pmu_init(void)
case 62: /* IvyBridge EP */
memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
+ /* dTLB-load-misses on IVB is different than SNB */
+ hw_cache_event_ids[C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = 0x8108; /* DTLB_LOAD_MISSES.DEMAND_LD_MISS_CAUSES_A_WALK */
+
memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
sizeof(hw_cache_extra_regs));
@@ -2452,6 +2561,7 @@ __init int intel_pmu_init(void)
x86_pmu.hw_config = hsw_hw_config;
x86_pmu.get_event_constraints = hsw_get_event_constraints;
x86_pmu.cpu_events = hsw_events_attrs;
+ x86_pmu.lbr_double_abort = true;
pr_cont("Haswell events, ");
break;
@@ -2503,6 +2613,34 @@ __init int intel_pmu_init(void)
}
}
+ /*
+ * Access LBR MSR may cause #GP under certain circumstances.
+ * E.g. KVM doesn't support LBR MSR
+ * Check all LBT MSR here.
+ * Disable LBR access if any LBR MSRs can not be accessed.
+ */
+ if (x86_pmu.lbr_nr && !check_msr(x86_pmu.lbr_tos, 0x3UL))
+ x86_pmu.lbr_nr = 0;
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ if (!(check_msr(x86_pmu.lbr_from + i, 0xffffUL) &&
+ check_msr(x86_pmu.lbr_to + i, 0xffffUL)))
+ x86_pmu.lbr_nr = 0;
+ }
+
+ /*
+ * Access extra MSR may cause #GP under certain circumstances.
+ * E.g. KVM doesn't support offcore event
+ * Check all extra_regs here.
+ */
+ if (x86_pmu.extra_regs) {
+ for (er = x86_pmu.extra_regs; er->msr; er++) {
+ er->extra_msr_access = check_msr(er->msr, 0x1ffUL);
+ /* Disable LBR select mapping */
+ if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access)
+ x86_pmu.lbr_sel_map = NULL;
+ }
+ }
+
/* Support full width counters using alternative MSR range */
if (x86_pmu.intel_cap.full_width_write) {
x86_pmu.max_period = x86_pmu.cntval_mask;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index ab3ba1c1b7d..696ade311de 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -12,6 +12,7 @@
#define BTS_BUFFER_SIZE (PAGE_SIZE << 4)
#define PEBS_BUFFER_SIZE PAGE_SIZE
+#define PEBS_FIXUP_SIZE PAGE_SIZE
/*
* pebs_record_32 for p4 and core not supported
@@ -107,15 +108,31 @@ static u64 precise_store_data(u64 status)
return val;
}
-static u64 precise_store_data_hsw(u64 status)
+static u64 precise_store_data_hsw(struct perf_event *event, u64 status)
{
union perf_mem_data_src dse;
+ u64 cfg = event->hw.config & INTEL_ARCH_EVENT_MASK;
dse.val = 0;
dse.mem_op = PERF_MEM_OP_STORE;
dse.mem_lvl = PERF_MEM_LVL_NA;
+
+ /*
+ * L1 info only valid for following events:
+ *
+ * MEM_UOPS_RETIRED.STLB_MISS_STORES
+ * MEM_UOPS_RETIRED.LOCK_STORES
+ * MEM_UOPS_RETIRED.SPLIT_STORES
+ * MEM_UOPS_RETIRED.ALL_STORES
+ */
+ if (cfg != 0x12d0 && cfg != 0x22d0 && cfg != 0x42d0 && cfg != 0x82d0)
+ return dse.mem_lvl;
+
if (status & 1)
- dse.mem_lvl = PERF_MEM_LVL_L1;
+ dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
+ else
+ dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS;
+
/* Nothing else supported. Sorry. */
return dse.val;
}
@@ -182,18 +199,32 @@ struct pebs_record_nhm {
* Same as pebs_record_nhm, with two additional fields.
*/
struct pebs_record_hsw {
- struct pebs_record_nhm nhm;
- /*
- * Real IP of the event. In the Intel documentation this
- * is called eventingrip.
- */
- u64 real_ip;
- /*
- * TSX tuning information field: abort cycles and abort flags.
- */
- u64 tsx_tuning;
+ u64 flags, ip;
+ u64 ax, bx, cx, dx;
+ u64 si, di, bp, sp;
+ u64 r8, r9, r10, r11;
+ u64 r12, r13, r14, r15;
+ u64 status, dla, dse, lat;
+ u64 real_ip, tsx_tuning;
+};
+
+union hsw_tsx_tuning {
+ struct {
+ u32 cycles_last_block : 32,
+ hle_abort : 1,
+ rtm_abort : 1,
+ instruction_abort : 1,
+ non_instruction_abort : 1,
+ retry : 1,
+ data_conflict : 1,
+ capacity_writes : 1,
+ capacity_reads : 1;
+ };
+ u64 value;
};
+#define PEBS_HSW_TSX_FLAGS 0xff00000000ULL
+
void init_debug_store_on_cpu(int cpu)
{
struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
@@ -214,12 +245,14 @@ void fini_debug_store_on_cpu(int cpu)
wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
}
+static DEFINE_PER_CPU(void *, insn_buffer);
+
static int alloc_pebs_buffer(int cpu)
{
struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
int node = cpu_to_node(cpu);
int max, thresh = 1; /* always use a single PEBS record */
- void *buffer;
+ void *buffer, *ibuffer;
if (!x86_pmu.pebs)
return 0;
@@ -228,6 +261,19 @@ static int alloc_pebs_buffer(int cpu)
if (unlikely(!buffer))
return -ENOMEM;
+ /*
+ * HSW+ already provides us the eventing ip; no need to allocate this
+ * buffer then.
+ */
+ if (x86_pmu.intel_cap.pebs_format < 2) {
+ ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
+ if (!ibuffer) {
+ kfree(buffer);
+ return -ENOMEM;
+ }
+ per_cpu(insn_buffer, cpu) = ibuffer;
+ }
+
max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
ds->pebs_buffer_base = (u64)(unsigned long)buffer;
@@ -248,6 +294,9 @@ static void release_pebs_buffer(int cpu)
if (!ds || !x86_pmu.pebs)
return;
+ kfree(per_cpu(insn_buffer, cpu));
+ per_cpu(insn_buffer, cpu) = NULL;
+
kfree((void *)(unsigned long)ds->pebs_buffer_base);
ds->pebs_buffer_base = 0;
}
@@ -262,9 +311,11 @@ static int alloc_bts_buffer(int cpu)
if (!x86_pmu.bts)
return 0;
- buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL, node);
- if (unlikely(!buffer))
+ buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
+ if (unlikely(!buffer)) {
+ WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
return -ENOMEM;
+ }
max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
thresh = max / 16;
@@ -715,6 +766,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
unsigned long old_to, to = cpuc->lbr_entries[0].to;
unsigned long ip = regs->ip;
int is_64bit = 0;
+ void *kaddr;
/*
* We don't need to fixup if the PEBS assist is fault like
@@ -738,7 +790,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
* unsigned math, either ip is before the start (impossible) or
* the basic block is larger than 1 page (sanity)
*/
- if ((ip - to) > PAGE_SIZE)
+ if ((ip - to) > PEBS_FIXUP_SIZE)
return 0;
/*
@@ -749,29 +801,33 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
return 1;
}
+ if (!kernel_ip(ip)) {
+ int size, bytes;
+ u8 *buf = this_cpu_read(insn_buffer);
+
+ size = ip - to; /* Must fit our buffer, see above */
+ bytes = copy_from_user_nmi(buf, (void __user *)to, size);
+ if (bytes != 0)
+ return 0;
+
+ kaddr = buf;
+ } else {
+ kaddr = (void *)to;
+ }
+
do {
struct insn insn;
- u8 buf[MAX_INSN_SIZE];
- void *kaddr;
old_to = to;
- if (!kernel_ip(ip)) {
- int bytes, size = MAX_INSN_SIZE;
-
- bytes = copy_from_user_nmi(buf, (void __user *)to, size);
- if (bytes != size)
- return 0;
-
- kaddr = buf;
- } else
- kaddr = (void *)to;
#ifdef CONFIG_X86_64
is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32);
#endif
insn_init(&insn, kaddr, is_64bit);
insn_get_length(&insn);
+
to += insn.length;
+ kaddr += insn.length;
} while (to < ip);
if (to == ip) {
@@ -786,16 +842,34 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
return 0;
}
+static inline u64 intel_hsw_weight(struct pebs_record_hsw *pebs)
+{
+ if (pebs->tsx_tuning) {
+ union hsw_tsx_tuning tsx = { .value = pebs->tsx_tuning };
+ return tsx.cycles_last_block;
+ }
+ return 0;
+}
+
+static inline u64 intel_hsw_transaction(struct pebs_record_hsw *pebs)
+{
+ u64 txn = (pebs->tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32;
+
+ /* For RTM XABORTs also log the abort code from AX */
+ if ((txn & PERF_TXN_TRANSACTION) && (pebs->ax & 1))
+ txn |= ((pebs->ax >> 24) & 0xff) << PERF_TXN_ABORT_SHIFT;
+ return txn;
+}
+
static void __intel_pmu_pebs_event(struct perf_event *event,
struct pt_regs *iregs, void *__pebs)
{
/*
- * We cast to pebs_record_nhm to get the load latency data
- * if extra_reg MSR_PEBS_LD_LAT_THRESHOLD used
+ * We cast to the biggest pebs_record but are careful not to
+ * unconditionally access the 'extra' entries.
*/
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- struct pebs_record_nhm *pebs = __pebs;
- struct pebs_record_hsw *pebs_hsw = __pebs;
+ struct pebs_record_hsw *pebs = __pebs;
struct perf_sample_data data;
struct pt_regs regs;
u64 sample_type;
@@ -831,7 +905,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
data.data_src.val = load_latency_data(pebs->dse);
else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW)
data.data_src.val =
- precise_store_data_hsw(pebs->dse);
+ precise_store_data_hsw(event, pebs->dse);
else
data.data_src.val = precise_store_data(pebs->dse);
}
@@ -854,7 +928,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
regs.sp = pebs->sp;
if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format >= 2) {
- regs.ip = pebs_hsw->real_ip;
+ regs.ip = pebs->real_ip;
regs.flags |= PERF_EFLAGS_EXACT;
} else if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(&regs))
regs.flags |= PERF_EFLAGS_EXACT;
@@ -862,9 +936,18 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
regs.flags &= ~PERF_EFLAGS_EXACT;
if ((event->attr.sample_type & PERF_SAMPLE_ADDR) &&
- x86_pmu.intel_cap.pebs_format >= 1)
+ x86_pmu.intel_cap.pebs_format >= 1)
data.addr = pebs->dla;
+ if (x86_pmu.intel_cap.pebs_format >= 2) {
+ /* Only set the TSX weight when no memory weight. */
+ if ((event->attr.sample_type & PERF_SAMPLE_WEIGHT) && !fll)
+ data.weight = intel_hsw_weight(pebs);
+
+ if (event->attr.sample_type & PERF_SAMPLE_TRANSACTION)
+ data.txn = intel_hsw_transaction(pebs);
+ }
+
if (has_branch_stack(event))
data.br_stack = &cpuc->lbr_stack;
@@ -913,17 +996,34 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
__intel_pmu_pebs_event(event, iregs, at);
}
-static void __intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, void *at,
- void *top)
+static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
struct debug_store *ds = cpuc->ds;
struct perf_event *event = NULL;
+ void *at, *top;
u64 status = 0;
int bit;
+ if (!x86_pmu.pebs_active)
+ return;
+
+ at = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
+ top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
+
ds->pebs_index = ds->pebs_buffer_base;
+ if (unlikely(at > top))
+ return;
+
+ /*
+ * Should not happen, we program the threshold at 1 and do not
+ * set a reset value.
+ */
+ WARN_ONCE(top - at > x86_pmu.max_pebs_events * x86_pmu.pebs_record_size,
+ "Unexpected number of pebs records %ld\n",
+ (long)(top - at) / x86_pmu.pebs_record_size);
+
for (; at < top; at += x86_pmu.pebs_record_size) {
struct pebs_record_nhm *p = at;
@@ -951,61 +1051,6 @@ static void __intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, void *at,
}
}
-static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- struct debug_store *ds = cpuc->ds;
- struct pebs_record_nhm *at, *top;
- int n;
-
- if (!x86_pmu.pebs_active)
- return;
-
- at = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
- top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
-
- ds->pebs_index = ds->pebs_buffer_base;
-
- n = top - at;
- if (n <= 0)
- return;
-
- /*
- * Should not happen, we program the threshold at 1 and do not
- * set a reset value.
- */
- WARN_ONCE(n > x86_pmu.max_pebs_events,
- "Unexpected number of pebs records %d\n", n);
-
- return __intel_pmu_drain_pebs_nhm(iregs, at, top);
-}
-
-static void intel_pmu_drain_pebs_hsw(struct pt_regs *iregs)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- struct debug_store *ds = cpuc->ds;
- struct pebs_record_hsw *at, *top;
- int n;
-
- if (!x86_pmu.pebs_active)
- return;
-
- at = (struct pebs_record_hsw *)(unsigned long)ds->pebs_buffer_base;
- top = (struct pebs_record_hsw *)(unsigned long)ds->pebs_index;
-
- n = top - at;
- if (n <= 0)
- return;
- /*
- * Should not happen, we program the threshold at 1 and do not
- * set a reset value.
- */
- WARN_ONCE(n > x86_pmu.max_pebs_events,
- "Unexpected number of pebs records %d\n", n);
-
- return __intel_pmu_drain_pebs_nhm(iregs, at, top);
-}
-
/*
* BTS, PEBS probe and setup
*/
@@ -1040,7 +1085,7 @@ void intel_ds_init(void)
case 2:
pr_cont("PEBS fmt2%c, ", pebs_type);
x86_pmu.pebs_record_size = sizeof(struct pebs_record_hsw);
- x86_pmu.drain_pebs = intel_pmu_drain_pebs_hsw;
+ x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
break;
default:
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index d5be06a5005..9dd2459a4c7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -284,6 +284,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
int lbr_format = x86_pmu.intel_cap.lbr_format;
u64 tos = intel_pmu_lbr_tos();
int i;
+ int out = 0;
for (i = 0; i < x86_pmu.lbr_nr; i++) {
unsigned long lbr_idx = (tos - i) & mask;
@@ -306,15 +307,27 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
}
from = (u64)((((s64)from) << skip) >> skip);
- cpuc->lbr_entries[i].from = from;
- cpuc->lbr_entries[i].to = to;
- cpuc->lbr_entries[i].mispred = mis;
- cpuc->lbr_entries[i].predicted = pred;
- cpuc->lbr_entries[i].in_tx = in_tx;
- cpuc->lbr_entries[i].abort = abort;
- cpuc->lbr_entries[i].reserved = 0;
+ /*
+ * Some CPUs report duplicated abort records,
+ * with the second entry not having an abort bit set.
+ * Skip them here. This loop runs backwards,
+ * so we need to undo the previous record.
+ * If the abort just happened outside the window
+ * the extra entry cannot be removed.
+ */
+ if (abort && x86_pmu.lbr_double_abort && out > 0)
+ out--;
+
+ cpuc->lbr_entries[out].from = from;
+ cpuc->lbr_entries[out].to = to;
+ cpuc->lbr_entries[out].mispred = mis;
+ cpuc->lbr_entries[out].predicted = pred;
+ cpuc->lbr_entries[out].in_tx = in_tx;
+ cpuc->lbr_entries[out].abort = abort;
+ cpuc->lbr_entries[out].reserved = 0;
+ out++;
}
- cpuc->lbr_stack.nr = i;
+ cpuc->lbr_stack.nr = out;
}
void intel_pmu_lbr_read(void)
@@ -371,6 +384,9 @@ static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
if (br_type & PERF_SAMPLE_BRANCH_NO_TX)
mask |= X86_BR_NO_TX;
+ if (br_type & PERF_SAMPLE_BRANCH_COND)
+ mask |= X86_BR_JCC;
+
/*
* stash actual user request into reg, it may
* be used by fixup code for some CPU
@@ -478,7 +494,7 @@ static int branch_type(unsigned long from, unsigned long to, int abort)
/* may fail if text not present */
bytes = copy_from_user_nmi(buf, (void __user *)from, size);
- if (bytes != size)
+ if (bytes != 0)
return X86_BR_NONE;
addr = buf;
@@ -665,6 +681,7 @@ static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
* NHM/WSM erratum: must include IND_JMP to capture IND_CALL
*/
[PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL | LBR_IND_JMP,
+ [PERF_SAMPLE_BRANCH_COND] = LBR_JCC,
};
static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
@@ -676,6 +693,7 @@ static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
[PERF_SAMPLE_BRANCH_ANY_CALL] = LBR_REL_CALL | LBR_IND_CALL
| LBR_FAR,
[PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL,
+ [PERF_SAMPLE_BRANCH_COND] = LBR_JCC,
};
/* core */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
new file mode 100644
index 00000000000..619f7699487
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
@@ -0,0 +1,714 @@
+/*
+ * perf_event_intel_rapl.c: support Intel RAPL energy consumption counters
+ * Copyright (C) 2013 Google, Inc., Stephane Eranian
+ *
+ * Intel RAPL interface is specified in the IA-32 Manual Vol3b
+ * section 14.7.1 (September 2013)
+ *
+ * RAPL provides more controls than just reporting energy consumption
+ * however here we only expose the 3 energy consumption free running
+ * counters (pp0, pkg, dram).
+ *
+ * Each of those counters increments in a power unit defined by the
+ * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
+ * but it can vary.
+ *
+ * Counter to rapl events mappings:
+ *
+ * pp0 counter: consumption of all physical cores (power plane 0)
+ * event: rapl_energy_cores
+ * perf code: 0x1
+ *
+ * pkg counter: consumption of the whole processor package
+ * event: rapl_energy_pkg
+ * perf code: 0x2
+ *
+ * dram counter: consumption of the dram domain (servers only)
+ * event: rapl_energy_dram
+ * perf code: 0x3
+ *
+ * dram counter: consumption of the builtin-gpu domain (client only)
+ * event: rapl_energy_gpu
+ * perf code: 0x4
+ *
+ * We manage those counters as free running (read-only). They may be
+ * use simultaneously by other tools, such as turbostat.
+ *
+ * The events only support system-wide mode counting. There is no
+ * sampling support because it does not make sense and is not
+ * supported by the RAPL hardware.
+ *
+ * Because we want to avoid floating-point operations in the kernel,
+ * the events are all reported in fixed point arithmetic (32.32).
+ * Tools must adjust the counts to convert them to Watts using
+ * the duration of the measurement. Tools may use a function such as
+ * ldexp(raw_count, -32);
+ */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/perf_event.h>
+#include <asm/cpu_device_id.h>
+#include "perf_event.h"
+
+/*
+ * RAPL energy status counters
+ */
+#define RAPL_IDX_PP0_NRG_STAT 0 /* all cores */
+#define INTEL_RAPL_PP0 0x1 /* pseudo-encoding */
+#define RAPL_IDX_PKG_NRG_STAT 1 /* entire package */
+#define INTEL_RAPL_PKG 0x2 /* pseudo-encoding */
+#define RAPL_IDX_RAM_NRG_STAT 2 /* DRAM */
+#define INTEL_RAPL_RAM 0x3 /* pseudo-encoding */
+#define RAPL_IDX_PP1_NRG_STAT 3 /* gpu */
+#define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */
+
+/* Clients have PP0, PKG */
+#define RAPL_IDX_CLN (1<<RAPL_IDX_PP0_NRG_STAT|\
+ 1<<RAPL_IDX_PKG_NRG_STAT|\
+ 1<<RAPL_IDX_PP1_NRG_STAT)
+
+/* Servers have PP0, PKG, RAM */
+#define RAPL_IDX_SRV (1<<RAPL_IDX_PP0_NRG_STAT|\
+ 1<<RAPL_IDX_PKG_NRG_STAT|\
+ 1<<RAPL_IDX_RAM_NRG_STAT)
+
+/* Servers have PP0, PKG, RAM, PP1 */
+#define RAPL_IDX_HSW (1<<RAPL_IDX_PP0_NRG_STAT|\
+ 1<<RAPL_IDX_PKG_NRG_STAT|\
+ 1<<RAPL_IDX_RAM_NRG_STAT|\
+ 1<<RAPL_IDX_PP1_NRG_STAT)
+
+/*
+ * event code: LSB 8 bits, passed in attr->config
+ * any other bit is reserved
+ */
+#define RAPL_EVENT_MASK 0xFFULL
+
+#define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format) \
+static ssize_t __rapl_##_var##_show(struct kobject *kobj, \
+ struct kobj_attribute *attr, \
+ char *page) \
+{ \
+ BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \
+ return sprintf(page, _format "\n"); \
+} \
+static struct kobj_attribute format_attr_##_var = \
+ __ATTR(_name, 0444, __rapl_##_var##_show, NULL)
+
+#define RAPL_EVENT_DESC(_name, _config) \
+{ \
+ .attr = __ATTR(_name, 0444, rapl_event_show, NULL), \
+ .config = _config, \
+}
+
+#define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */
+
+struct rapl_pmu {
+ spinlock_t lock;
+ int hw_unit; /* 1/2^hw_unit Joule */
+ int n_active; /* number of active events */
+ struct list_head active_list;
+ struct pmu *pmu; /* pointer to rapl_pmu_class */
+ ktime_t timer_interval; /* in ktime_t unit */
+ struct hrtimer hrtimer;
+};
+
+static struct pmu rapl_pmu_class;
+static cpumask_t rapl_cpu_mask;
+static int rapl_cntr_mask;
+
+static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu);
+static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free);
+
+static inline u64 rapl_read_counter(struct perf_event *event)
+{
+ u64 raw;
+ rdmsrl(event->hw.event_base, raw);
+ return raw;
+}
+
+static inline u64 rapl_scale(u64 v)
+{
+ /*
+ * scale delta to smallest unit (1/2^32)
+ * users must then scale back: count * 1/(1e9*2^32) to get Joules
+ * or use ldexp(count, -32).
+ * Watts = Joules/Time delta
+ */
+ return v << (32 - __get_cpu_var(rapl_pmu)->hw_unit);
+}
+
+static u64 rapl_event_update(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ u64 prev_raw_count, new_raw_count;
+ s64 delta, sdelta;
+ int shift = RAPL_CNTR_WIDTH;
+
+again:
+ prev_raw_count = local64_read(&hwc->prev_count);
+ rdmsrl(event->hw.event_base, new_raw_count);
+
+ if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
+ new_raw_count) != prev_raw_count) {
+ cpu_relax();
+ goto again;
+ }
+
+ /*
+ * Now we have the new raw value and have updated the prev
+ * timestamp already. We can now calculate the elapsed delta
+ * (event-)time and add that to the generic event.
+ *
+ * Careful, not all hw sign-extends above the physical width
+ * of the count.
+ */
+ delta = (new_raw_count << shift) - (prev_raw_count << shift);
+ delta >>= shift;
+
+ sdelta = rapl_scale(delta);
+
+ local64_add(sdelta, &event->count);
+
+ return new_raw_count;
+}
+
+static void rapl_start_hrtimer(struct rapl_pmu *pmu)
+{
+ __hrtimer_start_range_ns(&pmu->hrtimer,
+ pmu->timer_interval, 0,
+ HRTIMER_MODE_REL_PINNED, 0);
+}
+
+static void rapl_stop_hrtimer(struct rapl_pmu *pmu)
+{
+ hrtimer_cancel(&pmu->hrtimer);
+}
+
+static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
+{
+ struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu);
+ struct perf_event *event;
+ unsigned long flags;
+
+ if (!pmu->n_active)
+ return HRTIMER_NORESTART;
+
+ spin_lock_irqsave(&pmu->lock, flags);
+
+ list_for_each_entry(event, &pmu->active_list, active_entry) {
+ rapl_event_update(event);
+ }
+
+ spin_unlock_irqrestore(&pmu->lock, flags);
+
+ hrtimer_forward_now(hrtimer, pmu->timer_interval);
+
+ return HRTIMER_RESTART;
+}
+
+static void rapl_hrtimer_init(struct rapl_pmu *pmu)
+{
+ struct hrtimer *hr = &pmu->hrtimer;
+
+ hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hr->function = rapl_hrtimer_handle;
+}
+
+static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
+ struct perf_event *event)
+{
+ if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+ return;
+
+ event->hw.state = 0;
+
+ list_add_tail(&event->active_entry, &pmu->active_list);
+
+ local64_set(&event->hw.prev_count, rapl_read_counter(event));
+
+ pmu->n_active++;
+ if (pmu->n_active == 1)
+ rapl_start_hrtimer(pmu);
+}
+
+static void rapl_pmu_event_start(struct perf_event *event, int mode)
+{
+ struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu);
+ unsigned long flags;
+
+ spin_lock_irqsave(&pmu->lock, flags);
+ __rapl_pmu_event_start(pmu, event);
+ spin_unlock_irqrestore(&pmu->lock, flags);
+}
+
+static void rapl_pmu_event_stop(struct perf_event *event, int mode)
+{
+ struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu);
+ struct hw_perf_event *hwc = &event->hw;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pmu->lock, flags);
+
+ /* mark event as deactivated and stopped */
+ if (!(hwc->state & PERF_HES_STOPPED)) {
+ WARN_ON_ONCE(pmu->n_active <= 0);
+ pmu->n_active--;
+ if (pmu->n_active == 0)
+ rapl_stop_hrtimer(pmu);
+
+ list_del(&event->active_entry);
+
+ WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+ hwc->state |= PERF_HES_STOPPED;
+ }
+
+ /* check if update of sw counter is necessary */
+ if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+ /*
+ * Drain the remaining delta count out of a event
+ * that we are disabling:
+ */
+ rapl_event_update(event);
+ hwc->state |= PERF_HES_UPTODATE;
+ }
+
+ spin_unlock_irqrestore(&pmu->lock, flags);
+}
+
+static int rapl_pmu_event_add(struct perf_event *event, int mode)
+{
+ struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu);
+ struct hw_perf_event *hwc = &event->hw;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pmu->lock, flags);
+
+ hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+ if (mode & PERF_EF_START)
+ __rapl_pmu_event_start(pmu, event);
+
+ spin_unlock_irqrestore(&pmu->lock, flags);
+
+ return 0;
+}
+
+static void rapl_pmu_event_del(struct perf_event *event, int flags)
+{
+ rapl_pmu_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int rapl_pmu_event_init(struct perf_event *event)
+{
+ u64 cfg = event->attr.config & RAPL_EVENT_MASK;
+ int bit, msr, ret = 0;
+
+ /* only look at RAPL events */
+ if (event->attr.type != rapl_pmu_class.type)
+ return -ENOENT;
+
+ /* check only supported bits are set */
+ if (event->attr.config & ~RAPL_EVENT_MASK)
+ return -EINVAL;
+
+ /*
+ * check event is known (determines counter)
+ */
+ switch (cfg) {
+ case INTEL_RAPL_PP0:
+ bit = RAPL_IDX_PP0_NRG_STAT;
+ msr = MSR_PP0_ENERGY_STATUS;
+ break;
+ case INTEL_RAPL_PKG:
+ bit = RAPL_IDX_PKG_NRG_STAT;
+ msr = MSR_PKG_ENERGY_STATUS;
+ break;
+ case INTEL_RAPL_RAM:
+ bit = RAPL_IDX_RAM_NRG_STAT;
+ msr = MSR_DRAM_ENERGY_STATUS;
+ break;
+ case INTEL_RAPL_PP1:
+ bit = RAPL_IDX_PP1_NRG_STAT;
+ msr = MSR_PP1_ENERGY_STATUS;
+ break;
+ default:
+ return -EINVAL;
+ }
+ /* check event supported */
+ if (!(rapl_cntr_mask & (1 << bit)))
+ return -EINVAL;
+
+ /* unsupported modes and filters */
+ if (event->attr.exclude_user ||
+ event->attr.exclude_kernel ||
+ event->attr.exclude_hv ||
+ event->attr.exclude_idle ||
+ event->attr.exclude_host ||
+ event->attr.exclude_guest ||
+ event->attr.sample_period) /* no sampling */
+ return -EINVAL;
+
+ /* must be done before validate_group */
+ event->hw.event_base = msr;
+ event->hw.config = cfg;
+ event->hw.idx = bit;
+
+ return ret;
+}
+
+static void rapl_pmu_event_read(struct perf_event *event)
+{
+ rapl_event_update(event);
+}
+
+static ssize_t rapl_get_attr_cpumask(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &rapl_cpu_mask);
+
+ buf[n++] = '\n';
+ buf[n] = '\0';
+ return n;
+}
+
+static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
+
+static struct attribute *rapl_pmu_attrs[] = {
+ &dev_attr_cpumask.attr,
+ NULL,
+};
+
+static struct attribute_group rapl_pmu_attr_group = {
+ .attrs = rapl_pmu_attrs,
+};
+
+EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
+EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02");
+EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03");
+EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04");
+
+EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
+EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules");
+EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules");
+EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules");
+
+/*
+ * we compute in 0.23 nJ increments regardless of MSR
+ */
+EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
+EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10");
+EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10");
+EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10");
+
+static struct attribute *rapl_events_srv_attr[] = {
+ EVENT_PTR(rapl_cores),
+ EVENT_PTR(rapl_pkg),
+ EVENT_PTR(rapl_ram),
+
+ EVENT_PTR(rapl_cores_unit),
+ EVENT_PTR(rapl_pkg_unit),
+ EVENT_PTR(rapl_ram_unit),
+
+ EVENT_PTR(rapl_cores_scale),
+ EVENT_PTR(rapl_pkg_scale),
+ EVENT_PTR(rapl_ram_scale),
+ NULL,
+};
+
+static struct attribute *rapl_events_cln_attr[] = {
+ EVENT_PTR(rapl_cores),
+ EVENT_PTR(rapl_pkg),
+ EVENT_PTR(rapl_gpu),
+
+ EVENT_PTR(rapl_cores_unit),
+ EVENT_PTR(rapl_pkg_unit),
+ EVENT_PTR(rapl_gpu_unit),
+
+ EVENT_PTR(rapl_cores_scale),
+ EVENT_PTR(rapl_pkg_scale),
+ EVENT_PTR(rapl_gpu_scale),
+ NULL,
+};
+
+static struct attribute *rapl_events_hsw_attr[] = {
+ EVENT_PTR(rapl_cores),
+ EVENT_PTR(rapl_pkg),
+ EVENT_PTR(rapl_gpu),
+ EVENT_PTR(rapl_ram),
+
+ EVENT_PTR(rapl_cores_unit),
+ EVENT_PTR(rapl_pkg_unit),
+ EVENT_PTR(rapl_gpu_unit),
+ EVENT_PTR(rapl_ram_unit),
+
+ EVENT_PTR(rapl_cores_scale),
+ EVENT_PTR(rapl_pkg_scale),
+ EVENT_PTR(rapl_gpu_scale),
+ EVENT_PTR(rapl_ram_scale),
+ NULL,
+};
+
+static struct attribute_group rapl_pmu_events_group = {
+ .name = "events",
+ .attrs = NULL, /* patched at runtime */
+};
+
+DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7");
+static struct attribute *rapl_formats_attr[] = {
+ &format_attr_event.attr,
+ NULL,
+};
+
+static struct attribute_group rapl_pmu_format_group = {
+ .name = "format",
+ .attrs = rapl_formats_attr,
+};
+
+const struct attribute_group *rapl_attr_groups[] = {
+ &rapl_pmu_attr_group,
+ &rapl_pmu_format_group,
+ &rapl_pmu_events_group,
+ NULL,
+};
+
+static struct pmu rapl_pmu_class = {
+ .attr_groups = rapl_attr_groups,
+ .task_ctx_nr = perf_invalid_context, /* system-wide only */
+ .event_init = rapl_pmu_event_init,
+ .add = rapl_pmu_event_add, /* must have */
+ .del = rapl_pmu_event_del, /* must have */
+ .start = rapl_pmu_event_start,
+ .stop = rapl_pmu_event_stop,
+ .read = rapl_pmu_event_read,
+};
+
+static void rapl_cpu_exit(int cpu)
+{
+ struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
+ int i, phys_id = topology_physical_package_id(cpu);
+ int target = -1;
+
+ /* find a new cpu on same package */
+ for_each_online_cpu(i) {
+ if (i == cpu)
+ continue;
+ if (phys_id == topology_physical_package_id(i)) {
+ target = i;
+ break;
+ }
+ }
+ /*
+ * clear cpu from cpumask
+ * if was set in cpumask and still some cpu on package,
+ * then move to new cpu
+ */
+ if (cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask) && target >= 0)
+ cpumask_set_cpu(target, &rapl_cpu_mask);
+
+ WARN_ON(cpumask_empty(&rapl_cpu_mask));
+ /*
+ * migrate events and context to new cpu
+ */
+ if (target >= 0)
+ perf_pmu_migrate_context(pmu->pmu, cpu, target);
+
+ /* cancel overflow polling timer for CPU */
+ rapl_stop_hrtimer(pmu);
+}
+
+static void rapl_cpu_init(int cpu)
+{
+ int i, phys_id = topology_physical_package_id(cpu);
+
+ /* check if phys_is is already covered */
+ for_each_cpu(i, &rapl_cpu_mask) {
+ if (phys_id == topology_physical_package_id(i))
+ return;
+ }
+ /* was not found, so add it */
+ cpumask_set_cpu(cpu, &rapl_cpu_mask);
+}
+
+static int rapl_cpu_prepare(int cpu)
+{
+ struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
+ int phys_id = topology_physical_package_id(cpu);
+ u64 ms;
+ u64 msr_rapl_power_unit_bits;
+
+ if (pmu)
+ return 0;
+
+ if (phys_id < 0)
+ return -1;
+
+ /* protect rdmsrl() to handle virtualization */
+ if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits))
+ return -1;
+
+ pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
+ if (!pmu)
+ return -1;
+
+ spin_lock_init(&pmu->lock);
+
+ INIT_LIST_HEAD(&pmu->active_list);
+
+ /*
+ * grab power unit as: 1/2^unit Joules
+ *
+ * we cache in local PMU instance
+ */
+ pmu->hw_unit = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
+ pmu->pmu = &rapl_pmu_class;
+
+ /*
+ * use reference of 200W for scaling the timeout
+ * to avoid missing counter overflows.
+ * 200W = 200 Joules/sec
+ * divide interval by 2 to avoid lockstep (2 * 100)
+ * if hw unit is 32, then we use 2 ms 1/200/2
+ */
+ if (pmu->hw_unit < 32)
+ ms = (1000 / (2 * 100)) * (1ULL << (32 - pmu->hw_unit - 1));
+ else
+ ms = 2;
+
+ pmu->timer_interval = ms_to_ktime(ms);
+
+ rapl_hrtimer_init(pmu);
+
+ /* set RAPL pmu for this cpu for now */
+ per_cpu(rapl_pmu, cpu) = pmu;
+ per_cpu(rapl_pmu_to_free, cpu) = NULL;
+
+ return 0;
+}
+
+static void rapl_cpu_kfree(int cpu)
+{
+ struct rapl_pmu *pmu = per_cpu(rapl_pmu_to_free, cpu);
+
+ kfree(pmu);
+
+ per_cpu(rapl_pmu_to_free, cpu) = NULL;
+}
+
+static int rapl_cpu_dying(int cpu)
+{
+ struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
+
+ if (!pmu)
+ return 0;
+
+ per_cpu(rapl_pmu, cpu) = NULL;
+
+ per_cpu(rapl_pmu_to_free, cpu) = pmu;
+
+ return 0;
+}
+
+static int rapl_cpu_notifier(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ unsigned int cpu = (long)hcpu;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_UP_PREPARE:
+ rapl_cpu_prepare(cpu);
+ break;
+ case CPU_STARTING:
+ rapl_cpu_init(cpu);
+ break;
+ case CPU_UP_CANCELED:
+ case CPU_DYING:
+ rapl_cpu_dying(cpu);
+ break;
+ case CPU_ONLINE:
+ case CPU_DEAD:
+ rapl_cpu_kfree(cpu);
+ break;
+ case CPU_DOWN_PREPARE:
+ rapl_cpu_exit(cpu);
+ break;
+ default:
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static const struct x86_cpu_id rapl_cpu_match[] = {
+ [0] = { .vendor = X86_VENDOR_INTEL, .family = 6 },
+ [1] = {},
+};
+
+static int __init rapl_pmu_init(void)
+{
+ struct rapl_pmu *pmu;
+ int cpu, ret;
+
+ /*
+ * check for Intel processor family 6
+ */
+ if (!x86_match_cpu(rapl_cpu_match))
+ return 0;
+
+ /* check supported CPU */
+ switch (boot_cpu_data.x86_model) {
+ case 42: /* Sandy Bridge */
+ case 58: /* Ivy Bridge */
+ rapl_cntr_mask = RAPL_IDX_CLN;
+ rapl_pmu_events_group.attrs = rapl_events_cln_attr;
+ break;
+ case 60: /* Haswell */
+ case 69: /* Haswell-Celeron */
+ rapl_cntr_mask = RAPL_IDX_HSW;
+ rapl_pmu_events_group.attrs = rapl_events_hsw_attr;
+ break;
+ case 45: /* Sandy Bridge-EP */
+ case 62: /* IvyTown */
+ rapl_cntr_mask = RAPL_IDX_SRV;
+ rapl_pmu_events_group.attrs = rapl_events_srv_attr;
+ break;
+
+ default:
+ /* unsupported */
+ return 0;
+ }
+
+ cpu_notifier_register_begin();
+
+ for_each_online_cpu(cpu) {
+ ret = rapl_cpu_prepare(cpu);
+ if (ret)
+ goto out;
+ rapl_cpu_init(cpu);
+ }
+
+ __perf_cpu_notifier(rapl_cpu_notifier);
+
+ ret = perf_pmu_register(&rapl_pmu_class, "power", -1);
+ if (WARN_ON(ret)) {
+ pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret);
+ cpu_notifier_register_done();
+ return -1;
+ }
+
+ pmu = __get_cpu_var(rapl_pmu);
+
+ pr_info("RAPL PMU detected, hw unit 2^-%d Joules,"
+ " API unit is 2^-32 Joules,"
+ " %d fixed counters"
+ " %llu ms ovfl timer\n",
+ pmu->hw_unit,
+ hweight32(rapl_cntr_mask),
+ ktime_to_ms(pmu->timer_interval));
+
+out:
+ cpu_notifier_register_done();
+
+ return 0;
+}
+device_initcall(rapl_pmu_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 4118f9f6831..ae6552a0701 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -66,6 +66,47 @@ DEFINE_UNCORE_FORMAT_ATTR(mask_vnw, mask_vnw, "config2:3-4");
DEFINE_UNCORE_FORMAT_ATTR(mask0, mask0, "config2:0-31");
DEFINE_UNCORE_FORMAT_ATTR(mask1, mask1, "config2:32-63");
+static void uncore_pmu_start_hrtimer(struct intel_uncore_box *box);
+static void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box);
+static void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event);
+static void uncore_pmu_event_read(struct perf_event *event);
+
+static struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event)
+{
+ return container_of(event->pmu, struct intel_uncore_pmu, pmu);
+}
+
+static struct intel_uncore_box *
+uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
+{
+ struct intel_uncore_box *box;
+
+ box = *per_cpu_ptr(pmu->box, cpu);
+ if (box)
+ return box;
+
+ raw_spin_lock(&uncore_box_lock);
+ list_for_each_entry(box, &pmu->box_list, list) {
+ if (box->phys_id == topology_physical_package_id(cpu)) {
+ atomic_inc(&box->refcnt);
+ *per_cpu_ptr(pmu->box, cpu) = box;
+ break;
+ }
+ }
+ raw_spin_unlock(&uncore_box_lock);
+
+ return *per_cpu_ptr(pmu->box, cpu);
+}
+
+static struct intel_uncore_box *uncore_event_to_box(struct perf_event *event)
+{
+ /*
+ * perf core schedules event on the basis of cpu, uncore events are
+ * collected by one of the cpus inside a physical package.
+ */
+ return uncore_pmu_to_box(uncore_event_to_pmu(event), smp_processor_id());
+}
+
static u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event)
{
u64 count;
@@ -501,21 +542,24 @@ static struct extra_reg snbep_uncore_cbox_extra_regs[] = {
SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0x6),
SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0x6),
SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0x6),
SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x6),
SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x8),
SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0xc),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0xc),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0xa),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0xa),
SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x2),
SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x2),
SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x2),
SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x2),
SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x8),
SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0xc),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0xc),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0xa),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0xa),
SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x2),
SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x2),
SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x2),
@@ -997,6 +1041,20 @@ static int snbep_pci2phy_map_init(int devid)
}
}
+ if (!err) {
+ /*
+ * For PCI bus with no UBOX device, find the next bus
+ * that has UBOX device and use its mapping.
+ */
+ i = -1;
+ for (bus = 255; bus >= 0; bus--) {
+ if (pcibus_to_physid[bus] >= 0)
+ i = pcibus_to_physid[bus];
+ else
+ pcibus_to_physid[bus] = i;
+ }
+ }
+
if (ubox_dev)
pci_dev_put(ubox_dev);
@@ -1099,6 +1157,24 @@ static struct attribute *ivt_uncore_qpi_formats_attr[] = {
&format_attr_umask.attr,
&format_attr_edge.attr,
&format_attr_thresh8.attr,
+ &format_attr_match_rds.attr,
+ &format_attr_match_rnid30.attr,
+ &format_attr_match_rnid4.attr,
+ &format_attr_match_dnid.attr,
+ &format_attr_match_mc.attr,
+ &format_attr_match_opc.attr,
+ &format_attr_match_vnw.attr,
+ &format_attr_match0.attr,
+ &format_attr_match1.attr,
+ &format_attr_mask_rds.attr,
+ &format_attr_mask_rnid30.attr,
+ &format_attr_mask_rnid4.attr,
+ &format_attr_mask_dnid.attr,
+ &format_attr_mask_mc.attr,
+ &format_attr_mask_opc.attr,
+ &format_attr_mask_vnw.attr,
+ &format_attr_mask0.attr,
+ &format_attr_mask1.attr,
NULL,
};
@@ -1146,10 +1222,16 @@ static struct extra_reg ivt_uncore_cbox_extra_regs[] = {
SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
SNBEP_CBO_EVENT_EXTRA_REG(0x1031, 0x10ff, 0x2),
+
+ SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0xc),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x5134, 0xffff, 0xc),
SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0xc),
SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0xc),
SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0xc),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0xc),
SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x10),
SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10),
SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10),
@@ -1164,7 +1246,7 @@ static struct extra_reg ivt_uncore_cbox_extra_regs[] = {
SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10),
SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10),
SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10),
- SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10),
SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),
SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18),
SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18),
@@ -1312,17 +1394,83 @@ static struct intel_uncore_type ivt_uncore_imc = {
IVT_UNCORE_PCI_COMMON_INIT(),
};
+/* registers in IRP boxes are not properly aligned */
+static unsigned ivt_uncore_irp_ctls[] = {0xd8, 0xdc, 0xe0, 0xe4};
+static unsigned ivt_uncore_irp_ctrs[] = {0xa0, 0xb0, 0xb8, 0xc0};
+
+static void ivt_uncore_irp_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+
+ pci_write_config_dword(pdev, ivt_uncore_irp_ctls[hwc->idx],
+ hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static void ivt_uncore_irp_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+
+ pci_write_config_dword(pdev, ivt_uncore_irp_ctls[hwc->idx], hwc->config);
+}
+
+static u64 ivt_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+ u64 count = 0;
+
+ pci_read_config_dword(pdev, ivt_uncore_irp_ctrs[hwc->idx], (u32 *)&count);
+ pci_read_config_dword(pdev, ivt_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1);
+
+ return count;
+}
+
+static struct intel_uncore_ops ivt_uncore_irp_ops = {
+ .init_box = ivt_uncore_pci_init_box,
+ .disable_box = snbep_uncore_pci_disable_box,
+ .enable_box = snbep_uncore_pci_enable_box,
+ .disable_event = ivt_uncore_irp_disable_event,
+ .enable_event = ivt_uncore_irp_enable_event,
+ .read_counter = ivt_uncore_irp_read_counter,
+};
+
+static struct intel_uncore_type ivt_uncore_irp = {
+ .name = "irp",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .event_mask = IVT_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
+ .ops = &ivt_uncore_irp_ops,
+ .format_group = &ivt_uncore_format_group,
+};
+
+static struct intel_uncore_ops ivt_uncore_qpi_ops = {
+ .init_box = ivt_uncore_pci_init_box,
+ .disable_box = snbep_uncore_pci_disable_box,
+ .enable_box = snbep_uncore_pci_enable_box,
+ .disable_event = snbep_uncore_pci_disable_event,
+ .enable_event = snbep_qpi_enable_event,
+ .read_counter = snbep_uncore_pci_read_counter,
+ .hw_config = snbep_qpi_hw_config,
+ .get_constraint = uncore_get_constraint,
+ .put_constraint = uncore_put_constraint,
+};
+
static struct intel_uncore_type ivt_uncore_qpi = {
- .name = "qpi",
- .num_counters = 4,
- .num_boxes = 3,
- .perf_ctr_bits = 48,
- .perf_ctr = SNBEP_PCI_PMON_CTR0,
- .event_ctl = SNBEP_PCI_PMON_CTL0,
- .event_mask = IVT_QPI_PCI_PMON_RAW_EVENT_MASK,
- .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
- .ops = &ivt_uncore_pci_ops,
- .format_group = &ivt_uncore_qpi_format_group,
+ .name = "qpi",
+ .num_counters = 4,
+ .num_boxes = 3,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SNBEP_PCI_PMON_CTR0,
+ .event_ctl = SNBEP_PCI_PMON_CTL0,
+ .event_mask = IVT_QPI_PCI_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
+ .num_shared_regs = 1,
+ .ops = &ivt_uncore_qpi_ops,
+ .format_group = &ivt_uncore_qpi_format_group,
};
static struct intel_uncore_type ivt_uncore_r2pcie = {
@@ -1346,6 +1494,7 @@ static struct intel_uncore_type ivt_uncore_r3qpi = {
enum {
IVT_PCI_UNCORE_HA,
IVT_PCI_UNCORE_IMC,
+ IVT_PCI_UNCORE_IRP,
IVT_PCI_UNCORE_QPI,
IVT_PCI_UNCORE_R2PCIE,
IVT_PCI_UNCORE_R3QPI,
@@ -1354,6 +1503,7 @@ enum {
static struct intel_uncore_type *ivt_pci_uncores[] = {
[IVT_PCI_UNCORE_HA] = &ivt_uncore_ha,
[IVT_PCI_UNCORE_IMC] = &ivt_uncore_imc,
+ [IVT_PCI_UNCORE_IRP] = &ivt_uncore_irp,
[IVT_PCI_UNCORE_QPI] = &ivt_uncore_qpi,
[IVT_PCI_UNCORE_R2PCIE] = &ivt_uncore_r2pcie,
[IVT_PCI_UNCORE_R3QPI] = &ivt_uncore_r3qpi,
@@ -1401,6 +1551,10 @@ static DEFINE_PCI_DEVICE_TABLE(ivt_uncore_pci_ids) = {
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef1),
.driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 7),
},
+ { /* IRP */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe39),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IRP, 0),
+ },
{ /* QPI0 Port 0 */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe32),
.driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_QPI, 0),
@@ -1429,6 +1583,16 @@ static DEFINE_PCI_DEVICE_TABLE(ivt_uncore_pci_ids) = {
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3e),
.driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R3QPI, 2),
},
+ { /* QPI Port 0 filter */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe86),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+ SNBEP_PCI_QPI_PORT0_FILTER),
+ },
+ { /* QPI Port 0 filter */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe96),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+ SNBEP_PCI_QPI_PORT1_FILTER),
+ },
{ /* end: all zeroes */ }
};
@@ -1517,6 +1681,349 @@ static struct intel_uncore_type *snb_msr_uncores[] = {
&snb_uncore_cbox,
NULL,
};
+
+enum {
+ SNB_PCI_UNCORE_IMC,
+};
+
+static struct uncore_event_desc snb_uncore_imc_events[] = {
+ INTEL_UNCORE_EVENT_DESC(data_reads, "event=0x01"),
+ INTEL_UNCORE_EVENT_DESC(data_reads.scale, "6.103515625e-5"),
+ INTEL_UNCORE_EVENT_DESC(data_reads.unit, "MiB"),
+
+ INTEL_UNCORE_EVENT_DESC(data_writes, "event=0x02"),
+ INTEL_UNCORE_EVENT_DESC(data_writes.scale, "6.103515625e-5"),
+ INTEL_UNCORE_EVENT_DESC(data_writes.unit, "MiB"),
+
+ { /* end: all zeroes */ },
+};
+
+#define SNB_UNCORE_PCI_IMC_EVENT_MASK 0xff
+#define SNB_UNCORE_PCI_IMC_BAR_OFFSET 0x48
+
+/* page size multiple covering all config regs */
+#define SNB_UNCORE_PCI_IMC_MAP_SIZE 0x6000
+
+#define SNB_UNCORE_PCI_IMC_DATA_READS 0x1
+#define SNB_UNCORE_PCI_IMC_DATA_READS_BASE 0x5050
+#define SNB_UNCORE_PCI_IMC_DATA_WRITES 0x2
+#define SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE 0x5054
+#define SNB_UNCORE_PCI_IMC_CTR_BASE SNB_UNCORE_PCI_IMC_DATA_READS_BASE
+
+static struct attribute *snb_uncore_imc_formats_attr[] = {
+ &format_attr_event.attr,
+ NULL,
+};
+
+static struct attribute_group snb_uncore_imc_format_group = {
+ .name = "format",
+ .attrs = snb_uncore_imc_formats_attr,
+};
+
+static void snb_uncore_imc_init_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ int where = SNB_UNCORE_PCI_IMC_BAR_OFFSET;
+ resource_size_t addr;
+ u32 pci_dword;
+
+ pci_read_config_dword(pdev, where, &pci_dword);
+ addr = pci_dword;
+
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+ pci_read_config_dword(pdev, where + 4, &pci_dword);
+ addr |= ((resource_size_t)pci_dword << 32);
+#endif
+
+ addr &= ~(PAGE_SIZE - 1);
+
+ box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE);
+ box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL;
+}
+
+static void snb_uncore_imc_enable_box(struct intel_uncore_box *box)
+{}
+
+static void snb_uncore_imc_disable_box(struct intel_uncore_box *box)
+{}
+
+static void snb_uncore_imc_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{}
+
+static void snb_uncore_imc_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{}
+
+static u64 snb_uncore_imc_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ return (u64)*(unsigned int *)(box->io_addr + hwc->event_base);
+}
+
+/*
+ * custom event_init() function because we define our own fixed, free
+ * running counters, so we do not want to conflict with generic uncore
+ * logic. Also simplifies processing
+ */
+static int snb_uncore_imc_event_init(struct perf_event *event)
+{
+ struct intel_uncore_pmu *pmu;
+ struct intel_uncore_box *box;
+ struct hw_perf_event *hwc = &event->hw;
+ u64 cfg = event->attr.config & SNB_UNCORE_PCI_IMC_EVENT_MASK;
+ int idx, base;
+
+ if (event->attr.type != event->pmu->type)
+ return -ENOENT;
+
+ pmu = uncore_event_to_pmu(event);
+ /* no device found for this pmu */
+ if (pmu->func_id < 0)
+ return -ENOENT;
+
+ /* Sampling not supported yet */
+ if (hwc->sample_period)
+ return -EINVAL;
+
+ /* unsupported modes and filters */
+ if (event->attr.exclude_user ||
+ event->attr.exclude_kernel ||
+ event->attr.exclude_hv ||
+ event->attr.exclude_idle ||
+ event->attr.exclude_host ||
+ event->attr.exclude_guest ||
+ event->attr.sample_period) /* no sampling */
+ return -EINVAL;
+
+ /*
+ * Place all uncore events for a particular physical package
+ * onto a single cpu
+ */
+ if (event->cpu < 0)
+ return -EINVAL;
+
+ /* check only supported bits are set */
+ if (event->attr.config & ~SNB_UNCORE_PCI_IMC_EVENT_MASK)
+ return -EINVAL;
+
+ box = uncore_pmu_to_box(pmu, event->cpu);
+ if (!box || box->cpu < 0)
+ return -EINVAL;
+
+ event->cpu = box->cpu;
+
+ event->hw.idx = -1;
+ event->hw.last_tag = ~0ULL;
+ event->hw.extra_reg.idx = EXTRA_REG_NONE;
+ event->hw.branch_reg.idx = EXTRA_REG_NONE;
+ /*
+ * check event is known (whitelist, determines counter)
+ */
+ switch (cfg) {
+ case SNB_UNCORE_PCI_IMC_DATA_READS:
+ base = SNB_UNCORE_PCI_IMC_DATA_READS_BASE;
+ idx = UNCORE_PMC_IDX_FIXED;
+ break;
+ case SNB_UNCORE_PCI_IMC_DATA_WRITES:
+ base = SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE;
+ idx = UNCORE_PMC_IDX_FIXED + 1;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ /* must be done before validate_group */
+ event->hw.event_base = base;
+ event->hw.config = cfg;
+ event->hw.idx = idx;
+
+ /* no group validation needed, we have free running counters */
+
+ return 0;
+}
+
+static int snb_uncore_imc_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ return 0;
+}
+
+static void snb_uncore_imc_event_start(struct perf_event *event, int flags)
+{
+ struct intel_uncore_box *box = uncore_event_to_box(event);
+ u64 count;
+
+ if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+ return;
+
+ event->hw.state = 0;
+ box->n_active++;
+
+ list_add_tail(&event->active_entry, &box->active_list);
+
+ count = snb_uncore_imc_read_counter(box, event);
+ local64_set(&event->hw.prev_count, count);
+
+ if (box->n_active == 1)
+ uncore_pmu_start_hrtimer(box);
+}
+
+static void snb_uncore_imc_event_stop(struct perf_event *event, int flags)
+{
+ struct intel_uncore_box *box = uncore_event_to_box(event);
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (!(hwc->state & PERF_HES_STOPPED)) {
+ box->n_active--;
+
+ WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+ hwc->state |= PERF_HES_STOPPED;
+
+ list_del(&event->active_entry);
+
+ if (box->n_active == 0)
+ uncore_pmu_cancel_hrtimer(box);
+ }
+
+ if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+ /*
+ * Drain the remaining delta count out of a event
+ * that we are disabling:
+ */
+ uncore_perf_event_update(box, event);
+ hwc->state |= PERF_HES_UPTODATE;
+ }
+}
+
+static int snb_uncore_imc_event_add(struct perf_event *event, int flags)
+{
+ struct intel_uncore_box *box = uncore_event_to_box(event);
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (!box)
+ return -ENODEV;
+
+ hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+ if (!(flags & PERF_EF_START))
+ hwc->state |= PERF_HES_ARCH;
+
+ snb_uncore_imc_event_start(event, 0);
+
+ box->n_events++;
+
+ return 0;
+}
+
+static void snb_uncore_imc_event_del(struct perf_event *event, int flags)
+{
+ struct intel_uncore_box *box = uncore_event_to_box(event);
+ int i;
+
+ snb_uncore_imc_event_stop(event, PERF_EF_UPDATE);
+
+ for (i = 0; i < box->n_events; i++) {
+ if (event == box->event_list[i]) {
+ --box->n_events;
+ break;
+ }
+ }
+}
+
+static int snb_pci2phy_map_init(int devid)
+{
+ struct pci_dev *dev = NULL;
+ int bus;
+
+ dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, dev);
+ if (!dev)
+ return -ENOTTY;
+
+ bus = dev->bus->number;
+
+ pcibus_to_physid[bus] = 0;
+
+ pci_dev_put(dev);
+
+ return 0;
+}
+
+static struct pmu snb_uncore_imc_pmu = {
+ .task_ctx_nr = perf_invalid_context,
+ .event_init = snb_uncore_imc_event_init,
+ .add = snb_uncore_imc_event_add,
+ .del = snb_uncore_imc_event_del,
+ .start = snb_uncore_imc_event_start,
+ .stop = snb_uncore_imc_event_stop,
+ .read = uncore_pmu_event_read,
+};
+
+static struct intel_uncore_ops snb_uncore_imc_ops = {
+ .init_box = snb_uncore_imc_init_box,
+ .enable_box = snb_uncore_imc_enable_box,
+ .disable_box = snb_uncore_imc_disable_box,
+ .disable_event = snb_uncore_imc_disable_event,
+ .enable_event = snb_uncore_imc_enable_event,
+ .hw_config = snb_uncore_imc_hw_config,
+ .read_counter = snb_uncore_imc_read_counter,
+};
+
+static struct intel_uncore_type snb_uncore_imc = {
+ .name = "imc",
+ .num_counters = 2,
+ .num_boxes = 1,
+ .fixed_ctr_bits = 32,
+ .fixed_ctr = SNB_UNCORE_PCI_IMC_CTR_BASE,
+ .event_descs = snb_uncore_imc_events,
+ .format_group = &snb_uncore_imc_format_group,
+ .perf_ctr = SNB_UNCORE_PCI_IMC_DATA_READS_BASE,
+ .event_mask = SNB_UNCORE_PCI_IMC_EVENT_MASK,
+ .ops = &snb_uncore_imc_ops,
+ .pmu = &snb_uncore_imc_pmu,
+};
+
+static struct intel_uncore_type *snb_pci_uncores[] = {
+ [SNB_PCI_UNCORE_IMC] = &snb_uncore_imc,
+ NULL,
+};
+
+static DEFINE_PCI_DEVICE_TABLE(snb_uncore_pci_ids) = {
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SNB_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* end: all zeroes */ },
+};
+
+static DEFINE_PCI_DEVICE_TABLE(ivb_uncore_pci_ids) = {
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* end: all zeroes */ },
+};
+
+static DEFINE_PCI_DEVICE_TABLE(hsw_uncore_pci_ids) = {
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* end: all zeroes */ },
+};
+
+static struct pci_driver snb_uncore_pci_driver = {
+ .name = "snb_uncore",
+ .id_table = snb_uncore_pci_ids,
+};
+
+static struct pci_driver ivb_uncore_pci_driver = {
+ .name = "ivb_uncore",
+ .id_table = ivb_uncore_pci_ids,
+};
+
+static struct pci_driver hsw_uncore_pci_driver = {
+ .name = "hsw_uncore",
+ .id_table = hsw_uncore_pci_ids,
+};
+
/* end of Sandy Bridge uncore support */
/* Nehalem uncore support */
@@ -2667,6 +3174,7 @@ again:
static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer)
{
struct intel_uncore_box *box;
+ struct perf_event *event;
unsigned long flags;
int bit;
@@ -2679,19 +3187,27 @@ static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer)
*/
local_irq_save(flags);
+ /*
+ * handle boxes with an active event list as opposed to active
+ * counters
+ */
+ list_for_each_entry(event, &box->active_list, active_entry) {
+ uncore_perf_event_update(box, event);
+ }
+
for_each_set_bit(bit, box->active_mask, UNCORE_PMC_IDX_MAX)
uncore_perf_event_update(box, box->events[bit]);
local_irq_restore(flags);
- hrtimer_forward_now(hrtimer, ns_to_ktime(UNCORE_PMU_HRTIMER_INTERVAL));
+ hrtimer_forward_now(hrtimer, ns_to_ktime(box->hrtimer_duration));
return HRTIMER_RESTART;
}
static void uncore_pmu_start_hrtimer(struct intel_uncore_box *box)
{
__hrtimer_start_range_ns(&box->hrtimer,
- ns_to_ktime(UNCORE_PMU_HRTIMER_INTERVAL), 0,
+ ns_to_ktime(box->hrtimer_duration), 0,
HRTIMER_MODE_REL_PINNED, 0);
}
@@ -2725,43 +3241,12 @@ static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type,
box->cpu = -1;
box->phys_id = -1;
- return box;
-}
-
-static struct intel_uncore_box *
-uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
-{
- struct intel_uncore_box *box;
-
- box = *per_cpu_ptr(pmu->box, cpu);
- if (box)
- return box;
-
- raw_spin_lock(&uncore_box_lock);
- list_for_each_entry(box, &pmu->box_list, list) {
- if (box->phys_id == topology_physical_package_id(cpu)) {
- atomic_inc(&box->refcnt);
- *per_cpu_ptr(pmu->box, cpu) = box;
- break;
- }
- }
- raw_spin_unlock(&uncore_box_lock);
-
- return *per_cpu_ptr(pmu->box, cpu);
-}
+ /* set default hrtimer timeout */
+ box->hrtimer_duration = UNCORE_PMU_HRTIMER_INTERVAL;
-static struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event)
-{
- return container_of(event->pmu, struct intel_uncore_pmu, pmu);
-}
+ INIT_LIST_HEAD(&box->active_list);
-static struct intel_uncore_box *uncore_event_to_box(struct perf_event *event)
-{
- /*
- * perf core schedules event on the basis of cpu, uncore events are
- * collected by one of the cpus inside a physical package.
- */
- return uncore_pmu_to_box(uncore_event_to_pmu(event), smp_processor_id());
+ return box;
}
static int
@@ -3157,16 +3642,21 @@ static int __init uncore_pmu_register(struct intel_uncore_pmu *pmu)
{
int ret;
- pmu->pmu = (struct pmu) {
- .attr_groups = pmu->type->attr_groups,
- .task_ctx_nr = perf_invalid_context,
- .event_init = uncore_pmu_event_init,
- .add = uncore_pmu_event_add,
- .del = uncore_pmu_event_del,
- .start = uncore_pmu_event_start,
- .stop = uncore_pmu_event_stop,
- .read = uncore_pmu_event_read,
- };
+ if (!pmu->type->pmu) {
+ pmu->pmu = (struct pmu) {
+ .attr_groups = pmu->type->attr_groups,
+ .task_ctx_nr = perf_invalid_context,
+ .event_init = uncore_pmu_event_init,
+ .add = uncore_pmu_event_add,
+ .del = uncore_pmu_event_del,
+ .start = uncore_pmu_event_start,
+ .stop = uncore_pmu_event_stop,
+ .read = uncore_pmu_event_read,
+ };
+ } else {
+ pmu->pmu = *pmu->type->pmu;
+ pmu->pmu.attr_groups = pmu->type->attr_groups;
+ }
if (pmu->type->num_boxes == 1) {
if (strlen(pmu->type->name) > 0)
@@ -3212,6 +3702,8 @@ static int __init uncore_type_init(struct intel_uncore_type *type)
if (!pmus)
return -ENOMEM;
+ type->pmus = pmus;
+
type->unconstrainted = (struct event_constraint)
__EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1,
0, type->num_counters, 0, 0);
@@ -3247,7 +3739,6 @@ static int __init uncore_type_init(struct intel_uncore_type *type)
}
type->pmu_group = &uncore_pmu_attr_group;
- type->pmus = pmus;
return 0;
fail:
uncore_type_exit(type);
@@ -3379,6 +3870,28 @@ static int __init uncore_pci_init(void)
pci_uncores = ivt_pci_uncores;
uncore_pci_driver = &ivt_uncore_pci_driver;
break;
+ case 42: /* Sandy Bridge */
+ ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_SNB_IMC);
+ if (ret)
+ return ret;
+ pci_uncores = snb_pci_uncores;
+ uncore_pci_driver = &snb_uncore_pci_driver;
+ break;
+ case 58: /* Ivy Bridge */
+ ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_IVB_IMC);
+ if (ret)
+ return ret;
+ pci_uncores = snb_pci_uncores;
+ uncore_pci_driver = &ivb_uncore_pci_driver;
+ break;
+ case 60: /* Haswell */
+ case 69: /* Haswell Celeron */
+ ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_HSW_IMC);
+ if (ret)
+ return ret;
+ pci_uncores = snb_pci_uncores;
+ uncore_pci_driver = &hsw_uncore_pci_driver;
+ break;
default:
return 0;
}
@@ -3650,7 +4163,7 @@ static void __init uncore_cpu_setup(void *dummy)
static int __init uncore_cpu_init(void)
{
- int ret, cpu, max_cores;
+ int ret, max_cores;
max_cores = boot_cpu_data.x86_max_cores;
switch (boot_cpu_data.x86_model) {
@@ -3694,29 +4207,6 @@ static int __init uncore_cpu_init(void)
if (ret)
return ret;
- get_online_cpus();
-
- for_each_online_cpu(cpu) {
- int i, phys_id = topology_physical_package_id(cpu);
-
- for_each_cpu(i, &uncore_cpu_mask) {
- if (phys_id == topology_physical_package_id(i)) {
- phys_id = -1;
- break;
- }
- }
- if (phys_id < 0)
- continue;
-
- uncore_cpu_prepare(cpu, phys_id);
- uncore_event_init_cpu(cpu);
- }
- on_each_cpu(uncore_cpu_setup, NULL, 1);
-
- register_cpu_notifier(&uncore_cpu_nb);
-
- put_online_cpus();
-
return 0;
}
@@ -3745,6 +4235,41 @@ static int __init uncore_pmus_register(void)
return 0;
}
+static void __init uncore_cpumask_init(void)
+{
+ int cpu;
+
+ /*
+ * ony invoke once from msr or pci init code
+ */
+ if (!cpumask_empty(&uncore_cpu_mask))
+ return;
+
+ cpu_notifier_register_begin();
+
+ for_each_online_cpu(cpu) {
+ int i, phys_id = topology_physical_package_id(cpu);
+
+ for_each_cpu(i, &uncore_cpu_mask) {
+ if (phys_id == topology_physical_package_id(i)) {
+ phys_id = -1;
+ break;
+ }
+ }
+ if (phys_id < 0)
+ continue;
+
+ uncore_cpu_prepare(cpu, phys_id);
+ uncore_event_init_cpu(cpu);
+ }
+ on_each_cpu(uncore_cpu_setup, NULL, 1);
+
+ __register_cpu_notifier(&uncore_cpu_nb);
+
+ cpu_notifier_register_done();
+}
+
+
static int __init intel_uncore_init(void)
{
int ret;
@@ -3763,6 +4288,7 @@ static int __init intel_uncore_init(void)
uncore_pci_exit();
goto fail;
}
+ uncore_cpumask_init();
uncore_pmus_register();
return 0;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
index a80ab71a883..90236f0c94a 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -6,6 +6,7 @@
#define UNCORE_PMU_NAME_LEN 32
#define UNCORE_PMU_HRTIMER_INTERVAL (60LL * NSEC_PER_SEC)
+#define UNCORE_SNB_IMC_HRTIMER_INTERVAL (5ULL * NSEC_PER_SEC)
#define UNCORE_FIXED_EVENT 0xff
#define UNCORE_PMC_IDX_MAX_GENERIC 8
@@ -440,6 +441,7 @@ struct intel_uncore_type {
struct intel_uncore_ops *ops;
struct uncore_event_desc *event_descs;
const struct attribute_group *attr_groups[4];
+ struct pmu *pmu; /* for custom pmu ops */
};
#define pmu_group attr_groups[0]
@@ -488,8 +490,11 @@ struct intel_uncore_box {
u64 tags[UNCORE_PMC_IDX_MAX];
struct pci_dev *pci_dev;
struct intel_uncore_pmu *pmu;
+ u64 hrtimer_duration; /* hrtimer timeout for this box */
struct hrtimer hrtimer;
struct list_head list;
+ struct list_head active_list;
+ void *io_addr;
struct intel_uncore_extra_reg shared_regs[0];
};
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 3486e666035..5d466b7d860 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -1257,7 +1257,24 @@ again:
pass++;
goto again;
}
-
+ /*
+ * Perf does test runs to see if a whole group can be assigned
+ * together succesfully. There can be multiple rounds of this.
+ * Unfortunately, p4_pmu_swap_config_ts touches the hwc->config
+ * bits, such that the next round of group assignments will
+ * cause the above p4_should_swap_ts to pass instead of fail.
+ * This leads to counters exclusive to thread0 being used by
+ * thread1.
+ *
+ * Solve this with a cheap hack, reset the idx back to -1 to
+ * force a new lookup (p4_next_cntr) to get the right counter
+ * for the right thread.
+ *
+ * This probably doesn't comply with the general spirit of how
+ * perf wants to work, but P4 is special. :-(
+ */
+ if (p4_should_swap_ts(hwc->config, cpu))
+ hwc->idx = -1;
p4_pmu_swap_config_ts(hwc, cpu);
if (assign)
assign[i] = cntr_idx;
@@ -1322,6 +1339,7 @@ static __initconst const struct x86_pmu p4_pmu = {
__init int p4_pmu_init(void)
{
unsigned int low, high;
+ int i, reg;
/* If we get stripped -- indexing fails */
BUILD_BUG_ON(ARCH_P4_MAX_CCCR > INTEL_PMC_MAX_GENERIC);
@@ -1340,5 +1358,19 @@ __init int p4_pmu_init(void)
x86_pmu = p4_pmu;
+ /*
+ * Even though the counters are configured to interrupt a particular
+ * logical processor when an overflow happens, testing has shown that
+ * on kdump kernels (which uses a single cpu), thread1's counter
+ * continues to run and will report an NMI on thread0. Due to the
+ * overflow bug, this leads to a stream of unknown NMIs.
+ *
+ * Solve this by zero'ing out the registers to mimic a reset.
+ */
+ for (i = 0; i < x86_pmu.num_counters; i++) {
+ reg = x86_pmu_config_addr(i);
+ wrmsrl_safe(reg, 0ULL);
+ }
+
return 0;
}
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index b1e2fe11532..7c1a0c07b60 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -231,31 +231,49 @@ static __initconst const struct x86_pmu p6_pmu = {
};
+static __init void p6_pmu_rdpmc_quirk(void)
+{
+ if (boot_cpu_data.x86_mask < 9) {
+ /*
+ * PPro erratum 26; fixed in stepping 9 and above.
+ */
+ pr_warn("Userspace RDPMC support disabled due to a CPU erratum\n");
+ x86_pmu.attr_rdpmc_broken = 1;
+ x86_pmu.attr_rdpmc = 0;
+ }
+}
+
__init int p6_pmu_init(void)
{
+ x86_pmu = p6_pmu;
+
switch (boot_cpu_data.x86_model) {
- case 1:
- case 3: /* Pentium Pro */
- case 5:
- case 6: /* Pentium II */
- case 7:
- case 8:
- case 11: /* Pentium III */
- case 9:
- case 13:
- /* Pentium M */
+ case 1: /* Pentium Pro */
+ x86_add_quirk(p6_pmu_rdpmc_quirk);
+ break;
+
+ case 3: /* Pentium II - Klamath */
+ case 5: /* Pentium II - Deschutes */
+ case 6: /* Pentium II - Mendocino */
break;
+
+ case 7: /* Pentium III - Katmai */
+ case 8: /* Pentium III - Coppermine */
+ case 10: /* Pentium III Xeon */
+ case 11: /* Pentium III - Tualatin */
+ break;
+
+ case 9: /* Pentium M - Banias */
+ case 13: /* Pentium M - Dothan */
+ break;
+
default:
- pr_cont("unsupported p6 CPU model %d ",
- boot_cpu_data.x86_model);
+ pr_cont("unsupported p6 CPU model %d ", boot_cpu_data.x86_model);
return -ENODEV;
}
- x86_pmu = p6_pmu;
-
memcpy(hw_cache_event_ids, p6_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
-
return 0;
}
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index aee6317b902..06fe3ed8b85 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -11,15 +11,12 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
unsigned int cpu)
{
#ifdef CONFIG_SMP
- if (c->x86_max_cores * smp_num_siblings > 1) {
- seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
- seq_printf(m, "siblings\t: %d\n",
- cpumask_weight(cpu_core_mask(cpu)));
- seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
- seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
- seq_printf(m, "apicid\t\t: %d\n", c->apicid);
- seq_printf(m, "initial apicid\t: %d\n", c->initial_apicid);
- }
+ seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
+ seq_printf(m, "siblings\t: %d\n", cpumask_weight(cpu_core_mask(cpu)));
+ seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
+ seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
+ seq_printf(m, "apicid\t\t: %d\n", c->apicid);
+ seq_printf(m, "initial apicid\t: %d\n", c->initial_apicid);
#endif
}
diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c
index 88db010845c..136ac74dee8 100644
--- a/arch/x86/kernel/cpu/rdrand.c
+++ b/arch/x86/kernel/cpu/rdrand.c
@@ -27,24 +27,11 @@
static int __init x86_rdrand_setup(char *s)
{
setup_clear_cpu_cap(X86_FEATURE_RDRAND);
+ setup_clear_cpu_cap(X86_FEATURE_RDSEED);
return 1;
}
__setup("nordrand", x86_rdrand_setup);
-/* We can't use arch_get_random_long() here since alternatives haven't run */
-static inline int rdrand_long(unsigned long *v)
-{
- int ok;
- asm volatile("1: " RDRAND_LONG "\n\t"
- "jc 2f\n\t"
- "decl %0\n\t"
- "jnz 1b\n\t"
- "2:"
- : "=r" (ok), "=a" (*v)
- : "0" (RDRAND_RETRY_LOOPS));
- return ok;
-}
-
/*
* Force a reseed cycle; we are architecturally guaranteed a reseed
* after no more than 512 128-bit chunks of random data. This also
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index f2cc63e9cf0..b6f794aa169 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -1,5 +1,5 @@
/*
- * Routines to indentify additional cpu features that are scattered in
+ * Routines to identify additional cpu features that are scattered in
* cpuid space.
*/
#include <linux/cpu.h>
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index aa0430d69b9..3fa0e5ad86b 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -1,6 +1,5 @@
#include <linux/kernel.h>
#include <linux/mm.h>
-#include <linux/init.h>
#include <asm/processor.h>
#include <asm/msr.h>
#include "cpu.h"
diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c
index 202759a1412..ef9c2a0078b 100644
--- a/arch/x86/kernel/cpu/umc.c
+++ b/arch/x86/kernel/cpu/umc.c
@@ -1,5 +1,4 @@
#include <linux/kernel.h>
-#include <linux/init.h>
#include <asm/processor.h>
#include "cpu.h"
@@ -11,8 +10,8 @@
static const struct cpu_dev umc_cpu_dev = {
.c_vendor = "UMC",
.c_ident = { "UMC UMC UMC" },
- .c_models = {
- { .vendor = X86_VENDOR_UMC, .family = 4, .model_names =
+ .legacy_models = {
+ { .family = 4, .model_names =
{
[1] = "U5D",
[2] = "U5S",
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 7d9481c743f..3225ae6c518 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -198,14 +198,15 @@ static int __init cpuid_init(void)
goto out_chrdev;
}
cpuid_class->devnode = cpuid_devnode;
- get_online_cpus();
+
+ cpu_notifier_register_begin();
for_each_online_cpu(i) {
err = cpuid_device_create(i);
if (err != 0)
goto out_class;
}
- register_hotcpu_notifier(&cpuid_class_cpu_notifier);
- put_online_cpus();
+ __register_hotcpu_notifier(&cpuid_class_cpu_notifier);
+ cpu_notifier_register_done();
err = 0;
goto out;
@@ -215,7 +216,7 @@ out_class:
for_each_online_cpu(i) {
cpuid_device_destroy(i);
}
- put_online_cpus();
+ cpu_notifier_register_done();
class_destroy(cpuid_class);
out_chrdev:
__unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
@@ -227,13 +228,13 @@ static void __exit cpuid_exit(void)
{
int cpu = 0;
- get_online_cpus();
+ cpu_notifier_register_begin();
for_each_online_cpu(cpu)
cpuid_device_destroy(cpu);
class_destroy(cpuid_class);
__unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
- unregister_hotcpu_notifier(&cpuid_class_cpu_notifier);
- put_online_cpus();
+ __unregister_hotcpu_notifier(&cpuid_class_cpu_notifier);
+ cpu_notifier_register_done();
}
module_init(cpuid_init);
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index e0e0841eef4..507de806659 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -7,7 +7,6 @@
*
*/
-#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/smp.h>
@@ -58,9 +57,7 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
{
#ifdef CONFIG_X86_32
struct pt_regs fixed_regs;
-#endif
-#ifdef CONFIG_X86_32
if (!user_mode_vm(regs)) {
crash_fixup_ss_esp(&fixed_regs, regs);
regs = &fixed_regs;
@@ -127,12 +124,12 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
cpu_emergency_vmxoff();
cpu_emergency_svm_disable();
- lapic_shutdown();
#ifdef CONFIG_X86_IO_APIC
/* Prevent crash_kexec() from deadlocking on ioapic_lock. */
ioapic_zap_locks();
disable_IO_APIC();
#endif
+ lapic_shutdown();
#ifdef CONFIG_HPET_TIMER
hpet_disable();
#endif
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 376dc787344..7db54b5d5f8 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -20,22 +20,13 @@
#include <asm/hpet.h>
#include <asm/apic.h>
#include <asm/pci_x86.h>
+#include <asm/setup.h>
__initdata u64 initial_dtb;
char __initdata cmd_line[COMMAND_LINE_SIZE];
int __initdata of_ioapic;
-unsigned long pci_address_to_pio(phys_addr_t address)
-{
- /*
- * The ioport address can be directly used by inX / outX
- */
- BUG_ON(address >= (1 << 16));
- return (unsigned long)address;
-}
-EXPORT_SYMBOL_GPL(pci_address_to_pio);
-
void __init early_init_dt_scan_chosen_arch(unsigned long node)
{
BUG();
@@ -51,15 +42,6 @@ void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align)
return __alloc_bootmem(size, align, __pa(MAX_DMA_ADDRESS));
}
-#ifdef CONFIG_BLK_DEV_INITRD
-void __init early_init_dt_setup_initrd_arch(u64 start, u64 end)
-{
- initrd_start = (unsigned long)__va(start);
- initrd_end = (unsigned long)__va(end);
- initrd_below_start_ok = 1;
-}
-#endif
-
void __init add_dtb(u64 data)
{
initial_dtb = data + offsetof(struct setup_data, data);
@@ -105,7 +87,6 @@ struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus)
static int x86_of_pci_irq_enable(struct pci_dev *dev)
{
- struct of_irq oirq;
u32 virq;
int ret;
u8 pin;
@@ -116,12 +97,7 @@ static int x86_of_pci_irq_enable(struct pci_dev *dev)
if (!pin)
return 0;
- ret = of_irq_map_pci(dev, &oirq);
- if (ret)
- return ret;
-
- virq = irq_create_of_mapping(oirq.controller, oirq.specifier,
- oirq.size);
+ virq = of_irq_parse_and_map_pci(dev, 0, 0);
if (virq == 0)
return -EINVAL;
dev->irq = virq;
@@ -230,32 +206,23 @@ static void __init dtb_apic_setup(void)
static void __init x86_flattree_get_config(void)
{
u32 size, map_len;
- void *new_dtb;
+ void *dt;
if (!initial_dtb)
return;
- map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK),
- (u64)sizeof(struct boot_param_header));
+ map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), (u64)128);
- initial_boot_params = early_memremap(initial_dtb, map_len);
- size = be32_to_cpu(initial_boot_params->totalsize);
+ initial_boot_params = dt = early_memremap(initial_dtb, map_len);
+ size = of_get_flat_dt_size();
if (map_len < size) {
- early_iounmap(initial_boot_params, map_len);
- initial_boot_params = early_memremap(initial_dtb, size);
+ early_iounmap(dt, map_len);
+ initial_boot_params = dt = early_memremap(initial_dtb, size);
map_len = size;
}
- new_dtb = alloc_bootmem(size);
- memcpy(new_dtb, initial_boot_params, size);
- early_iounmap(initial_boot_params, map_len);
-
- initial_boot_params = new_dtb;
-
- /* root level address cells */
- of_scan_flat_dt(early_init_dt_scan_root, NULL);
-
- unflatten_device_tree();
+ unflatten_and_copy_device_tree();
+ early_iounmap(dt, map_len);
}
#else
static inline void x86_flattree_get_config(void) { }
diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c
index 5d3fe8d36e4..f6dfd9334b6 100644
--- a/arch/x86/kernel/doublefault.c
+++ b/arch/x86/kernel/doublefault.c
@@ -1,6 +1,5 @@
#include <linux/mm.h>
#include <linux/sched.h>
-#include <linux/init.h>
#include <linux/init_task.h>
#include <linux/fs.h>
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index deb6421c9e6..b74ebc7c440 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -25,12 +25,17 @@ unsigned int code_bytes = 64;
int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
static int die_counter;
-void printk_address(unsigned long address, int reliable)
+static void printk_stack_address(unsigned long address, int reliable)
{
pr_cont(" [<%p>] %s%pB\n",
(void *)address, reliable ? "" : "? ", (void *)address);
}
+void printk_address(unsigned long address)
+{
+ pr_cont(" [<%p>] %pS\n", (void *)address, (void *)address);
+}
+
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
static void
print_ftrace_graph_addr(unsigned long addr, void *data,
@@ -151,7 +156,7 @@ static void print_trace_address(void *data, unsigned long addr, int reliable)
{
touch_nmi_watchdog();
printk(data);
- printk_address(addr, reliable);
+ printk_stack_address(addr, reliable);
}
static const struct stacktrace_ops print_trace_ops = {
@@ -195,7 +200,7 @@ static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED;
static int die_owner = -1;
static unsigned int die_nest_count;
-unsigned __kprobes long oops_begin(void)
+unsigned long oops_begin(void)
{
int cpu;
unsigned long flags;
@@ -218,8 +223,9 @@ unsigned __kprobes long oops_begin(void)
return flags;
}
EXPORT_SYMBOL_GPL(oops_begin);
+NOKPROBE_SYMBOL(oops_begin);
-void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
+void oops_end(unsigned long flags, struct pt_regs *regs, int signr)
{
if (regs && kexec_should_crash(current))
crash_kexec(regs);
@@ -242,8 +248,9 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
panic("Fatal exception");
do_exit(signr);
}
+NOKPROBE_SYMBOL(oops_end);
-int __kprobes __die(const char *str, struct pt_regs *regs, long err)
+int __die(const char *str, struct pt_regs *regs, long err)
{
#ifdef CONFIG_X86_32
unsigned short ss;
@@ -281,11 +288,12 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
#else
/* Executive summary in case the oops scrolled away */
printk(KERN_ALERT "RIP ");
- printk_address(regs->ip, 1);
+ printk_address(regs->ip);
printk(" RSP <%016lx>\n", regs->sp);
#endif
return 0;
}
+NOKPROBE_SYMBOL(__die);
/*
* This is gone through when something in the kernel has done something bad
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index f2a1770ca17..5abd4cd4230 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -16,12 +16,35 @@
#include <asm/stacktrace.h>
+static void *is_irq_stack(void *p, void *irq)
+{
+ if (p < irq || p >= (irq + THREAD_SIZE))
+ return NULL;
+ return irq + THREAD_SIZE;
+}
+
+
+static void *is_hardirq_stack(unsigned long *stack, int cpu)
+{
+ void *irq = per_cpu(hardirq_stack, cpu);
+
+ return is_irq_stack(stack, irq);
+}
+
+static void *is_softirq_stack(unsigned long *stack, int cpu)
+{
+ void *irq = per_cpu(softirq_stack, cpu);
+
+ return is_irq_stack(stack, irq);
+}
void dump_trace(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack, unsigned long bp,
const struct stacktrace_ops *ops, void *data)
{
+ const unsigned cpu = get_cpu();
int graph = 0;
+ u32 *prev_esp;
if (!task)
task = current;
@@ -30,7 +53,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
unsigned long dummy;
stack = &dummy;
- if (task && task != current)
+ if (task != current)
stack = (unsigned long *)task->thread.sp;
}
@@ -39,18 +62,31 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
for (;;) {
struct thread_info *context;
+ void *end_stack;
+
+ end_stack = is_hardirq_stack(stack, cpu);
+ if (!end_stack)
+ end_stack = is_softirq_stack(stack, cpu);
- context = (struct thread_info *)
- ((unsigned long)stack & (~(THREAD_SIZE - 1)));
- bp = ops->walk_stack(context, stack, bp, ops, data, NULL, &graph);
+ context = task_thread_info(task);
+ bp = ops->walk_stack(context, stack, bp, ops, data,
+ end_stack, &graph);
- stack = (unsigned long *)context->previous_esp;
+ /* Stop if not on irq stack */
+ if (!end_stack)
+ break;
+
+ /* The previous esp is saved on the bottom of the stack */
+ prev_esp = (u32 *)(end_stack - THREAD_SIZE);
+ stack = (unsigned long *)*prev_esp;
if (!stack)
break;
+
if (ops->stack(data, "IRQ") < 0)
break;
touch_nmi_watchdog();
}
+ put_cpu();
}
EXPORT_SYMBOL(dump_trace);
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index addb207dab9..1abcb50b48a 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -104,6 +104,44 @@ in_irq_stack(unsigned long *stack, unsigned long *irq_stack,
return (stack >= irq_stack && stack < irq_stack_end);
}
+static const unsigned long irq_stack_size =
+ (IRQ_STACK_SIZE - 64) / sizeof(unsigned long);
+
+enum stack_type {
+ STACK_IS_UNKNOWN,
+ STACK_IS_NORMAL,
+ STACK_IS_EXCEPTION,
+ STACK_IS_IRQ,
+};
+
+static enum stack_type
+analyze_stack(int cpu, struct task_struct *task, unsigned long *stack,
+ unsigned long **stack_end, unsigned long *irq_stack,
+ unsigned *used, char **id)
+{
+ unsigned long addr;
+
+ addr = ((unsigned long)stack & (~(THREAD_SIZE - 1)));
+ if ((unsigned long)task_stack_page(task) == addr)
+ return STACK_IS_NORMAL;
+
+ *stack_end = in_exception_stack(cpu, (unsigned long)stack,
+ used, id);
+ if (*stack_end)
+ return STACK_IS_EXCEPTION;
+
+ if (!irq_stack)
+ return STACK_IS_NORMAL;
+
+ *stack_end = irq_stack;
+ irq_stack = irq_stack - irq_stack_size;
+
+ if (in_irq_stack(stack, irq_stack, *stack_end))
+ return STACK_IS_IRQ;
+
+ return STACK_IS_UNKNOWN;
+}
+
/*
* x86-64 can have up to three kernel stacks:
* process stack
@@ -116,12 +154,12 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
const struct stacktrace_ops *ops, void *data)
{
const unsigned cpu = get_cpu();
- unsigned long *irq_stack_end =
- (unsigned long *)per_cpu(irq_stack_ptr, cpu);
- unsigned used = 0;
struct thread_info *tinfo;
- int graph = 0;
+ unsigned long *irq_stack = (unsigned long *)per_cpu(irq_stack_ptr, cpu);
unsigned long dummy;
+ unsigned used = 0;
+ int graph = 0;
+ int done = 0;
if (!task)
task = current;
@@ -143,49 +181,61 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
* exceptions
*/
tinfo = task_thread_info(task);
- for (;;) {
+ while (!done) {
+ unsigned long *stack_end;
+ enum stack_type stype;
char *id;
- unsigned long *estack_end;
- estack_end = in_exception_stack(cpu, (unsigned long)stack,
- &used, &id);
- if (estack_end) {
+ stype = analyze_stack(cpu, task, stack, &stack_end,
+ irq_stack, &used, &id);
+
+ /* Default finish unless specified to continue */
+ done = 1;
+
+ switch (stype) {
+
+ /* Break out early if we are on the thread stack */
+ case STACK_IS_NORMAL:
+ break;
+
+ case STACK_IS_EXCEPTION:
+
if (ops->stack(data, id) < 0)
break;
bp = ops->walk_stack(tinfo, stack, bp, ops,
- data, estack_end, &graph);
+ data, stack_end, &graph);
ops->stack(data, "<EOE>");
/*
* We link to the next stack via the
* second-to-last pointer (index -2 to end) in the
* exception stack:
*/
- stack = (unsigned long *) estack_end[-2];
- continue;
- }
- if (irq_stack_end) {
- unsigned long *irq_stack;
- irq_stack = irq_stack_end -
- (IRQ_STACK_SIZE - 64) / sizeof(*irq_stack);
-
- if (in_irq_stack(stack, irq_stack, irq_stack_end)) {
- if (ops->stack(data, "IRQ") < 0)
- break;
- bp = ops->walk_stack(tinfo, stack, bp,
- ops, data, irq_stack_end, &graph);
- /*
- * We link to the next stack (which would be
- * the process stack normally) the last
- * pointer (index -1 to end) in the IRQ stack:
- */
- stack = (unsigned long *) (irq_stack_end[-1]);
- irq_stack_end = NULL;
- ops->stack(data, "EOI");
- continue;
- }
+ stack = (unsigned long *) stack_end[-2];
+ done = 0;
+ break;
+
+ case STACK_IS_IRQ:
+
+ if (ops->stack(data, "IRQ") < 0)
+ break;
+ bp = ops->walk_stack(tinfo, stack, bp,
+ ops, data, stack_end, &graph);
+ /*
+ * We link to the next stack (which would be
+ * the process stack normally) the last
+ * pointer (index -1 to end) in the IRQ stack:
+ */
+ stack = (unsigned long *) (stack_end[-1]);
+ irq_stack = NULL;
+ ops->stack(data, "EOI");
+ done = 0;
+ break;
+
+ case STACK_IS_UNKNOWN:
+ ops->stack(data, "UNK");
+ break;
}
- break;
}
/*
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 174da5fc5a7..988c00a1f60 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1120,7 +1120,7 @@ void __init memblock_find_dma_reserve(void)
nr_pages += end_pfn - start_pfn;
}
- for_each_free_mem_range(u, MAX_NUMNODES, &start, &end, NULL) {
+ for_each_free_mem_range(u, NUMA_NO_NODE, &start, &end, NULL) {
start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN);
end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN);
if (start_pfn < end_pfn)
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index b3cd3ebae07..2e1a6853e00 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -17,6 +17,7 @@
#include <asm/dma.h>
#include <asm/io_apic.h>
#include <asm/apic.h>
+#include <asm/hpet.h>
#include <asm/iommu.h>
#include <asm/gart.h>
#include <asm/irq_remapping.h>
@@ -203,18 +204,15 @@ static void __init intel_remapping_check(int num, int slot, int func)
revision = read_pci_config_byte(num, slot, func, PCI_REVISION_ID);
/*
- * Revision 13 of all triggering devices id in this quirk have
- * a problem draining interrupts when irq remapping is enabled,
- * and should be flagged as broken. Additionally revisions 0x12
- * and 0x22 of device id 0x3405 has this problem.
+ * Revision <= 13 of all triggering devices id in this quirk
+ * have a problem draining interrupts when irq remapping is
+ * enabled, and should be flagged as broken. Additionally
+ * revision 0x22 of device id 0x3405 has this problem.
*/
- if (revision == 0x13)
+ if (revision <= 0x13)
set_irq_remapping_broken();
- else if ((device == 0x3405) &&
- ((revision == 0x12) ||
- (revision == 0x22)))
+ else if (device == 0x3405 && revision == 0x22)
set_irq_remapping_broken();
-
}
/*
@@ -228,7 +226,7 @@ static void __init intel_remapping_check(int num, int slot, int func)
*
* And yes, so far on current devices the base addr is always under 4G.
*/
-static u32 __init intel_stolen_base(int num, int slot, int func)
+static u32 __init intel_stolen_base(int num, int slot, int func, size_t stolen_size)
{
u32 base;
@@ -243,10 +241,118 @@ static u32 __init intel_stolen_base(int num, int slot, int func)
return base;
}
-#define KB(x) ((x) * 1024)
+#define KB(x) ((x) * 1024UL)
#define MB(x) (KB (KB (x)))
#define GB(x) (MB (KB (x)))
+static size_t __init i830_tseg_size(void)
+{
+ u8 tmp = read_pci_config_byte(0, 0, 0, I830_ESMRAMC);
+
+ if (!(tmp & TSEG_ENABLE))
+ return 0;
+
+ if (tmp & I830_TSEG_SIZE_1M)
+ return MB(1);
+ else
+ return KB(512);
+}
+
+static size_t __init i845_tseg_size(void)
+{
+ u8 tmp = read_pci_config_byte(0, 0, 0, I845_ESMRAMC);
+
+ if (!(tmp & TSEG_ENABLE))
+ return 0;
+
+ switch (tmp & I845_TSEG_SIZE_MASK) {
+ case I845_TSEG_SIZE_512K:
+ return KB(512);
+ case I845_TSEG_SIZE_1M:
+ return MB(1);
+ default:
+ WARN_ON(1);
+ return 0;
+ }
+}
+
+static size_t __init i85x_tseg_size(void)
+{
+ u8 tmp = read_pci_config_byte(0, 0, 0, I85X_ESMRAMC);
+
+ if (!(tmp & TSEG_ENABLE))
+ return 0;
+
+ return MB(1);
+}
+
+static size_t __init i830_mem_size(void)
+{
+ return read_pci_config_byte(0, 0, 0, I830_DRB3) * MB(32);
+}
+
+static size_t __init i85x_mem_size(void)
+{
+ return read_pci_config_byte(0, 0, 1, I85X_DRB3) * MB(32);
+}
+
+/*
+ * On 830/845/85x the stolen memory base isn't available in any
+ * register. We need to calculate it as TOM-TSEG_SIZE-stolen_size.
+ */
+static u32 __init i830_stolen_base(int num, int slot, int func, size_t stolen_size)
+{
+ return i830_mem_size() - i830_tseg_size() - stolen_size;
+}
+
+static u32 __init i845_stolen_base(int num, int slot, int func, size_t stolen_size)
+{
+ return i830_mem_size() - i845_tseg_size() - stolen_size;
+}
+
+static u32 __init i85x_stolen_base(int num, int slot, int func, size_t stolen_size)
+{
+ return i85x_mem_size() - i85x_tseg_size() - stolen_size;
+}
+
+static u32 __init i865_stolen_base(int num, int slot, int func, size_t stolen_size)
+{
+ /*
+ * FIXME is the graphics stolen memory region
+ * always at TOUD? Ie. is it always the last
+ * one to be allocated by the BIOS?
+ */
+ return read_pci_config_16(0, 0, 0, I865_TOUD) << 16;
+}
+
+static size_t __init i830_stolen_size(int num, int slot, int func)
+{
+ size_t stolen_size;
+ u16 gmch_ctrl;
+
+ gmch_ctrl = read_pci_config_16(0, 0, 0, I830_GMCH_CTRL);
+
+ switch (gmch_ctrl & I830_GMCH_GMS_MASK) {
+ case I830_GMCH_GMS_STOLEN_512:
+ stolen_size = KB(512);
+ break;
+ case I830_GMCH_GMS_STOLEN_1024:
+ stolen_size = MB(1);
+ break;
+ case I830_GMCH_GMS_STOLEN_8192:
+ stolen_size = MB(8);
+ break;
+ case I830_GMCH_GMS_LOCAL:
+ /* local memory isn't part of the normal address space */
+ stolen_size = 0;
+ break;
+ default:
+ return 0;
+ }
+
+ return stolen_size;
+}
+
static size_t __init gen3_stolen_size(int num, int slot, int func)
{
size_t stolen_size;
@@ -313,29 +419,110 @@ static size_t __init gen6_stolen_size(int num, int slot, int func)
return gmch_ctrl << 25; /* 32 MB units */
}
-typedef size_t (*stolen_size_fn)(int num, int slot, int func);
-
-static struct pci_device_id intel_stolen_ids[] __initdata = {
- INTEL_I915G_IDS(gen3_stolen_size),
- INTEL_I915GM_IDS(gen3_stolen_size),
- INTEL_I945G_IDS(gen3_stolen_size),
- INTEL_I945GM_IDS(gen3_stolen_size),
- INTEL_VLV_M_IDS(gen3_stolen_size),
- INTEL_VLV_D_IDS(gen3_stolen_size),
- INTEL_PINEVIEW_IDS(gen3_stolen_size),
- INTEL_I965G_IDS(gen3_stolen_size),
- INTEL_G33_IDS(gen3_stolen_size),
- INTEL_I965GM_IDS(gen3_stolen_size),
- INTEL_GM45_IDS(gen3_stolen_size),
- INTEL_G45_IDS(gen3_stolen_size),
- INTEL_IRONLAKE_D_IDS(gen3_stolen_size),
- INTEL_IRONLAKE_M_IDS(gen3_stolen_size),
- INTEL_SNB_D_IDS(gen6_stolen_size),
- INTEL_SNB_M_IDS(gen6_stolen_size),
- INTEL_IVB_M_IDS(gen6_stolen_size),
- INTEL_IVB_D_IDS(gen6_stolen_size),
- INTEL_HSW_D_IDS(gen6_stolen_size),
- INTEL_HSW_M_IDS(gen6_stolen_size),
+static size_t __init gen8_stolen_size(int num, int slot, int func)
+{
+ u16 gmch_ctrl;
+
+ gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL);
+ gmch_ctrl >>= BDW_GMCH_GMS_SHIFT;
+ gmch_ctrl &= BDW_GMCH_GMS_MASK;
+ return gmch_ctrl << 25; /* 32 MB units */
+}
+
+static size_t __init chv_stolen_size(int num, int slot, int func)
+{
+ u16 gmch_ctrl;
+
+ gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL);
+ gmch_ctrl >>= SNB_GMCH_GMS_SHIFT;
+ gmch_ctrl &= SNB_GMCH_GMS_MASK;
+
+ /*
+ * 0x0 to 0x10: 32MB increments starting at 0MB
+ * 0x11 to 0x16: 4MB increments starting at 8MB
+ * 0x17 to 0x1d: 4MB increments start at 36MB
+ */
+ if (gmch_ctrl < 0x11)
+ return gmch_ctrl << 25;
+ else if (gmch_ctrl < 0x17)
+ return (gmch_ctrl - 0x11 + 2) << 22;
+ else
+ return (gmch_ctrl - 0x17 + 9) << 22;
+}
+
+struct intel_stolen_funcs {
+ size_t (*size)(int num, int slot, int func);
+ u32 (*base)(int num, int slot, int func, size_t size);
+};
+
+static const struct intel_stolen_funcs i830_stolen_funcs __initconst = {
+ .base = i830_stolen_base,
+ .size = i830_stolen_size,
+};
+
+static const struct intel_stolen_funcs i845_stolen_funcs __initconst = {
+ .base = i845_stolen_base,
+ .size = i830_stolen_size,
+};
+
+static const struct intel_stolen_funcs i85x_stolen_funcs __initconst = {
+ .base = i85x_stolen_base,
+ .size = gen3_stolen_size,
+};
+
+static const struct intel_stolen_funcs i865_stolen_funcs __initconst = {
+ .base = i865_stolen_base,
+ .size = gen3_stolen_size,
+};
+
+static const struct intel_stolen_funcs gen3_stolen_funcs __initconst = {
+ .base = intel_stolen_base,
+ .size = gen3_stolen_size,
+};
+
+static const struct intel_stolen_funcs gen6_stolen_funcs __initconst = {
+ .base = intel_stolen_base,
+ .size = gen6_stolen_size,
+};
+
+static const struct intel_stolen_funcs gen8_stolen_funcs __initconst = {
+ .base = intel_stolen_base,
+ .size = gen8_stolen_size,
+};
+
+static const struct intel_stolen_funcs chv_stolen_funcs __initconst = {
+ .base = intel_stolen_base,
+ .size = chv_stolen_size,
+};
+
+static const struct pci_device_id intel_stolen_ids[] __initconst = {
+ INTEL_I830_IDS(&i830_stolen_funcs),
+ INTEL_I845G_IDS(&i845_stolen_funcs),
+ INTEL_I85X_IDS(&i85x_stolen_funcs),
+ INTEL_I865G_IDS(&i865_stolen_funcs),
+ INTEL_I915G_IDS(&gen3_stolen_funcs),
+ INTEL_I915GM_IDS(&gen3_stolen_funcs),
+ INTEL_I945G_IDS(&gen3_stolen_funcs),
+ INTEL_I945GM_IDS(&gen3_stolen_funcs),
+ INTEL_VLV_M_IDS(&gen6_stolen_funcs),
+ INTEL_VLV_D_IDS(&gen6_stolen_funcs),
+ INTEL_PINEVIEW_IDS(&gen3_stolen_funcs),
+ INTEL_I965G_IDS(&gen3_stolen_funcs),
+ INTEL_G33_IDS(&gen3_stolen_funcs),
+ INTEL_I965GM_IDS(&gen3_stolen_funcs),
+ INTEL_GM45_IDS(&gen3_stolen_funcs),
+ INTEL_G45_IDS(&gen3_stolen_funcs),
+ INTEL_IRONLAKE_D_IDS(&gen3_stolen_funcs),
+ INTEL_IRONLAKE_M_IDS(&gen3_stolen_funcs),
+ INTEL_SNB_D_IDS(&gen6_stolen_funcs),
+ INTEL_SNB_M_IDS(&gen6_stolen_funcs),
+ INTEL_IVB_M_IDS(&gen6_stolen_funcs),
+ INTEL_IVB_D_IDS(&gen6_stolen_funcs),
+ INTEL_HSW_D_IDS(&gen6_stolen_funcs),
+ INTEL_HSW_M_IDS(&gen6_stolen_funcs),
+ INTEL_BDW_M_IDS(&gen8_stolen_funcs),
+ INTEL_BDW_D_IDS(&gen8_stolen_funcs),
+ INTEL_CHV_IDS(&chv_stolen_funcs),
};
static void __init intel_graphics_stolen(int num, int slot, int func)
@@ -352,11 +539,13 @@ static void __init intel_graphics_stolen(int num, int slot, int func)
for (i = 0; i < ARRAY_SIZE(intel_stolen_ids); i++) {
if (intel_stolen_ids[i].device == device) {
- stolen_size_fn stolen_size =
- (stolen_size_fn)intel_stolen_ids[i].driver_data;
- size = stolen_size(num, slot, func);
- start = intel_stolen_base(num, slot, func);
+ const struct intel_stolen_funcs *stolen_funcs =
+ (const struct intel_stolen_funcs *)intel_stolen_ids[i].driver_data;
+ size = stolen_funcs->size(num, slot, func);
+ start = stolen_funcs->base(num, slot, func, size);
if (size && start) {
+ printk(KERN_INFO "Reserving Intel graphics stolen memory at 0x%x-0x%x\n",
+ start, start + (u32)size - 1);
/* Mark this space as reserved */
e820_add_region(start, size, E820_RESERVED);
sanitize_e820_map(e820.map,
@@ -368,6 +557,15 @@ static void __init intel_graphics_stolen(int num, int slot, int func)
}
}
+static void __init force_disable_hpet(int num, int slot, int func)
+{
+#ifdef CONFIG_HPET_TIMER
+ boot_hpet_disable = 1;
+ pr_info("x86/hpet: Will disable the HPET for this platform because it's not reliable\n");
+#endif
+}
+
+
#define QFLAG_APPLY_ONCE 0x1
#define QFLAG_APPLIED 0x2
#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED)
@@ -405,6 +603,12 @@ static struct chipset early_qrk[] __initdata = {
PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check },
{ PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID,
QFLAG_APPLY_ONCE, intel_graphics_stolen },
+ /*
+ * HPET on current version of Baytrail platform has accuracy
+ * problems, disable it for now:
+ */
+ { PCI_VENDOR_ID_INTEL, 0x0f00,
+ PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
{}
};
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index d15f575a861..01d1c187c9f 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -14,9 +14,11 @@
#include <xen/hvc-console.h>
#include <asm/pci-direct.h>
#include <asm/fixmap.h>
-#include <asm/mrst.h>
+#include <asm/intel-mid.h>
#include <asm/pgtable.h>
#include <linux/usb/ehci_def.h>
+#include <linux/efi.h>
+#include <asm/efi.h>
/* Simple VGA output */
#define VGABASE (__ISA_IO_base + 0xb8000)
@@ -234,6 +236,11 @@ static int __init setup_early_printk(char *buf)
early_console_register(&early_hsu_console, keep);
}
#endif
+#ifdef CONFIG_EARLY_PRINTK_EFI
+ if (!strncmp(buf, "efi", 3))
+ early_console_register(&early_efi_console, keep);
+#endif
+
buf++;
}
return 0;
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index f0dcb0ceb6a..0d0c9d4ab6d 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -315,10 +315,6 @@ ENTRY(ret_from_kernel_thread)
ENDPROC(ret_from_kernel_thread)
/*
- * Interrupt exit functions should be protected against kprobes
- */
- .pushsection .kprobes.text, "ax"
-/*
* Return to user mode is not as complex as all this looks,
* but we want the default path for a system call return to
* go as quickly as possible which is why some of this is
@@ -362,12 +358,9 @@ END(ret_from_exception)
#ifdef CONFIG_PREEMPT
ENTRY(resume_kernel)
DISABLE_INTERRUPTS(CLBR_ANY)
- cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
- jnz restore_all
need_resched:
- movl TI_flags(%ebp), %ecx # need_resched set ?
- testb $_TIF_NEED_RESCHED, %cl
- jz restore_all
+ cmpl $0,PER_CPU_VAR(__preempt_count)
+ jnz restore_all
testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ?
jz restore_all
call preempt_schedule_irq
@@ -375,10 +368,6 @@ need_resched:
END(resume_kernel)
#endif
CFI_ENDPROC
-/*
- * End of kprobes section
- */
- .popsection
/* SYSENTER_RETURN points to after the "sysenter" instruction in
the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
@@ -434,8 +423,9 @@ sysenter_past_esp:
jnz sysenter_audit
sysenter_do_call:
cmpl $(NR_syscalls), %eax
- jae syscall_badsys
+ jae sysenter_badsys
call *sys_call_table(,%eax,4)
+sysenter_after_call:
movl %eax,PT_EAX(%esp)
LOCKDEP_SYS_EXIT
DISABLE_INTERRUPTS(CLBR_ANY)
@@ -498,10 +488,6 @@ sysexit_audit:
PTGS_TO_GS_EX
ENDPROC(ia32_sysenter_target)
-/*
- * syscall stub including irq exit should be protected against kprobes
- */
- .pushsection .kprobes.text, "ax"
# system call handler stub
ENTRY(system_call)
RING0_INT_FRAME # can't unwind into user space anyway
@@ -516,6 +502,7 @@ ENTRY(system_call)
jae syscall_badsys
syscall_call:
call *sys_call_table(,%eax,4)
+syscall_after_call:
movl %eax,PT_EAX(%esp) # store the return value
syscall_exit:
LOCKDEP_SYS_EXIT
@@ -530,6 +517,7 @@ syscall_exit:
restore_all:
TRACE_IRQS_IRET
restore_all_notrace:
+#ifdef CONFIG_X86_ESPFIX32
movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
# Warning: PT_OLDSS(%esp) contains the wrong/random values if we
# are returning to the kernel.
@@ -540,6 +528,7 @@ restore_all_notrace:
cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
CFI_REMEMBER_STATE
je ldt_ss # returning to user-space with LDT SS
+#endif
restore_nocheck:
RESTORE_REGS 4 # skip orig_eax/error_code
irq_return:
@@ -552,13 +541,9 @@ ENTRY(iret_exc)
.previous
_ASM_EXTABLE(irq_return,iret_exc)
+#ifdef CONFIG_X86_ESPFIX32
CFI_RESTORE_STATE
ldt_ss:
- larl PT_OLDSS(%esp), %eax
- jnz restore_nocheck
- testl $0x00400000, %eax # returning to 32bit stack?
- jnz restore_nocheck # allright, normal return
-
#ifdef CONFIG_PARAVIRT
/*
* The kernel can't run on a non-flat stack if paravirt mode
@@ -600,6 +585,7 @@ ldt_ss:
lss (%esp), %esp /* switch to espfix segment */
CFI_ADJUST_CFA_OFFSET -8
jmp restore_nocheck
+#endif
CFI_ENDPROC
ENDPROC(system_call)
@@ -690,14 +676,15 @@ syscall_fault:
END(syscall_fault)
syscall_badsys:
- movl $-ENOSYS,PT_EAX(%esp)
- jmp resume_userspace
+ movl $-ENOSYS,%eax
+ jmp syscall_after_call
+END(syscall_badsys)
+
+sysenter_badsys:
+ movl $-ENOSYS,%eax
+ jmp sysenter_after_call
END(syscall_badsys)
CFI_ENDPROC
-/*
- * End of kprobes section
- */
- .popsection
.macro FIXUP_ESPFIX_STACK
/*
@@ -707,6 +694,7 @@ END(syscall_badsys)
* the high word of the segment base from the GDT and swiches to the
* normal stack and adjusts ESP with the matching offset.
*/
+#ifdef CONFIG_X86_ESPFIX32
/* fixup the stack */
mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
@@ -716,8 +704,10 @@ END(syscall_badsys)
pushl_cfi %eax
lss (%esp), %esp /* switch to the normal stack segment */
CFI_ADJUST_CFA_OFFSET -8
+#endif
.endm
.macro UNWIND_ESPFIX_STACK
+#ifdef CONFIG_X86_ESPFIX32
movl %ss, %eax
/* see if on espfix stack */
cmpw $__ESPFIX_SS, %ax
@@ -728,6 +718,7 @@ END(syscall_badsys)
/* switch to normal stack */
FIXUP_ESPFIX_STACK
27:
+#endif
.endm
/*
@@ -784,10 +775,6 @@ common_interrupt:
ENDPROC(common_interrupt)
CFI_ENDPROC
-/*
- * Irq entries should be protected against kprobes
- */
- .pushsection .kprobes.text, "ax"
#define BUILD_INTERRUPT3(name, nr, fn) \
ENTRY(name) \
RING0_INT_FRAME; \
@@ -964,10 +951,6 @@ ENTRY(spurious_interrupt_bug)
jmp error_code
CFI_ENDPROC
END(spurious_interrupt_bug)
-/*
- * End of kprobes section
- */
- .popsection
#ifdef CONFIG_XEN
/* Xen doesn't set %esp to be precisely what the normal sysenter
@@ -1085,7 +1068,7 @@ ENTRY(ftrace_caller)
pushl $0 /* Pass NULL as regs pointer */
movl 4*4(%esp), %eax
movl 0x4(%ebp), %edx
- leal function_trace_op, %ecx
+ movl function_trace_op, %ecx
subl $MCOUNT_INSN_SIZE, %eax
.globl ftrace_call
@@ -1143,7 +1126,7 @@ ENTRY(ftrace_regs_caller)
movl 12*4(%esp), %eax /* Load ip (1st parameter) */
subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */
movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */
- leal function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */
+ movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */
pushl %esp /* Save pt_regs as 4th parameter */
GLOBAL(ftrace_regs_call)
@@ -1242,10 +1225,15 @@ return_to_handler:
jmp *%ecx
#endif
-/*
- * Some functions should be protected against kprobes
- */
- .pushsection .kprobes.text, "ax"
+#ifdef CONFIG_TRACING
+ENTRY(trace_page_fault)
+ RING0_EC_FRAME
+ ASM_CLAC
+ pushl_cfi $trace_do_page_fault
+ jmp error_code
+ CFI_ENDPROC
+END(trace_page_fault)
+#endif
ENTRY(page_fault)
RING0_EC_FRAME
@@ -1348,11 +1336,13 @@ END(debug)
ENTRY(nmi)
RING0_INT_FRAME
ASM_CLAC
+#ifdef CONFIG_X86_ESPFIX32
pushl_cfi %eax
movl %ss, %eax
cmpw $__ESPFIX_SS, %ax
popl_cfi %eax
je nmi_espfix_stack
+#endif
cmpl $ia32_sysenter_target,(%esp)
je nmi_stack_fixup
pushl_cfi %eax
@@ -1392,6 +1382,7 @@ nmi_debug_stack_check:
FIX_STACK 24, nmi_stack_correct, 1
jmp nmi_stack_correct
+#ifdef CONFIG_X86_ESPFIX32
nmi_espfix_stack:
/* We have a RING0_INT_FRAME here.
*
@@ -1413,6 +1404,7 @@ nmi_espfix_stack:
lss 12+4(%esp), %esp # back to espfix stack
CFI_ADJUST_CFA_OFFSET -24
jmp irq_return
+#endif
CFI_ENDPROC
END(nmi)
@@ -1446,7 +1438,3 @@ ENTRY(async_page_fault)
END(async_page_fault)
#endif
-/*
- * End of kprobes section
- */
- .popsection
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index b077f4cc225..c844f0816ab 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -36,7 +36,7 @@
* - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
* frame that is otherwise undefined after a SYSCALL
* - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
- * - errorentry/paranoidentry/zeroentry - Define exception entry points.
+ * - idtentry - Define exception entry points.
*/
#include <linux/linkage.h>
@@ -53,11 +53,11 @@
#include <asm/page_types.h>
#include <asm/irqflags.h>
#include <asm/paravirt.h>
-#include <asm/ftrace.h>
#include <asm/percpu.h>
#include <asm/asm.h>
#include <asm/context_tracking.h>
#include <asm/smap.h>
+#include <asm/pgtable_types.h>
#include <linux/err.h>
/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
@@ -69,209 +69,6 @@
.code64
.section .entry.text, "ax"
-#ifdef CONFIG_FUNCTION_TRACER
-
-#ifdef CC_USING_FENTRY
-# define function_hook __fentry__
-#else
-# define function_hook mcount
-#endif
-
-#ifdef CONFIG_DYNAMIC_FTRACE
-
-ENTRY(function_hook)
- retq
-END(function_hook)
-
-/* skip is set if stack has been adjusted */
-.macro ftrace_caller_setup skip=0
- MCOUNT_SAVE_FRAME \skip
-
- /* Load the ftrace_ops into the 3rd parameter */
- leaq function_trace_op, %rdx
-
- /* Load ip into the first parameter */
- movq RIP(%rsp), %rdi
- subq $MCOUNT_INSN_SIZE, %rdi
- /* Load the parent_ip into the second parameter */
-#ifdef CC_USING_FENTRY
- movq SS+16(%rsp), %rsi
-#else
- movq 8(%rbp), %rsi
-#endif
-.endm
-
-ENTRY(ftrace_caller)
- /* Check if tracing was disabled (quick check) */
- cmpl $0, function_trace_stop
- jne ftrace_stub
-
- ftrace_caller_setup
- /* regs go into 4th parameter (but make it NULL) */
- movq $0, %rcx
-
-GLOBAL(ftrace_call)
- call ftrace_stub
-
- MCOUNT_RESTORE_FRAME
-ftrace_return:
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-GLOBAL(ftrace_graph_call)
- jmp ftrace_stub
-#endif
-
-GLOBAL(ftrace_stub)
- retq
-END(ftrace_caller)
-
-ENTRY(ftrace_regs_caller)
- /* Save the current flags before compare (in SS location)*/
- pushfq
-
- /* Check if tracing was disabled (quick check) */
- cmpl $0, function_trace_stop
- jne ftrace_restore_flags
-
- /* skip=8 to skip flags saved in SS */
- ftrace_caller_setup 8
-
- /* Save the rest of pt_regs */
- movq %r15, R15(%rsp)
- movq %r14, R14(%rsp)
- movq %r13, R13(%rsp)
- movq %r12, R12(%rsp)
- movq %r11, R11(%rsp)
- movq %r10, R10(%rsp)
- movq %rbp, RBP(%rsp)
- movq %rbx, RBX(%rsp)
- /* Copy saved flags */
- movq SS(%rsp), %rcx
- movq %rcx, EFLAGS(%rsp)
- /* Kernel segments */
- movq $__KERNEL_DS, %rcx
- movq %rcx, SS(%rsp)
- movq $__KERNEL_CS, %rcx
- movq %rcx, CS(%rsp)
- /* Stack - skipping return address */
- leaq SS+16(%rsp), %rcx
- movq %rcx, RSP(%rsp)
-
- /* regs go into 4th parameter */
- leaq (%rsp), %rcx
-
-GLOBAL(ftrace_regs_call)
- call ftrace_stub
-
- /* Copy flags back to SS, to restore them */
- movq EFLAGS(%rsp), %rax
- movq %rax, SS(%rsp)
-
- /* Handlers can change the RIP */
- movq RIP(%rsp), %rax
- movq %rax, SS+8(%rsp)
-
- /* restore the rest of pt_regs */
- movq R15(%rsp), %r15
- movq R14(%rsp), %r14
- movq R13(%rsp), %r13
- movq R12(%rsp), %r12
- movq R10(%rsp), %r10
- movq RBP(%rsp), %rbp
- movq RBX(%rsp), %rbx
-
- /* skip=8 to skip flags saved in SS */
- MCOUNT_RESTORE_FRAME 8
-
- /* Restore flags */
- popfq
-
- jmp ftrace_return
-ftrace_restore_flags:
- popfq
- jmp ftrace_stub
-
-END(ftrace_regs_caller)
-
-
-#else /* ! CONFIG_DYNAMIC_FTRACE */
-
-ENTRY(function_hook)
- cmpl $0, function_trace_stop
- jne ftrace_stub
-
- cmpq $ftrace_stub, ftrace_trace_function
- jnz trace
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- cmpq $ftrace_stub, ftrace_graph_return
- jnz ftrace_graph_caller
-
- cmpq $ftrace_graph_entry_stub, ftrace_graph_entry
- jnz ftrace_graph_caller
-#endif
-
-GLOBAL(ftrace_stub)
- retq
-
-trace:
- MCOUNT_SAVE_FRAME
-
- movq RIP(%rsp), %rdi
-#ifdef CC_USING_FENTRY
- movq SS+16(%rsp), %rsi
-#else
- movq 8(%rbp), %rsi
-#endif
- subq $MCOUNT_INSN_SIZE, %rdi
-
- call *ftrace_trace_function
-
- MCOUNT_RESTORE_FRAME
-
- jmp ftrace_stub
-END(function_hook)
-#endif /* CONFIG_DYNAMIC_FTRACE */
-#endif /* CONFIG_FUNCTION_TRACER */
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-ENTRY(ftrace_graph_caller)
- MCOUNT_SAVE_FRAME
-
-#ifdef CC_USING_FENTRY
- leaq SS+16(%rsp), %rdi
- movq $0, %rdx /* No framepointers needed */
-#else
- leaq 8(%rbp), %rdi
- movq (%rbp), %rdx
-#endif
- movq RIP(%rsp), %rsi
- subq $MCOUNT_INSN_SIZE, %rsi
-
- call prepare_ftrace_return
-
- MCOUNT_RESTORE_FRAME
-
- retq
-END(ftrace_graph_caller)
-
-GLOBAL(return_to_handler)
- subq $24, %rsp
-
- /* Save the return values */
- movq %rax, (%rsp)
- movq %rdx, 8(%rsp)
- movq %rbp, %rdi
-
- call ftrace_return_to_handler
-
- movq %rax, %rdi
- movq 8(%rsp), %rdx
- movq (%rsp), %rax
- addq $24, %rsp
- jmp *%rdi
-#endif
-
#ifndef CONFIG_PREEMPT
#define retint_kernel retint_restore_args
@@ -487,8 +284,6 @@ ENDPROC(native_usergs_sysret64)
TRACE_IRQS_OFF
.endm
-/* save complete stack frame */
- .pushsection .kprobes.text, "ax"
ENTRY(save_paranoid)
XCPT_FRAME 1 RDI+8
cld
@@ -517,7 +312,6 @@ ENTRY(save_paranoid)
1: ret
CFI_ENDPROC
END(save_paranoid)
- .popsection
/*
* A newly forked process directly context switches into this address.
@@ -975,10 +769,6 @@ END(interrupt)
call \func
.endm
-/*
- * Interrupt entry/exit should be protected against kprobes
- */
- .pushsection .kprobes.text, "ax"
/*
* The interrupt stubs push (~vector+0x80) onto the stack and
* then jump to common_interrupt.
@@ -1041,12 +831,45 @@ restore_args:
irq_return:
INTERRUPT_RETURN
- _ASM_EXTABLE(irq_return, bad_iret)
-#ifdef CONFIG_PARAVIRT
ENTRY(native_iret)
+ /*
+ * Are we returning to a stack segment from the LDT? Note: in
+ * 64-bit mode SS:RSP on the exception stack is always valid.
+ */
+#ifdef CONFIG_X86_ESPFIX64
+ testb $4,(SS-RIP)(%rsp)
+ jnz native_irq_return_ldt
+#endif
+
+native_irq_return_iret:
iretq
- _ASM_EXTABLE(native_iret, bad_iret)
+ _ASM_EXTABLE(native_irq_return_iret, bad_iret)
+
+#ifdef CONFIG_X86_ESPFIX64
+native_irq_return_ldt:
+ pushq_cfi %rax
+ pushq_cfi %rdi
+ SWAPGS
+ movq PER_CPU_VAR(espfix_waddr),%rdi
+ movq %rax,(0*8)(%rdi) /* RAX */
+ movq (2*8)(%rsp),%rax /* RIP */
+ movq %rax,(1*8)(%rdi)
+ movq (3*8)(%rsp),%rax /* CS */
+ movq %rax,(2*8)(%rdi)
+ movq (4*8)(%rsp),%rax /* RFLAGS */
+ movq %rax,(3*8)(%rdi)
+ movq (6*8)(%rsp),%rax /* SS */
+ movq %rax,(5*8)(%rdi)
+ movq (5*8)(%rsp),%rax /* RSP */
+ movq %rax,(4*8)(%rdi)
+ andl $0xffff0000,%eax
+ popq_cfi %rdi
+ orq PER_CPU_VAR(espfix_stack),%rax
+ SWAPGS
+ movq %rax,%rsp
+ popq_cfi %rax
+ jmp native_irq_return_iret
#endif
.section .fixup,"ax"
@@ -1103,22 +926,46 @@ retint_signal:
/* Returning to kernel space. Check if we need preemption */
/* rcx: threadinfo. interrupts off. */
ENTRY(retint_kernel)
- cmpl $0,TI_preempt_count(%rcx)
+ cmpl $0,PER_CPU_VAR(__preempt_count)
jnz retint_restore_args
- bt $TIF_NEED_RESCHED,TI_flags(%rcx)
- jnc retint_restore_args
bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
jnc retint_restore_args
call preempt_schedule_irq
jmp exit_intr
#endif
-
CFI_ENDPROC
END(common_interrupt)
-/*
- * End of kprobes section
- */
- .popsection
+
+ /*
+ * If IRET takes a fault on the espfix stack, then we
+ * end up promoting it to a doublefault. In that case,
+ * modify the stack to make it look like we just entered
+ * the #GP handler from user space, similar to bad_iret.
+ */
+#ifdef CONFIG_X86_ESPFIX64
+ ALIGN
+__do_double_fault:
+ XCPT_FRAME 1 RDI+8
+ movq RSP(%rdi),%rax /* Trap on the espfix stack? */
+ sarq $PGDIR_SHIFT,%rax
+ cmpl $ESPFIX_PGD_ENTRY,%eax
+ jne do_double_fault /* No, just deliver the fault */
+ cmpl $__KERNEL_CS,CS(%rdi)
+ jne do_double_fault
+ movq RIP(%rdi),%rax
+ cmpq $native_irq_return_iret,%rax
+ jne do_double_fault /* This shouldn't happen... */
+ movq PER_CPU_VAR(kernel_stack),%rax
+ subq $(6*8-KERNEL_STACK_OFFSET),%rax /* Reset to original stack */
+ movq %rax,RSP(%rdi)
+ movq $0,(%rax) /* Missing (lost) #GP error code */
+ movq $general_protection,RIP(%rdi)
+ retq
+ CFI_ENDPROC
+END(__do_double_fault)
+#else
+# define __do_double_fault do_double_fault
+#endif
/*
* APIC interrupts.
@@ -1205,114 +1052,100 @@ apicinterrupt IRQ_WORK_VECTOR \
/*
* Exception entry points.
*/
-.macro zeroentry sym do_sym
-ENTRY(\sym)
- INTR_FRAME
- ASM_CLAC
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
- subq $ORIG_RAX-R15, %rsp
- CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
- call error_entry
- DEFAULT_FRAME 0
- movq %rsp,%rdi /* pt_regs pointer */
- xorl %esi,%esi /* no error code */
- call \do_sym
- jmp error_exit /* %ebx: no swapgs flag */
- CFI_ENDPROC
-END(\sym)
-.endm
+#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8)
-.macro paranoidzeroentry sym do_sym
+.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
ENTRY(\sym)
- INTR_FRAME
- ASM_CLAC
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
- subq $ORIG_RAX-R15, %rsp
- CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
- call save_paranoid
- TRACE_IRQS_OFF
- movq %rsp,%rdi /* pt_regs pointer */
- xorl %esi,%esi /* no error code */
- call \do_sym
- jmp paranoid_exit /* %ebx: no swapgs flag */
- CFI_ENDPROC
-END(\sym)
-.endm
+ /* Sanity check */
+ .if \shift_ist != -1 && \paranoid == 0
+ .error "using shift_ist requires paranoid=1"
+ .endif
-#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8)
-.macro paranoidzeroentry_ist sym do_sym ist
-ENTRY(\sym)
+ .if \has_error_code
+ XCPT_FRAME
+ .else
INTR_FRAME
- ASM_CLAC
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
- subq $ORIG_RAX-R15, %rsp
- CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
- call save_paranoid
- TRACE_IRQS_OFF_DEBUG
- movq %rsp,%rdi /* pt_regs pointer */
- xorl %esi,%esi /* no error code */
- subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
- call \do_sym
- addq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
- jmp paranoid_exit /* %ebx: no swapgs flag */
- CFI_ENDPROC
-END(\sym)
-.endm
+ .endif
-.macro errorentry sym do_sym
-ENTRY(\sym)
- XCPT_FRAME
ASM_CLAC
PARAVIRT_ADJUST_EXCEPTION_FRAME
- subq $ORIG_RAX-R15, %rsp
- CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
- call error_entry
- DEFAULT_FRAME 0
- movq %rsp,%rdi /* pt_regs pointer */
- movq ORIG_RAX(%rsp),%rsi /* get error code */
- movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
- call \do_sym
- jmp error_exit /* %ebx: no swapgs flag */
- CFI_ENDPROC
-END(\sym)
-.endm
- /* error code is on the stack already */
-.macro paranoiderrorentry sym do_sym
-ENTRY(\sym)
- XCPT_FRAME
- ASM_CLAC
- PARAVIRT_ADJUST_EXCEPTION_FRAME
+ .ifeq \has_error_code
+ pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
+ .endif
+
subq $ORIG_RAX-R15, %rsp
CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
+
+ .if \paranoid
call save_paranoid
+ .else
+ call error_entry
+ .endif
+
DEFAULT_FRAME 0
+
+ .if \paranoid
+ .if \shift_ist != -1
+ TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */
+ .else
TRACE_IRQS_OFF
+ .endif
+ .endif
+
movq %rsp,%rdi /* pt_regs pointer */
+
+ .if \has_error_code
movq ORIG_RAX(%rsp),%rsi /* get error code */
movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
+ .else
+ xorl %esi,%esi /* no error code */
+ .endif
+
+ .if \shift_ist != -1
+ subq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist)
+ .endif
+
call \do_sym
+
+ .if \shift_ist != -1
+ addq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist)
+ .endif
+
+ .if \paranoid
jmp paranoid_exit /* %ebx: no swapgs flag */
+ .else
+ jmp error_exit /* %ebx: no swapgs flag */
+ .endif
+
CFI_ENDPROC
END(\sym)
.endm
-zeroentry divide_error do_divide_error
-zeroentry overflow do_overflow
-zeroentry bounds do_bounds
-zeroentry invalid_op do_invalid_op
-zeroentry device_not_available do_device_not_available
-paranoiderrorentry double_fault do_double_fault
-zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun
-errorentry invalid_TSS do_invalid_TSS
-errorentry segment_not_present do_segment_not_present
-zeroentry spurious_interrupt_bug do_spurious_interrupt_bug
-zeroentry coprocessor_error do_coprocessor_error
-errorentry alignment_check do_alignment_check
-zeroentry simd_coprocessor_error do_simd_coprocessor_error
+#ifdef CONFIG_TRACING
+.macro trace_idtentry sym do_sym has_error_code:req
+idtentry trace(\sym) trace(\do_sym) has_error_code=\has_error_code
+idtentry \sym \do_sym has_error_code=\has_error_code
+.endm
+#else
+.macro trace_idtentry sym do_sym has_error_code:req
+idtentry \sym \do_sym has_error_code=\has_error_code
+.endm
+#endif
+
+idtentry divide_error do_divide_error has_error_code=0
+idtentry overflow do_overflow has_error_code=0
+idtentry bounds do_bounds has_error_code=0
+idtentry invalid_op do_invalid_op has_error_code=0
+idtentry device_not_available do_device_not_available has_error_code=0
+idtentry double_fault __do_double_fault has_error_code=1 paranoid=1
+idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0
+idtentry invalid_TSS do_invalid_TSS has_error_code=1
+idtentry segment_not_present do_segment_not_present has_error_code=1
+idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0
+idtentry coprocessor_error do_coprocessor_error has_error_code=0
+idtentry alignment_check do_alignment_check has_error_code=1
+idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0
/* Reload gs selector with exception handling */
@@ -1342,7 +1175,7 @@ bad_gs:
.previous
/* Call softirq on interrupt stack. Interrupts are off. */
-ENTRY(call_softirq)
+ENTRY(do_softirq_own_stack)
CFI_STARTPROC
pushq_cfi %rbp
CFI_REL_OFFSET rbp,0
@@ -1359,10 +1192,10 @@ ENTRY(call_softirq)
decl PER_CPU_VAR(irq_count)
ret
CFI_ENDPROC
-END(call_softirq)
+END(do_softirq_own_stack)
#ifdef CONFIG_XEN
-zeroentry xen_hypervisor_callback xen_do_hypervisor_callback
+idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
/*
* A note on the "critical region" in our callback handler.
@@ -1468,26 +1301,21 @@ apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
hyperv_callback_vector hyperv_vector_handler
#endif /* CONFIG_HYPERV */
-/*
- * Some functions should be protected against kprobes
- */
- .pushsection .kprobes.text, "ax"
-
-paranoidzeroentry_ist debug do_debug DEBUG_STACK
-paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
-paranoiderrorentry stack_segment do_stack_segment
+idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
+idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
+idtentry stack_segment do_stack_segment has_error_code=1 paranoid=1
#ifdef CONFIG_XEN
-zeroentry xen_debug do_debug
-zeroentry xen_int3 do_int3
-errorentry xen_stack_segment do_stack_segment
+idtentry xen_debug do_debug has_error_code=0
+idtentry xen_int3 do_int3 has_error_code=0
+idtentry xen_stack_segment do_stack_segment has_error_code=1
#endif
-errorentry general_protection do_general_protection
-errorentry page_fault do_page_fault
+idtentry general_protection do_general_protection has_error_code=1
+trace_idtentry page_fault do_page_fault has_error_code=1
#ifdef CONFIG_KVM_GUEST
-errorentry async_page_fault do_async_page_fault
+idtentry async_page_fault do_async_page_fault has_error_code=1
#endif
#ifdef CONFIG_X86_MCE
-paranoidzeroentry machine_check *machine_check_vector(%rip)
+idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip)
#endif
/*
@@ -1592,7 +1420,7 @@ error_sti:
*/
error_kernelspace:
incl %ebx
- leaq irq_return(%rip),%rcx
+ leaq native_irq_return_iret(%rip),%rcx
cmpq %rcx,RIP+8(%rsp)
je error_swapgs
movl %ecx,%eax /* zero extend */
@@ -1889,7 +1717,3 @@ ENTRY(ignore_sysret)
CFI_ENDPROC
END(ignore_sysret)
-/*
- * End of kprobes section
- */
- .popsection
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
new file mode 100644
index 00000000000..94d857fb103
--- /dev/null
+++ b/arch/x86/kernel/espfix_64.c
@@ -0,0 +1,208 @@
+/* ----------------------------------------------------------------------- *
+ *
+ * Copyright 2014 Intel Corporation; author: H. Peter Anvin
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * The IRET instruction, when returning to a 16-bit segment, only
+ * restores the bottom 16 bits of the user space stack pointer. This
+ * causes some 16-bit software to break, but it also leaks kernel state
+ * to user space.
+ *
+ * This works around this by creating percpu "ministacks", each of which
+ * is mapped 2^16 times 64K apart. When we detect that the return SS is
+ * on the LDT, we copy the IRET frame to the ministack and use the
+ * relevant alias to return to userspace. The ministacks are mapped
+ * readonly, so if the IRET fault we promote #GP to #DF which is an IST
+ * vector and thus has its own stack; we then do the fixup in the #DF
+ * handler.
+ *
+ * This file sets up the ministacks and the related page tables. The
+ * actual ministack invocation is in entry_64.S.
+ */
+
+#include <linux/init.h>
+#include <linux/init_task.h>
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/gfp.h>
+#include <linux/random.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/setup.h>
+#include <asm/espfix.h>
+
+/*
+ * Note: we only need 6*8 = 48 bytes for the espfix stack, but round
+ * it up to a cache line to avoid unnecessary sharing.
+ */
+#define ESPFIX_STACK_SIZE (8*8UL)
+#define ESPFIX_STACKS_PER_PAGE (PAGE_SIZE/ESPFIX_STACK_SIZE)
+
+/* There is address space for how many espfix pages? */
+#define ESPFIX_PAGE_SPACE (1UL << (PGDIR_SHIFT-PAGE_SHIFT-16))
+
+#define ESPFIX_MAX_CPUS (ESPFIX_STACKS_PER_PAGE * ESPFIX_PAGE_SPACE)
+#if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS
+# error "Need more than one PGD for the ESPFIX hack"
+#endif
+
+#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO)
+
+/* This contains the *bottom* address of the espfix stack */
+DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack);
+DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);
+
+/* Initialization mutex - should this be a spinlock? */
+static DEFINE_MUTEX(espfix_init_mutex);
+
+/* Page allocation bitmap - each page serves ESPFIX_STACKS_PER_PAGE CPUs */
+#define ESPFIX_MAX_PAGES DIV_ROUND_UP(CONFIG_NR_CPUS, ESPFIX_STACKS_PER_PAGE)
+static void *espfix_pages[ESPFIX_MAX_PAGES];
+
+static __page_aligned_bss pud_t espfix_pud_page[PTRS_PER_PUD]
+ __aligned(PAGE_SIZE);
+
+static unsigned int page_random, slot_random;
+
+/*
+ * This returns the bottom address of the espfix stack for a specific CPU.
+ * The math allows for a non-power-of-two ESPFIX_STACK_SIZE, in which case
+ * we have to account for some amount of padding at the end of each page.
+ */
+static inline unsigned long espfix_base_addr(unsigned int cpu)
+{
+ unsigned long page, slot;
+ unsigned long addr;
+
+ page = (cpu / ESPFIX_STACKS_PER_PAGE) ^ page_random;
+ slot = (cpu + slot_random) % ESPFIX_STACKS_PER_PAGE;
+ addr = (page << PAGE_SHIFT) + (slot * ESPFIX_STACK_SIZE);
+ addr = (addr & 0xffffUL) | ((addr & ~0xffffUL) << 16);
+ addr += ESPFIX_BASE_ADDR;
+ return addr;
+}
+
+#define PTE_STRIDE (65536/PAGE_SIZE)
+#define ESPFIX_PTE_CLONES (PTRS_PER_PTE/PTE_STRIDE)
+#define ESPFIX_PMD_CLONES PTRS_PER_PMD
+#define ESPFIX_PUD_CLONES (65536/(ESPFIX_PTE_CLONES*ESPFIX_PMD_CLONES))
+
+#define PGTABLE_PROT ((_KERNPG_TABLE & ~_PAGE_RW) | _PAGE_NX)
+
+static void init_espfix_random(void)
+{
+ unsigned long rand;
+
+ /*
+ * This is run before the entropy pools are initialized,
+ * but this is hopefully better than nothing.
+ */
+ if (!arch_get_random_long(&rand)) {
+ /* The constant is an arbitrary large prime */
+ rdtscll(rand);
+ rand *= 0xc345c6b72fd16123UL;
+ }
+
+ slot_random = rand % ESPFIX_STACKS_PER_PAGE;
+ page_random = (rand / ESPFIX_STACKS_PER_PAGE)
+ & (ESPFIX_PAGE_SPACE - 1);
+}
+
+void __init init_espfix_bsp(void)
+{
+ pgd_t *pgd_p;
+ pteval_t ptemask;
+
+ ptemask = __supported_pte_mask;
+
+ /* Install the espfix pud into the kernel page directory */
+ pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
+ pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
+
+ /* Randomize the locations */
+ init_espfix_random();
+
+ /* The rest is the same as for any other processor */
+ init_espfix_ap();
+}
+
+void init_espfix_ap(void)
+{
+ unsigned int cpu, page;
+ unsigned long addr;
+ pud_t pud, *pud_p;
+ pmd_t pmd, *pmd_p;
+ pte_t pte, *pte_p;
+ int n;
+ void *stack_page;
+ pteval_t ptemask;
+
+ /* We only have to do this once... */
+ if (likely(this_cpu_read(espfix_stack)))
+ return; /* Already initialized */
+
+ cpu = smp_processor_id();
+ addr = espfix_base_addr(cpu);
+ page = cpu/ESPFIX_STACKS_PER_PAGE;
+
+ /* Did another CPU already set this up? */
+ stack_page = ACCESS_ONCE(espfix_pages[page]);
+ if (likely(stack_page))
+ goto done;
+
+ mutex_lock(&espfix_init_mutex);
+
+ /* Did we race on the lock? */
+ stack_page = ACCESS_ONCE(espfix_pages[page]);
+ if (stack_page)
+ goto unlock_done;
+
+ ptemask = __supported_pte_mask;
+
+ pud_p = &espfix_pud_page[pud_index(addr)];
+ pud = *pud_p;
+ if (!pud_present(pud)) {
+ pmd_p = (pmd_t *)__get_free_page(PGALLOC_GFP);
+ pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask));
+ paravirt_alloc_pmd(&init_mm, __pa(pmd_p) >> PAGE_SHIFT);
+ for (n = 0; n < ESPFIX_PUD_CLONES; n++)
+ set_pud(&pud_p[n], pud);
+ }
+
+ pmd_p = pmd_offset(&pud, addr);
+ pmd = *pmd_p;
+ if (!pmd_present(pmd)) {
+ pte_p = (pte_t *)__get_free_page(PGALLOC_GFP);
+ pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask));
+ paravirt_alloc_pte(&init_mm, __pa(pte_p) >> PAGE_SHIFT);
+ for (n = 0; n < ESPFIX_PMD_CLONES; n++)
+ set_pmd(&pmd_p[n], pmd);
+ }
+
+ pte_p = pte_offset_kernel(&pmd, addr);
+ stack_page = (void *)__get_free_page(GFP_KERNEL);
+ pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask));
+ for (n = 0; n < ESPFIX_PTE_CLONES; n++)
+ set_pte(&pte_p[n*PTE_STRIDE], pte);
+
+ /* Job is done for this CPU and any CPU which shares this page */
+ ACCESS_ONCE(espfix_pages[page]) = stack_page;
+
+unlock_done:
+ mutex_unlock(&espfix_init_mutex);
+done:
+ this_cpu_write(espfix_stack, addr);
+ this_cpu_write(espfix_waddr, (unsigned long)stack_page
+ + (addr & ~PAGE_MASK));
+}
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 42a392a9fd0..cbc4a91b131 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -77,8 +77,7 @@ within(unsigned long addr, unsigned long start, unsigned long end)
return addr >= start && addr < end;
}
-static int
-do_ftrace_mod_code(unsigned long ip, const void *new_code)
+static unsigned long text_ip_addr(unsigned long ip)
{
/*
* On x86_64, kernel text mappings are mapped read-only with
@@ -91,7 +90,7 @@ do_ftrace_mod_code(unsigned long ip, const void *new_code)
if (within(ip, (unsigned long)_text, (unsigned long)_etext))
ip = (unsigned long)__va(__pa_symbol(ip));
- return probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE);
+ return ip;
}
static const unsigned char *ftrace_nop_replace(void)
@@ -123,8 +122,10 @@ ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code,
if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0)
return -EINVAL;
+ ip = text_ip_addr(ip);
+
/* replace the text with the new text */
- if (do_ftrace_mod_code(ip, new_code))
+ if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE))
return -EPERM;
sync_core();
@@ -221,33 +222,56 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
return -EINVAL;
}
-int ftrace_update_ftrace_func(ftrace_func_t func)
+static unsigned long ftrace_update_func;
+
+static int update_ftrace_func(unsigned long ip, void *new)
{
- unsigned long ip = (unsigned long)(&ftrace_call);
- unsigned char old[MCOUNT_INSN_SIZE], *new;
+ unsigned char old[MCOUNT_INSN_SIZE];
int ret;
- memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE);
- new = ftrace_call_replace(ip, (unsigned long)func);
+ memcpy(old, (void *)ip, MCOUNT_INSN_SIZE);
+
+ ftrace_update_func = ip;
+ /* Make sure the breakpoints see the ftrace_update_func update */
+ smp_wmb();
/* See comment above by declaration of modifying_ftrace_code */
atomic_inc(&modifying_ftrace_code);
ret = ftrace_modify_code(ip, old, new);
+ atomic_dec(&modifying_ftrace_code);
+
+ return ret;
+}
+
+int ftrace_update_ftrace_func(ftrace_func_t func)
+{
+ unsigned long ip = (unsigned long)(&ftrace_call);
+ unsigned char *new;
+ int ret;
+
+ new = ftrace_call_replace(ip, (unsigned long)func);
+ ret = update_ftrace_func(ip, new);
+
/* Also update the regs callback function */
if (!ret) {
ip = (unsigned long)(&ftrace_regs_call);
- memcpy(old, &ftrace_regs_call, MCOUNT_INSN_SIZE);
new = ftrace_call_replace(ip, (unsigned long)func);
- ret = ftrace_modify_code(ip, old, new);
+ ret = update_ftrace_func(ip, new);
}
- atomic_dec(&modifying_ftrace_code);
-
return ret;
}
+static int is_ftrace_caller(unsigned long ip)
+{
+ if (ip == ftrace_update_func)
+ return 1;
+
+ return 0;
+}
+
/*
* A breakpoint was added to the code address we are about to
* modify, and this is the handle that will just skip over it.
@@ -257,10 +281,13 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
*/
int ftrace_int3_handler(struct pt_regs *regs)
{
+ unsigned long ip;
+
if (WARN_ON_ONCE(!regs))
return 0;
- if (!ftrace_location(regs->ip - 1))
+ ip = regs->ip - 1;
+ if (!ftrace_location(ip) && !is_ftrace_caller(ip))
return 0;
regs->ip += MCOUNT_INSN_SIZE - 1;
@@ -270,18 +297,12 @@ int ftrace_int3_handler(struct pt_regs *regs)
static int ftrace_write(unsigned long ip, const char *val, int size)
{
- /*
- * On x86_64, kernel text mappings are mapped read-only with
- * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead
- * of the kernel text mapping to modify the kernel text.
- *
- * For 32bit kernels, these mappings are same and we can use
- * kernel identity mapping to modify code.
- */
- if (within(ip, (unsigned long)_text, (unsigned long)_etext))
- ip = (unsigned long)__va(__pa_symbol(ip));
+ ip = text_ip_addr(ip);
+
+ if (probe_kernel_write((void *)ip, val, size))
+ return -EPERM;
- return probe_kernel_write((void *)ip, val, size);
+ return 0;
}
static int add_break(unsigned long ip, const char *old)
@@ -296,10 +317,7 @@ static int add_break(unsigned long ip, const char *old)
if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0)
return -EINVAL;
- if (ftrace_write(ip, &brk, 1))
- return -EPERM;
-
- return 0;
+ return ftrace_write(ip, &brk, 1);
}
static int add_brk_on_call(struct dyn_ftrace *rec, unsigned long addr)
@@ -322,40 +340,14 @@ static int add_brk_on_nop(struct dyn_ftrace *rec)
return add_break(rec->ip, old);
}
-/*
- * If the record has the FTRACE_FL_REGS set, that means that it
- * wants to convert to a callback that saves all regs. If FTRACE_FL_REGS
- * is not not set, then it wants to convert to the normal callback.
- */
-static unsigned long get_ftrace_addr(struct dyn_ftrace *rec)
-{
- if (rec->flags & FTRACE_FL_REGS)
- return (unsigned long)FTRACE_REGS_ADDR;
- else
- return (unsigned long)FTRACE_ADDR;
-}
-
-/*
- * The FTRACE_FL_REGS_EN is set when the record already points to
- * a function that saves all the regs. Basically the '_EN' version
- * represents the current state of the function.
- */
-static unsigned long get_ftrace_old_addr(struct dyn_ftrace *rec)
-{
- if (rec->flags & FTRACE_FL_REGS_EN)
- return (unsigned long)FTRACE_REGS_ADDR;
- else
- return (unsigned long)FTRACE_ADDR;
-}
-
static int add_breakpoints(struct dyn_ftrace *rec, int enable)
{
unsigned long ftrace_addr;
int ret;
- ret = ftrace_test_record(rec, enable);
+ ftrace_addr = ftrace_get_addr_curr(rec);
- ftrace_addr = get_ftrace_addr(rec);
+ ret = ftrace_test_record(rec, enable);
switch (ret) {
case FTRACE_UPDATE_IGNORE:
@@ -365,10 +357,7 @@ static int add_breakpoints(struct dyn_ftrace *rec, int enable)
/* converting nop to call */
return add_brk_on_nop(rec);
- case FTRACE_UPDATE_MODIFY_CALL_REGS:
case FTRACE_UPDATE_MODIFY_CALL:
- ftrace_addr = get_ftrace_old_addr(rec);
- /* fall through */
case FTRACE_UPDATE_MAKE_NOP:
/* converting a call to a nop */
return add_brk_on_call(rec, ftrace_addr);
@@ -398,7 +387,7 @@ static int remove_breakpoint(struct dyn_ftrace *rec)
/* If this does not have a breakpoint, we are done */
if (ins[0] != brk)
- return -1;
+ return 0;
nop = ftrace_nop_replace();
@@ -413,14 +402,14 @@ static int remove_breakpoint(struct dyn_ftrace *rec)
* If not, don't touch the breakpoint, we make just create
* a disaster.
*/
- ftrace_addr = get_ftrace_addr(rec);
+ ftrace_addr = ftrace_get_addr_new(rec);
nop = ftrace_call_replace(ip, ftrace_addr);
if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) == 0)
goto update;
/* Check both ftrace_addr and ftrace_old_addr */
- ftrace_addr = get_ftrace_old_addr(rec);
+ ftrace_addr = ftrace_get_addr_curr(rec);
nop = ftrace_call_replace(ip, ftrace_addr);
if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0)
@@ -428,7 +417,7 @@ static int remove_breakpoint(struct dyn_ftrace *rec)
}
update:
- return probe_kernel_write((void *)ip, &nop[0], 1);
+ return ftrace_write(ip, nop, 1);
}
static int add_update_code(unsigned long ip, unsigned const char *new)
@@ -436,9 +425,7 @@ static int add_update_code(unsigned long ip, unsigned const char *new)
/* skip breakpoint */
ip++;
new++;
- if (ftrace_write(ip, new, MCOUNT_INSN_SIZE - 1))
- return -EPERM;
- return 0;
+ return ftrace_write(ip, new, MCOUNT_INSN_SIZE - 1);
}
static int add_update_call(struct dyn_ftrace *rec, unsigned long addr)
@@ -466,13 +453,12 @@ static int add_update(struct dyn_ftrace *rec, int enable)
ret = ftrace_test_record(rec, enable);
- ftrace_addr = get_ftrace_addr(rec);
+ ftrace_addr = ftrace_get_addr_new(rec);
switch (ret) {
case FTRACE_UPDATE_IGNORE:
return 0;
- case FTRACE_UPDATE_MODIFY_CALL_REGS:
case FTRACE_UPDATE_MODIFY_CALL:
case FTRACE_UPDATE_MAKE_CALL:
/* converting nop to call */
@@ -493,10 +479,7 @@ static int finish_update_call(struct dyn_ftrace *rec, unsigned long addr)
new = ftrace_call_replace(ip, addr);
- if (ftrace_write(ip, new, 1))
- return -EPERM;
-
- return 0;
+ return ftrace_write(ip, new, 1);
}
static int finish_update_nop(struct dyn_ftrace *rec)
@@ -506,9 +489,7 @@ static int finish_update_nop(struct dyn_ftrace *rec)
new = ftrace_nop_replace();
- if (ftrace_write(ip, new, 1))
- return -EPERM;
- return 0;
+ return ftrace_write(ip, new, 1);
}
static int finish_update(struct dyn_ftrace *rec, int enable)
@@ -518,13 +499,12 @@ static int finish_update(struct dyn_ftrace *rec, int enable)
ret = ftrace_update_record(rec, enable);
- ftrace_addr = get_ftrace_addr(rec);
+ ftrace_addr = ftrace_get_addr_new(rec);
switch (ret) {
case FTRACE_UPDATE_IGNORE:
return 0;
- case FTRACE_UPDATE_MODIFY_CALL_REGS:
case FTRACE_UPDATE_MODIFY_CALL:
case FTRACE_UPDATE_MAKE_CALL:
/* converting nop to call */
@@ -601,12 +581,18 @@ void ftrace_replace_code(int enable)
return;
remove_breakpoints:
+ pr_warn("Failed on %s (%d):\n", report, count);
ftrace_bug(ret, rec ? rec->ip : 0);
- printk(KERN_WARNING "Failed on %s (%d):\n", report, count);
for_ftrace_rec_iter(iter) {
rec = ftrace_rec_iter_record(iter);
- remove_breakpoint(rec);
+ /*
+ * Breakpoints are handled only when this function is in
+ * progress. The system could not work with them.
+ */
+ if (remove_breakpoint(rec))
+ BUG();
}
+ run_sync();
}
static int
@@ -628,16 +614,19 @@ ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
run_sync();
ret = ftrace_write(ip, new_code, 1);
- if (ret) {
- ret = -EPERM;
- goto out;
- }
- run_sync();
+ /*
+ * The breakpoint is handled only when this function is in progress.
+ * The system could not work if we could not remove it.
+ */
+ BUG_ON(ret);
out:
+ run_sync();
return ret;
fail_update:
- probe_kernel_write((void *)ip, &old_code[0], 1);
+ /* Also here the system could not work with the breakpoint */
+ if (ftrace_write(ip, old_code, 1))
+ BUG();
goto out;
}
@@ -651,11 +640,8 @@ void arch_ftrace_update_code(int command)
atomic_dec(&modifying_ftrace_code);
}
-int __init ftrace_dyn_arch_init(void *data)
+int __init ftrace_dyn_arch_init(void)
{
- /* The return code is retured via data */
- *(unsigned long *)data = 0;
-
return 0;
}
#endif
@@ -665,45 +651,41 @@ int __init ftrace_dyn_arch_init(void *data)
#ifdef CONFIG_DYNAMIC_FTRACE
extern void ftrace_graph_call(void);
-static int ftrace_mod_jmp(unsigned long ip,
- int old_offset, int new_offset)
+static unsigned char *ftrace_jmp_replace(unsigned long ip, unsigned long addr)
{
- unsigned char code[MCOUNT_INSN_SIZE];
+ static union ftrace_code_union calc;
- if (probe_kernel_read(code, (void *)ip, MCOUNT_INSN_SIZE))
- return -EFAULT;
+ /* Jmp not a call (ignore the .e8) */
+ calc.e8 = 0xe9;
+ calc.offset = ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr);
- if (code[0] != 0xe9 || old_offset != *(int *)(&code[1]))
- return -EINVAL;
+ /*
+ * ftrace external locks synchronize the access to the static variable.
+ */
+ return calc.code;
+}
- *(int *)(&code[1]) = new_offset;
+static int ftrace_mod_jmp(unsigned long ip, void *func)
+{
+ unsigned char *new;
- if (do_ftrace_mod_code(ip, &code))
- return -EPERM;
+ new = ftrace_jmp_replace(ip, (unsigned long)func);
- return 0;
+ return update_ftrace_func(ip, new);
}
int ftrace_enable_ftrace_graph_caller(void)
{
unsigned long ip = (unsigned long)(&ftrace_graph_call);
- int old_offset, new_offset;
- old_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
- new_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
-
- return ftrace_mod_jmp(ip, old_offset, new_offset);
+ return ftrace_mod_jmp(ip, &ftrace_graph_caller);
}
int ftrace_disable_ftrace_graph_caller(void)
{
unsigned long ip = (unsigned long)(&ftrace_graph_call);
- int old_offset, new_offset;
-
- old_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
- new_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
- return ftrace_mod_jmp(ip, old_offset, new_offset);
+ return ftrace_mod_jmp(ip, &ftrace_stub);
}
#endif /* !CONFIG_DYNAMIC_FTRACE */
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 06f87bece92..d6c1b983699 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -29,14 +29,14 @@ static void __init i386_default_early_setup(void)
reserve_ebda_region();
}
-asmlinkage void __init i386_start_kernel(void)
+asmlinkage __visible void __init i386_start_kernel(void)
{
sanitize_boot_params(&boot_params);
/* Call the subarch specific early setup function */
switch (boot_params.hdr.hardware_subarch) {
- case X86_SUBARCH_MRST:
- x86_mrst_early_setup();
+ case X86_SUBARCH_INTEL_MID:
+ x86_intel_mid_early_setup();
break;
case X86_SUBARCH_CE4100:
x86_ce4100_early_setup();
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 1be8e43b669..eda1a865641 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -137,7 +137,7 @@ static void __init copy_bootdata(char *real_mode_data)
}
}
-asmlinkage void __init x86_64_start_kernel(char * real_mode_data)
+asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
{
int i;
@@ -162,7 +162,7 @@ asmlinkage void __init x86_64_start_kernel(char * real_mode_data)
clear_bss();
for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
- set_intr_gate(i, &early_idt_handlers[i]);
+ set_intr_gate(i, early_idt_handlers[i]);
load_idt((const struct desc_ptr *)&idt_descr);
copy_bootdata(__va(real_mode_data));
@@ -172,7 +172,7 @@ asmlinkage void __init x86_64_start_kernel(char * real_mode_data)
*/
load_ucode_bsp();
- if (console_loglevel == 10)
+ if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG)
early_printk("Kernel alive\n");
clear_page(init_level4_pgt);
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 81ba27679f1..f36bd42d6f0 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -544,6 +544,10 @@ ENDPROC(early_idt_handlers)
/* This is global to keep gas from relaxing the jumps */
ENTRY(early_idt_handler)
cld
+
+ cmpl $2,(%esp) # X86_TRAP_NMI
+ je is_nmi # Ignore NMI
+
cmpl $2,%ss:early_recursion_flag
je hlt_loop
incl %ss:early_recursion_flag
@@ -594,8 +598,9 @@ ex_entry:
pop %edx
pop %ecx
pop %eax
- addl $8,%esp /* drop vector number and error code */
decl %ss:early_recursion_flag
+is_nmi:
+ addl $8,%esp /* drop vector number and error code */
iret
ENDPROC(early_idt_handler)
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index e1aabdb314c..a468c0a65c4 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -343,6 +343,9 @@ early_idt_handlers:
ENTRY(early_idt_handler)
cld
+ cmpl $2,(%rsp) # X86_TRAP_NMI
+ je is_nmi # Ignore NMI
+
cmpl $2,early_recursion_flag(%rip)
jz 1f
incl early_recursion_flag(%rip)
@@ -405,8 +408,9 @@ ENTRY(early_idt_handler)
popq %rdx
popq %rcx
popq %rax
- addq $16,%rsp # drop vector number and error code
decl early_recursion_flag(%rip)
+is_nmi:
+ addq $16,%rsp # drop vector number and error code
INTERRUPT_RETURN
ENDPROC(early_idt_handler)
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index da85a8e830a..319bcb9372f 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -74,9 +74,6 @@ static inline void hpet_writel(unsigned int d, unsigned int a)
static inline void hpet_set_mapping(void)
{
hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
-#ifdef CONFIG_X86_64
- __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VVAR_NOCACHE);
-#endif
}
static inline void hpet_clear_mapping(void)
@@ -88,7 +85,7 @@ static inline void hpet_clear_mapping(void)
/*
* HPET command line enable / disable
*/
-static int boot_hpet_disable;
+int boot_hpet_disable;
int hpet_force_user;
static int hpet_verbose;
@@ -479,7 +476,7 @@ static int hpet_msi_next_event(unsigned long delta,
static int hpet_setup_msi_irq(unsigned int irq)
{
if (x86_msi.setup_hpet_msi(irq, hpet_blockid)) {
- destroy_irq(irq);
+ irq_free_hwirq(irq);
return -EINVAL;
}
return 0;
@@ -487,9 +484,8 @@ static int hpet_setup_msi_irq(unsigned int irq)
static int hpet_assign_irq(struct hpet_dev *dev)
{
- unsigned int irq;
+ unsigned int irq = irq_alloc_hwirq(-1);
- irq = create_irq_nr(0, -1);
if (!irq)
return -EINVAL;
@@ -521,7 +517,7 @@ static int hpet_setup_irq(struct hpet_dev *dev)
{
if (request_irq(dev->irq, hpet_interrupt_handler,
- IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
+ IRQF_TIMER | IRQF_NOBALANCING,
dev->name, dev))
return -1;
@@ -699,7 +695,7 @@ static int hpet_cpuhp_notify(struct notifier_block *n,
/* FIXME: add schedule_work_on() */
schedule_delayed_work_on(cpu, &work.work, 0);
wait_for_completion(&work.complete);
- destroy_timer_on_stack(&work.work.timer);
+ destroy_delayed_work_on_stack(&work.work);
break;
case CPU_DEAD:
if (hdev) {
@@ -752,9 +748,7 @@ static struct clocksource clocksource_hpet = {
.mask = HPET_MASK,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
.resume = hpet_resume_counter,
-#ifdef CONFIG_X86_64
.archdata = { .vclock_mode = VCLOCK_HPET },
-#endif
};
static int hpet_clocksource_register(void)
@@ -943,12 +937,14 @@ static __init int hpet_late_init(void)
if (boot_cpu_has(X86_FEATURE_ARAT))
return 0;
+ cpu_notifier_register_begin();
for_each_online_cpu(cpu) {
hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu);
}
/* This notifier should be called after workqueue is ready */
- hotcpu_notifier(hpet_cpuhp_notify, -20);
+ __hotcpu_notifier(hpet_cpuhp_notify, -20);
+ cpu_notifier_register_done();
return 0;
}
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index f66ff162dce..5f9cf20cdb6 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -32,13 +32,11 @@
#include <linux/irqflags.h>
#include <linux/notifier.h>
#include <linux/kallsyms.h>
-#include <linux/kprobes.h>
#include <linux/percpu.h>
#include <linux/kdebug.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/sched.h>
-#include <linux/init.h>
#include <linux/smp.h>
#include <asm/hw_breakpoint.h>
@@ -425,7 +423,7 @@ EXPORT_SYMBOL_GPL(hw_breakpoint_restore);
* NOTIFY_STOP returned for all other cases
*
*/
-static int __kprobes hw_breakpoint_handler(struct die_args *args)
+static int hw_breakpoint_handler(struct die_args *args)
{
int i, cpu, rc = NOTIFY_STOP;
struct perf_event *bp;
@@ -512,7 +510,7 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
/*
* Handle debug exception notifications.
*/
-int __kprobes hw_breakpoint_exceptions_notify(
+int hw_breakpoint_exceptions_notify(
struct notifier_block *unused, unsigned long val, void *data)
{
if (val != DIE_DEBUG)
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index 0fa69127209..05fd74f537d 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -37,3 +37,10 @@ EXPORT_SYMBOL(strstr);
EXPORT_SYMBOL(csum_partial);
EXPORT_SYMBOL(empty_zero_page);
+
+#ifdef CONFIG_PREEMPT
+EXPORT_SYMBOL(___preempt_schedule);
+#ifdef CONFIG_CONTEXT_TRACKING
+EXPORT_SYMBOL(___preempt_schedule_context);
+#endif
+#endif
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 5d576ab3440..d5dd8081441 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -86,10 +86,19 @@ EXPORT_SYMBOL(__kernel_fpu_begin);
void __kernel_fpu_end(void)
{
- if (use_eager_fpu())
- math_state_restore();
- else
+ if (use_eager_fpu()) {
+ /*
+ * For eager fpu, most the time, tsk_used_math() is true.
+ * Restore the user math as we are done with the kernel usage.
+ * At few instances during thread exit, signal handling etc,
+ * tsk_used_math() is false. Those few places will take proper
+ * actions, so we don't need to restore the math here.
+ */
+ if (likely(tsk_used_math(current)))
+ math_state_restore();
+ } else {
stts();
+ }
}
EXPORT_SYMBOL(__kernel_fpu_end);
@@ -100,7 +109,7 @@ void unlazy_fpu(struct task_struct *tsk)
__save_init_fpu(tsk);
__thread_fpu_end(tsk);
} else
- tsk->fpu_counter = 0;
+ tsk->thread.fpu_counter = 0;
preempt_enable();
}
EXPORT_SYMBOL(unlazy_fpu);
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 9a5c460404d..8af817105e2 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -299,21 +299,38 @@ static void unmask_8259A(void)
static void init_8259A(int auto_eoi)
{
unsigned long flags;
+ unsigned char probe_val = ~(1 << PIC_CASCADE_IR);
+ unsigned char new_val;
i8259A_auto_eoi = auto_eoi;
raw_spin_lock_irqsave(&i8259A_lock, flags);
- outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
+ /*
+ * Check to see if we have a PIC.
+ * Mask all except the cascade and read
+ * back the value we just wrote. If we don't
+ * have a PIC, we will read 0xff as opposed to the
+ * value we wrote.
+ */
outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
+ outb(probe_val, PIC_MASTER_IMR);
+ new_val = inb(PIC_MASTER_IMR);
+ if (new_val != probe_val) {
+ printk(KERN_INFO "Using NULL legacy PIC\n");
+ legacy_pic = &null_legacy_pic;
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
+ return;
+ }
+
+ outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
/*
* outb_pic - this has to work on a wide range of PC hardware.
*/
outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */
- /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 on x86-64,
- to 0x20-0x27 on i386 */
+ /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */
outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR);
/* 8259A-1 (the master) has a slave on IR2 */
diff --git a/arch/x86/kernel/iosf_mbi.c b/arch/x86/kernel/iosf_mbi.c
new file mode 100644
index 00000000000..d30acdc1229
--- /dev/null
+++ b/arch/x86/kernel/iosf_mbi.c
@@ -0,0 +1,237 @@
+/*
+ * IOSF-SB MailBox Interface Driver
+ * Copyright (c) 2013, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ *
+ * The IOSF-SB is a fabric bus available on Atom based SOC's that uses a
+ * mailbox interface (MBI) to communicate with mutiple devices. This
+ * driver implements access to this interface for those platforms that can
+ * enumerate the device using PCI.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+
+#include <asm/iosf_mbi.h>
+
+#define PCI_DEVICE_ID_BAYTRAIL 0x0F00
+#define PCI_DEVICE_ID_QUARK_X1000 0x0958
+
+static DEFINE_SPINLOCK(iosf_mbi_lock);
+
+static inline u32 iosf_mbi_form_mcr(u8 op, u8 port, u8 offset)
+{
+ return (op << 24) | (port << 16) | (offset << 8) | MBI_ENABLE;
+}
+
+static struct pci_dev *mbi_pdev; /* one mbi device */
+
+static int iosf_mbi_pci_read_mdr(u32 mcrx, u32 mcr, u32 *mdr)
+{
+ int result;
+
+ if (!mbi_pdev)
+ return -ENODEV;
+
+ if (mcrx) {
+ result = pci_write_config_dword(mbi_pdev, MBI_MCRX_OFFSET,
+ mcrx);
+ if (result < 0)
+ goto fail_read;
+ }
+
+ result = pci_write_config_dword(mbi_pdev, MBI_MCR_OFFSET, mcr);
+ if (result < 0)
+ goto fail_read;
+
+ result = pci_read_config_dword(mbi_pdev, MBI_MDR_OFFSET, mdr);
+ if (result < 0)
+ goto fail_read;
+
+ return 0;
+
+fail_read:
+ dev_err(&mbi_pdev->dev, "PCI config access failed with %d\n", result);
+ return result;
+}
+
+static int iosf_mbi_pci_write_mdr(u32 mcrx, u32 mcr, u32 mdr)
+{
+ int result;
+
+ if (!mbi_pdev)
+ return -ENODEV;
+
+ result = pci_write_config_dword(mbi_pdev, MBI_MDR_OFFSET, mdr);
+ if (result < 0)
+ goto fail_write;
+
+ if (mcrx) {
+ result = pci_write_config_dword(mbi_pdev, MBI_MCRX_OFFSET,
+ mcrx);
+ if (result < 0)
+ goto fail_write;
+ }
+
+ result = pci_write_config_dword(mbi_pdev, MBI_MCR_OFFSET, mcr);
+ if (result < 0)
+ goto fail_write;
+
+ return 0;
+
+fail_write:
+ dev_err(&mbi_pdev->dev, "PCI config access failed with %d\n", result);
+ return result;
+}
+
+int iosf_mbi_read(u8 port, u8 opcode, u32 offset, u32 *mdr)
+{
+ u32 mcr, mcrx;
+ unsigned long flags;
+ int ret;
+
+ /*Access to the GFX unit is handled by GPU code */
+ if (port == BT_MBI_UNIT_GFX) {
+ WARN_ON(1);
+ return -EPERM;
+ }
+
+ mcr = iosf_mbi_form_mcr(opcode, port, offset & MBI_MASK_LO);
+ mcrx = offset & MBI_MASK_HI;
+
+ spin_lock_irqsave(&iosf_mbi_lock, flags);
+ ret = iosf_mbi_pci_read_mdr(mcrx, mcr, mdr);
+ spin_unlock_irqrestore(&iosf_mbi_lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(iosf_mbi_read);
+
+int iosf_mbi_write(u8 port, u8 opcode, u32 offset, u32 mdr)
+{
+ u32 mcr, mcrx;
+ unsigned long flags;
+ int ret;
+
+ /*Access to the GFX unit is handled by GPU code */
+ if (port == BT_MBI_UNIT_GFX) {
+ WARN_ON(1);
+ return -EPERM;
+ }
+
+ mcr = iosf_mbi_form_mcr(opcode, port, offset & MBI_MASK_LO);
+ mcrx = offset & MBI_MASK_HI;
+
+ spin_lock_irqsave(&iosf_mbi_lock, flags);
+ ret = iosf_mbi_pci_write_mdr(mcrx, mcr, mdr);
+ spin_unlock_irqrestore(&iosf_mbi_lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(iosf_mbi_write);
+
+int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask)
+{
+ u32 mcr, mcrx;
+ u32 value;
+ unsigned long flags;
+ int ret;
+
+ /*Access to the GFX unit is handled by GPU code */
+ if (port == BT_MBI_UNIT_GFX) {
+ WARN_ON(1);
+ return -EPERM;
+ }
+
+ mcr = iosf_mbi_form_mcr(opcode, port, offset & MBI_MASK_LO);
+ mcrx = offset & MBI_MASK_HI;
+
+ spin_lock_irqsave(&iosf_mbi_lock, flags);
+
+ /* Read current mdr value */
+ ret = iosf_mbi_pci_read_mdr(mcrx, mcr & MBI_RD_MASK, &value);
+ if (ret < 0) {
+ spin_unlock_irqrestore(&iosf_mbi_lock, flags);
+ return ret;
+ }
+
+ /* Apply mask */
+ value &= ~mask;
+ mdr &= mask;
+ value |= mdr;
+
+ /* Write back */
+ ret = iosf_mbi_pci_write_mdr(mcrx, mcr | MBI_WR_MASK, value);
+
+ spin_unlock_irqrestore(&iosf_mbi_lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(iosf_mbi_modify);
+
+bool iosf_mbi_available(void)
+{
+ /* Mbi isn't hot-pluggable. No remove routine is provided */
+ return mbi_pdev;
+}
+EXPORT_SYMBOL(iosf_mbi_available);
+
+static int iosf_mbi_probe(struct pci_dev *pdev,
+ const struct pci_device_id *unused)
+{
+ int ret;
+
+ ret = pci_enable_device(pdev);
+ if (ret < 0) {
+ dev_err(&pdev->dev, "error: could not enable device\n");
+ return ret;
+ }
+
+ mbi_pdev = pci_dev_get(pdev);
+ return 0;
+}
+
+static DEFINE_PCI_DEVICE_TABLE(iosf_mbi_pci_ids) = {
+ { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_BAYTRAIL) },
+ { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_QUARK_X1000) },
+ { 0, },
+};
+MODULE_DEVICE_TABLE(pci, iosf_mbi_pci_ids);
+
+static struct pci_driver iosf_mbi_pci_driver = {
+ .name = "iosf_mbi_pci",
+ .probe = iosf_mbi_probe,
+ .id_table = iosf_mbi_pci_ids,
+};
+
+static int __init iosf_mbi_init(void)
+{
+ return pci_register_driver(&iosf_mbi_pci_driver);
+}
+
+static void __exit iosf_mbi_exit(void)
+{
+ pci_unregister_driver(&iosf_mbi_pci_driver);
+ if (mbi_pdev) {
+ pci_dev_put(mbi_pdev);
+ mbi_pdev = NULL;
+ }
+}
+
+module_init(iosf_mbi_init);
+module_exit(iosf_mbi_exit);
+
+MODULE_AUTHOR("David E. Box <david.e.box@linux.intel.com>");
+MODULE_DESCRIPTION("IOSF Mailbox Interface accessor");
+MODULE_LICENSE("GPL v2");
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 22d0687e7fd..922d2858102 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -17,6 +17,7 @@
#include <asm/idle.h>
#include <asm/mce.h>
#include <asm/hw_irq.h>
+#include <asm/desc.h>
#define CREATE_TRACE_POINTS
#include <asm/trace/irq_vectors.h>
@@ -125,6 +126,12 @@ int arch_show_interrupts(struct seq_file *p, int prec)
seq_printf(p, "%10u ", per_cpu(mce_poll_count, j));
seq_printf(p, " Machine check polls\n");
#endif
+#if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN)
+ seq_printf(p, "%*s: ", prec, "THR");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->irq_hv_callback_count);
+ seq_printf(p, " Hypervisor callback interrupts\n");
+#endif
seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
#if defined(CONFIG_X86_IO_APIC)
seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
@@ -193,9 +200,13 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
if (!handle_irq(irq, regs)) {
ack_APIC_irq();
- if (printk_ratelimit())
- pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n",
- __func__, smp_processor_id(), vector, irq);
+ if (irq != VECTOR_RETRIGGERED) {
+ pr_emerg_ratelimited("%s: %d.%d No irq handler for vector (irq %d)\n",
+ __func__, smp_processor_id(),
+ vector, irq);
+ } else {
+ __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED);
+ }
}
irq_exit();
@@ -262,6 +273,90 @@ __visible void smp_trace_x86_platform_ipi(struct pt_regs *regs)
EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
#ifdef CONFIG_HOTPLUG_CPU
+
+/* These two declarations are only used in check_irq_vectors_for_cpu_disable()
+ * below, which is protected by stop_machine(). Putting them on the stack
+ * results in a stack frame overflow. Dynamically allocating could result in a
+ * failure so declare these two cpumasks as global.
+ */
+static struct cpumask affinity_new, online_new;
+
+/*
+ * This cpu is going to be removed and its vectors migrated to the remaining
+ * online cpus. Check to see if there are enough vectors in the remaining cpus.
+ * This function is protected by stop_machine().
+ */
+int check_irq_vectors_for_cpu_disable(void)
+{
+ int irq, cpu;
+ unsigned int this_cpu, vector, this_count, count;
+ struct irq_desc *desc;
+ struct irq_data *data;
+
+ this_cpu = smp_processor_id();
+ cpumask_copy(&online_new, cpu_online_mask);
+ cpu_clear(this_cpu, online_new);
+
+ this_count = 0;
+ for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
+ irq = __this_cpu_read(vector_irq[vector]);
+ if (irq >= 0) {
+ desc = irq_to_desc(irq);
+ data = irq_desc_get_irq_data(desc);
+ cpumask_copy(&affinity_new, data->affinity);
+ cpu_clear(this_cpu, affinity_new);
+
+ /* Do not count inactive or per-cpu irqs. */
+ if (!irq_has_action(irq) || irqd_is_per_cpu(data))
+ continue;
+
+ /*
+ * A single irq may be mapped to multiple
+ * cpu's vector_irq[] (for example IOAPIC cluster
+ * mode). In this case we have two
+ * possibilities:
+ *
+ * 1) the resulting affinity mask is empty; that is
+ * this the down'd cpu is the last cpu in the irq's
+ * affinity mask, or
+ *
+ * 2) the resulting affinity mask is no longer
+ * a subset of the online cpus but the affinity
+ * mask is not zero; that is the down'd cpu is the
+ * last online cpu in a user set affinity mask.
+ */
+ if (cpumask_empty(&affinity_new) ||
+ !cpumask_subset(&affinity_new, &online_new))
+ this_count++;
+ }
+ }
+
+ count = 0;
+ for_each_online_cpu(cpu) {
+ if (cpu == this_cpu)
+ continue;
+ /*
+ * We scan from FIRST_EXTERNAL_VECTOR to first system
+ * vector. If the vector is marked in the used vectors
+ * bitmap or an irq is assigned to it, we don't count
+ * it as available.
+ */
+ for (vector = FIRST_EXTERNAL_VECTOR;
+ vector < first_system_vector; vector++) {
+ if (!test_bit(vector, used_vectors) &&
+ per_cpu(vector_irq, cpu)[vector] < 0)
+ count++;
+ }
+ }
+
+ if (count < this_count) {
+ pr_warn("CPU %d disable failed: CPU has %u vectors assigned and there are only %u available.\n",
+ this_cpu, this_count, count);
+ return -ERANGE;
+ }
+ return 0;
+}
+
/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
void fixup_irqs(void)
{
@@ -270,6 +365,7 @@ void fixup_irqs(void)
struct irq_desc *desc;
struct irq_data *data;
struct irq_chip *chip;
+ int ret;
for_each_irq_desc(irq, desc) {
int break_affinity = 0;
@@ -308,10 +404,14 @@ void fixup_irqs(void)
if (!irqd_can_move_in_process_context(data) && chip->irq_mask)
chip->irq_mask(data);
- if (chip->irq_set_affinity)
- chip->irq_set_affinity(data, affinity, true);
- else if (!(warned++))
- set_affinity = 0;
+ if (chip->irq_set_affinity) {
+ ret = chip->irq_set_affinity(data, affinity, true);
+ if (ret == -ENOSPC)
+ pr_crit("IRQ %d set affinity failed because there are no available vectors. The device assigned to this IRQ is unstable.\n", irq);
+ } else {
+ if (!(warned++))
+ set_affinity = 0;
+ }
/*
* We unmask if the irq was not marked masked by the
@@ -344,7 +444,7 @@ void fixup_irqs(void)
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
unsigned int irr;
- if (__this_cpu_read(vector_irq[vector]) < 0)
+ if (__this_cpu_read(vector_irq[vector]) <= VECTOR_UNDEFINED)
continue;
irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
@@ -355,11 +455,14 @@ void fixup_irqs(void)
data = irq_desc_get_irq_data(desc);
chip = irq_data_get_irq_chip(data);
raw_spin_lock(&desc->lock);
- if (chip->irq_retrigger)
+ if (chip->irq_retrigger) {
chip->irq_retrigger(data);
+ __this_cpu_write(vector_irq[vector], VECTOR_RETRIGGERED);
+ }
raw_spin_unlock(&desc->lock);
}
- __this_cpu_write(vector_irq[vector], -1);
+ if (__this_cpu_read(vector_irq[vector]) != VECTOR_RETRIGGERED)
+ __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED);
}
}
#endif
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 4186755f1d7..63ce838e5a5 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -55,16 +55,8 @@ static inline int check_stack_overflow(void) { return 0; }
static inline void print_stack_overflow(void) { }
#endif
-/*
- * per-CPU IRQ handling contexts (thread information and stack)
- */
-union irq_ctx {
- struct thread_info tinfo;
- u32 stack[THREAD_SIZE/sizeof(u32)];
-} __attribute__((aligned(THREAD_SIZE)));
-
-static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx);
-static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx);
+DEFINE_PER_CPU(struct irq_stack *, hardirq_stack);
+DEFINE_PER_CPU(struct irq_stack *, softirq_stack);
static void call_on_stack(void *func, void *stack)
{
@@ -77,14 +69,26 @@ static void call_on_stack(void *func, void *stack)
: "memory", "cc", "edx", "ecx", "eax");
}
+/* how to get the current stack pointer from C */
+#define current_stack_pointer ({ \
+ unsigned long sp; \
+ asm("mov %%esp,%0" : "=g" (sp)); \
+ sp; \
+})
+
+static inline void *current_stack(void)
+{
+ return (void *)(current_stack_pointer & ~(THREAD_SIZE - 1));
+}
+
static inline int
execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
{
- union irq_ctx *curctx, *irqctx;
- u32 *isp, arg1, arg2;
+ struct irq_stack *curstk, *irqstk;
+ u32 *isp, *prev_esp, arg1, arg2;
- curctx = (union irq_ctx *) current_thread_info();
- irqctx = __this_cpu_read(hardirq_ctx);
+ curstk = (struct irq_stack *) current_stack();
+ irqstk = __this_cpu_read(hardirq_stack);
/*
* this is where we switch to the IRQ stack. However, if we are
@@ -92,16 +96,14 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
* handler) we can't do that and just have to keep using the
* current stack (which is the irq stack already after all)
*/
- if (unlikely(curctx == irqctx))
+ if (unlikely(curstk == irqstk))
return 0;
- /* build the stack frame on the IRQ stack */
- isp = (u32 *) ((char *)irqctx + sizeof(*irqctx));
- irqctx->tinfo.task = curctx->tinfo.task;
- irqctx->tinfo.previous_esp = current_stack_pointer;
+ isp = (u32 *) ((char *)irqstk + sizeof(*irqstk));
- /* Copy the preempt_count so that the [soft]irq checks work. */
- irqctx->tinfo.preempt_count = curctx->tinfo.preempt_count;
+ /* Save the next esp at the bottom of the stack */
+ prev_esp = (u32 *)irqstk;
+ *prev_esp = current_stack_pointer;
if (unlikely(overflow))
call_on_stack(print_stack_overflow, isp);
@@ -121,63 +123,42 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
*/
void irq_ctx_init(int cpu)
{
- union irq_ctx *irqctx;
+ struct irq_stack *irqstk;
- if (per_cpu(hardirq_ctx, cpu))
+ if (per_cpu(hardirq_stack, cpu))
return;
- irqctx = page_address(alloc_pages_node(cpu_to_node(cpu),
+ irqstk = page_address(alloc_pages_node(cpu_to_node(cpu),
THREADINFO_GFP,
THREAD_SIZE_ORDER));
- memset(&irqctx->tinfo, 0, sizeof(struct thread_info));
- irqctx->tinfo.cpu = cpu;
- irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
- irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
+ per_cpu(hardirq_stack, cpu) = irqstk;
- per_cpu(hardirq_ctx, cpu) = irqctx;
-
- irqctx = page_address(alloc_pages_node(cpu_to_node(cpu),
+ irqstk = page_address(alloc_pages_node(cpu_to_node(cpu),
THREADINFO_GFP,
THREAD_SIZE_ORDER));
- memset(&irqctx->tinfo, 0, sizeof(struct thread_info));
- irqctx->tinfo.cpu = cpu;
- irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
-
- per_cpu(softirq_ctx, cpu) = irqctx;
+ per_cpu(softirq_stack, cpu) = irqstk;
printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n",
- cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu));
+ cpu, per_cpu(hardirq_stack, cpu), per_cpu(softirq_stack, cpu));
}
-asmlinkage void do_softirq(void)
+void do_softirq_own_stack(void)
{
- unsigned long flags;
- struct thread_info *curctx;
- union irq_ctx *irqctx;
- u32 *isp;
-
- if (in_interrupt())
- return;
-
- local_irq_save(flags);
+ struct thread_info *curstk;
+ struct irq_stack *irqstk;
+ u32 *isp, *prev_esp;
- if (local_softirq_pending()) {
- curctx = current_thread_info();
- irqctx = __this_cpu_read(softirq_ctx);
- irqctx->tinfo.task = curctx->task;
- irqctx->tinfo.previous_esp = current_stack_pointer;
+ curstk = current_stack();
+ irqstk = __this_cpu_read(softirq_stack);
- /* build the stack frame on the softirq stack */
- isp = (u32 *) ((char *)irqctx + sizeof(*irqctx));
+ /* build the stack frame on the softirq stack */
+ isp = (u32 *) ((char *)irqstk + sizeof(*irqstk));
- call_on_stack(__do_softirq, isp);
- /*
- * Shouldn't happen, we returned above if in_interrupt():
- */
- WARN_ON_ONCE(softirq_count());
- }
+ /* Push the previous esp onto the stack */
+ prev_esp = (u32 *)irqstk;
+ *prev_esp = current_stack_pointer;
- local_irq_restore(flags);
+ call_on_stack(__do_softirq, isp);
}
bool handle_irq(unsigned irq, struct pt_regs *regs)
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index d04d3ecded6..4d1c746892e 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -87,24 +87,3 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
generic_handle_irq_desc(irq, desc);
return true;
}
-
-
-extern void call_softirq(void);
-
-asmlinkage void do_softirq(void)
-{
- __u32 pending;
- unsigned long flags;
-
- if (in_interrupt())
- return;
-
- local_irq_save(flags);
- pending = local_softirq_pending();
- /* Switch to interrupt stack */
- if (pending) {
- call_softirq();
- WARN_ON_ONCE(softirq_count());
- }
- local_irq_restore(flags);
-}
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index a2a1fbc594f..7f50156542f 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -52,7 +52,7 @@ static struct irqaction irq2 = {
};
DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
- [0 ... NR_VECTORS - 1] = -1,
+ [0 ... NR_VECTORS - 1] = VECTOR_UNDEFINED,
};
int vector_used_by_percpu_irq(unsigned int vector)
@@ -60,7 +60,7 @@ int vector_used_by_percpu_irq(unsigned int vector)
int cpu;
for_each_online_cpu(cpu) {
- if (per_cpu(vector_irq, cpu)[vector] != -1)
+ if (per_cpu(vector_irq, cpu)[vector] > VECTOR_UNDEFINED)
return 1;
}
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index ee11b7dfbfb..26d5a55a273 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -42,15 +42,27 @@ static void __jump_label_transform(struct jump_entry *entry,
int init)
{
union jump_code_union code;
+ const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP };
const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5];
if (type == JUMP_LABEL_ENABLE) {
- /*
- * We are enabling this jump label. If it is not a nop
- * then something must have gone wrong.
- */
- if (unlikely(memcmp((void *)entry->code, ideal_nop, 5) != 0))
- bug_at((void *)entry->code, __LINE__);
+ if (init) {
+ /*
+ * Jump label is enabled for the first time.
+ * So we expect a default_nop...
+ */
+ if (unlikely(memcmp((void *)entry->code, default_nop, 5)
+ != 0))
+ bug_at((void *)entry->code, __LINE__);
+ } else {
+ /*
+ * ...otherwise expect an ideal_nop. Otherwise
+ * something went horribly wrong.
+ */
+ if (unlikely(memcmp((void *)entry->code, ideal_nop, 5)
+ != 0))
+ bug_at((void *)entry->code, __LINE__);
+ }
code.jump = 0xe9;
code.offset = entry->target -
@@ -63,7 +75,6 @@ static void __jump_label_transform(struct jump_entry *entry,
* are converting the default nop to the ideal nop.
*/
if (init) {
- const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP };
if (unlikely(memcmp((void *)entry->code, default_nop, 5) != 0))
bug_at((void *)entry->code, __LINE__);
} else {
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 836f8322960..7ec1d5f8d28 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -39,7 +39,6 @@
#include <linux/sched.h>
#include <linux/delay.h>
#include <linux/kgdb.h>
-#include <linux/init.h>
#include <linux/smp.h>
#include <linux/nmi.h>
#include <linux/hw_breakpoint.h>
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 79a3f968287..67e6d19ef1b 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -112,7 +112,8 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = {
const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
-static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op)
+static nokprobe_inline void
+__synthesize_relative_insn(void *from, void *to, u8 op)
{
struct __arch_relative_insn {
u8 op;
@@ -125,21 +126,23 @@ static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op)
}
/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
-void __kprobes synthesize_reljump(void *from, void *to)
+void synthesize_reljump(void *from, void *to)
{
__synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE);
}
+NOKPROBE_SYMBOL(synthesize_reljump);
/* Insert a call instruction at address 'from', which calls address 'to'.*/
-void __kprobes synthesize_relcall(void *from, void *to)
+void synthesize_relcall(void *from, void *to)
{
__synthesize_relative_insn(from, to, RELATIVECALL_OPCODE);
}
+NOKPROBE_SYMBOL(synthesize_relcall);
/*
* Skip the prefixes of the instruction.
*/
-static kprobe_opcode_t *__kprobes skip_prefixes(kprobe_opcode_t *insn)
+static kprobe_opcode_t *skip_prefixes(kprobe_opcode_t *insn)
{
insn_attr_t attr;
@@ -154,12 +157,13 @@ static kprobe_opcode_t *__kprobes skip_prefixes(kprobe_opcode_t *insn)
#endif
return insn;
}
+NOKPROBE_SYMBOL(skip_prefixes);
/*
* Returns non-zero if opcode is boostable.
* RIP relative instructions are adjusted at copying time in 64 bits mode
*/
-int __kprobes can_boost(kprobe_opcode_t *opcodes)
+int can_boost(kprobe_opcode_t *opcodes)
{
kprobe_opcode_t opcode;
kprobe_opcode_t *orig_opcodes = opcodes;
@@ -260,7 +264,7 @@ unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long add
}
/* Check if paddr is at an instruction boundary */
-static int __kprobes can_probe(unsigned long paddr)
+static int can_probe(unsigned long paddr)
{
unsigned long addr, __addr, offset = 0;
struct insn insn;
@@ -299,7 +303,7 @@ static int __kprobes can_probe(unsigned long paddr)
/*
* Returns non-zero if opcode modifies the interrupt flag.
*/
-static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
+static int is_IF_modifier(kprobe_opcode_t *insn)
{
/* Skip prefixes */
insn = skip_prefixes(insn);
@@ -322,7 +326,7 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
* If not, return null.
* Only applicable to 64-bit x86.
*/
-int __kprobes __copy_instruction(u8 *dest, u8 *src)
+int __copy_instruction(u8 *dest, u8 *src)
{
struct insn insn;
kprobe_opcode_t buf[MAX_INSN_SIZE];
@@ -365,7 +369,7 @@ int __kprobes __copy_instruction(u8 *dest, u8 *src)
return insn.length;
}
-static int __kprobes arch_copy_kprobe(struct kprobe *p)
+static int arch_copy_kprobe(struct kprobe *p)
{
int ret;
@@ -392,7 +396,7 @@ static int __kprobes arch_copy_kprobe(struct kprobe *p)
return 0;
}
-int __kprobes arch_prepare_kprobe(struct kprobe *p)
+int arch_prepare_kprobe(struct kprobe *p)
{
if (alternatives_text_reserved(p->addr, p->addr))
return -EINVAL;
@@ -407,17 +411,17 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p)
return arch_copy_kprobe(p);
}
-void __kprobes arch_arm_kprobe(struct kprobe *p)
+void arch_arm_kprobe(struct kprobe *p)
{
text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
}
-void __kprobes arch_disarm_kprobe(struct kprobe *p)
+void arch_disarm_kprobe(struct kprobe *p)
{
text_poke(p->addr, &p->opcode, 1);
}
-void __kprobes arch_remove_kprobe(struct kprobe *p)
+void arch_remove_kprobe(struct kprobe *p)
{
if (p->ainsn.insn) {
free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1));
@@ -425,7 +429,8 @@ void __kprobes arch_remove_kprobe(struct kprobe *p)
}
}
-static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
+static nokprobe_inline void
+save_previous_kprobe(struct kprobe_ctlblk *kcb)
{
kcb->prev_kprobe.kp = kprobe_running();
kcb->prev_kprobe.status = kcb->kprobe_status;
@@ -433,7 +438,8 @@ static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
kcb->prev_kprobe.saved_flags = kcb->kprobe_saved_flags;
}
-static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
+static nokprobe_inline void
+restore_previous_kprobe(struct kprobe_ctlblk *kcb)
{
__this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);
kcb->kprobe_status = kcb->prev_kprobe.status;
@@ -441,8 +447,9 @@ static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags;
}
-static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
- struct kprobe_ctlblk *kcb)
+static nokprobe_inline void
+set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
{
__this_cpu_write(current_kprobe, p);
kcb->kprobe_saved_flags = kcb->kprobe_old_flags
@@ -451,7 +458,7 @@ static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
kcb->kprobe_saved_flags &= ~X86_EFLAGS_IF;
}
-static void __kprobes clear_btf(void)
+static nokprobe_inline void clear_btf(void)
{
if (test_thread_flag(TIF_BLOCKSTEP)) {
unsigned long debugctl = get_debugctlmsr();
@@ -461,7 +468,7 @@ static void __kprobes clear_btf(void)
}
}
-static void __kprobes restore_btf(void)
+static nokprobe_inline void restore_btf(void)
{
if (test_thread_flag(TIF_BLOCKSTEP)) {
unsigned long debugctl = get_debugctlmsr();
@@ -471,8 +478,7 @@ static void __kprobes restore_btf(void)
}
}
-void __kprobes
-arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)
+void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)
{
unsigned long *sara = stack_addr(regs);
@@ -481,9 +487,10 @@ arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)
/* Replace the return addr with trampoline addr */
*sara = (unsigned long) &kretprobe_trampoline;
}
+NOKPROBE_SYMBOL(arch_prepare_kretprobe);
-static void __kprobes
-setup_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb, int reenter)
+static void setup_singlestep(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb, int reenter)
{
if (setup_detour_execution(p, regs, reenter))
return;
@@ -519,22 +526,24 @@ setup_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *k
else
regs->ip = (unsigned long)p->ainsn.insn;
}
+NOKPROBE_SYMBOL(setup_singlestep);
/*
* We have reentered the kprobe_handler(), since another probe was hit while
* within the handler. We save the original kprobes variables and just single
* step on the instruction of the new probe without calling any user handlers.
*/
-static int __kprobes
-reenter_kprobe(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb)
+static int reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
{
switch (kcb->kprobe_status) {
case KPROBE_HIT_SSDONE:
case KPROBE_HIT_ACTIVE:
+ case KPROBE_HIT_SS:
kprobes_inc_nmissed_count(p);
setup_singlestep(p, regs, kcb, 1);
break;
- case KPROBE_HIT_SS:
+ case KPROBE_REENTER:
/* A probe has been hit in the codepath leading up to, or just
* after, single-stepping of a probed instruction. This entire
* codepath should strictly reside in .kprobes.text section.
@@ -553,17 +562,21 @@ reenter_kprobe(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb
return 1;
}
+NOKPROBE_SYMBOL(reenter_kprobe);
/*
* Interrupts are disabled on entry as trap3 is an interrupt gate and they
* remain disabled throughout this function.
*/
-static int __kprobes kprobe_handler(struct pt_regs *regs)
+int kprobe_int3_handler(struct pt_regs *regs)
{
kprobe_opcode_t *addr;
struct kprobe *p;
struct kprobe_ctlblk *kcb;
+ if (user_mode_vm(regs))
+ return 0;
+
addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
/*
* We don't want to be preempted for the entire
@@ -621,12 +634,13 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
preempt_enable_no_resched();
return 0;
}
+NOKPROBE_SYMBOL(kprobe_int3_handler);
/*
* When a retprobed function returns, this code saves registers and
* calls trampoline_handler() runs, which calls the kretprobe's handler.
*/
-static void __used __kprobes kretprobe_trampoline_holder(void)
+static void __used kretprobe_trampoline_holder(void)
{
asm volatile (
".global kretprobe_trampoline\n"
@@ -657,11 +671,13 @@ static void __used __kprobes kretprobe_trampoline_holder(void)
#endif
" ret\n");
}
+NOKPROBE_SYMBOL(kretprobe_trampoline_holder);
+NOKPROBE_SYMBOL(kretprobe_trampoline);
/*
* Called from kretprobe_trampoline
*/
-__visible __used __kprobes void *trampoline_handler(struct pt_regs *regs)
+__visible __used void *trampoline_handler(struct pt_regs *regs)
{
struct kretprobe_instance *ri = NULL;
struct hlist_head *head, empty_rp;
@@ -747,6 +763,7 @@ __visible __used __kprobes void *trampoline_handler(struct pt_regs *regs)
}
return (void *)orig_ret_address;
}
+NOKPROBE_SYMBOL(trampoline_handler);
/*
* Called after single-stepping. p->addr is the address of the
@@ -775,8 +792,8 @@ __visible __used __kprobes void *trampoline_handler(struct pt_regs *regs)
* jump instruction after the copied instruction, that jumps to the next
* instruction after the probepoint.
*/
-static void __kprobes
-resume_execution(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb)
+static void resume_execution(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
{
unsigned long *tos = stack_addr(regs);
unsigned long copy_ip = (unsigned long)p->ainsn.insn;
@@ -851,12 +868,13 @@ resume_execution(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *k
no_change:
restore_btf();
}
+NOKPROBE_SYMBOL(resume_execution);
/*
* Interrupts are disabled on entry as trap1 is an interrupt gate and they
* remain disabled throughout this function.
*/
-static int __kprobes post_kprobe_handler(struct pt_regs *regs)
+int kprobe_debug_handler(struct pt_regs *regs)
{
struct kprobe *cur = kprobe_running();
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
@@ -891,15 +909,17 @@ out:
return 1;
}
+NOKPROBE_SYMBOL(kprobe_debug_handler);
-int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
+int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
{
struct kprobe *cur = kprobe_running();
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
- switch (kcb->kprobe_status) {
- case KPROBE_HIT_SS:
- case KPROBE_REENTER:
+ if (unlikely(regs->ip == (unsigned long)cur->ainsn.insn)) {
+ /* This must happen on single-stepping */
+ WARN_ON(kcb->kprobe_status != KPROBE_HIT_SS &&
+ kcb->kprobe_status != KPROBE_REENTER);
/*
* We are here because the instruction being single
* stepped caused a page fault. We reset the current
@@ -914,9 +934,8 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
else
reset_current_kprobe();
preempt_enable_no_resched();
- break;
- case KPROBE_HIT_ACTIVE:
- case KPROBE_HIT_SSDONE:
+ } else if (kcb->kprobe_status == KPROBE_HIT_ACTIVE ||
+ kcb->kprobe_status == KPROBE_HIT_SSDONE) {
/*
* We increment the nmissed count for accounting,
* we can also use npre/npostfault count for accounting
@@ -945,18 +964,17 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
* fixup routine could not handle it,
* Let do_page_fault() fix it.
*/
- break;
- default:
- break;
}
+
return 0;
}
+NOKPROBE_SYMBOL(kprobe_fault_handler);
/*
* Wrapper routine for handling exceptions.
*/
-int __kprobes
-kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *data)
+int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val,
+ void *data)
{
struct die_args *args = data;
int ret = NOTIFY_DONE;
@@ -964,22 +982,7 @@ kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *d
if (args->regs && user_mode_vm(args->regs))
return ret;
- switch (val) {
- case DIE_INT3:
- if (kprobe_handler(args->regs))
- ret = NOTIFY_STOP;
- break;
- case DIE_DEBUG:
- if (post_kprobe_handler(args->regs)) {
- /*
- * Reset the BS bit in dr6 (pointed by args->err) to
- * denote completion of processing
- */
- (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP;
- ret = NOTIFY_STOP;
- }
- break;
- case DIE_GPF:
+ if (val == DIE_GPF) {
/*
* To be potentially processing a kprobe fault and to
* trust the result from kprobe_running(), we have
@@ -988,14 +991,12 @@ kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *d
if (!preemptible() && kprobe_running() &&
kprobe_fault_handler(args->regs, args->trapnr))
ret = NOTIFY_STOP;
- break;
- default:
- break;
}
return ret;
}
+NOKPROBE_SYMBOL(kprobe_exceptions_notify);
-int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
+int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
{
struct jprobe *jp = container_of(p, struct jprobe, kp);
unsigned long addr;
@@ -1019,8 +1020,9 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
regs->ip = (unsigned long)(jp->entry);
return 1;
}
+NOKPROBE_SYMBOL(setjmp_pre_handler);
-void __kprobes jprobe_return(void)
+void jprobe_return(void)
{
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
@@ -1036,8 +1038,10 @@ void __kprobes jprobe_return(void)
" nop \n"::"b"
(kcb->jprobe_saved_sp):"memory");
}
+NOKPROBE_SYMBOL(jprobe_return);
+NOKPROBE_SYMBOL(jprobe_return_end);
-int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
+int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
{
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
u8 *addr = (u8 *) (regs->ip - 1);
@@ -1065,13 +1069,22 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
}
return 0;
}
+NOKPROBE_SYMBOL(longjmp_break_handler);
+
+bool arch_within_kprobe_blacklist(unsigned long addr)
+{
+ return (addr >= (unsigned long)__kprobes_text_start &&
+ addr < (unsigned long)__kprobes_text_end) ||
+ (addr >= (unsigned long)__entry_text_start &&
+ addr < (unsigned long)__entry_text_end);
+}
int __init arch_init_kprobes(void)
{
return 0;
}
-int __kprobes arch_trampoline_kprobe(struct kprobe *p)
+int arch_trampoline_kprobe(struct kprobe *p)
{
return 0;
}
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index 23ef5c556f0..717b02a22e6 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -25,8 +25,9 @@
#include "common.h"
-static int __skip_singlestep(struct kprobe *p, struct pt_regs *regs,
- struct kprobe_ctlblk *kcb)
+static nokprobe_inline
+int __skip_singlestep(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
{
/*
* Emulate singlestep (and also recover regs->ip)
@@ -41,18 +42,19 @@ static int __skip_singlestep(struct kprobe *p, struct pt_regs *regs,
return 1;
}
-int __kprobes skip_singlestep(struct kprobe *p, struct pt_regs *regs,
- struct kprobe_ctlblk *kcb)
+int skip_singlestep(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
{
if (kprobe_ftrace(p))
return __skip_singlestep(p, regs, kcb);
else
return 0;
}
+NOKPROBE_SYMBOL(skip_singlestep);
/* Ftrace callback handler for kprobes */
-void __kprobes kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *ops, struct pt_regs *regs)
+void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *ops, struct pt_regs *regs)
{
struct kprobe *p;
struct kprobe_ctlblk *kcb;
@@ -84,8 +86,9 @@ void __kprobes kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
end:
local_irq_restore(flags);
}
+NOKPROBE_SYMBOL(kprobe_ftrace_handler);
-int __kprobes arch_prepare_kprobe_ftrace(struct kprobe *p)
+int arch_prepare_kprobe_ftrace(struct kprobe *p)
{
p->ainsn.insn = NULL;
p->ainsn.boostable = -1;
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 898160b42e4..f304773285a 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -77,7 +77,7 @@ found:
}
/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */
-static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val)
+static void synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val)
{
#ifdef CONFIG_X86_64
*addr++ = 0x48;
@@ -138,7 +138,8 @@ asm (
#define INT3_SIZE sizeof(kprobe_opcode_t)
/* Optimized kprobe call back function: called from optinsn */
-static void __kprobes optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs)
+static void
+optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs)
{
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
unsigned long flags;
@@ -168,8 +169,9 @@ static void __kprobes optimized_callback(struct optimized_kprobe *op, struct pt_
}
local_irq_restore(flags);
}
+NOKPROBE_SYMBOL(optimized_callback);
-static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
+static int copy_optimized_instructions(u8 *dest, u8 *src)
{
int len = 0, ret;
@@ -189,7 +191,7 @@ static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
}
/* Check whether insn is indirect jump */
-static int __kprobes insn_is_indirect_jump(struct insn *insn)
+static int insn_is_indirect_jump(struct insn *insn)
{
return ((insn->opcode.bytes[0] == 0xff &&
(X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
@@ -224,7 +226,7 @@ static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
}
/* Decode whole function to ensure any instructions don't jump into target */
-static int __kprobes can_optimize(unsigned long paddr)
+static int can_optimize(unsigned long paddr)
{
unsigned long addr, size = 0, offset = 0;
struct insn insn;
@@ -275,7 +277,7 @@ static int __kprobes can_optimize(unsigned long paddr)
}
/* Check optimized_kprobe can actually be optimized. */
-int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op)
+int arch_check_optimized_kprobe(struct optimized_kprobe *op)
{
int i;
struct kprobe *p;
@@ -290,15 +292,15 @@ int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op)
}
/* Check the addr is within the optimized instructions. */
-int __kprobes
-arch_within_optimized_kprobe(struct optimized_kprobe *op, unsigned long addr)
+int arch_within_optimized_kprobe(struct optimized_kprobe *op,
+ unsigned long addr)
{
return ((unsigned long)op->kp.addr <= addr &&
(unsigned long)op->kp.addr + op->optinsn.size > addr);
}
/* Free optimized instruction slot */
-static __kprobes
+static
void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
{
if (op->optinsn.insn) {
@@ -308,7 +310,7 @@ void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
}
}
-void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op)
+void arch_remove_optimized_kprobe(struct optimized_kprobe *op)
{
__arch_remove_optimized_kprobe(op, 1);
}
@@ -318,7 +320,7 @@ void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op)
* Target instructions MUST be relocatable (checked inside)
* This is called when new aggr(opt)probe is allocated or reused.
*/
-int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
+int arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
{
u8 *buf;
int ret;
@@ -372,7 +374,7 @@ int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
* Replace breakpoints (int3) with relative jumps.
* Caller must call with locking kprobe_mutex and text_mutex.
*/
-void __kprobes arch_optimize_kprobes(struct list_head *oplist)
+void arch_optimize_kprobes(struct list_head *oplist)
{
struct optimized_kprobe *op, *tmp;
u8 insn_buf[RELATIVEJUMP_SIZE];
@@ -398,7 +400,7 @@ void __kprobes arch_optimize_kprobes(struct list_head *oplist)
}
/* Replace a relative jump with a breakpoint (int3). */
-void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op)
+void arch_unoptimize_kprobe(struct optimized_kprobe *op)
{
u8 insn_buf[RELATIVEJUMP_SIZE];
@@ -424,8 +426,7 @@ extern void arch_unoptimize_kprobes(struct list_head *oplist,
}
}
-int __kprobes
-setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter)
+int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter)
{
struct optimized_kprobe *op;
@@ -441,3 +442,4 @@ setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter)
}
return 0;
}
+NOKPROBE_SYMBOL(setup_detour_execution);
diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c
new file mode 100644
index 00000000000..c2bedaea11f
--- /dev/null
+++ b/arch/x86/kernel/ksysfs.c
@@ -0,0 +1,340 @@
+/*
+ * Architecture specific sysfs attributes in /sys/kernel
+ *
+ * Copyright (C) 2007, Intel Corp.
+ * Huang Ying <ying.huang@intel.com>
+ * Copyright (C) 2013, 2013 Red Hat, Inc.
+ * Dave Young <dyoung@redhat.com>
+ *
+ * This file is released under the GPLv2
+ */
+
+#include <linux/kobject.h>
+#include <linux/string.h>
+#include <linux/sysfs.h>
+#include <linux/init.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+
+#include <asm/io.h>
+#include <asm/setup.h>
+
+static ssize_t version_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "0x%04x\n", boot_params.hdr.version);
+}
+
+static struct kobj_attribute boot_params_version_attr = __ATTR_RO(version);
+
+static ssize_t boot_params_data_read(struct file *fp, struct kobject *kobj,
+ struct bin_attribute *bin_attr,
+ char *buf, loff_t off, size_t count)
+{
+ memcpy(buf, (void *)&boot_params + off, count);
+ return count;
+}
+
+static struct bin_attribute boot_params_data_attr = {
+ .attr = {
+ .name = "data",
+ .mode = S_IRUGO,
+ },
+ .read = boot_params_data_read,
+ .size = sizeof(boot_params),
+};
+
+static struct attribute *boot_params_version_attrs[] = {
+ &boot_params_version_attr.attr,
+ NULL,
+};
+
+static struct bin_attribute *boot_params_data_attrs[] = {
+ &boot_params_data_attr,
+ NULL,
+};
+
+static struct attribute_group boot_params_attr_group = {
+ .attrs = boot_params_version_attrs,
+ .bin_attrs = boot_params_data_attrs,
+};
+
+static int kobj_to_setup_data_nr(struct kobject *kobj, int *nr)
+{
+ const char *name;
+
+ name = kobject_name(kobj);
+ return kstrtoint(name, 10, nr);
+}
+
+static int get_setup_data_paddr(int nr, u64 *paddr)
+{
+ int i = 0;
+ struct setup_data *data;
+ u64 pa_data = boot_params.hdr.setup_data;
+
+ while (pa_data) {
+ if (nr == i) {
+ *paddr = pa_data;
+ return 0;
+ }
+ data = ioremap_cache(pa_data, sizeof(*data));
+ if (!data)
+ return -ENOMEM;
+
+ pa_data = data->next;
+ iounmap(data);
+ i++;
+ }
+ return -EINVAL;
+}
+
+static int __init get_setup_data_size(int nr, size_t *size)
+{
+ int i = 0;
+ struct setup_data *data;
+ u64 pa_data = boot_params.hdr.setup_data;
+
+ while (pa_data) {
+ data = ioremap_cache(pa_data, sizeof(*data));
+ if (!data)
+ return -ENOMEM;
+ if (nr == i) {
+ *size = data->len;
+ iounmap(data);
+ return 0;
+ }
+
+ pa_data = data->next;
+ iounmap(data);
+ i++;
+ }
+ return -EINVAL;
+}
+
+static ssize_t type_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ int nr, ret;
+ u64 paddr;
+ struct setup_data *data;
+
+ ret = kobj_to_setup_data_nr(kobj, &nr);
+ if (ret)
+ return ret;
+
+ ret = get_setup_data_paddr(nr, &paddr);
+ if (ret)
+ return ret;
+ data = ioremap_cache(paddr, sizeof(*data));
+ if (!data)
+ return -ENOMEM;
+
+ ret = sprintf(buf, "0x%x\n", data->type);
+ iounmap(data);
+ return ret;
+}
+
+static ssize_t setup_data_data_read(struct file *fp,
+ struct kobject *kobj,
+ struct bin_attribute *bin_attr,
+ char *buf,
+ loff_t off, size_t count)
+{
+ int nr, ret = 0;
+ u64 paddr;
+ struct setup_data *data;
+ void *p;
+
+ ret = kobj_to_setup_data_nr(kobj, &nr);
+ if (ret)
+ return ret;
+
+ ret = get_setup_data_paddr(nr, &paddr);
+ if (ret)
+ return ret;
+ data = ioremap_cache(paddr, sizeof(*data));
+ if (!data)
+ return -ENOMEM;
+
+ if (off > data->len) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (count > data->len - off)
+ count = data->len - off;
+
+ if (!count)
+ goto out;
+
+ ret = count;
+ p = ioremap_cache(paddr + sizeof(*data), data->len);
+ if (!p) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ memcpy(buf, p + off, count);
+ iounmap(p);
+out:
+ iounmap(data);
+ return ret;
+}
+
+static struct kobj_attribute type_attr = __ATTR_RO(type);
+
+static struct bin_attribute data_attr = {
+ .attr = {
+ .name = "data",
+ .mode = S_IRUGO,
+ },
+ .read = setup_data_data_read,
+};
+
+static struct attribute *setup_data_type_attrs[] = {
+ &type_attr.attr,
+ NULL,
+};
+
+static struct bin_attribute *setup_data_data_attrs[] = {
+ &data_attr,
+ NULL,
+};
+
+static struct attribute_group setup_data_attr_group = {
+ .attrs = setup_data_type_attrs,
+ .bin_attrs = setup_data_data_attrs,
+};
+
+static int __init create_setup_data_node(struct kobject *parent,
+ struct kobject **kobjp, int nr)
+{
+ int ret = 0;
+ size_t size;
+ struct kobject *kobj;
+ char name[16]; /* should be enough for setup_data nodes numbers */
+ snprintf(name, 16, "%d", nr);
+
+ kobj = kobject_create_and_add(name, parent);
+ if (!kobj)
+ return -ENOMEM;
+
+ ret = get_setup_data_size(nr, &size);
+ if (ret)
+ goto out_kobj;
+
+ data_attr.size = size;
+ ret = sysfs_create_group(kobj, &setup_data_attr_group);
+ if (ret)
+ goto out_kobj;
+ *kobjp = kobj;
+
+ return 0;
+out_kobj:
+ kobject_put(kobj);
+ return ret;
+}
+
+static void __init cleanup_setup_data_node(struct kobject *kobj)
+{
+ sysfs_remove_group(kobj, &setup_data_attr_group);
+ kobject_put(kobj);
+}
+
+static int __init get_setup_data_total_num(u64 pa_data, int *nr)
+{
+ int ret = 0;
+ struct setup_data *data;
+
+ *nr = 0;
+ while (pa_data) {
+ *nr += 1;
+ data = ioremap_cache(pa_data, sizeof(*data));
+ if (!data) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ pa_data = data->next;
+ iounmap(data);
+ }
+
+out:
+ return ret;
+}
+
+static int __init create_setup_data_nodes(struct kobject *parent)
+{
+ struct kobject *setup_data_kobj, **kobjp;
+ u64 pa_data;
+ int i, j, nr, ret = 0;
+
+ pa_data = boot_params.hdr.setup_data;
+ if (!pa_data)
+ return 0;
+
+ setup_data_kobj = kobject_create_and_add("setup_data", parent);
+ if (!setup_data_kobj) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = get_setup_data_total_num(pa_data, &nr);
+ if (ret)
+ goto out_setup_data_kobj;
+
+ kobjp = kmalloc(sizeof(*kobjp) * nr, GFP_KERNEL);
+ if (!kobjp) {
+ ret = -ENOMEM;
+ goto out_setup_data_kobj;
+ }
+
+ for (i = 0; i < nr; i++) {
+ ret = create_setup_data_node(setup_data_kobj, kobjp + i, i);
+ if (ret)
+ goto out_clean_nodes;
+ }
+
+ kfree(kobjp);
+ return 0;
+
+out_clean_nodes:
+ for (j = i - 1; j > 0; j--)
+ cleanup_setup_data_node(*(kobjp + j));
+ kfree(kobjp);
+out_setup_data_kobj:
+ kobject_put(setup_data_kobj);
+out:
+ return ret;
+}
+
+static int __init boot_params_ksysfs_init(void)
+{
+ int ret;
+ struct kobject *boot_params_kobj;
+
+ boot_params_kobj = kobject_create_and_add("boot_params",
+ kernel_kobj);
+ if (!boot_params_kobj) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = sysfs_create_group(boot_params_kobj, &boot_params_attr_group);
+ if (ret)
+ goto out_boot_params_kobj;
+
+ ret = create_setup_data_nodes(boot_params_kobj);
+ if (ret)
+ goto out_create_group;
+
+ return 0;
+out_create_group:
+ sysfs_remove_group(boot_params_kobj, &boot_params_attr_group);
+out_boot_params_kobj:
+ kobject_put(boot_params_kobj);
+out:
+ return ret;
+}
+
+arch_initcall(boot_params_ksysfs_init);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 697b93af02d..3dd8e2c4d74 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -251,15 +251,16 @@ u32 kvm_read_and_reset_pf_reason(void)
return reason;
}
EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason);
+NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason);
-dotraplinkage void __kprobes
+dotraplinkage void
do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
{
enum ctx_state prev_state;
switch (kvm_read_and_reset_pf_reason()) {
default:
- do_page_fault(regs, error_code);
+ trace_do_page_fault(regs, error_code);
break;
case KVM_PV_REASON_PAGE_NOT_PRESENT:
/* page is swapped out by the host. */
@@ -276,6 +277,7 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
break;
}
}
+NOKPROBE_SYMBOL(do_async_page_fault);
static void __init paravirt_ops_setup(void)
{
@@ -417,7 +419,6 @@ void kvm_disable_steal_time(void)
#ifdef CONFIG_SMP
static void __init kvm_smp_prepare_boot_cpu(void)
{
- WARN_ON(kvm_register_clock("primary cpu clock"));
kvm_guest_cpu_init();
native_smp_prepare_boot_cpu();
kvm_spinlock_init();
@@ -464,7 +465,7 @@ static struct notifier_block kvm_cpu_notifier = {
static void __init kvm_apf_trap_init(void)
{
- set_intr_gate(14, &async_page_fault);
+ set_intr_gate(14, async_page_fault);
}
void __init kvm_guest_init(void)
@@ -500,6 +501,38 @@ void __init kvm_guest_init(void)
#endif
}
+static noinline uint32_t __kvm_cpuid_base(void)
+{
+ if (boot_cpu_data.cpuid_level < 0)
+ return 0; /* So we don't blow up on old processors */
+
+ if (cpu_has_hypervisor)
+ return hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0);
+
+ return 0;
+}
+
+static inline uint32_t kvm_cpuid_base(void)
+{
+ static int kvm_cpuid_base = -1;
+
+ if (kvm_cpuid_base == -1)
+ kvm_cpuid_base = __kvm_cpuid_base();
+
+ return kvm_cpuid_base;
+}
+
+bool kvm_para_available(void)
+{
+ return kvm_cpuid_base() != 0;
+}
+EXPORT_SYMBOL_GPL(kvm_para_available);
+
+unsigned int kvm_arch_para_features(void)
+{
+ return cpuid_eax(kvm_cpuid_base() | KVM_CPUID_FEATURES);
+}
+
static uint32_t __init kvm_detect(void)
{
return kvm_cpuid_base();
@@ -609,7 +642,7 @@ static struct dentry *d_kvm_debug;
struct dentry *kvm_init_debugfs(void)
{
- d_kvm_debug = debugfs_create_dir("kvm", NULL);
+ d_kvm_debug = debugfs_create_dir("kvm-guest", NULL);
if (!d_kvm_debug)
printk(KERN_WARNING "Could not create 'kvm' debugfs directory\n");
@@ -673,7 +706,7 @@ static cpumask_t waiting_cpus;
/* Track spinlock on which a cpu is waiting */
static DEFINE_PER_CPU(struct kvm_lock_waiting, klock_waiting);
-static void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
+__visible void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
{
struct kvm_lock_waiting *w;
int cpu;
@@ -775,11 +808,22 @@ void __init kvm_spinlock_init(void)
if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
return;
- printk(KERN_INFO "KVM setup paravirtual spinlock\n");
+ pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(kvm_lock_spinning);
+ pv_lock_ops.unlock_kick = kvm_unlock_kick;
+}
+
+static __init int kvm_spinlock_init_jump(void)
+{
+ if (!kvm_para_available())
+ return 0;
+ if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
+ return 0;
static_key_slow_inc(&paravirt_ticketlocks_enabled);
+ printk(KERN_INFO "KVM setup paravirtual spinlock\n");
- pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(kvm_lock_spinning);
- pv_lock_ops.unlock_kick = kvm_unlock_kick;
+ return 0;
}
+early_initcall(kvm_spinlock_init_jump);
+
#endif /* CONFIG_PARAVIRT_SPINLOCKS */
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 1570e074134..d9156ceecdf 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -139,6 +139,7 @@ bool kvm_check_and_clear_guest_paused(void)
src = &hv_clock[cpu].pvti;
if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) {
src->flags &= ~PVCLOCK_GUEST_STOPPED;
+ pvclock_touch_watchdogs();
ret = true;
}
@@ -241,7 +242,7 @@ void __init kvmclock_init(void)
hv_clock = __va(mem);
memset(hv_clock, 0, size);
- if (kvm_register_clock("boot clock")) {
+ if (kvm_register_clock("primary cpu clock")) {
hv_clock = NULL;
memblock_free(mem, size);
return;
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index ebc98739892..c37886d759c 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -229,6 +229,11 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
}
}
+ if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) {
+ error = -EINVAL;
+ goto out_unlock;
+ }
+
fill_ldt(&ldt, &ldt_info);
if (oldmode)
ldt.avl = 0;
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 5b19e4d78b0..1667b1de8d5 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -9,7 +9,6 @@
#include <linux/mm.h>
#include <linux/kexec.h>
#include <linux/delay.h>
-#include <linux/init.h>
#include <linux/numa.h>
#include <linux/ftrace.h>
#include <linux/suspend.h>
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 4eabc160696..679cef0791c 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -279,5 +279,7 @@ void arch_crash_save_vmcoreinfo(void)
VMCOREINFO_SYMBOL(node_data);
VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
#endif
+ vmcoreinfo_append_str("KERNELOFFSET=%lx\n",
+ (unsigned long)&_text - __START_KERNEL);
}
diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/mcount_64.S
new file mode 100644
index 00000000000..c050a015316
--- /dev/null
+++ b/arch/x86/kernel/mcount_64.S
@@ -0,0 +1,217 @@
+/*
+ * linux/arch/x86_64/mcount_64.S
+ *
+ * Copyright (C) 2014 Steven Rostedt, Red Hat Inc
+ */
+
+#include <linux/linkage.h>
+#include <asm/ptrace.h>
+#include <asm/ftrace.h>
+
+
+ .code64
+ .section .entry.text, "ax"
+
+
+#ifdef CONFIG_FUNCTION_TRACER
+
+#ifdef CC_USING_FENTRY
+# define function_hook __fentry__
+#else
+# define function_hook mcount
+#endif
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+ENTRY(function_hook)
+ retq
+END(function_hook)
+
+/* skip is set if stack has been adjusted */
+.macro ftrace_caller_setup skip=0
+ MCOUNT_SAVE_FRAME \skip
+
+ /* Load the ftrace_ops into the 3rd parameter */
+ movq function_trace_op(%rip), %rdx
+
+ /* Load ip into the first parameter */
+ movq RIP(%rsp), %rdi
+ subq $MCOUNT_INSN_SIZE, %rdi
+ /* Load the parent_ip into the second parameter */
+#ifdef CC_USING_FENTRY
+ movq SS+16(%rsp), %rsi
+#else
+ movq 8(%rbp), %rsi
+#endif
+.endm
+
+ENTRY(ftrace_caller)
+ /* Check if tracing was disabled (quick check) */
+ cmpl $0, function_trace_stop
+ jne ftrace_stub
+
+ ftrace_caller_setup
+ /* regs go into 4th parameter (but make it NULL) */
+ movq $0, %rcx
+
+GLOBAL(ftrace_call)
+ call ftrace_stub
+
+ MCOUNT_RESTORE_FRAME
+ftrace_return:
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+GLOBAL(ftrace_graph_call)
+ jmp ftrace_stub
+#endif
+
+GLOBAL(ftrace_stub)
+ retq
+END(ftrace_caller)
+
+ENTRY(ftrace_regs_caller)
+ /* Save the current flags before compare (in SS location)*/
+ pushfq
+
+ /* Check if tracing was disabled (quick check) */
+ cmpl $0, function_trace_stop
+ jne ftrace_restore_flags
+
+ /* skip=8 to skip flags saved in SS */
+ ftrace_caller_setup 8
+
+ /* Save the rest of pt_regs */
+ movq %r15, R15(%rsp)
+ movq %r14, R14(%rsp)
+ movq %r13, R13(%rsp)
+ movq %r12, R12(%rsp)
+ movq %r11, R11(%rsp)
+ movq %r10, R10(%rsp)
+ movq %rbp, RBP(%rsp)
+ movq %rbx, RBX(%rsp)
+ /* Copy saved flags */
+ movq SS(%rsp), %rcx
+ movq %rcx, EFLAGS(%rsp)
+ /* Kernel segments */
+ movq $__KERNEL_DS, %rcx
+ movq %rcx, SS(%rsp)
+ movq $__KERNEL_CS, %rcx
+ movq %rcx, CS(%rsp)
+ /* Stack - skipping return address */
+ leaq SS+16(%rsp), %rcx
+ movq %rcx, RSP(%rsp)
+
+ /* regs go into 4th parameter */
+ leaq (%rsp), %rcx
+
+GLOBAL(ftrace_regs_call)
+ call ftrace_stub
+
+ /* Copy flags back to SS, to restore them */
+ movq EFLAGS(%rsp), %rax
+ movq %rax, SS(%rsp)
+
+ /* Handlers can change the RIP */
+ movq RIP(%rsp), %rax
+ movq %rax, SS+8(%rsp)
+
+ /* restore the rest of pt_regs */
+ movq R15(%rsp), %r15
+ movq R14(%rsp), %r14
+ movq R13(%rsp), %r13
+ movq R12(%rsp), %r12
+ movq R10(%rsp), %r10
+ movq RBP(%rsp), %rbp
+ movq RBX(%rsp), %rbx
+
+ /* skip=8 to skip flags saved in SS */
+ MCOUNT_RESTORE_FRAME 8
+
+ /* Restore flags */
+ popfq
+
+ jmp ftrace_return
+ftrace_restore_flags:
+ popfq
+ jmp ftrace_stub
+
+END(ftrace_regs_caller)
+
+
+#else /* ! CONFIG_DYNAMIC_FTRACE */
+
+ENTRY(function_hook)
+ cmpl $0, function_trace_stop
+ jne ftrace_stub
+
+ cmpq $ftrace_stub, ftrace_trace_function
+ jnz trace
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ cmpq $ftrace_stub, ftrace_graph_return
+ jnz ftrace_graph_caller
+
+ cmpq $ftrace_graph_entry_stub, ftrace_graph_entry
+ jnz ftrace_graph_caller
+#endif
+
+GLOBAL(ftrace_stub)
+ retq
+
+trace:
+ MCOUNT_SAVE_FRAME
+
+ movq RIP(%rsp), %rdi
+#ifdef CC_USING_FENTRY
+ movq SS+16(%rsp), %rsi
+#else
+ movq 8(%rbp), %rsi
+#endif
+ subq $MCOUNT_INSN_SIZE, %rdi
+
+ call *ftrace_trace_function
+
+ MCOUNT_RESTORE_FRAME
+
+ jmp ftrace_stub
+END(function_hook)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_FUNCTION_TRACER */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ENTRY(ftrace_graph_caller)
+ MCOUNT_SAVE_FRAME
+
+#ifdef CC_USING_FENTRY
+ leaq SS+16(%rsp), %rdi
+ movq $0, %rdx /* No framepointers needed */
+#else
+ leaq 8(%rbp), %rdi
+ movq (%rbp), %rdx
+#endif
+ movq RIP(%rsp), %rsi
+ subq $MCOUNT_INSN_SIZE, %rsi
+
+ call prepare_ftrace_return
+
+ MCOUNT_RESTORE_FRAME
+
+ retq
+END(ftrace_graph_caller)
+
+GLOBAL(return_to_handler)
+ subq $24, %rsp
+
+ /* Save the return values */
+ movq %rax, (%rsp)
+ movq %rdx, 8(%rsp)
+ movq %rbp, %rdi
+
+ call ftrace_return_to_handler
+
+ movq %rax, %rdi
+ movq 8(%rsp), %rdx
+ movq (%rsp), %rax
+ addq $24, %rsp
+ jmp *%rdi
+#endif
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 216a4d754b0..e69f9882bf9 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -28,6 +28,7 @@
#include <linux/mm.h>
#include <linux/gfp.h>
#include <linux/jump_label.h>
+#include <linux/random.h>
#include <asm/page.h>
#include <asm/pgtable.h>
@@ -43,13 +44,52 @@ do { \
} while (0)
#endif
+#ifdef CONFIG_RANDOMIZE_BASE
+static unsigned long module_load_offset;
+static int randomize_modules = 1;
+
+/* Mutex protects the module_load_offset. */
+static DEFINE_MUTEX(module_kaslr_mutex);
+
+static int __init parse_nokaslr(char *p)
+{
+ randomize_modules = 0;
+ return 0;
+}
+early_param("nokaslr", parse_nokaslr);
+
+static unsigned long int get_module_load_offset(void)
+{
+ if (randomize_modules) {
+ mutex_lock(&module_kaslr_mutex);
+ /*
+ * Calculate the module_load_offset the first time this
+ * code is called. Once calculated it stays the same until
+ * reboot.
+ */
+ if (module_load_offset == 0)
+ module_load_offset =
+ (get_random_int() % 1024 + 1) * PAGE_SIZE;
+ mutex_unlock(&module_kaslr_mutex);
+ }
+ return module_load_offset;
+}
+#else
+static unsigned long int get_module_load_offset(void)
+{
+ return 0;
+}
+#endif
+
void *module_alloc(unsigned long size)
{
if (PAGE_ALIGN(size) > MODULES_LEN)
return NULL;
- return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
- GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
- -1, __builtin_return_address(0));
+ return __vmalloc_node_range(size, 1,
+ MODULES_VADDR + get_module_load_offset(),
+ MODULES_END, GFP_KERNEL | __GFP_HIGHMEM,
+ PAGE_KERNEL_EXEC, NUMA_NO_NODE,
+ __builtin_return_address(0));
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 88458faea2f..c9603ac80de 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -46,7 +46,7 @@ static struct class *msr_class;
static loff_t msr_seek(struct file *file, loff_t offset, int orig)
{
loff_t ret;
- struct inode *inode = file->f_mapping->host;
+ struct inode *inode = file_inode(file);
mutex_lock(&inode->i_mutex);
switch (orig) {
@@ -259,14 +259,15 @@ static int __init msr_init(void)
goto out_chrdev;
}
msr_class->devnode = msr_devnode;
- get_online_cpus();
+
+ cpu_notifier_register_begin();
for_each_online_cpu(i) {
err = msr_device_create(i);
if (err != 0)
goto out_class;
}
- register_hotcpu_notifier(&msr_class_cpu_notifier);
- put_online_cpus();
+ __register_hotcpu_notifier(&msr_class_cpu_notifier);
+ cpu_notifier_register_done();
err = 0;
goto out;
@@ -275,7 +276,7 @@ out_class:
i = 0;
for_each_online_cpu(i)
msr_device_destroy(i);
- put_online_cpus();
+ cpu_notifier_register_done();
class_destroy(msr_class);
out_chrdev:
__unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
@@ -286,13 +287,14 @@ out:
static void __exit msr_exit(void)
{
int cpu = 0;
- get_online_cpus();
+
+ cpu_notifier_register_begin();
for_each_online_cpu(cpu)
msr_device_destroy(cpu);
class_destroy(msr_class);
__unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
- unregister_hotcpu_notifier(&msr_class_cpu_notifier);
- put_online_cpus();
+ __unregister_hotcpu_notifier(&msr_class_cpu_notifier);
+ cpu_notifier_register_done();
}
module_init(msr_init);
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index ba77ebc2c35..c3e985d1751 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -87,6 +87,7 @@ __setup("unknown_nmi_panic", setup_unknown_nmi_panic);
#define nmi_to_desc(type) (&nmi_desc[type])
static u64 nmi_longest_ns = 1 * NSEC_PER_MSEC;
+
static int __init nmi_warning_debugfs(void)
{
debugfs_create_u64("nmi_longest_ns", 0644,
@@ -95,7 +96,21 @@ static int __init nmi_warning_debugfs(void)
}
fs_initcall(nmi_warning_debugfs);
-static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b)
+static void nmi_max_handler(struct irq_work *w)
+{
+ struct nmiaction *a = container_of(w, struct nmiaction, irq_work);
+ int remainder_ns, decimal_msecs;
+ u64 whole_msecs = ACCESS_ONCE(a->max_duration);
+
+ remainder_ns = do_div(whole_msecs, (1000 * 1000));
+ decimal_msecs = remainder_ns / 1000;
+
+ printk_ratelimited(KERN_INFO
+ "INFO: NMI handler (%ps) took too long to run: %lld.%03d msecs\n",
+ a->handler, whole_msecs, decimal_msecs);
+}
+
+static int nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b)
{
struct nmi_desc *desc = nmi_to_desc(type);
struct nmiaction *a;
@@ -110,26 +125,20 @@ static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2
* to handle those situations.
*/
list_for_each_entry_rcu(a, &desc->head, list) {
- u64 before, delta, whole_msecs;
- int remainder_ns, decimal_msecs, thishandled;
+ int thishandled;
+ u64 delta;
- before = local_clock();
+ delta = sched_clock();
thishandled = a->handler(type, regs);
handled += thishandled;
- delta = local_clock() - before;
+ delta = sched_clock() - delta;
trace_nmi_handler(a->handler, (int)delta, thishandled);
- if (delta < nmi_longest_ns)
+ if (delta < nmi_longest_ns || delta < a->max_duration)
continue;
- nmi_longest_ns = delta;
- whole_msecs = delta;
- remainder_ns = do_div(whole_msecs, (1000 * 1000));
- decimal_msecs = remainder_ns / 1000;
- printk_ratelimited(KERN_INFO
- "INFO: NMI handler (%ps) took too long to run: "
- "%lld.%03d msecs\n", a->handler, whole_msecs,
- decimal_msecs);
+ a->max_duration = delta;
+ irq_work_queue(&a->irq_work);
}
rcu_read_unlock();
@@ -137,6 +146,7 @@ static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2
/* return total number of NMI events handled */
return handled;
}
+NOKPROBE_SYMBOL(nmi_handle);
int __register_nmi_handler(unsigned int type, struct nmiaction *action)
{
@@ -146,6 +156,8 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action)
if (!action->handler)
return -EINVAL;
+ init_irq_work(&action->irq_work, nmi_max_handler);
+
spin_lock_irqsave(&desc->lock, flags);
/*
@@ -197,7 +209,7 @@ void unregister_nmi_handler(unsigned int type, const char *name)
}
EXPORT_SYMBOL_GPL(unregister_nmi_handler);
-static __kprobes void
+static void
pci_serr_error(unsigned char reason, struct pt_regs *regs)
{
/* check to see if anyone registered against these types of errors */
@@ -227,8 +239,9 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs)
reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR;
outb(reason, NMI_REASON_PORT);
}
+NOKPROBE_SYMBOL(pci_serr_error);
-static __kprobes void
+static void
io_check_error(unsigned char reason, struct pt_regs *regs)
{
unsigned long i;
@@ -258,8 +271,9 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
reason &= ~NMI_REASON_CLEAR_IOCHK;
outb(reason, NMI_REASON_PORT);
}
+NOKPROBE_SYMBOL(io_check_error);
-static __kprobes void
+static void
unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
{
int handled;
@@ -287,11 +301,12 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
pr_emerg("Dazed and confused, but trying to continue\n");
}
+NOKPROBE_SYMBOL(unknown_nmi_error);
static DEFINE_PER_CPU(bool, swallow_nmi);
static DEFINE_PER_CPU(unsigned long, last_nmi_rip);
-static __kprobes void default_do_nmi(struct pt_regs *regs)
+static void default_do_nmi(struct pt_regs *regs)
{
unsigned char reason = 0;
int handled;
@@ -390,6 +405,7 @@ static __kprobes void default_do_nmi(struct pt_regs *regs)
else
unknown_nmi_error(reason, regs);
}
+NOKPROBE_SYMBOL(default_do_nmi);
/*
* NMIs can hit breakpoints which will cause it to lose its
@@ -509,7 +525,7 @@ static inline void nmi_nesting_postprocess(void)
}
#endif
-dotraplinkage notrace __kprobes void
+dotraplinkage notrace void
do_nmi(struct pt_regs *regs, long error_code)
{
nmi_nesting_preprocess(regs);
@@ -526,6 +542,7 @@ do_nmi(struct pt_regs *regs, long error_code)
/* On i386, may loop back to preprocess */
nmi_nesting_postprocess();
}
+NOKPROBE_SYMBOL(do_nmi);
void stop_nmi(void)
{
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 1b10af835c3..548d25f00c9 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -23,6 +23,7 @@
#include <linux/efi.h>
#include <linux/bcd.h>
#include <linux/highmem.h>
+#include <linux/kprobes.h>
#include <asm/bug.h>
#include <asm/paravirt.h>
@@ -389,6 +390,11 @@ __visible struct pv_cpu_ops pv_cpu_ops = {
.end_context_switch = paravirt_nop,
};
+/* At this point, native_get/set_debugreg has real function entries */
+NOKPROBE_SYMBOL(native_get_debugreg);
+NOKPROBE_SYMBOL(native_set_debugreg);
+NOKPROBE_SYMBOL(native_load_idt);
+
struct pv_apic_ops pv_apic_ops = {
#ifdef CONFIG_X86_LOCAL_APIC
.startup_ipi_hook = paravirt_nop,
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index 3f08f34f93e..a1da6737ba5 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -6,7 +6,6 @@ DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
DEF_NATIVE(pv_irq_ops, restore_fl, "pushq %rdi; popfq");
DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
-DEF_NATIVE(pv_cpu_ops, iret, "iretq");
DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
@@ -50,7 +49,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_irq_ops, save_fl);
PATCH_SITE(pv_irq_ops, irq_enable);
PATCH_SITE(pv_irq_ops, irq_disable);
- PATCH_SITE(pv_cpu_ops, iret);
PATCH_SITE(pv_cpu_ops, irq_enable_sysexit);
PATCH_SITE(pv_cpu_ops, usergs_sysret32);
PATCH_SITE(pv_cpu_ops, usergs_sysret64);
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 299d49302e7..0497f719977 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -1207,23 +1207,31 @@ error:
return ret;
}
-static inline int __init determine_tce_table_size(u64 ram)
+static inline int __init determine_tce_table_size(void)
{
int ret;
if (specified_table_size != TCE_TABLE_SIZE_UNSPECIFIED)
return specified_table_size;
- /*
- * Table sizes are from 0 to 7 (TCE_TABLE_SIZE_64K to
- * TCE_TABLE_SIZE_8M). Table size 0 has 8K entries and each
- * larger table size has twice as many entries, so shift the
- * max ram address by 13 to divide by 8K and then look at the
- * order of the result to choose between 0-7.
- */
- ret = get_order(ram >> 13);
- if (ret > TCE_TABLE_SIZE_8M)
+ if (is_kdump_kernel() && saved_max_pfn) {
+ /*
+ * Table sizes are from 0 to 7 (TCE_TABLE_SIZE_64K to
+ * TCE_TABLE_SIZE_8M). Table size 0 has 8K entries and each
+ * larger table size has twice as many entries, so shift the
+ * max ram address by 13 to divide by 8K and then look at the
+ * order of the result to choose between 0-7.
+ */
+ ret = get_order((saved_max_pfn * PAGE_SIZE) >> 13);
+ if (ret > TCE_TABLE_SIZE_8M)
+ ret = TCE_TABLE_SIZE_8M;
+ } else {
+ /*
+ * Use 8M by default (suggested by Muli) if it's not
+ * kdump kernel and saved_max_pfn isn't set.
+ */
ret = TCE_TABLE_SIZE_8M;
+ }
return ret;
}
@@ -1418,8 +1426,7 @@ int __init detect_calgary(void)
return -ENOMEM;
}
- specified_table_size = determine_tce_table_size((is_kdump_kernel() ?
- saved_max_pfn : max_pfn) * PAGE_SIZE);
+ specified_table_size = determine_tce_table_size();
for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
struct calgary_bus_info *info = &bus_info[bus];
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 872079a67e4..a25e202bb31 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -97,11 +97,18 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size,
dma_mask = dma_alloc_coherent_mask(dev, flag);
- flag |= __GFP_ZERO;
+ flag &= ~__GFP_ZERO;
again:
page = NULL;
- if (!(flag & GFP_ATOMIC))
+ /* CMA can be used only in the context which permits sleeping */
+ if (flag & __GFP_WAIT) {
page = dma_alloc_from_contiguous(dev, count, get_order(size));
+ if (page && page_to_phys(page) + size > dma_mask) {
+ dma_release_from_contiguous(dev, page, count);
+ page = NULL;
+ }
+ }
+ /* fallback */
if (!page)
page = alloc_pages_node(dev_to_node(dev), flag, get_order(size));
if (!page)
@@ -118,7 +125,7 @@ again:
return NULL;
}
-
+ memset(page_address(page), 0, size);
*dma_addr = addr;
return page_address(page);
}
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index 871be4a84c7..da15918d1c8 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -3,7 +3,6 @@
#include <linux/dma-mapping.h>
#include <linux/scatterlist.h>
#include <linux/string.h>
-#include <linux/init.h>
#include <linux/gfp.h>
#include <linux/pci.h>
#include <linux/mm.h>
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 6c483ba98b9..77dd0ad58be 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -14,7 +14,7 @@
#include <asm/iommu_table.h>
int swiotlb __read_mostly;
-static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
+void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
dma_addr_t *dma_handle, gfp_t flags,
struct dma_attrs *attrs)
{
@@ -28,11 +28,14 @@ static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
return swiotlb_alloc_coherent(hwdev, size, dma_handle, flags);
}
-static void x86_swiotlb_free_coherent(struct device *dev, size_t size,
+void x86_swiotlb_free_coherent(struct device *dev, size_t size,
void *vaddr, dma_addr_t dma_addr,
struct dma_attrs *attrs)
{
- swiotlb_free_coherent(dev, size, vaddr, dma_addr);
+ if (is_swiotlb_buffer(dma_to_phys(dev, dma_addr)))
+ swiotlb_free_coherent(dev, size, vaddr, dma_addr);
+ else
+ dma_generic_free_coherent(dev, size, vaddr, dma_addr, attrs);
}
static struct dma_map_ops swiotlb_dma_ops = {
diff --git a/arch/x86/kernel/preempt.S b/arch/x86/kernel/preempt.S
new file mode 100644
index 00000000000..ca7f0d58a87
--- /dev/null
+++ b/arch/x86/kernel/preempt.S
@@ -0,0 +1,25 @@
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/asm.h>
+#include <asm/calling.h>
+
+ENTRY(___preempt_schedule)
+ CFI_STARTPROC
+ SAVE_ALL
+ call preempt_schedule
+ RESTORE_ALL
+ ret
+ CFI_ENDPROC
+
+#ifdef CONFIG_CONTEXT_TRACKING
+
+ENTRY(___preempt_schedule_context)
+ CFI_STARTPROC
+ SAVE_ALL
+ call preempt_schedule_context
+ RESTORE_ALL
+ ret
+ CFI_ENDPROC
+
+#endif
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index c83516be105..4505e2a950d 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -298,10 +298,7 @@ void arch_cpu_idle_dead(void)
*/
void arch_cpu_idle(void)
{
- if (cpuidle_idle_call())
- x86_idle();
- else
- local_irq_enable();
+ x86_idle();
}
/*
@@ -391,9 +388,9 @@ static void amd_e400_idle(void)
* The switch back from broadcast mode needs to be
* called with interrupts disabled.
*/
- local_irq_disable();
- clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
- local_irq_enable();
+ local_irq_disable();
+ clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
+ local_irq_enable();
} else
default_idle();
}
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 884f98f6935..7bc86bbe748 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -24,7 +24,6 @@
#include <linux/interrupt.h>
#include <linux/delay.h>
#include <linux/reboot.h>
-#include <linux/init.h>
#include <linux/mc146818rtc.h>
#include <linux/module.h>
#include <linux/kallsyms.h>
@@ -153,7 +152,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
childregs->orig_ax = -1;
childregs->cs = __KERNEL_CS | get_kernel_rpl();
childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
- p->fpu_counter = 0;
+ p->thread.fpu_counter = 0;
p->thread.io_bitmap_ptr = NULL;
memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
return 0;
@@ -166,7 +165,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
p->thread.ip = (unsigned long) ret_from_fork;
task_user_gs(p) = get_user_gs(current_pt_regs());
- p->fpu_counter = 0;
+ p->thread.fpu_counter = 0;
p->thread.io_bitmap_ptr = NULL;
tsk = current;
err = -ENOMEM;
@@ -292,6 +291,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
set_iopl_mask(next->iopl);
/*
+ * If it were not for PREEMPT_ACTIVE we could guarantee that the
+ * preempt_count of all tasks was equal here and this would not be
+ * needed.
+ */
+ task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
+ this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
+
+ /*
* Now maybe handle debug registers and/or IO bitmaps
*/
if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV ||
@@ -307,6 +314,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
*/
arch_end_context_switch(next_p);
+ this_cpu_write(kernel_stack,
+ (unsigned long)task_stack_page(next_p) +
+ THREAD_SIZE - KERNEL_STACK_OFFSET);
+
/*
* Restore %gs if needed (which is common)
*/
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index bb1dc51bab0..ca5b02d405c 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -52,7 +52,7 @@
asmlinkage extern void ret_from_fork(void);
-asmlinkage DEFINE_PER_CPU(unsigned long, old_rsp);
+__visible DEFINE_PER_CPU(unsigned long, old_rsp);
/* Prints also some state that isn't saved in the pt_regs */
void __show_regs(struct pt_regs *regs, int all)
@@ -63,7 +63,7 @@ void __show_regs(struct pt_regs *regs, int all)
unsigned int ds, cs, es;
printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
- printk_address(regs->ip, 1);
+ printk_address(regs->ip);
printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
regs->sp, regs->flags);
printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
@@ -163,7 +163,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
p->thread.sp = (unsigned long) childregs;
p->thread.usersp = me->thread.usersp;
set_tsk_thread_flag(p, TIF_FORK);
- p->fpu_counter = 0;
+ p->thread.fpu_counter = 0;
p->thread.io_bitmap_ptr = NULL;
savesegment(gs, p->thread.gsindex);
@@ -363,6 +363,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
this_cpu_write(old_rsp, next->usersp);
this_cpu_write(current_task, next_p);
+ /*
+ * If it were not for PREEMPT_ACTIVE we could guarantee that the
+ * preempt_count of all tasks was equal here and this would not be
+ * needed.
+ */
+ task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
+ this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
+
this_cpu_write(kernel_stack,
(unsigned long)task_stack_page(next_p) +
THREAD_SIZE - KERNEL_STACK_OFFSET);
@@ -405,12 +413,11 @@ void set_personality_ia32(bool x32)
set_thread_flag(TIF_ADDR32);
/* Mark the associated mm as containing 32-bit tasks. */
- if (current->mm)
- current->mm->context.ia32_compat = 1;
-
if (x32) {
clear_thread_flag(TIF_IA32);
set_thread_flag(TIF_X32);
+ if (current->mm)
+ current->mm->context.ia32_compat = TIF_X32;
current->personality &= ~READ_IMPLIES_EXEC;
/* is_compat_task() uses the presence of the x32
syscall bit flag to determine compat status */
@@ -418,6 +425,8 @@ void set_personality_ia32(bool x32)
} else {
set_thread_flag(TIF_IA32);
clear_thread_flag(TIF_X32);
+ if (current->mm)
+ current->mm->context.ia32_compat = TIF_IA32;
current->personality |= force_personality32;
/* Prepare the first "return" to user space */
current_thread_info()->status |= TS_COMPAT;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 7461f50d5bb..678c0ada3b3 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -184,14 +184,14 @@ unsigned long kernel_stack_pointer(struct pt_regs *regs)
{
unsigned long context = (unsigned long)regs & ~(THREAD_SIZE - 1);
unsigned long sp = (unsigned long)&regs->sp;
- struct thread_info *tinfo;
+ u32 *prev_esp;
if (context == (sp & ~(THREAD_SIZE - 1)))
return sp;
- tinfo = (struct thread_info *)context;
- if (tinfo->previous_esp)
- return tinfo->previous_esp;
+ prev_esp = (u32 *)(context);
+ if (prev_esp)
+ return (unsigned long)prev_esp;
return (unsigned long)regs;
}
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index a16bae3f83b..2f355d229a5 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -43,6 +43,14 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
return pv_tsc_khz;
}
+void pvclock_touch_watchdogs(void)
+{
+ touch_softlockup_watchdog_sync();
+ clocksource_touch_watchdog();
+ rcu_cpu_stall_reset();
+ reset_hung_task_detector();
+}
+
static atomic64_t last_value = ATOMIC64_INIT(0);
void pvclock_resume(void)
@@ -74,6 +82,11 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
version = __pvclock_read_cycles(src, &ret, &flags);
} while ((src->version & 1) || version != src->version);
+ if (unlikely((flags & PVCLOCK_GUEST_STOPPED) != 0)) {
+ src->flags &= ~PVCLOCK_GUEST_STOPPED;
+ pvclock_touch_watchdogs();
+ }
+
if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) &&
(flags & PVCLOCK_TSC_STABLE_BIT))
return ret;
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 04ee1e2e4c0..ff898bbf579 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -529,7 +529,7 @@ static void quirk_amd_nb_node(struct pci_dev *dev)
return;
pci_read_config_dword(nb_ht, 0x60, &val);
- node = val & 7;
+ node = pcibus_to_node(dev->bus) | (val & 7);
/*
* Some hardware may return an invalid node ID,
* so check it first:
@@ -571,3 +571,40 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F5,
quirk_amd_nb_node);
#endif
+
+#ifdef CONFIG_PCI
+/*
+ * Processor does not ensure DRAM scrub read/write sequence
+ * is atomic wrt accesses to CC6 save state area. Therefore
+ * if a concurrent scrub read/write access is to same address
+ * the entry may appear as if it is not written. This quirk
+ * applies to Fam16h models 00h-0Fh
+ *
+ * See "Revision Guide" for AMD F16h models 00h-0fh,
+ * document 51810 rev. 3.04, Nov 2013
+ */
+static void amd_disable_seq_and_redirect_scrub(struct pci_dev *dev)
+{
+ u32 val;
+
+ /*
+ * Suggested workaround:
+ * set D18F3x58[4:0] = 00h and set D18F3x5C[0] = 0b
+ */
+ pci_read_config_dword(dev, 0x58, &val);
+ if (val & 0x1F) {
+ val &= ~(0x1F);
+ pci_write_config_dword(dev, 0x58, val);
+ }
+
+ pci_read_config_dword(dev, 0x5C, &val);
+ if (val & BIT(0)) {
+ val &= ~BIT(0);
+ pci_write_config_dword(dev, 0x5c, val);
+ }
+}
+
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3,
+ amd_disable_seq_and_redirect_scrub);
+
+#endif
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index e643e744e4d..52b1157c53e 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -61,7 +61,7 @@ static int __init set_bios_reboot(const struct dmi_system_id *d)
if (reboot_type != BOOT_BIOS) {
reboot_type = BOOT_BIOS;
pr_info("%s series board detected. Selecting %s-method for reboots.\n",
- "BIOS", d->ident);
+ d->ident, "BIOS");
}
return 0;
}
@@ -114,10 +114,10 @@ EXPORT_SYMBOL(machine_real_restart);
*/
static int __init set_pci_reboot(const struct dmi_system_id *d)
{
- if (reboot_type != BOOT_CF9) {
- reboot_type = BOOT_CF9;
+ if (reboot_type != BOOT_CF9_FORCE) {
+ reboot_type = BOOT_CF9_FORCE;
pr_info("%s series board detected. Selecting %s-method for reboots.\n",
- "PCI", d->ident);
+ d->ident, "PCI");
}
return 0;
}
@@ -127,7 +127,7 @@ static int __init set_kbd_reboot(const struct dmi_system_id *d)
if (reboot_type != BOOT_KBD) {
reboot_type = BOOT_KBD;
pr_info("%s series board detected. Selecting %s-method for reboot.\n",
- "KBD", d->ident);
+ d->ident, "KBD");
}
return 0;
}
@@ -136,244 +136,266 @@ static int __init set_kbd_reboot(const struct dmi_system_id *d)
* This is a single dmi_table handling all reboot quirks.
*/
static struct dmi_system_id __initdata reboot_dmi_table[] = {
- { /* Handle problems with rebooting on Dell E520's */
- .callback = set_bios_reboot,
- .ident = "Dell E520",
+
+ /* Acer */
+ { /* Handle reboot issue on Acer Aspire one */
+ .callback = set_kbd_reboot,
+ .ident = "Acer Aspire One A110",
.matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "Dell DM061"),
+ DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"),
},
},
- { /* Handle problems with rebooting on Dell 1300's */
- .callback = set_bios_reboot,
- .ident = "Dell PowerEdge 1300",
+
+ /* Apple */
+ { /* Handle problems with rebooting on Apple MacBook5 */
+ .callback = set_pci_reboot,
+ .ident = "Apple MacBook5",
.matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
- DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1300/"),
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"),
},
},
- { /* Handle problems with rebooting on Dell 300's */
- .callback = set_bios_reboot,
- .ident = "Dell PowerEdge 300",
+ { /* Handle problems with rebooting on Apple MacBookPro5 */
+ .callback = set_pci_reboot,
+ .ident = "Apple MacBookPro5",
.matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
- DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"),
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"),
},
},
- { /* Handle problems with rebooting on Dell Optiplex 745's SFF */
- .callback = set_bios_reboot,
- .ident = "Dell OptiPlex 745",
+ { /* Handle problems with rebooting on Apple Macmini3,1 */
+ .callback = set_pci_reboot,
+ .ident = "Apple Macmini3,1",
.matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"),
},
},
- { /* Handle problems with rebooting on Dell Optiplex 745's DFF */
- .callback = set_bios_reboot,
- .ident = "Dell OptiPlex 745",
+ { /* Handle problems with rebooting on the iMac9,1. */
+ .callback = set_pci_reboot,
+ .ident = "Apple iMac9,1",
.matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
- DMI_MATCH(DMI_BOARD_NAME, "0MM599"),
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"),
},
},
- { /* Handle problems with rebooting on Dell Optiplex 745 with 0KW626 */
+
+ /* ASUS */
+ { /* Handle problems with rebooting on ASUS P4S800 */
.callback = set_bios_reboot,
- .ident = "Dell OptiPlex 745",
+ .ident = "ASUS P4S800",
.matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
- DMI_MATCH(DMI_BOARD_NAME, "0KW626"),
+ DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
+ DMI_MATCH(DMI_BOARD_NAME, "P4S800"),
},
},
- { /* Handle problems with rebooting on Dell Optiplex 330 with 0KP561 */
+
+ /* Certec */
+ { /* Handle problems with rebooting on Certec BPC600 */
+ .callback = set_pci_reboot,
+ .ident = "Certec BPC600",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Certec"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "BPC600"),
+ },
+ },
+
+ /* Dell */
+ { /* Handle problems with rebooting on Dell DXP061 */
.callback = set_bios_reboot,
- .ident = "Dell OptiPlex 330",
+ .ident = "Dell DXP061",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 330"),
- DMI_MATCH(DMI_BOARD_NAME, "0KP561"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Dell DXP061"),
},
},
- { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */
+ { /* Handle problems with rebooting on Dell E520's */
.callback = set_bios_reboot,
- .ident = "Dell OptiPlex 360",
+ .ident = "Dell E520",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 360"),
- DMI_MATCH(DMI_BOARD_NAME, "0T656F"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Dell DM061"),
},
},
- { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G */
- .callback = set_bios_reboot,
- .ident = "Dell OptiPlex 760",
+ { /* Handle problems with rebooting on the Latitude E5410. */
+ .callback = set_pci_reboot,
+ .ident = "Dell Latitude E5410",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 760"),
- DMI_MATCH(DMI_BOARD_NAME, "0G919G"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5410"),
},
},
- { /* Handle problems with rebooting on Dell 2400's */
- .callback = set_bios_reboot,
- .ident = "Dell PowerEdge 2400",
+ { /* Handle problems with rebooting on the Latitude E5420. */
+ .callback = set_pci_reboot,
+ .ident = "Dell Latitude E5420",
.matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
- DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"),
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5420"),
},
},
- { /* Handle problems with rebooting on Dell T5400's */
- .callback = set_bios_reboot,
- .ident = "Dell Precision T5400",
+ { /* Handle problems with rebooting on the Latitude E6320. */
+ .callback = set_pci_reboot,
+ .ident = "Dell Latitude E6320",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T5400"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6320"),
},
},
- { /* Handle problems with rebooting on Dell T7400's */
- .callback = set_bios_reboot,
- .ident = "Dell Precision T7400",
+ { /* Handle problems with rebooting on the Latitude E6420. */
+ .callback = set_pci_reboot,
+ .ident = "Dell Latitude E6420",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T7400"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6420"),
},
},
- { /* Handle problems with rebooting on HP laptops */
+ { /* Handle problems with rebooting on Dell Optiplex 330 with 0KP561 */
.callback = set_bios_reboot,
- .ident = "HP Compaq Laptop",
+ .ident = "Dell OptiPlex 330",
.matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
- DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"),
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 330"),
+ DMI_MATCH(DMI_BOARD_NAME, "0KP561"),
},
},
- { /* Handle problems with rebooting on Dell XPS710 */
+ { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */
.callback = set_bios_reboot,
- .ident = "Dell XPS710",
+ .ident = "Dell OptiPlex 360",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "Dell XPS710"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 360"),
+ DMI_MATCH(DMI_BOARD_NAME, "0T656F"),
},
},
- { /* Handle problems with rebooting on Dell DXP061 */
+ { /* Handle problems with rebooting on Dell Optiplex 745's SFF */
.callback = set_bios_reboot,
- .ident = "Dell DXP061",
+ .ident = "Dell OptiPlex 745",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "Dell DXP061"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
},
},
- { /* Handle problems with rebooting on Sony VGN-Z540N */
+ { /* Handle problems with rebooting on Dell Optiplex 745's DFF */
.callback = set_bios_reboot,
- .ident = "Sony VGN-Z540N",
+ .ident = "Dell OptiPlex 745",
.matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Sony Corporation"),
- DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"),
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
+ DMI_MATCH(DMI_BOARD_NAME, "0MM599"),
},
},
- { /* Handle problems with rebooting on ASUS P4S800 */
+ { /* Handle problems with rebooting on Dell Optiplex 745 with 0KW626 */
.callback = set_bios_reboot,
- .ident = "ASUS P4S800",
+ .ident = "Dell OptiPlex 745",
.matches = {
- DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
- DMI_MATCH(DMI_BOARD_NAME, "P4S800"),
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
+ DMI_MATCH(DMI_BOARD_NAME, "0KW626"),
},
},
-
- { /* Handle reboot issue on Acer Aspire one */
- .callback = set_kbd_reboot,
- .ident = "Acer Aspire One A110",
+ { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G */
+ .callback = set_bios_reboot,
+ .ident = "Dell OptiPlex 760",
.matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
- DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"),
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 760"),
+ DMI_MATCH(DMI_BOARD_NAME, "0G919G"),
},
},
- { /* Handle problems with rebooting on Apple MacBook5 */
+ { /* Handle problems with rebooting on the OptiPlex 990. */
.callback = set_pci_reboot,
- .ident = "Apple MacBook5",
+ .ident = "Dell OptiPlex 990",
.matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"),
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"),
},
},
- { /* Handle problems with rebooting on Apple MacBookPro5 */
- .callback = set_pci_reboot,
- .ident = "Apple MacBookPro5",
+ { /* Handle problems with rebooting on Dell 300's */
+ .callback = set_bios_reboot,
+ .ident = "Dell PowerEdge 300",
.matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"),
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"),
},
},
- { /* Handle problems with rebooting on Apple Macmini3,1 */
- .callback = set_pci_reboot,
- .ident = "Apple Macmini3,1",
+ { /* Handle problems with rebooting on Dell 1300's */
+ .callback = set_bios_reboot,
+ .ident = "Dell PowerEdge 1300",
.matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"),
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1300/"),
},
},
- { /* Handle problems with rebooting on the iMac9,1. */
- .callback = set_pci_reboot,
- .ident = "Apple iMac9,1",
+ { /* Handle problems with rebooting on Dell 2400's */
+ .callback = set_bios_reboot,
+ .ident = "Dell PowerEdge 2400",
.matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"),
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"),
},
},
- { /* Handle problems with rebooting on the Latitude E6320. */
+ { /* Handle problems with rebooting on the Dell PowerEdge C6100. */
.callback = set_pci_reboot,
- .ident = "Dell Latitude E6320",
+ .ident = "Dell PowerEdge C6100",
.matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6320"),
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "C6100"),
},
},
- { /* Handle problems with rebooting on the Latitude E5420. */
+ { /* Handle problems with rebooting on the Precision M6600. */
.callback = set_pci_reboot,
- .ident = "Dell Latitude E5420",
+ .ident = "Dell Precision M6600",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5420"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Precision M6600"),
},
},
- { /* Handle problems with rebooting on the Latitude E6420. */
- .callback = set_pci_reboot,
- .ident = "Dell Latitude E6420",
+ { /* Handle problems with rebooting on Dell T5400's */
+ .callback = set_bios_reboot,
+ .ident = "Dell Precision T5400",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6420"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T5400"),
},
},
- { /* Handle problems with rebooting on the OptiPlex 990. */
- .callback = set_pci_reboot,
- .ident = "Dell OptiPlex 990",
+ { /* Handle problems with rebooting on Dell T7400's */
+ .callback = set_bios_reboot,
+ .ident = "Dell Precision T7400",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T7400"),
},
},
- { /* Handle problems with rebooting on the Precision M6600. */
- .callback = set_pci_reboot,
- .ident = "Dell Precision M6600",
+ { /* Handle problems with rebooting on Dell XPS710 */
+ .callback = set_bios_reboot,
+ .ident = "Dell XPS710",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "Precision M6600"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Dell XPS710"),
},
},
- { /* Handle problems with rebooting on the Dell PowerEdge C6100. */
- .callback = set_pci_reboot,
- .ident = "Dell PowerEdge C6100",
+
+ /* Hewlett-Packard */
+ { /* Handle problems with rebooting on HP laptops */
+ .callback = set_bios_reboot,
+ .ident = "HP Compaq Laptop",
.matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
- DMI_MATCH(DMI_PRODUCT_NAME, "C6100"),
+ DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"),
},
},
- { /* Some C6100 machines were shipped with vendor being 'Dell'. */
- .callback = set_pci_reboot,
- .ident = "Dell PowerEdge C6100",
+
+ /* Sony */
+ { /* Handle problems with rebooting on Sony VGN-Z540N */
+ .callback = set_bios_reboot,
+ .ident = "Sony VGN-Z540N",
.matches = {
- DMI_MATCH(DMI_SYS_VENDOR, "Dell"),
- DMI_MATCH(DMI_PRODUCT_NAME, "C6100"),
+ DMI_MATCH(DMI_SYS_VENDOR, "Sony Corporation"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"),
},
},
+
{ }
};
@@ -446,17 +468,23 @@ void __attribute__((weak)) mach_reboot_fixups(void)
}
/*
- * Windows compatible x86 hardware expects the following on reboot:
+ * To the best of our knowledge Windows compatible x86 hardware expects
+ * the following on reboot:
*
* 1) If the FADT has the ACPI reboot register flag set, try it
* 2) If still alive, write to the keyboard controller
* 3) If still alive, write to the ACPI reboot register again
* 4) If still alive, write to the keyboard controller again
+ * 5) If still alive, call the EFI runtime service to reboot
+ * 6) If no EFI runtime service, call the BIOS to do a reboot
+ *
+ * We default to following the same pattern. We also have
+ * two other reboot methods: 'triple fault' and 'PCI', which
+ * can be triggered via the reboot= kernel boot option or
+ * via quirks.
*
- * If the machine is still alive at this stage, it gives up. We default to
- * following the same pattern, except that if we're still alive after (4) we'll
- * try to force a triple fault and then cycle between hitting the keyboard
- * controller and doing that
+ * This means that this function can never return, it can misbehave
+ * by not rebooting properly and hanging.
*/
static void native_machine_emergency_restart(void)
{
@@ -477,6 +505,11 @@ static void native_machine_emergency_restart(void)
for (;;) {
/* Could also try the reset bit in the Hammer NB */
switch (reboot_type) {
+ case BOOT_ACPI:
+ acpi_reboot();
+ reboot_type = BOOT_KBD;
+ break;
+
case BOOT_KBD:
mach_reboot_fixups(); /* For board specific fixups */
@@ -490,49 +523,48 @@ static void native_machine_emergency_restart(void)
attempt = 1;
reboot_type = BOOT_ACPI;
} else {
- reboot_type = BOOT_TRIPLE;
+ reboot_type = BOOT_EFI;
}
break;
- case BOOT_TRIPLE:
- load_idt(&no_idt);
- __asm__ __volatile__("int3");
-
- reboot_type = BOOT_KBD;
- break;
-
- case BOOT_BIOS:
- machine_real_restart(MRR_BIOS);
-
- reboot_type = BOOT_KBD;
- break;
-
- case BOOT_ACPI:
- acpi_reboot();
- reboot_type = BOOT_KBD;
- break;
-
case BOOT_EFI:
if (efi_enabled(EFI_RUNTIME_SERVICES))
efi.reset_system(reboot_mode == REBOOT_WARM ?
EFI_RESET_WARM :
EFI_RESET_COLD,
EFI_SUCCESS, 0, NULL);
- reboot_type = BOOT_KBD;
+ reboot_type = BOOT_BIOS;
break;
- case BOOT_CF9:
+ case BOOT_BIOS:
+ machine_real_restart(MRR_BIOS);
+
+ /* We're probably dead after this, but... */
+ reboot_type = BOOT_CF9_SAFE;
+ break;
+
+ case BOOT_CF9_FORCE:
port_cf9_safe = true;
/* Fall through */
- case BOOT_CF9_COND:
+ case BOOT_CF9_SAFE:
if (port_cf9_safe) {
- u8 cf9 = inb(0xcf9) & ~6;
+ u8 reboot_code = reboot_mode == REBOOT_WARM ? 0x06 : 0x0E;
+ u8 cf9 = inb(0xcf9) & ~reboot_code;
outb(cf9|2, 0xcf9); /* Request hard reset */
udelay(50);
- outb(cf9|6, 0xcf9); /* Actually do the reset */
+ /* Actually do the reset */
+ outb(cf9|reboot_code, 0xcf9);
udelay(50);
}
+ reboot_type = BOOT_TRIPLE;
+ break;
+
+ case BOOT_TRIPLE:
+ load_idt(&no_idt);
+ __asm__ __volatile__("int3");
+
+ /* We're probably dead after this, but... */
reboot_type = BOOT_KBD;
break;
}
@@ -542,6 +574,21 @@ static void native_machine_emergency_restart(void)
void native_machine_shutdown(void)
{
/* Stop the cpus and apics */
+#ifdef CONFIG_X86_IO_APIC
+ /*
+ * Disabling IO APIC before local APIC is a workaround for
+ * erratum AVR31 in "Intel Atom Processor C2000 Product Family
+ * Specification Update". In this situation, interrupts that target
+ * a Logical Processor whose Local APIC is either in the process of
+ * being hardware disabled or software disabled are neither delivered
+ * nor discarded. When this erratum occurs, the processor may hang.
+ *
+ * Even without the erratum, it still makes sense to quiet IO APIC
+ * before disabling Local APIC.
+ */
+ disable_IO_APIC();
+#endif
+
#ifdef CONFIG_SMP
/*
* Stop all of the others. Also disable the local irq to
@@ -554,10 +601,6 @@ void native_machine_shutdown(void)
lapic_shutdown();
-#ifdef CONFIG_X86_IO_APIC
- disable_IO_APIC();
-#endif
-
#ifdef CONFIG_HPET_TIMER
hpet_disable();
#endif
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 0aa29394ed6..ca9622a25e9 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -12,7 +12,7 @@
#include <asm/vsyscall.h>
#include <asm/x86_init.h>
#include <asm/time.h>
-#include <asm/mrst.h>
+#include <asm/intel-mid.h>
#include <asm/rtc.h>
#ifdef CONFIG_X86_32
@@ -189,9 +189,17 @@ static __init int add_rtc_cmos(void)
return 0;
/* Intel MID platforms don't have ioport rtc */
- if (mrst_identify_cpu())
+ if (intel_mid_identify_cpu())
return -ENODEV;
+#ifdef CONFIG_ACPI
+ if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC) {
+ /* This warning can likely go away again in a year or two. */
+ pr_info("ACPI: not registering RTC platform device\n");
+ return -ENODEV;
+ }
+#endif
+
platform_device_register(&rtc_device);
dev_info(&rtc_device.dev,
"registered platform RTC device (no PNP device found)\n");
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index f0de6294b95..78a0e629892 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -295,6 +295,8 @@ static void __init reserve_brk(void)
_brk_start = 0;
}
+u64 relocated_ramdisk;
+
#ifdef CONFIG_BLK_DEV_INITRD
static u64 __init get_ramdisk_image(void)
@@ -321,25 +323,24 @@ static void __init relocate_initrd(void)
u64 ramdisk_image = get_ramdisk_image();
u64 ramdisk_size = get_ramdisk_size();
u64 area_size = PAGE_ALIGN(ramdisk_size);
- u64 ramdisk_here;
unsigned long slop, clen, mapaddr;
char *p, *q;
/* We need to move the initrd down into directly mapped mem */
- ramdisk_here = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
- area_size, PAGE_SIZE);
+ relocated_ramdisk = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
+ area_size, PAGE_SIZE);
- if (!ramdisk_here)
+ if (!relocated_ramdisk)
panic("Cannot find place for new RAMDISK of size %lld\n",
- ramdisk_size);
+ ramdisk_size);
/* Note: this includes all the mem currently occupied by
the initrd, we rely on that fact to keep the data intact. */
- memblock_reserve(ramdisk_here, area_size);
- initrd_start = ramdisk_here + PAGE_OFFSET;
+ memblock_reserve(relocated_ramdisk, area_size);
+ initrd_start = relocated_ramdisk + PAGE_OFFSET;
initrd_end = initrd_start + ramdisk_size;
printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n",
- ramdisk_here, ramdisk_here + ramdisk_size - 1);
+ relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1);
q = (char *)initrd_start;
@@ -363,7 +364,7 @@ static void __init relocate_initrd(void)
printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to"
" [mem %#010llx-%#010llx]\n",
ramdisk_image, ramdisk_image + ramdisk_size - 1,
- ramdisk_here, ramdisk_here + ramdisk_size - 1);
+ relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1);
}
static void __init early_reserve_initrd(void)
@@ -447,6 +448,9 @@ static void __init parse_setup_data(void)
case SETUP_DTB:
add_dtb(pa_data);
break;
+ case SETUP_EFI:
+ parse_efi_setup(pa_data, data_len);
+ break;
default:
break;
}
@@ -824,6 +828,20 @@ static void __init trim_low_memory_range(void)
}
/*
+ * Dump out kernel offset information on panic.
+ */
+static int
+dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
+{
+ pr_emerg("Kernel Offset: 0x%lx from 0x%lx "
+ "(relocation range: 0x%lx-0x%lx)\n",
+ (unsigned long)&_text - __START_KERNEL, __START_KERNEL,
+ __START_KERNEL_map, MODULES_VADDR-1);
+
+ return 0;
+}
+
+/*
* Determine if we were loaded by an EFI loader. If so, then we have also been
* passed the efi memmap, systab, etc., so we should use these data structures
* for initialization. Note, the efi init code path is determined by the
@@ -851,7 +869,6 @@ void __init setup_arch(char **cmdline_p)
#ifdef CONFIG_X86_32
memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
- visws_early_detect();
/*
* copy kernel address range established so far and switch
@@ -908,11 +925,11 @@ void __init setup_arch(char **cmdline_p)
#ifdef CONFIG_EFI
if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
"EL32", 4)) {
- set_bit(EFI_BOOT, &x86_efi_facility);
+ set_bit(EFI_BOOT, &efi.flags);
} else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
"EL64", 4)) {
- set_bit(EFI_BOOT, &x86_efi_facility);
- set_bit(EFI_64BIT, &x86_efi_facility);
+ set_bit(EFI_BOOT, &efi.flags);
+ set_bit(EFI_64BIT, &efi.flags);
}
if (efi_enabled(EFI_BOOT))
@@ -924,8 +941,6 @@ void __init setup_arch(char **cmdline_p)
iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1;
setup_memory_map();
parse_setup_data();
- /* update the e820_saved too */
- e820_reserve_setup_data();
copy_edd();
@@ -987,12 +1002,15 @@ void __init setup_arch(char **cmdline_p)
early_dump_pci_devices();
#endif
+ /* update the e820_saved too */
+ e820_reserve_setup_data();
finish_e820_parsing();
if (efi_enabled(EFI_BOOT))
efi_init();
dmi_scan_machine();
+ dmi_memdev_walk();
dmi_set_dump_stack_arch_desc();
/*
@@ -1101,7 +1119,7 @@ void __init setup_arch(char **cmdline_p)
setup_real_mode();
memblock_set_current_limit(get_max_mapped());
- dma_contiguous_reserve(0);
+ dma_contiguous_reserve(max_pfn_mapped << PAGE_SHIFT);
/*
* NOTE: On x86-32, only from this point on, fixmaps are ready for use.
@@ -1120,8 +1138,6 @@ void __init setup_arch(char **cmdline_p)
acpi_initrd_override((void *)initrd_start, initrd_end - initrd_start);
#endif
- reserve_crashkernel();
-
vsmp_init();
io_delay_init();
@@ -1134,6 +1150,13 @@ void __init setup_arch(char **cmdline_p)
early_acpi_boot_init();
initmem_init();
+
+ /*
+ * Reserve memory for crash kernel after SRAT is parsed so that it
+ * won't consume hotpluggable memory.
+ */
+ reserve_crashkernel();
+
memblock_find_dma_reserve();
#ifdef CONFIG_KVM_GUEST
@@ -1215,14 +1238,8 @@ void __init setup_arch(char **cmdline_p)
register_refined_jiffies(CLOCK_TICK_RATE);
#ifdef CONFIG_EFI
- /* Once setup is done above, unmap the EFI memory map on
- * mismatched firmware/kernel archtectures since there is no
- * support for runtime services.
- */
- if (efi_enabled(EFI_BOOT) && !efi_is_native()) {
- pr_info("efi: Setup done, disabling due to 32/64-bit mismatch\n");
- efi_unmap_memmap();
- }
+ if (efi_enabled(EFI_BOOT))
+ efi_apply_memmap_quirks();
#endif
}
@@ -1242,3 +1259,15 @@ void __init i386_reserve_resources(void)
}
#endif /* CONFIG_X86_32 */
+
+static struct notifier_block kernel_offset_notifier = {
+ .notifier_call = dump_kernel_offset
+};
+
+static int __init register_kernel_offset_dumper(void)
+{
+ atomic_notifier_chain_register(&panic_notifier_list,
+ &kernel_offset_notifier);
+ return 0;
+}
+__initcall(register_kernel_offset_dumper);
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 9e5de6813e1..2851d63c120 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -298,7 +298,8 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t *set,
}
if (current->mm->context.vdso)
- restorer = VDSO32_SYMBOL(current->mm->context.vdso, sigreturn);
+ restorer = current->mm->context.vdso +
+ selected_vdso32->sym___kernel_sigreturn;
else
restorer = &frame->retcode;
if (ksig->ka.sa.sa_flags & SA_RESTORER)
@@ -361,7 +362,8 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
save_altstack_ex(&frame->uc.uc_stack, regs->sp);
/* Set up to return from userspace. */
- restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn);
+ restorer = current->mm->context.vdso +
+ selected_vdso32->sym___kernel_rt_sigreturn;
if (ksig->ka.sa.sa_flags & SA_RESTORER)
restorer = ksig->ka.sa.sa_restorer;
put_user_ex(restorer, &frame->pretcode);
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 7c3a5a61f2e..be8e1bde07a 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -168,7 +168,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
* this function calls the 'stop' function on all other CPUs in the system.
*/
-asmlinkage void smp_reboot_interrupt(void)
+asmlinkage __visible void smp_reboot_interrupt(void)
{
ack_APIC_irq();
irq_enter();
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 6cacab671f9..5492798930e 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -73,36 +73,14 @@
#include <asm/setup.h>
#include <asm/uv/uv.h>
#include <linux/mc146818rtc.h>
-
#include <asm/smpboot_hooks.h>
#include <asm/i8259.h>
-
#include <asm/realmode.h>
+#include <asm/misc.h>
/* State of each CPU */
DEFINE_PER_CPU(int, cpu_state) = { 0 };
-#ifdef CONFIG_HOTPLUG_CPU
-/*
- * We need this for trampoline_base protection from concurrent accesses when
- * off- and onlining cores wildly.
- */
-static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex);
-
-void cpu_hotplug_driver_lock(void)
-{
- mutex_lock(&x86_cpu_hotplug_driver_mutex);
-}
-
-void cpu_hotplug_driver_unlock(void)
-{
- mutex_unlock(&x86_cpu_hotplug_driver_mutex);
-}
-
-ssize_t arch_cpu_probe(const char *buf, size_t count) { return -1; }
-ssize_t arch_cpu_release(const char *buf, size_t count) { return -1; }
-#endif
-
/* Number of siblings per CPU package */
int smp_num_siblings = 1;
EXPORT_SYMBOL(smp_num_siblings);
@@ -144,8 +122,9 @@ static void smp_callin(void)
* Since CPU0 is not wakened up by INIT, it doesn't wait for the IPI.
*/
cpuid = smp_processor_id();
- if (apic->wait_for_init_deassert && cpuid != 0)
- apic->wait_for_init_deassert(&init_deasserted);
+ if (apic->wait_for_init_deassert && cpuid)
+ while (!atomic_read(&init_deasserted))
+ cpu_relax();
/*
* (This works even if the APIC is not enabled.)
@@ -265,6 +244,13 @@ static void notrace start_secondary(void *unused)
check_tsc_sync_target();
/*
+ * Enable the espfix hack for this CPU
+ */
+#ifdef CONFIG_X86_ESPFIX64
+ init_espfix_ap();
+#endif
+
+ /*
* We need to hold vector_lock so there the set of online cpus
* does not change while we are assigning vectors to cpus. Holding
* this lock ensures we don't half assign or remove an irq from a cpu.
@@ -648,22 +634,46 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
return (send_status | accept_status);
}
+void smp_announce(void)
+{
+ int num_nodes = num_online_nodes();
+
+ printk(KERN_INFO "x86: Booted up %d node%s, %d CPUs\n",
+ num_nodes, (num_nodes > 1 ? "s" : ""), num_online_cpus());
+}
+
/* reduce the number of lines printed when booting a large cpu count system */
static void announce_cpu(int cpu, int apicid)
{
static int current_node = -1;
int node = early_cpu_to_node(cpu);
- int max_cpu_present = find_last_bit(cpumask_bits(cpu_present_mask), NR_CPUS);
+ static int width, node_width;
+
+ if (!width)
+ width = num_digits(num_possible_cpus()) + 1; /* + '#' sign */
+
+ if (!node_width)
+ node_width = num_digits(num_possible_nodes()) + 1; /* + '#' */
+
+ if (cpu == 1)
+ printk(KERN_INFO "x86: Booting SMP configuration:\n");
if (system_state == SYSTEM_BOOTING) {
if (node != current_node) {
if (current_node > (-1))
- pr_cont(" OK\n");
+ pr_cont("\n");
current_node = node;
- pr_info("Booting Node %3d, Processors ", node);
+
+ printk(KERN_INFO ".... node %*s#%d, CPUs: ",
+ node_width - num_digits(node), " ", node);
}
- pr_cont(" #%4d%s", cpu, cpu == max_cpu_present ? " OK\n" : "");
- return;
+
+ /* Add padding for the BSP */
+ if (cpu == 1)
+ pr_cont("%*s", width + 1, " ");
+
+ pr_cont("%*s#%d", width - num_digits(cpu), " ", cpu);
+
} else
pr_info("Booting Node %d Processor %d APIC 0x%x\n",
node, cpu, apicid);
@@ -699,11 +709,15 @@ wakeup_cpu_via_init_nmi(int cpu, unsigned long start_ip, int apicid,
int id;
int boot_error;
+ preempt_disable();
+
/*
* Wake up AP by INIT, INIT, STARTUP sequence.
*/
- if (cpu)
- return wakeup_secondary_cpu_via_init(apicid, start_ip);
+ if (cpu) {
+ boot_error = wakeup_secondary_cpu_via_init(apicid, start_ip);
+ goto out;
+ }
/*
* Wake up BSP by nmi.
@@ -723,6 +737,9 @@ wakeup_cpu_via_init_nmi(int cpu, unsigned long start_ip, int apicid,
boot_error = wakeup_secondary_cpu_via_nmi(id, start_ip);
}
+out:
+ preempt_enable();
+
return boot_error;
}
@@ -756,10 +773,10 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
#else
clear_tsk_thread_flag(idle, TIF_FORK);
initial_gs = per_cpu_offset(cpu);
+#endif
per_cpu(kernel_stack, cpu) =
(unsigned long)task_stack_page(idle) -
KERNEL_STACK_OFFSET + THREAD_SIZE;
-#endif
early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
initial_code = (unsigned long)start_secondary;
stack_start = idle->thread.sp;
@@ -849,9 +866,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
/* was set by cpu_init() */
cpumask_clear_cpu(cpu, cpu_initialized_mask);
-
- set_cpu_present(cpu, false);
- per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID;
}
/* mark "stuck" area as not stuck */
@@ -911,7 +925,7 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
err = do_boot_cpu(apicid, cpu, tidle);
if (err) {
- pr_debug("do_boot_cpu failed %d\n", err);
+ pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu);
return -EIO;
}
@@ -1310,6 +1324,12 @@ void cpu_disable_common(void)
int native_cpu_disable(void)
{
+ int ret;
+
+ ret = check_irq_vectors_for_cpu_disable();
+ if (ret)
+ return ret;
+
clear_local_APIC();
cpu_disable_common();
@@ -1371,7 +1391,7 @@ static inline void mwait_play_dead(void)
if (!this_cpu_has(X86_FEATURE_MWAIT))
return;
- if (!this_cpu_has(X86_FEATURE_CLFLSH))
+ if (!this_cpu_has(X86_FEATURE_CLFLUSH))
return;
if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)
return;
@@ -1415,7 +1435,9 @@ static inline void mwait_play_dead(void)
* The WBINVD is insufficient due to the spurious-wakeup
* case where we return around the loop.
*/
+ mb();
clflush(mwait_ptr);
+ mb();
__monitor(mwait_ptr, 0, 0);
mb();
__mwait(eax, 0);
diff --git a/arch/x86/kernel/sysfb_simplefb.c b/arch/x86/kernel/sysfb_simplefb.c
index 22513e96b01..86179d40989 100644
--- a/arch/x86/kernel/sysfb_simplefb.c
+++ b/arch/x86/kernel/sysfb_simplefb.c
@@ -72,14 +72,14 @@ __init int create_simplefb(const struct screen_info *si,
* the part that is occupied by the framebuffer */
len = mode->height * mode->stride;
len = PAGE_ALIGN(len);
- if (len > si->lfb_size << 16) {
+ if (len > (u64)si->lfb_size << 16) {
printk(KERN_WARNING "sysfb: VRAM smaller than advertised\n");
return -EINVAL;
}
/* setup IORESOURCE_MEM as framebuffer memory */
memset(&res, 0, sizeof(res));
- res.flags = IORESOURCE_MEM;
+ res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
res.name = simplefb_resname;
res.start = si->lfb_base;
res.end = si->lfb_base + len - 1;
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index 24d3c91e981..bf7ef5ce29d 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -23,7 +23,7 @@
#include <asm/time.h>
#ifdef CONFIG_X86_64
-DEFINE_VVAR(volatile unsigned long, jiffies) = INITIAL_JIFFIES;
+__visible DEFINE_VVAR(volatile unsigned long, jiffies) = INITIAL_JIFFIES;
#endif
unsigned long profile_pc(struct pt_regs *regs)
@@ -62,7 +62,7 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id)
static struct irqaction irq0 = {
.handler = timer_interrupt,
- .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER,
+ .flags = IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER,
.name = "timer"
};
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index 6e60b5fe224..649b010da00 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -65,29 +65,32 @@ int __ref _debug_hotplug_cpu(int cpu, int action)
if (!cpu_is_hotpluggable(cpu))
return -EINVAL;
- cpu_hotplug_driver_lock();
+ lock_device_hotplug();
switch (action) {
case 0:
ret = cpu_down(cpu);
if (!ret) {
pr_info("CPU %u is now offline\n", cpu);
+ dev->offline = true;
kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
} else
pr_debug("Can't offline CPU%d.\n", cpu);
break;
case 1:
ret = cpu_up(cpu);
- if (!ret)
+ if (!ret) {
+ dev->offline = false;
kobject_uevent(&dev->kobj, KOBJ_ONLINE);
- else
+ } else {
pr_debug("Can't online CPU%d.\n", cpu);
+ }
break;
default:
ret = -EINVAL;
}
- cpu_hotplug_driver_unlock();
+ unlock_device_hotplug();
return ret;
}
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 8c8093b146c..0d0e922fafc 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -23,6 +23,7 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/ptrace.h>
+#include <linux/uprobes.h>
#include <linux/string.h>
#include <linux/delay.h>
#include <linux/errno.h>
@@ -88,7 +89,7 @@ static inline void conditional_sti(struct pt_regs *regs)
static inline void preempt_conditional_sti(struct pt_regs *regs)
{
- inc_preempt_count();
+ preempt_count_inc();
if (regs->flags & X86_EFLAGS_IF)
local_irq_enable();
}
@@ -103,10 +104,10 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
{
if (regs->flags & X86_EFLAGS_IF)
local_irq_disable();
- dec_preempt_count();
+ preempt_count_dec();
}
-static int __kprobes
+static nokprobe_inline int
do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
struct pt_regs *regs, long error_code)
{
@@ -136,7 +137,38 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
return -1;
}
-static void __kprobes
+static siginfo_t *fill_trap_info(struct pt_regs *regs, int signr, int trapnr,
+ siginfo_t *info)
+{
+ unsigned long siaddr;
+ int sicode;
+
+ switch (trapnr) {
+ default:
+ return SEND_SIG_PRIV;
+
+ case X86_TRAP_DE:
+ sicode = FPE_INTDIV;
+ siaddr = uprobe_get_trap_addr(regs);
+ break;
+ case X86_TRAP_UD:
+ sicode = ILL_ILLOPN;
+ siaddr = uprobe_get_trap_addr(regs);
+ break;
+ case X86_TRAP_AC:
+ sicode = BUS_ADRALN;
+ siaddr = 0;
+ break;
+ }
+
+ info->si_signo = signr;
+ info->si_errno = 0;
+ info->si_code = sicode;
+ info->si_addr = (void __user *)siaddr;
+ return info;
+}
+
+static void
do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
long error_code, siginfo_t *info)
{
@@ -168,64 +200,43 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
}
#endif
- if (info)
- force_sig_info(signr, info, tsk);
- else
- force_sig(signr, tsk);
+ force_sig_info(signr, info ?: SEND_SIG_PRIV, tsk);
}
+NOKPROBE_SYMBOL(do_trap);
-#define DO_ERROR(trapnr, signr, str, name) \
-dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
-{ \
- enum ctx_state prev_state; \
- \
- prev_state = exception_enter(); \
- if (notify_die(DIE_TRAP, str, regs, error_code, \
- trapnr, signr) == NOTIFY_STOP) { \
- exception_exit(prev_state); \
- return; \
- } \
- conditional_sti(regs); \
- do_trap(trapnr, signr, str, regs, error_code, NULL); \
- exception_exit(prev_state); \
+static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
+ unsigned long trapnr, int signr)
+{
+ enum ctx_state prev_state = exception_enter();
+ siginfo_t info;
+
+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=
+ NOTIFY_STOP) {
+ conditional_sti(regs);
+ do_trap(trapnr, signr, str, regs, error_code,
+ fill_trap_info(regs, signr, trapnr, &info));
+ }
+
+ exception_exit(prev_state);
}
-#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
+#define DO_ERROR(trapnr, signr, str, name) \
dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
{ \
- siginfo_t info; \
- enum ctx_state prev_state; \
- \
- info.si_signo = signr; \
- info.si_errno = 0; \
- info.si_code = sicode; \
- info.si_addr = (void __user *)siaddr; \
- prev_state = exception_enter(); \
- if (notify_die(DIE_TRAP, str, regs, error_code, \
- trapnr, signr) == NOTIFY_STOP) { \
- exception_exit(prev_state); \
- return; \
- } \
- conditional_sti(regs); \
- do_trap(trapnr, signr, str, regs, error_code, &info); \
- exception_exit(prev_state); \
+ do_error_trap(regs, error_code, str, trapnr, signr); \
}
-DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV,
- regs->ip)
-DO_ERROR(X86_TRAP_OF, SIGSEGV, "overflow", overflow)
-DO_ERROR(X86_TRAP_BR, SIGSEGV, "bounds", bounds)
-DO_ERROR_INFO(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN,
- regs->ip)
-DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun",
- coprocessor_segment_overrun)
-DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS)
-DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present)
+DO_ERROR(X86_TRAP_DE, SIGFPE, "divide error", divide_error)
+DO_ERROR(X86_TRAP_OF, SIGSEGV, "overflow", overflow)
+DO_ERROR(X86_TRAP_BR, SIGSEGV, "bounds", bounds)
+DO_ERROR(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op)
+DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun",coprocessor_segment_overrun)
+DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS)
+DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present)
#ifdef CONFIG_X86_32
-DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment)
+DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment)
#endif
-DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check,
- BUS_ADRALN, 0)
+DO_ERROR(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check)
#ifdef CONFIG_X86_64
/* Runs on IST stack */
@@ -267,7 +278,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
}
#endif
-dotraplinkage void __kprobes
+dotraplinkage void
do_general_protection(struct pt_regs *regs, long error_code)
{
struct task_struct *tsk;
@@ -309,13 +320,14 @@ do_general_protection(struct pt_regs *regs, long error_code)
pr_cont("\n");
}
- force_sig(SIGSEGV, tsk);
+ force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
exit:
exception_exit(prev_state);
}
+NOKPROBE_SYMBOL(do_general_protection);
/* May run on IST stack. */
-dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_code)
+dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
{
enum ctx_state prev_state;
@@ -338,6 +350,11 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_co
goto exit;
#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
+#ifdef CONFIG_KPROBES
+ if (kprobe_int3_handler(regs))
+ goto exit;
+#endif
+
if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
SIGTRAP) == NOTIFY_STOP)
goto exit;
@@ -354,6 +371,7 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_co
exit:
exception_exit(prev_state);
}
+NOKPROBE_SYMBOL(do_int3);
#ifdef CONFIG_X86_64
/*
@@ -361,7 +379,7 @@ exit:
* for scheduling or signal handling. The actual stack switch is done in
* entry.S
*/
-asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
+asmlinkage __visible struct pt_regs *sync_regs(struct pt_regs *eregs)
{
struct pt_regs *regs = eregs;
/* Did already sync */
@@ -380,6 +398,7 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
*regs = *eregs;
return regs;
}
+NOKPROBE_SYMBOL(sync_regs);
#endif
/*
@@ -406,7 +425,7 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
*
* May run on IST stack.
*/
-dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
+dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
{
struct task_struct *tsk = current;
enum ctx_state prev_state;
@@ -444,6 +463,11 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
/* Store the virtualized DR6 value */
tsk->thread.debugreg6 = dr6;
+#ifdef CONFIG_KPROBES
+ if (kprobe_debug_handler(regs))
+ goto exit;
+#endif
+
if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, error_code,
SIGTRAP) == NOTIFY_STOP)
goto exit;
@@ -486,13 +510,14 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
exit:
exception_exit(prev_state);
}
+NOKPROBE_SYMBOL(do_debug);
/*
* Note that we play around with the 'TS' bit in an attempt to get
* the correct behaviour even in the presence of the asynchronous
* IRQ13 behaviour
*/
-void math_error(struct pt_regs *regs, int error_code, int trapnr)
+static void math_error(struct pt_regs *regs, int error_code, int trapnr)
{
struct task_struct *task = current;
siginfo_t info;
@@ -522,7 +547,7 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr)
task->thread.error_code = error_code;
info.si_signo = SIGFPE;
info.si_errno = 0;
- info.si_addr = (void __user *)regs->ip;
+ info.si_addr = (void __user *)uprobe_get_trap_addr(regs);
if (trapnr == X86_TRAP_MF) {
unsigned short cwd, swd;
/*
@@ -605,11 +630,11 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
#endif
}
-asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
+asmlinkage __visible void __attribute__((weak)) smp_thermal_interrupt(void)
{
}
-asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void)
+asmlinkage __visible void __attribute__((weak)) smp_threshold_interrupt(void)
{
}
@@ -649,15 +674,15 @@ void math_state_restore(void)
*/
if (unlikely(restore_fpu_checking(tsk))) {
drop_init_fpu(tsk);
- force_sig(SIGSEGV, tsk);
+ force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
return;
}
- tsk->fpu_counter++;
+ tsk->thread.fpu_counter++;
}
EXPORT_SYMBOL_GPL(math_state_restore);
-dotraplinkage void __kprobes
+dotraplinkage void
do_device_not_available(struct pt_regs *regs, long error_code)
{
enum ctx_state prev_state;
@@ -683,6 +708,7 @@ do_device_not_available(struct pt_regs *regs, long error_code)
#endif
exception_exit(prev_state);
}
+NOKPROBE_SYMBOL(do_device_not_available);
#ifdef CONFIG_X86_32
dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
@@ -713,7 +739,7 @@ void __init early_trap_init(void)
/* int3 can be called from all */
set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);
#ifdef CONFIG_X86_32
- set_intr_gate(X86_TRAP_PF, &page_fault);
+ set_intr_gate(X86_TRAP_PF, page_fault);
#endif
load_idt(&idt_descr);
}
@@ -721,7 +747,7 @@ void __init early_trap_init(void)
void __init early_trap_pf_init(void)
{
#ifdef CONFIG_X86_64
- set_intr_gate(X86_TRAP_PF, &page_fault);
+ set_intr_gate(X86_TRAP_PF, page_fault);
#endif
}
@@ -737,30 +763,30 @@ void __init trap_init(void)
early_iounmap(p, 4);
#endif
- set_intr_gate(X86_TRAP_DE, &divide_error);
+ set_intr_gate(X86_TRAP_DE, divide_error);
set_intr_gate_ist(X86_TRAP_NMI, &nmi, NMI_STACK);
/* int4 can be called from all */
set_system_intr_gate(X86_TRAP_OF, &overflow);
- set_intr_gate(X86_TRAP_BR, &bounds);
- set_intr_gate(X86_TRAP_UD, &invalid_op);
- set_intr_gate(X86_TRAP_NM, &device_not_available);
+ set_intr_gate(X86_TRAP_BR, bounds);
+ set_intr_gate(X86_TRAP_UD, invalid_op);
+ set_intr_gate(X86_TRAP_NM, device_not_available);
#ifdef CONFIG_X86_32
set_task_gate(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS);
#else
set_intr_gate_ist(X86_TRAP_DF, &double_fault, DOUBLEFAULT_STACK);
#endif
- set_intr_gate(X86_TRAP_OLD_MF, &coprocessor_segment_overrun);
- set_intr_gate(X86_TRAP_TS, &invalid_TSS);
- set_intr_gate(X86_TRAP_NP, &segment_not_present);
+ set_intr_gate(X86_TRAP_OLD_MF, coprocessor_segment_overrun);
+ set_intr_gate(X86_TRAP_TS, invalid_TSS);
+ set_intr_gate(X86_TRAP_NP, segment_not_present);
set_intr_gate_ist(X86_TRAP_SS, &stack_segment, STACKFAULT_STACK);
- set_intr_gate(X86_TRAP_GP, &general_protection);
- set_intr_gate(X86_TRAP_SPURIOUS, &spurious_interrupt_bug);
- set_intr_gate(X86_TRAP_MF, &coprocessor_error);
- set_intr_gate(X86_TRAP_AC, &alignment_check);
+ set_intr_gate(X86_TRAP_GP, general_protection);
+ set_intr_gate(X86_TRAP_SPURIOUS, spurious_interrupt_bug);
+ set_intr_gate(X86_TRAP_MF, coprocessor_error);
+ set_intr_gate(X86_TRAP_AC, alignment_check);
#ifdef CONFIG_X86_MCE
set_intr_gate_ist(X86_TRAP_MC, &machine_check, MCE_STACK);
#endif
- set_intr_gate(X86_TRAP_XF, &simd_coprocessor_error);
+ set_intr_gate(X86_TRAP_XF, simd_coprocessor_error);
/* Reserve all the builtin and the syscall vector: */
for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 930e5d48f56..ea030319b32 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -11,6 +11,7 @@
#include <linux/clocksource.h>
#include <linux/percpu.h>
#include <linux/timex.h>
+#include <linux/static_key.h>
#include <asm/hpet.h>
#include <asm/timer.h>
@@ -37,13 +38,244 @@ static int __read_mostly tsc_unstable;
erroneous rdtsc usage on !cpu_has_tsc processors */
static int __read_mostly tsc_disabled = -1;
+static struct static_key __use_tsc = STATIC_KEY_INIT;
+
int tsc_clocksource_reliable;
+
+/*
+ * Use a ring-buffer like data structure, where a writer advances the head by
+ * writing a new data entry and a reader advances the tail when it observes a
+ * new entry.
+ *
+ * Writers are made to wait on readers until there's space to write a new
+ * entry.
+ *
+ * This means that we can always use an {offset, mul} pair to compute a ns
+ * value that is 'roughly' in the right direction, even if we're writing a new
+ * {offset, mul} pair during the clock read.
+ *
+ * The down-side is that we can no longer guarantee strict monotonicity anymore
+ * (assuming the TSC was that to begin with), because while we compute the
+ * intersection point of the two clock slopes and make sure the time is
+ * continuous at the point of switching; we can no longer guarantee a reader is
+ * strictly before or after the switch point.
+ *
+ * It does mean a reader no longer needs to disable IRQs in order to avoid
+ * CPU-Freq updates messing with his times, and similarly an NMI reader will
+ * no longer run the risk of hitting half-written state.
+ */
+
+struct cyc2ns {
+ struct cyc2ns_data data[2]; /* 0 + 2*24 = 48 */
+ struct cyc2ns_data *head; /* 48 + 8 = 56 */
+ struct cyc2ns_data *tail; /* 56 + 8 = 64 */
+}; /* exactly fits one cacheline */
+
+static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
+
+struct cyc2ns_data *cyc2ns_read_begin(void)
+{
+ struct cyc2ns_data *head;
+
+ preempt_disable();
+
+ head = this_cpu_read(cyc2ns.head);
+ /*
+ * Ensure we observe the entry when we observe the pointer to it.
+ * matches the wmb from cyc2ns_write_end().
+ */
+ smp_read_barrier_depends();
+ head->__count++;
+ barrier();
+
+ return head;
+}
+
+void cyc2ns_read_end(struct cyc2ns_data *head)
+{
+ barrier();
+ /*
+ * If we're the outer most nested read; update the tail pointer
+ * when we're done. This notifies possible pending writers
+ * that we've observed the head pointer and that the other
+ * entry is now free.
+ */
+ if (!--head->__count) {
+ /*
+ * x86-TSO does not reorder writes with older reads;
+ * therefore once this write becomes visible to another
+ * cpu, we must be finished reading the cyc2ns_data.
+ *
+ * matches with cyc2ns_write_begin().
+ */
+ this_cpu_write(cyc2ns.tail, head);
+ }
+ preempt_enable();
+}
+
+/*
+ * Begin writing a new @data entry for @cpu.
+ *
+ * Assumes some sort of write side lock; currently 'provided' by the assumption
+ * that cpufreq will call its notifiers sequentially.
+ */
+static struct cyc2ns_data *cyc2ns_write_begin(int cpu)
+{
+ struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
+ struct cyc2ns_data *data = c2n->data;
+
+ if (data == c2n->head)
+ data++;
+
+ /* XXX send an IPI to @cpu in order to guarantee a read? */
+
+ /*
+ * When we observe the tail write from cyc2ns_read_end(),
+ * the cpu must be done with that entry and its safe
+ * to start writing to it.
+ */
+ while (c2n->tail == data)
+ cpu_relax();
+
+ return data;
+}
+
+static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data)
+{
+ struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
+
+ /*
+ * Ensure the @data writes are visible before we publish the
+ * entry. Matches the data-depencency in cyc2ns_read_begin().
+ */
+ smp_wmb();
+
+ ACCESS_ONCE(c2n->head) = data;
+}
+
+/*
+ * Accelerators for sched_clock()
+ * convert from cycles(64bits) => nanoseconds (64bits)
+ * basic equation:
+ * ns = cycles / (freq / ns_per_sec)
+ * ns = cycles * (ns_per_sec / freq)
+ * ns = cycles * (10^9 / (cpu_khz * 10^3))
+ * ns = cycles * (10^6 / cpu_khz)
+ *
+ * Then we use scaling math (suggested by george@mvista.com) to get:
+ * ns = cycles * (10^6 * SC / cpu_khz) / SC
+ * ns = cycles * cyc2ns_scale / SC
+ *
+ * And since SC is a constant power of two, we can convert the div
+ * into a shift.
+ *
+ * We can use khz divisor instead of mhz to keep a better precision, since
+ * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
+ * (mathieu.desnoyers@polymtl.ca)
+ *
+ * -johnstul@us.ibm.com "math is hard, lets go shopping!"
+ */
+
+#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
+
+static void cyc2ns_data_init(struct cyc2ns_data *data)
+{
+ data->cyc2ns_mul = 0;
+ data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
+ data->cyc2ns_offset = 0;
+ data->__count = 0;
+}
+
+static void cyc2ns_init(int cpu)
+{
+ struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
+
+ cyc2ns_data_init(&c2n->data[0]);
+ cyc2ns_data_init(&c2n->data[1]);
+
+ c2n->head = c2n->data;
+ c2n->tail = c2n->data;
+}
+
+static inline unsigned long long cycles_2_ns(unsigned long long cyc)
+{
+ struct cyc2ns_data *data, *tail;
+ unsigned long long ns;
+
+ /*
+ * See cyc2ns_read_*() for details; replicated in order to avoid
+ * an extra few instructions that came with the abstraction.
+ * Notable, it allows us to only do the __count and tail update
+ * dance when its actually needed.
+ */
+
+ preempt_disable_notrace();
+ data = this_cpu_read(cyc2ns.head);
+ tail = this_cpu_read(cyc2ns.tail);
+
+ if (likely(data == tail)) {
+ ns = data->cyc2ns_offset;
+ ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+ } else {
+ data->__count++;
+
+ barrier();
+
+ ns = data->cyc2ns_offset;
+ ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+
+ barrier();
+
+ if (!--data->__count)
+ this_cpu_write(cyc2ns.tail, data);
+ }
+ preempt_enable_notrace();
+
+ return ns;
+}
+
+/* XXX surely we already have this someplace in the kernel?! */
+#define DIV_ROUND(n, d) (((n) + ((d) / 2)) / (d))
+
+static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
+{
+ unsigned long long tsc_now, ns_now;
+ struct cyc2ns_data *data;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ sched_clock_idle_sleep_event();
+
+ if (!cpu_khz)
+ goto done;
+
+ data = cyc2ns_write_begin(cpu);
+
+ rdtscll(tsc_now);
+ ns_now = cycles_2_ns(tsc_now);
+
+ /*
+ * Compute a new multiplier as per the above comment and ensure our
+ * time function is continuous; see the comment near struct
+ * cyc2ns_data.
+ */
+ data->cyc2ns_mul = DIV_ROUND(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR, cpu_khz);
+ data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
+ data->cyc2ns_offset = ns_now -
+ mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+
+ cyc2ns_write_end(cpu, data);
+
+done:
+ sched_clock_idle_wakeup_event(0);
+ local_irq_restore(flags);
+}
/*
* Scheduler clock - returns current time in nanosec units.
*/
u64 native_sched_clock(void)
{
- u64 this_offset;
+ u64 tsc_now;
/*
* Fall back to jiffies if there's no TSC available:
@@ -53,16 +285,16 @@ u64 native_sched_clock(void)
* very important for it to be as fast as the platform
* can achieve it. )
*/
- if (unlikely(tsc_disabled)) {
+ if (!static_key_false(&__use_tsc)) {
/* No locking but a rare wrong value is not a big deal: */
return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
}
/* read the Time Stamp Counter: */
- rdtscll(this_offset);
+ rdtscll(tsc_now);
/* return the value in ns */
- return __cycles_2_ns(this_offset);
+ return cycles_2_ns(tsc_now);
}
/* We need to define a real function for sched_clock, to override the
@@ -419,6 +651,13 @@ unsigned long native_calibrate_tsc(void)
unsigned long flags, latch, ms, fast_calibrate;
int hpet = is_hpet_enabled(), i, loopmin;
+ /* Calibrate TSC using MSR for Intel Atom SoCs */
+ local_irq_save(flags);
+ fast_calibrate = try_msr_calibrate_tsc();
+ local_irq_restore(flags);
+ if (fast_calibrate)
+ return fast_calibrate;
+
local_irq_save(flags);
fast_calibrate = quick_pit_calibrate();
local_irq_restore(flags);
@@ -589,61 +828,11 @@ int recalibrate_cpu_khz(void)
EXPORT_SYMBOL(recalibrate_cpu_khz);
-/* Accelerators for sched_clock()
- * convert from cycles(64bits) => nanoseconds (64bits)
- * basic equation:
- * ns = cycles / (freq / ns_per_sec)
- * ns = cycles * (ns_per_sec / freq)
- * ns = cycles * (10^9 / (cpu_khz * 10^3))
- * ns = cycles * (10^6 / cpu_khz)
- *
- * Then we use scaling math (suggested by george@mvista.com) to get:
- * ns = cycles * (10^6 * SC / cpu_khz) / SC
- * ns = cycles * cyc2ns_scale / SC
- *
- * And since SC is a constant power of two, we can convert the div
- * into a shift.
- *
- * We can use khz divisor instead of mhz to keep a better precision, since
- * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
- * (mathieu.desnoyers@polymtl.ca)
- *
- * -johnstul@us.ibm.com "math is hard, lets go shopping!"
- */
-
-DEFINE_PER_CPU(unsigned long, cyc2ns);
-DEFINE_PER_CPU(unsigned long long, cyc2ns_offset);
-
-static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
-{
- unsigned long long tsc_now, ns_now, *offset;
- unsigned long flags, *scale;
-
- local_irq_save(flags);
- sched_clock_idle_sleep_event();
-
- scale = &per_cpu(cyc2ns, cpu);
- offset = &per_cpu(cyc2ns_offset, cpu);
-
- rdtscll(tsc_now);
- ns_now = __cycles_2_ns(tsc_now);
-
- if (cpu_khz) {
- *scale = ((NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR) +
- cpu_khz / 2) / cpu_khz;
- *offset = ns_now - mult_frac(tsc_now, *scale,
- (1UL << CYC2NS_SCALE_FACTOR));
- }
-
- sched_clock_idle_wakeup_event(0);
- local_irq_restore(flags);
-}
-
static unsigned long long cyc2ns_suspend;
void tsc_save_sched_clock_state(void)
{
- if (!sched_clock_stable)
+ if (!sched_clock_stable())
return;
cyc2ns_suspend = sched_clock();
@@ -663,16 +852,26 @@ void tsc_restore_sched_clock_state(void)
unsigned long flags;
int cpu;
- if (!sched_clock_stable)
+ if (!sched_clock_stable())
return;
local_irq_save(flags);
- __this_cpu_write(cyc2ns_offset, 0);
+ /*
+ * We're comming out of suspend, there's no concurrency yet; don't
+ * bother being nice about the RCU stuff, just write to both
+ * data fields.
+ */
+
+ this_cpu_write(cyc2ns.data[0].cyc2ns_offset, 0);
+ this_cpu_write(cyc2ns.data[1].cyc2ns_offset, 0);
+
offset = cyc2ns_suspend - sched_clock();
- for_each_possible_cpu(cpu)
- per_cpu(cyc2ns_offset, cpu) = offset;
+ for_each_possible_cpu(cpu) {
+ per_cpu(cyc2ns.data[0].cyc2ns_offset, cpu) = offset;
+ per_cpu(cyc2ns.data[1].cyc2ns_offset, cpu) = offset;
+ }
local_irq_restore(flags);
}
@@ -715,16 +914,15 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
tsc_khz_ref = tsc_khz;
}
if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
- (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
- (val == CPUFREQ_RESUMECHANGE)) {
+ (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
*lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
if (!(freq->flags & CPUFREQ_CONST_LOOPS))
mark_tsc_unstable("cpufreq changes");
- }
- set_cyc2ns_scale(tsc_khz, freq->cpu);
+ set_cyc2ns_scale(tsc_khz, freq->cpu);
+ }
return 0;
}
@@ -786,16 +984,14 @@ static struct clocksource clocksource_tsc = {
.mask = CLOCKSOURCE_MASK(64),
.flags = CLOCK_SOURCE_IS_CONTINUOUS |
CLOCK_SOURCE_MUST_VERIFY,
-#ifdef CONFIG_X86_64
.archdata = { .vclock_mode = VCLOCK_TSC },
-#endif
};
void mark_tsc_unstable(char *reason)
{
if (!tsc_unstable) {
tsc_unstable = 1;
- sched_clock_stable = 0;
+ clear_sched_clock_stable();
disable_sched_clock_irqtime();
pr_info("Marking TSC unstable due to %s\n", reason);
/* Change only the rating, when not registered */
@@ -995,14 +1191,18 @@ void __init tsc_init(void)
* speed as the bootup CPU. (cpufreq notifiers will fix this
* up if their speed diverges)
*/
- for_each_possible_cpu(cpu)
+ for_each_possible_cpu(cpu) {
+ cyc2ns_init(cpu);
set_cyc2ns_scale(cpu_khz, cpu);
+ }
if (tsc_disabled > 0)
return;
/* now allow native_sched_clock() to use rdtsc */
+
tsc_disabled = 0;
+ static_key_slow_inc(&__use_tsc);
if (!no_sched_irq_time)
enable_sched_clock_irqtime();
diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c
new file mode 100644
index 00000000000..92ae6acac8a
--- /dev/null
+++ b/arch/x86/kernel/tsc_msr.c
@@ -0,0 +1,127 @@
+/*
+ * tsc_msr.c - MSR based TSC calibration on Intel Atom SoC platforms.
+ *
+ * TSC in Intel Atom SoC runs at a constant rate which can be figured
+ * by this formula:
+ * <maximum core-clock to bus-clock ratio> * <maximum resolved frequency>
+ * See Intel 64 and IA-32 System Programming Guid section 16.12 and 30.11.5
+ * for details.
+ * Especially some Intel Atom SoCs don't have PIT(i8254) or HPET, so MSR
+ * based calibration is the only option.
+ *
+ *
+ * Copyright (C) 2013 Intel Corporation
+ * Author: Bin Gao <bin.gao@intel.com>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/kernel.h>
+#include <asm/processor.h>
+#include <asm/setup.h>
+#include <asm/apic.h>
+#include <asm/param.h>
+
+/* CPU reference clock frequency: in KHz */
+#define FREQ_83 83200
+#define FREQ_100 99840
+#define FREQ_133 133200
+#define FREQ_166 166400
+
+#define MAX_NUM_FREQS 8
+
+/*
+ * According to Intel 64 and IA-32 System Programming Guide,
+ * if MSR_PERF_STAT[31] is set, the maximum resolved bus ratio can be
+ * read in MSR_PLATFORM_ID[12:8], otherwise in MSR_PERF_STAT[44:40].
+ * Unfortunately some Intel Atom SoCs aren't quite compliant to this,
+ * so we need manually differentiate SoC families. This is what the
+ * field msr_plat does.
+ */
+struct freq_desc {
+ u8 x86_family; /* CPU family */
+ u8 x86_model; /* model */
+ u8 msr_plat; /* 1: use MSR_PLATFORM_INFO, 0: MSR_IA32_PERF_STATUS */
+ u32 freqs[MAX_NUM_FREQS];
+};
+
+static struct freq_desc freq_desc_tables[] = {
+ /* PNW */
+ { 6, 0x27, 0, { 0, 0, 0, 0, 0, FREQ_100, 0, FREQ_83 } },
+ /* CLV+ */
+ { 6, 0x35, 0, { 0, FREQ_133, 0, 0, 0, FREQ_100, 0, FREQ_83 } },
+ /* TNG */
+ { 6, 0x4a, 1, { 0, FREQ_100, FREQ_133, 0, 0, 0, 0, 0 } },
+ /* VLV2 */
+ { 6, 0x37, 1, { FREQ_83, FREQ_100, FREQ_133, FREQ_166, 0, 0, 0, 0 } },
+ /* ANN */
+ { 6, 0x5a, 1, { FREQ_83, FREQ_100, FREQ_133, FREQ_100, 0, 0, 0, 0 } },
+};
+
+static int match_cpu(u8 family, u8 model)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(freq_desc_tables); i++) {
+ if ((family == freq_desc_tables[i].x86_family) &&
+ (model == freq_desc_tables[i].x86_model))
+ return i;
+ }
+
+ return -1;
+}
+
+/* Map CPU reference clock freq ID(0-7) to CPU reference clock freq(KHz) */
+#define id_to_freq(cpu_index, freq_id) \
+ (freq_desc_tables[cpu_index].freqs[freq_id])
+
+/*
+ * Do MSR calibration only for known/supported CPUs.
+ *
+ * Returns the calibration value or 0 if MSR calibration failed.
+ */
+unsigned long try_msr_calibrate_tsc(void)
+{
+ u32 lo, hi, ratio, freq_id, freq;
+ unsigned long res;
+ int cpu_index;
+
+ cpu_index = match_cpu(boot_cpu_data.x86, boot_cpu_data.x86_model);
+ if (cpu_index < 0)
+ return 0;
+
+ if (freq_desc_tables[cpu_index].msr_plat) {
+ rdmsr(MSR_PLATFORM_INFO, lo, hi);
+ ratio = (lo >> 8) & 0x1f;
+ } else {
+ rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
+ ratio = (hi >> 8) & 0x1f;
+ }
+ pr_info("Maximum core-clock to bus-clock ratio: 0x%x\n", ratio);
+
+ if (!ratio)
+ goto fail;
+
+ /* Get FSB FREQ ID */
+ rdmsr(MSR_FSB_FREQ, lo, hi);
+ freq_id = lo & 0x7;
+ freq = id_to_freq(cpu_index, freq_id);
+ pr_info("Resolved frequency ID: %u, frequency: %u KHz\n",
+ freq_id, freq);
+ if (!freq)
+ goto fail;
+
+ /* TSC frequency = maximum resolved freq * maximum resolved bus ratio */
+ res = freq * ratio;
+ pr_info("TSC runs at %lu KHz\n", res);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ lapic_timer_frequency = (freq * 1000) / HZ;
+ pr_info("lapic_timer_frequency = %d\n", lapic_timer_frequency);
+#endif
+ return res;
+
+fail:
+ pr_warn("Fast TSC calibration using MSR failed\n");
+ return 0;
+}
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index adfdf56a371..26488487bc6 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -16,7 +16,6 @@
*/
#include <linux/spinlock.h>
#include <linux/kernel.h>
-#include <linux/init.h>
#include <linux/smp.h>
#include <linux/nmi.h>
#include <asm/tsc.h>
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 2ed845928b5..5d1cbfe4ae5 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -32,20 +32,20 @@
/* Post-execution fixups. */
-/* No fixup needed */
-#define UPROBE_FIX_NONE 0x0
-
/* Adjust IP back to vicinity of actual insn */
-#define UPROBE_FIX_IP 0x1
+#define UPROBE_FIX_IP 0x01
/* Adjust the return address of a call insn */
-#define UPROBE_FIX_CALL 0x2
+#define UPROBE_FIX_CALL 0x02
/* Instruction will modify TF, don't change it */
-#define UPROBE_FIX_SETF 0x4
+#define UPROBE_FIX_SETF 0x04
-#define UPROBE_FIX_RIP_AX 0x8000
-#define UPROBE_FIX_RIP_CX 0x4000
+#define UPROBE_FIX_RIP_SI 0x08
+#define UPROBE_FIX_RIP_DI 0x10
+#define UPROBE_FIX_RIP_BX 0x20
+#define UPROBE_FIX_RIP_MASK \
+ (UPROBE_FIX_RIP_SI | UPROBE_FIX_RIP_DI | UPROBE_FIX_RIP_BX)
#define UPROBE_TRAP_NR UINT_MAX
@@ -53,7 +53,7 @@
#define OPCODE1(insn) ((insn)->opcode.bytes[0])
#define OPCODE2(insn) ((insn)->opcode.bytes[1])
#define OPCODE3(insn) ((insn)->opcode.bytes[2])
-#define MODRM_REG(insn) X86_MODRM_REG(insn->modrm.value)
+#define MODRM_REG(insn) X86_MODRM_REG((insn)->modrm.value)
#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
(((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
@@ -67,6 +67,7 @@
* to keep gcc from statically optimizing it out, as variable_test_bit makes
* some versions of gcc to think only *(unsigned long*) is used.
*/
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
static volatile u32 good_insns_32[256 / 32] = {
/* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
/* ---------------------------------------------- */
@@ -89,33 +90,12 @@ static volatile u32 good_insns_32[256 / 32] = {
/* ---------------------------------------------- */
/* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
};
+#else
+#define good_insns_32 NULL
+#endif
-/* Using this for both 64-bit and 32-bit apps */
-static volatile u32 good_2byte_insns[256 / 32] = {
- /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
- /* ---------------------------------------------- */
- W(0x00, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1) | /* 00 */
- W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* 10 */
- W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */
- W(0x30, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */
- W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
- W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
- W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */
- W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */
- W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
- W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
- W(0xa0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */
- W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
- W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
- W(0xd0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
- W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */
- W(0xf0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* f0 */
- /* ---------------------------------------------- */
- /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
-};
-
-#ifdef CONFIG_X86_64
/* Good-instruction tables for 64-bit apps */
+#if defined(CONFIG_X86_64)
static volatile u32 good_insns_64[256 / 32] = {
/* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
/* ---------------------------------------------- */
@@ -138,7 +118,33 @@ static volatile u32 good_insns_64[256 / 32] = {
/* ---------------------------------------------- */
/* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
};
+#else
+#define good_insns_64 NULL
#endif
+
+/* Using this for both 64-bit and 32-bit apps */
+static volatile u32 good_2byte_insns[256 / 32] = {
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* ---------------------------------------------- */
+ W(0x00, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1) | /* 00 */
+ W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* 10 */
+ W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */
+ W(0x30, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */
+ W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
+ W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
+ W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */
+ W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */
+ W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
+ W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
+ W(0xa0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */
+ W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
+ W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
+ W(0xd0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
+ W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */
+ W(0xf0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* f0 */
+ /* ---------------------------------------------- */
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+};
#undef W
/*
@@ -209,16 +215,25 @@ static bool is_prefix_bad(struct insn *insn)
return false;
}
-static int validate_insn_32bits(struct arch_uprobe *auprobe, struct insn *insn)
+static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool x86_64)
{
- insn_init(insn, auprobe->insn, false);
+ u32 volatile *good_insns;
+
+ insn_init(insn, auprobe->insn, x86_64);
+ /* has the side-effect of processing the entire instruction */
+ insn_get_length(insn);
+ if (WARN_ON_ONCE(!insn_complete(insn)))
+ return -ENOEXEC;
- /* Skip good instruction prefixes; reject "bad" ones. */
- insn_get_opcode(insn);
if (is_prefix_bad(insn))
return -ENOTSUPP;
- if (test_bit(OPCODE1(insn), (unsigned long *)good_insns_32))
+ if (x86_64)
+ good_insns = good_insns_64;
+ else
+ good_insns = good_insns_32;
+
+ if (test_bit(OPCODE1(insn), (unsigned long *)good_insns))
return 0;
if (insn->opcode.nbytes == 2) {
@@ -229,72 +244,19 @@ static int validate_insn_32bits(struct arch_uprobe *auprobe, struct insn *insn)
return -ENOTSUPP;
}
-/*
- * Figure out which fixups arch_uprobe_post_xol() will need to perform, and
- * annotate arch_uprobe->fixups accordingly. To start with,
- * arch_uprobe->fixups is either zero or it reflects rip-related fixups.
- */
-static void prepare_fixups(struct arch_uprobe *auprobe, struct insn *insn)
+#ifdef CONFIG_X86_64
+static inline bool is_64bit_mm(struct mm_struct *mm)
{
- bool fix_ip = true, fix_call = false; /* defaults */
- int reg;
-
- insn_get_opcode(insn); /* should be a nop */
-
- switch (OPCODE1(insn)) {
- case 0x9d:
- /* popf */
- auprobe->fixups |= UPROBE_FIX_SETF;
- break;
- case 0xc3: /* ret/lret */
- case 0xcb:
- case 0xc2:
- case 0xca:
- /* ip is correct */
- fix_ip = false;
- break;
- case 0xe8: /* call relative - Fix return addr */
- fix_call = true;
- break;
- case 0x9a: /* call absolute - Fix return addr, not ip */
- fix_call = true;
- fix_ip = false;
- break;
- case 0xff:
- insn_get_modrm(insn);
- reg = MODRM_REG(insn);
- if (reg == 2 || reg == 3) {
- /* call or lcall, indirect */
- /* Fix return addr; ip is correct. */
- fix_call = true;
- fix_ip = false;
- } else if (reg == 4 || reg == 5) {
- /* jmp or ljmp, indirect */
- /* ip is correct. */
- fix_ip = false;
- }
- break;
- case 0xea: /* jmp absolute -- ip is correct */
- fix_ip = false;
- break;
- default:
- break;
- }
- if (fix_ip)
- auprobe->fixups |= UPROBE_FIX_IP;
- if (fix_call)
- auprobe->fixups |= UPROBE_FIX_CALL;
+ return !config_enabled(CONFIG_IA32_EMULATION) ||
+ !(mm->context.ia32_compat == TIF_IA32);
}
-
-#ifdef CONFIG_X86_64
/*
* If arch_uprobe->insn doesn't use rip-relative addressing, return
* immediately. Otherwise, rewrite the instruction so that it accesses
* its memory operand indirectly through a scratch register. Set
- * arch_uprobe->fixups and arch_uprobe->rip_rela_target_address
- * accordingly. (The contents of the scratch register will be saved
- * before we single-step the modified instruction, and restored
- * afterward.)
+ * defparam->fixups accordingly. (The contents of the scratch register
+ * will be saved before we single-step the modified instruction,
+ * and restored afterward).
*
* We do this because a rip-relative instruction can access only a
* relatively small area (+/- 2 GB from the instruction), and the XOL
@@ -305,248 +267,513 @@ static void prepare_fixups(struct arch_uprobe *auprobe, struct insn *insn)
*
* Some useful facts about rip-relative instructions:
*
- * - There's always a modrm byte.
+ * - There's always a modrm byte with bit layout "00 reg 101".
* - There's never a SIB byte.
* - The displacement is always 4 bytes.
+ * - REX.B=1 bit in REX prefix, which normally extends r/m field,
+ * has no effect on rip-relative mode. It doesn't make modrm byte
+ * with r/m=101 refer to register 1101 = R13.
*/
-static void
-handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn)
+static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn)
{
u8 *cursor;
u8 reg;
+ u8 reg2;
- if (mm->context.ia32_compat)
- return;
-
- auprobe->rip_rela_target_address = 0x0;
if (!insn_rip_relative(insn))
return;
/*
- * insn_rip_relative() would have decoded rex_prefix, modrm.
+ * insn_rip_relative() would have decoded rex_prefix, vex_prefix, modrm.
* Clear REX.b bit (extension of MODRM.rm field):
- * we want to encode rax/rcx, not r8/r9.
+ * we want to encode low numbered reg, not r8+.
*/
if (insn->rex_prefix.nbytes) {
cursor = auprobe->insn + insn_offset_rex_prefix(insn);
- *cursor &= 0xfe; /* Clearing REX.B bit */
+ /* REX byte has 0100wrxb layout, clearing REX.b bit */
+ *cursor &= 0xfe;
+ }
+ /*
+ * Similar treatment for VEX3 prefix.
+ * TODO: add XOP/EVEX treatment when insn decoder supports them
+ */
+ if (insn->vex_prefix.nbytes == 3) {
+ /*
+ * vex2: c5 rvvvvLpp (has no b bit)
+ * vex3/xop: c4/8f rxbmmmmm wvvvvLpp
+ * evex: 62 rxbR00mm wvvvv1pp zllBVaaa
+ * (evex will need setting of both b and x since
+ * in non-sib encoding evex.x is 4th bit of MODRM.rm)
+ * Setting VEX3.b (setting because it has inverted meaning):
+ */
+ cursor = auprobe->insn + insn_offset_vex_prefix(insn) + 1;
+ *cursor |= 0x20;
}
/*
+ * Convert from rip-relative addressing to register-relative addressing
+ * via a scratch register.
+ *
+ * This is tricky since there are insns with modrm byte
+ * which also use registers not encoded in modrm byte:
+ * [i]div/[i]mul: implicitly use dx:ax
+ * shift ops: implicitly use cx
+ * cmpxchg: implicitly uses ax
+ * cmpxchg8/16b: implicitly uses dx:ax and bx:cx
+ * Encoding: 0f c7/1 modrm
+ * The code below thinks that reg=1 (cx), chooses si as scratch.
+ * mulx: implicitly uses dx: mulx r/m,r1,r2 does r1:r2 = dx * r/m.
+ * First appeared in Haswell (BMI2 insn). It is vex-encoded.
+ * Example where none of bx,cx,dx can be used as scratch reg:
+ * c4 e2 63 f6 0d disp32 mulx disp32(%rip),%ebx,%ecx
+ * [v]pcmpistri: implicitly uses cx, xmm0
+ * [v]pcmpistrm: implicitly uses xmm0
+ * [v]pcmpestri: implicitly uses ax, dx, cx, xmm0
+ * [v]pcmpestrm: implicitly uses ax, dx, xmm0
+ * Evil SSE4.2 string comparison ops from hell.
+ * maskmovq/[v]maskmovdqu: implicitly uses (ds:rdi) as destination.
+ * Encoding: 0f f7 modrm, 66 0f f7 modrm, vex-encoded: c5 f9 f7 modrm.
+ * Store op1, byte-masked by op2 msb's in each byte, to (ds:rdi).
+ * AMD says it has no 3-operand form (vex.vvvv must be 1111)
+ * and that it can have only register operands, not mem
+ * (its modrm byte must have mode=11).
+ * If these restrictions will ever be lifted,
+ * we'll need code to prevent selection of di as scratch reg!
+ *
+ * Summary: I don't know any insns with modrm byte which
+ * use SI register implicitly. DI register is used only
+ * by one insn (maskmovq) and BX register is used
+ * only by one too (cmpxchg8b).
+ * BP is stack-segment based (may be a problem?).
+ * AX, DX, CX are off-limits (many implicit users).
+ * SP is unusable (it's stack pointer - think about "pop mem";
+ * also, rsp+disp32 needs sib encoding -> insn length change).
+ */
+
+ reg = MODRM_REG(insn); /* Fetch modrm.reg */
+ reg2 = 0xff; /* Fetch vex.vvvv */
+ if (insn->vex_prefix.nbytes == 2)
+ reg2 = insn->vex_prefix.bytes[1];
+ else if (insn->vex_prefix.nbytes == 3)
+ reg2 = insn->vex_prefix.bytes[2];
+ /*
+ * TODO: add XOP, EXEV vvvv reading.
+ *
+ * vex.vvvv field is in bits 6-3, bits are inverted.
+ * But in 32-bit mode, high-order bit may be ignored.
+ * Therefore, let's consider only 3 low-order bits.
+ */
+ reg2 = ((reg2 >> 3) & 0x7) ^ 0x7;
+ /*
+ * Register numbering is ax,cx,dx,bx, sp,bp,si,di, r8..r15.
+ *
+ * Choose scratch reg. Order is important: must not select bx
+ * if we can use si (cmpxchg8b case!)
+ */
+ if (reg != 6 && reg2 != 6) {
+ reg2 = 6;
+ auprobe->defparam.fixups |= UPROBE_FIX_RIP_SI;
+ } else if (reg != 7 && reg2 != 7) {
+ reg2 = 7;
+ auprobe->defparam.fixups |= UPROBE_FIX_RIP_DI;
+ /* TODO (paranoia): force maskmovq to not use di */
+ } else {
+ reg2 = 3;
+ auprobe->defparam.fixups |= UPROBE_FIX_RIP_BX;
+ }
+ /*
* Point cursor at the modrm byte. The next 4 bytes are the
* displacement. Beyond the displacement, for some instructions,
* is the immediate operand.
*/
cursor = auprobe->insn + insn_offset_modrm(insn);
- insn_get_length(insn);
-
/*
- * Convert from rip-relative addressing to indirect addressing
- * via a scratch register. Change the r/m field from 0x5 (%rip)
- * to 0x0 (%rax) or 0x1 (%rcx), and squeeze out the offset field.
+ * Change modrm from "00 reg 101" to "10 reg reg2". Example:
+ * 89 05 disp32 mov %eax,disp32(%rip) becomes
+ * 89 86 disp32 mov %eax,disp32(%rsi)
*/
- reg = MODRM_REG(insn);
- if (reg == 0) {
- /*
- * The register operand (if any) is either the A register
- * (%rax, %eax, etc.) or (if the 0x4 bit is set in the
- * REX prefix) %r8. In any case, we know the C register
- * is NOT the register operand, so we use %rcx (register
- * #1) for the scratch register.
- */
- auprobe->fixups = UPROBE_FIX_RIP_CX;
- /* Change modrm from 00 000 101 to 00 000 001. */
- *cursor = 0x1;
- } else {
- /* Use %rax (register #0) for the scratch register. */
- auprobe->fixups = UPROBE_FIX_RIP_AX;
- /* Change modrm from 00 xxx 101 to 00 xxx 000 */
- *cursor = (reg << 3);
- }
-
- /* Target address = address of next instruction + (signed) offset */
- auprobe->rip_rela_target_address = (long)insn->length + insn->displacement.value;
-
- /* Displacement field is gone; slide immediate field (if any) over. */
- if (insn->immediate.nbytes) {
- cursor++;
- memmove(cursor, cursor + insn->displacement.nbytes, insn->immediate.nbytes);
- }
- return;
+ *cursor = 0x80 | (reg << 3) | reg2;
}
-static int validate_insn_64bits(struct arch_uprobe *auprobe, struct insn *insn)
+static inline unsigned long *
+scratch_reg(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
- insn_init(insn, auprobe->insn, true);
-
- /* Skip good instruction prefixes; reject "bad" ones. */
- insn_get_opcode(insn);
- if (is_prefix_bad(insn))
- return -ENOTSUPP;
+ if (auprobe->defparam.fixups & UPROBE_FIX_RIP_SI)
+ return &regs->si;
+ if (auprobe->defparam.fixups & UPROBE_FIX_RIP_DI)
+ return &regs->di;
+ return &regs->bx;
+}
- if (test_bit(OPCODE1(insn), (unsigned long *)good_insns_64))
- return 0;
+/*
+ * If we're emulating a rip-relative instruction, save the contents
+ * of the scratch register and store the target address in that register.
+ */
+static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ if (auprobe->defparam.fixups & UPROBE_FIX_RIP_MASK) {
+ struct uprobe_task *utask = current->utask;
+ unsigned long *sr = scratch_reg(auprobe, regs);
- if (insn->opcode.nbytes == 2) {
- if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns))
- return 0;
+ utask->autask.saved_scratch_register = *sr;
+ *sr = utask->vaddr + auprobe->defparam.ilen;
}
- return -ENOTSUPP;
}
-static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn)
+static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
- if (mm->context.ia32_compat)
- return validate_insn_32bits(auprobe, insn);
- return validate_insn_64bits(auprobe, insn);
+ if (auprobe->defparam.fixups & UPROBE_FIX_RIP_MASK) {
+ struct uprobe_task *utask = current->utask;
+ unsigned long *sr = scratch_reg(auprobe, regs);
+
+ *sr = utask->autask.saved_scratch_register;
+ }
}
#else /* 32-bit: */
-static void handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn)
+static inline bool is_64bit_mm(struct mm_struct *mm)
{
- /* No RIP-relative addressing on 32-bit */
+ return false;
}
-
-static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn)
+/*
+ * No RIP-relative addressing on 32-bit
+ */
+static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn)
+{
+}
+static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+}
+static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
- return validate_insn_32bits(auprobe, insn);
}
#endif /* CONFIG_X86_64 */
-/**
- * arch_uprobe_analyze_insn - instruction analysis including validity and fixups.
- * @mm: the probed address space.
- * @arch_uprobe: the probepoint information.
- * @addr: virtual address at which to install the probepoint
- * Return 0 on success or a -ve number on error.
- */
-int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr)
+struct uprobe_xol_ops {
+ bool (*emulate)(struct arch_uprobe *, struct pt_regs *);
+ int (*pre_xol)(struct arch_uprobe *, struct pt_regs *);
+ int (*post_xol)(struct arch_uprobe *, struct pt_regs *);
+ void (*abort)(struct arch_uprobe *, struct pt_regs *);
+};
+
+static inline int sizeof_long(void)
{
- int ret;
- struct insn insn;
+ return is_ia32_task() ? 4 : 8;
+}
- auprobe->fixups = 0;
- ret = validate_insn_bits(auprobe, mm, &insn);
- if (ret != 0)
- return ret;
+static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ riprel_pre_xol(auprobe, regs);
+ return 0;
+}
- handle_riprel_insn(auprobe, mm, &insn);
- prepare_fixups(auprobe, &insn);
+static int push_ret_address(struct pt_regs *regs, unsigned long ip)
+{
+ unsigned long new_sp = regs->sp - sizeof_long();
+ if (copy_to_user((void __user *)new_sp, &ip, sizeof_long()))
+ return -EFAULT;
+
+ regs->sp = new_sp;
return 0;
}
-#ifdef CONFIG_X86_64
/*
- * If we're emulating a rip-relative instruction, save the contents
- * of the scratch register and store the target address in that register.
+ * We have to fix things up as follows:
+ *
+ * Typically, the new ip is relative to the copied instruction. We need
+ * to make it relative to the original instruction (FIX_IP). Exceptions
+ * are return instructions and absolute or indirect jump or call instructions.
+ *
+ * If the single-stepped instruction was a call, the return address that
+ * is atop the stack is the address following the copied instruction. We
+ * need to make it the address following the original instruction (FIX_CALL).
+ *
+ * If the original instruction was a rip-relative instruction such as
+ * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent
+ * instruction using a scratch register -- e.g., "movl %edx,0xnnnn(%rsi)".
+ * We need to restore the contents of the scratch register
+ * (FIX_RIP_reg).
*/
-static void
-pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs,
- struct arch_uprobe_task *autask)
-{
- if (auprobe->fixups & UPROBE_FIX_RIP_AX) {
- autask->saved_scratch_register = regs->ax;
- regs->ax = current->utask->vaddr;
- regs->ax += auprobe->rip_rela_target_address;
- } else if (auprobe->fixups & UPROBE_FIX_RIP_CX) {
- autask->saved_scratch_register = regs->cx;
- regs->cx = current->utask->vaddr;
- regs->cx += auprobe->rip_rela_target_address;
+static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ struct uprobe_task *utask = current->utask;
+
+ riprel_post_xol(auprobe, regs);
+ if (auprobe->defparam.fixups & UPROBE_FIX_IP) {
+ long correction = utask->vaddr - utask->xol_vaddr;
+ regs->ip += correction;
+ } else if (auprobe->defparam.fixups & UPROBE_FIX_CALL) {
+ regs->sp += sizeof_long(); /* Pop incorrect return address */
+ if (push_ret_address(regs, utask->vaddr + auprobe->defparam.ilen))
+ return -ERESTART;
}
+ /* popf; tell the caller to not touch TF */
+ if (auprobe->defparam.fixups & UPROBE_FIX_SETF)
+ utask->autask.saved_tf = true;
+
+ return 0;
}
-#else
-static void
-pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs,
- struct arch_uprobe_task *autask)
+
+static void default_abort_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
- /* No RIP-relative addressing on 32-bit */
+ riprel_post_xol(auprobe, regs);
}
-#endif
-/*
- * arch_uprobe_pre_xol - prepare to execute out of line.
- * @auprobe: the probepoint information.
- * @regs: reflects the saved user state of current task.
- */
-int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+static struct uprobe_xol_ops default_xol_ops = {
+ .pre_xol = default_pre_xol_op,
+ .post_xol = default_post_xol_op,
+ .abort = default_abort_op,
+};
+
+static bool branch_is_call(struct arch_uprobe *auprobe)
{
- struct arch_uprobe_task *autask;
+ return auprobe->branch.opc1 == 0xe8;
+}
- autask = &current->utask->autask;
- autask->saved_trap_nr = current->thread.trap_nr;
- current->thread.trap_nr = UPROBE_TRAP_NR;
- regs->ip = current->utask->xol_vaddr;
- pre_xol_rip_insn(auprobe, regs, autask);
+#define CASE_COND \
+ COND(70, 71, XF(OF)) \
+ COND(72, 73, XF(CF)) \
+ COND(74, 75, XF(ZF)) \
+ COND(78, 79, XF(SF)) \
+ COND(7a, 7b, XF(PF)) \
+ COND(76, 77, XF(CF) || XF(ZF)) \
+ COND(7c, 7d, XF(SF) != XF(OF)) \
+ COND(7e, 7f, XF(ZF) || XF(SF) != XF(OF))
- autask->saved_tf = !!(regs->flags & X86_EFLAGS_TF);
- regs->flags |= X86_EFLAGS_TF;
- if (test_tsk_thread_flag(current, TIF_BLOCKSTEP))
- set_task_blockstep(current, false);
+#define COND(op_y, op_n, expr) \
+ case 0x ## op_y: DO((expr) != 0) \
+ case 0x ## op_n: DO((expr) == 0)
- return 0;
+#define XF(xf) (!!(flags & X86_EFLAGS_ ## xf))
+
+static bool is_cond_jmp_opcode(u8 opcode)
+{
+ switch (opcode) {
+ #define DO(expr) \
+ return true;
+ CASE_COND
+ #undef DO
+
+ default:
+ return false;
+ }
}
-/*
- * This function is called by arch_uprobe_post_xol() to adjust the return
- * address pushed by a call instruction executed out of line.
- */
-static int adjust_ret_addr(unsigned long sp, long correction)
+static bool check_jmp_cond(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
- int rasize, ncopied;
- long ra = 0;
+ unsigned long flags = regs->flags;
- if (is_ia32_task())
- rasize = 4;
- else
- rasize = 8;
+ switch (auprobe->branch.opc1) {
+ #define DO(expr) \
+ return expr;
+ CASE_COND
+ #undef DO
- ncopied = copy_from_user(&ra, (void __user *)sp, rasize);
- if (unlikely(ncopied))
- return -EFAULT;
+ default: /* not a conditional jmp */
+ return true;
+ }
+}
- ra += correction;
- ncopied = copy_to_user((void __user *)sp, &ra, rasize);
- if (unlikely(ncopied))
- return -EFAULT;
+#undef XF
+#undef COND
+#undef CASE_COND
- return 0;
+static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ unsigned long new_ip = regs->ip += auprobe->branch.ilen;
+ unsigned long offs = (long)auprobe->branch.offs;
+
+ if (branch_is_call(auprobe)) {
+ /*
+ * If it fails we execute this (mangled, see the comment in
+ * branch_clear_offset) insn out-of-line. In the likely case
+ * this should trigger the trap, and the probed application
+ * should die or restart the same insn after it handles the
+ * signal, arch_uprobe_post_xol() won't be even called.
+ *
+ * But there is corner case, see the comment in ->post_xol().
+ */
+ if (push_ret_address(regs, new_ip))
+ return false;
+ } else if (!check_jmp_cond(auprobe, regs)) {
+ offs = 0;
+ }
+
+ regs->ip = new_ip + offs;
+ return true;
}
-#ifdef CONFIG_X86_64
-static bool is_riprel_insn(struct arch_uprobe *auprobe)
+static int branch_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
- return ((auprobe->fixups & (UPROBE_FIX_RIP_AX | UPROBE_FIX_RIP_CX)) != 0);
+ BUG_ON(!branch_is_call(auprobe));
+ /*
+ * We can only get here if branch_emulate_op() failed to push the ret
+ * address _and_ another thread expanded our stack before the (mangled)
+ * "call" insn was executed out-of-line. Just restore ->sp and restart.
+ * We could also restore ->ip and try to call branch_emulate_op() again.
+ */
+ regs->sp += sizeof_long();
+ return -ERESTART;
+}
+
+static void branch_clear_offset(struct arch_uprobe *auprobe, struct insn *insn)
+{
+ /*
+ * Turn this insn into "call 1f; 1:", this is what we will execute
+ * out-of-line if ->emulate() fails. We only need this to generate
+ * a trap, so that the probed task receives the correct signal with
+ * the properly filled siginfo.
+ *
+ * But see the comment in ->post_xol(), in the unlikely case it can
+ * succeed. So we need to ensure that the new ->ip can not fall into
+ * the non-canonical area and trigger #GP.
+ *
+ * We could turn it into (say) "pushf", but then we would need to
+ * divorce ->insn[] and ->ixol[]. We need to preserve the 1st byte
+ * of ->insn[] for set_orig_insn().
+ */
+ memset(auprobe->insn + insn_offset_immediate(insn),
+ 0, insn->immediate.nbytes);
}
-static void
-handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction)
+static struct uprobe_xol_ops branch_xol_ops = {
+ .emulate = branch_emulate_op,
+ .post_xol = branch_post_xol_op,
+};
+
+/* Returns -ENOSYS if branch_xol_ops doesn't handle this insn */
+static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
{
- if (is_riprel_insn(auprobe)) {
- struct arch_uprobe_task *autask;
+ u8 opc1 = OPCODE1(insn);
+ int i;
- autask = &current->utask->autask;
- if (auprobe->fixups & UPROBE_FIX_RIP_AX)
- regs->ax = autask->saved_scratch_register;
- else
- regs->cx = autask->saved_scratch_register;
+ switch (opc1) {
+ case 0xeb: /* jmp 8 */
+ case 0xe9: /* jmp 32 */
+ case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */
+ break;
+
+ case 0xe8: /* call relative */
+ branch_clear_offset(auprobe, insn);
+ break;
+ case 0x0f:
+ if (insn->opcode.nbytes != 2)
+ return -ENOSYS;
/*
- * The original instruction includes a displacement, and so
- * is 4 bytes longer than what we've just single-stepped.
- * Fall through to handle stuff like "jmpq *...(%rip)" and
- * "callq *...(%rip)".
+ * If it is a "near" conditional jmp, OPCODE2() - 0x10 matches
+ * OPCODE1() of the "short" jmp which checks the same condition.
*/
- if (correction)
- *correction += 4;
+ opc1 = OPCODE2(insn) - 0x10;
+ default:
+ if (!is_cond_jmp_opcode(opc1))
+ return -ENOSYS;
+ }
+
+ /*
+ * 16-bit overrides such as CALLW (66 e8 nn nn) are not supported.
+ * Intel and AMD behavior differ in 64-bit mode: Intel ignores 66 prefix.
+ * No one uses these insns, reject any branch insns with such prefix.
+ */
+ for (i = 0; i < insn->prefixes.nbytes; i++) {
+ if (insn->prefixes.bytes[i] == 0x66)
+ return -ENOTSUPP;
}
+
+ auprobe->branch.opc1 = opc1;
+ auprobe->branch.ilen = insn->length;
+ auprobe->branch.offs = insn->immediate.value;
+
+ auprobe->ops = &branch_xol_ops;
+ return 0;
}
-#else
-static void
-handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction)
+
+/**
+ * arch_uprobe_analyze_insn - instruction analysis including validity and fixups.
+ * @mm: the probed address space.
+ * @arch_uprobe: the probepoint information.
+ * @addr: virtual address at which to install the probepoint
+ * Return 0 on success or a -ve number on error.
+ */
+int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr)
{
- /* No RIP-relative addressing on 32-bit */
+ struct insn insn;
+ u8 fix_ip_or_call = UPROBE_FIX_IP;
+ int ret;
+
+ ret = uprobe_init_insn(auprobe, &insn, is_64bit_mm(mm));
+ if (ret)
+ return ret;
+
+ ret = branch_setup_xol_ops(auprobe, &insn);
+ if (ret != -ENOSYS)
+ return ret;
+
+ /*
+ * Figure out which fixups default_post_xol_op() will need to perform,
+ * and annotate defparam->fixups accordingly.
+ */
+ switch (OPCODE1(&insn)) {
+ case 0x9d: /* popf */
+ auprobe->defparam.fixups |= UPROBE_FIX_SETF;
+ break;
+ case 0xc3: /* ret or lret -- ip is correct */
+ case 0xcb:
+ case 0xc2:
+ case 0xca:
+ case 0xea: /* jmp absolute -- ip is correct */
+ fix_ip_or_call = 0;
+ break;
+ case 0x9a: /* call absolute - Fix return addr, not ip */
+ fix_ip_or_call = UPROBE_FIX_CALL;
+ break;
+ case 0xff:
+ switch (MODRM_REG(&insn)) {
+ case 2: case 3: /* call or lcall, indirect */
+ fix_ip_or_call = UPROBE_FIX_CALL;
+ break;
+ case 4: case 5: /* jmp or ljmp, indirect */
+ fix_ip_or_call = 0;
+ break;
+ }
+ /* fall through */
+ default:
+ riprel_analyze(auprobe, &insn);
+ }
+
+ auprobe->defparam.ilen = insn.length;
+ auprobe->defparam.fixups |= fix_ip_or_call;
+
+ auprobe->ops = &default_xol_ops;
+ return 0;
+}
+
+/*
+ * arch_uprobe_pre_xol - prepare to execute out of line.
+ * @auprobe: the probepoint information.
+ * @regs: reflects the saved user state of current task.
+ */
+int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ struct uprobe_task *utask = current->utask;
+
+ if (auprobe->ops->pre_xol) {
+ int err = auprobe->ops->pre_xol(auprobe, regs);
+ if (err)
+ return err;
+ }
+
+ regs->ip = utask->xol_vaddr;
+ utask->autask.saved_trap_nr = current->thread.trap_nr;
+ current->thread.trap_nr = UPROBE_TRAP_NR;
+
+ utask->autask.saved_tf = !!(regs->flags & X86_EFLAGS_TF);
+ regs->flags |= X86_EFLAGS_TF;
+ if (test_tsk_thread_flag(current, TIF_BLOCKSTEP))
+ set_task_blockstep(current, false);
+
+ return 0;
}
-#endif
/*
* If xol insn itself traps and generates a signal(Say,
@@ -572,53 +799,42 @@ bool arch_uprobe_xol_was_trapped(struct task_struct *t)
* single-step, we single-stepped a copy of the instruction.
*
* This function prepares to resume execution after the single-step.
- * We have to fix things up as follows:
- *
- * Typically, the new ip is relative to the copied instruction. We need
- * to make it relative to the original instruction (FIX_IP). Exceptions
- * are return instructions and absolute or indirect jump or call instructions.
- *
- * If the single-stepped instruction was a call, the return address that
- * is atop the stack is the address following the copied instruction. We
- * need to make it the address following the original instruction (FIX_CALL).
- *
- * If the original instruction was a rip-relative instruction such as
- * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent
- * instruction using a scratch register -- e.g., "movl %edx,(%rax)".
- * We need to restore the contents of the scratch register and adjust
- * the ip, keeping in mind that the instruction we executed is 4 bytes
- * shorter than the original instruction (since we squeezed out the offset
- * field). (FIX_RIP_AX or FIX_RIP_CX)
*/
int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
- struct uprobe_task *utask;
- long correction;
- int result = 0;
+ struct uprobe_task *utask = current->utask;
+ bool send_sigtrap = utask->autask.saved_tf;
+ int err = 0;
WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR);
-
- utask = current->utask;
current->thread.trap_nr = utask->autask.saved_trap_nr;
- correction = (long)(utask->vaddr - utask->xol_vaddr);
- handle_riprel_post_xol(auprobe, regs, &correction);
- if (auprobe->fixups & UPROBE_FIX_IP)
- regs->ip += correction;
-
- if (auprobe->fixups & UPROBE_FIX_CALL)
- result = adjust_ret_addr(regs->sp, correction);
+ if (auprobe->ops->post_xol) {
+ err = auprobe->ops->post_xol(auprobe, regs);
+ if (err) {
+ /*
+ * Restore ->ip for restart or post mortem analysis.
+ * ->post_xol() must not return -ERESTART unless this
+ * is really possible.
+ */
+ regs->ip = utask->vaddr;
+ if (err == -ERESTART)
+ err = 0;
+ send_sigtrap = false;
+ }
+ }
/*
* arch_uprobe_pre_xol() doesn't save the state of TIF_BLOCKSTEP
* so we can get an extra SIGTRAP if we do not clear TF. We need
* to examine the opcode to make it right.
*/
- if (utask->autask.saved_tf)
+ if (send_sigtrap)
send_sig(SIGTRAP, current, 0);
- else if (!(auprobe->fixups & UPROBE_FIX_SETF))
+
+ if (!utask->autask.saved_tf)
regs->flags &= ~X86_EFLAGS_TF;
- return result;
+ return err;
}
/* callback routine for handling exceptions. */
@@ -652,41 +868,27 @@ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val,
/*
* This function gets called when XOL instruction either gets trapped or
- * the thread has a fatal signal, so reset the instruction pointer to its
- * probed address.
+ * the thread has a fatal signal. Reset the instruction pointer to its
+ * probed address for the potential restart or for post mortem analysis.
*/
void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
struct uprobe_task *utask = current->utask;
- current->thread.trap_nr = utask->autask.saved_trap_nr;
- handle_riprel_post_xol(auprobe, regs, NULL);
- instruction_pointer_set(regs, utask->vaddr);
+ if (auprobe->ops->abort)
+ auprobe->ops->abort(auprobe, regs);
+ current->thread.trap_nr = utask->autask.saved_trap_nr;
+ regs->ip = utask->vaddr;
/* clear TF if it was set by us in arch_uprobe_pre_xol() */
if (!utask->autask.saved_tf)
regs->flags &= ~X86_EFLAGS_TF;
}
-/*
- * Skip these instructions as per the currently known x86 ISA.
- * rep=0x66*; nop=0x90
- */
static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
- int i;
-
- for (i = 0; i < MAX_UINSN_BYTES; i++) {
- if (auprobe->insn[i] == 0x66)
- continue;
-
- if (auprobe->insn[i] == 0x90) {
- regs->ip += i + 1;
- return true;
- }
-
- break;
- }
+ if (auprobe->ops->emulate)
+ return auprobe->ops->emulate(auprobe, regs);
return false;
}
@@ -701,23 +903,21 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
unsigned long
arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs)
{
- int rasize, ncopied;
+ int rasize = sizeof_long(), nleft;
unsigned long orig_ret_vaddr = 0; /* clear high bits for 32-bit apps */
- rasize = is_ia32_task() ? 4 : 8;
- ncopied = copy_from_user(&orig_ret_vaddr, (void __user *)regs->sp, rasize);
- if (unlikely(ncopied))
+ if (copy_from_user(&orig_ret_vaddr, (void __user *)regs->sp, rasize))
return -1;
/* check whether address has been already hijacked */
if (orig_ret_vaddr == trampoline_vaddr)
return orig_ret_vaddr;
- ncopied = copy_to_user((void __user *)regs->sp, &trampoline_vaddr, rasize);
- if (likely(!ncopied))
+ nleft = copy_to_user((void __user *)regs->sp, &trampoline_vaddr, rasize);
+ if (likely(!nleft))
return orig_ret_vaddr;
- if (ncopied != rasize) {
+ if (nleft != rasize) {
pr_err("uprobe: return address clobbered: pid=%d, %%sp=%#lx, "
"%%ip=%#lx\n", current->pid, regs->sp, regs->ip);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 10c4f3006af..49edf2dd361 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -147,7 +147,6 @@ SECTIONS
_edata = .;
} :data
-#ifdef CONFIG_X86_64
. = ALIGN(PAGE_SIZE);
__vvar_page = .;
@@ -165,12 +164,15 @@ SECTIONS
#undef __VVAR_KERNEL_LDS
#undef EMIT_VVAR
+ /*
+ * Pad the rest of the page with zeros. Otherwise the loader
+ * can leave garbage here.
+ */
+ . = __vvar_beginning_hack + PAGE_SIZE;
} :data
. = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE);
-#endif /* CONFIG_X86_64 */
-
/* Init code and data - will be freed after init */
. = ALIGN(PAGE_SIZE);
.init.begin : AT(ADDR(.init.begin) - LOAD_OFFSET) {
@@ -199,6 +201,15 @@ SECTIONS
__x86_cpu_dev_end = .;
}
+#ifdef CONFIG_X86_INTEL_MID
+ .x86_intel_mid_dev.init : AT(ADDR(.x86_intel_mid_dev.init) - \
+ LOAD_OFFSET) {
+ __x86_intel_mid_dev_start = .;
+ *(.x86_intel_mid_dev.init)
+ __x86_intel_mid_dev_end = .;
+ }
+#endif
+
/*
* start address and size of operations which during runtime
* can be patched with virtualization friendly instructions or
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index 992f890283e..b99b9ad8540 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -26,6 +26,9 @@
#define TOPOLOGY_REGISTER_OFFSET 0x10
+/* Flag below is initialized once during vSMP PCI initialization. */
+static int irq_routing_comply = 1;
+
#if defined CONFIG_PCI && defined CONFIG_PARAVIRT
/*
* Interrupt control on vSMPowered systems:
@@ -33,7 +36,7 @@
* and vice versa.
*/
-static unsigned long vsmp_save_fl(void)
+asmlinkage __visible unsigned long vsmp_save_fl(void)
{
unsigned long flags = native_save_fl();
@@ -43,7 +46,7 @@ static unsigned long vsmp_save_fl(void)
}
PV_CALLEE_SAVE_REGS_THUNK(vsmp_save_fl);
-static void vsmp_restore_fl(unsigned long flags)
+__visible void vsmp_restore_fl(unsigned long flags)
{
if (flags & X86_EFLAGS_IF)
flags &= ~X86_EFLAGS_AC;
@@ -53,7 +56,7 @@ static void vsmp_restore_fl(unsigned long flags)
}
PV_CALLEE_SAVE_REGS_THUNK(vsmp_restore_fl);
-static void vsmp_irq_disable(void)
+asmlinkage __visible void vsmp_irq_disable(void)
{
unsigned long flags = native_save_fl();
@@ -61,7 +64,7 @@ static void vsmp_irq_disable(void)
}
PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_disable);
-static void vsmp_irq_enable(void)
+asmlinkage __visible void vsmp_irq_enable(void)
{
unsigned long flags = native_save_fl();
@@ -101,6 +104,10 @@ static void __init set_vsmp_pv_ops(void)
#ifdef CONFIG_SMP
if (cap & ctl & BIT(8)) {
ctl &= ~BIT(8);
+
+ /* Interrupt routing set to ignore */
+ irq_routing_comply = 0;
+
#ifdef CONFIG_PROC_FS
/* Don't let users change irq affinity via procfs */
no_irq_affinity = 1;
@@ -218,7 +225,9 @@ static void vsmp_apic_post_init(void)
{
/* need to update phys_pkg_id */
apic->phys_pkg_id = apicid_phys_pkg_id;
- apic->vector_allocation_domain = fill_vector_allocation_domain;
+
+ if (!irq_routing_comply)
+ apic->vector_allocation_domain = fill_vector_allocation_domain;
}
void __init vsmp_init(void)
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 1f96f9347ed..ea5b5709aa7 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -47,14 +47,12 @@
#include <asm/segment.h>
#include <asm/desc.h>
#include <asm/topology.h>
-#include <asm/vgtod.h>
#include <asm/traps.h>
#define CREATE_TRACE_POINTS
#include "vsyscall_trace.h"
DEFINE_VVAR(int, vgetcpu_mode);
-DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data);
static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE;
@@ -77,48 +75,6 @@ static int __init vsyscall_setup(char *str)
}
early_param("vsyscall", vsyscall_setup);
-void update_vsyscall_tz(void)
-{
- vsyscall_gtod_data.sys_tz = sys_tz;
-}
-
-void update_vsyscall(struct timekeeper *tk)
-{
- struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data;
-
- write_seqcount_begin(&vdata->seq);
-
- /* copy vsyscall data */
- vdata->clock.vclock_mode = tk->clock->archdata.vclock_mode;
- vdata->clock.cycle_last = tk->clock->cycle_last;
- vdata->clock.mask = tk->clock->mask;
- vdata->clock.mult = tk->mult;
- vdata->clock.shift = tk->shift;
-
- vdata->wall_time_sec = tk->xtime_sec;
- vdata->wall_time_snsec = tk->xtime_nsec;
-
- vdata->monotonic_time_sec = tk->xtime_sec
- + tk->wall_to_monotonic.tv_sec;
- vdata->monotonic_time_snsec = tk->xtime_nsec
- + (tk->wall_to_monotonic.tv_nsec
- << tk->shift);
- while (vdata->monotonic_time_snsec >=
- (((u64)NSEC_PER_SEC) << tk->shift)) {
- vdata->monotonic_time_snsec -=
- ((u64)NSEC_PER_SEC) << tk->shift;
- vdata->monotonic_time_sec++;
- }
-
- vdata->wall_time_coarse.tv_sec = tk->xtime_sec;
- vdata->wall_time_coarse.tv_nsec = (long)(tk->xtime_nsec >> tk->shift);
-
- vdata->monotonic_time_coarse = timespec_add(vdata->wall_time_coarse,
- tk->wall_to_monotonic);
-
- write_seqcount_end(&vdata->seq);
-}
-
static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
const char *message)
{
@@ -135,7 +91,7 @@ static int addr_to_vsyscall_nr(unsigned long addr)
{
int nr;
- if ((addr & ~0xC00UL) != VSYSCALL_START)
+ if ((addr & ~0xC00UL) != VSYSCALL_ADDR)
return -EINVAL;
nr = (addr & 0xC00UL) >> 10;
@@ -374,28 +330,24 @@ void __init map_vsyscall(void)
{
extern char __vsyscall_page;
unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
- extern char __vvar_page;
- unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page);
- __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_vsyscall,
+ __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,
vsyscall_mode == NATIVE
? PAGE_KERNEL_VSYSCALL
: PAGE_KERNEL_VVAR);
- BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_FIRST_PAGE) !=
- (unsigned long)VSYSCALL_START);
-
- __set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR);
- BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) !=
- (unsigned long)VVAR_ADDRESS);
+ BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
+ (unsigned long)VSYSCALL_ADDR);
}
static int __init vsyscall_init(void)
{
- BUG_ON(VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE));
+ cpu_notifier_register_begin();
on_each_cpu(cpu_vsyscall_init, NULL, 1);
/* notifier priority > KVM */
- hotcpu_notifier(cpu_vsyscall_notifier, 30);
+ __hotcpu_notifier(cpu_vsyscall_notifier, 30);
+
+ cpu_notifier_register_done();
return 0;
}
diff --git a/arch/x86/kernel/vsyscall_gtod.c b/arch/x86/kernel/vsyscall_gtod.c
new file mode 100644
index 00000000000..9531fbb123b
--- /dev/null
+++ b/arch/x86/kernel/vsyscall_gtod.c
@@ -0,0 +1,69 @@
+/*
+ * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
+ * Copyright 2003 Andi Kleen, SuSE Labs.
+ *
+ * Modified for x86 32 bit architecture by
+ * Stefani Seibold <stefani@seibold.net>
+ * sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany
+ *
+ * Thanks to hpa@transmeta.com for some useful hint.
+ * Special thanks to Ingo Molnar for his early experience with
+ * a different vsyscall implementation for Linux/IA32 and for the name.
+ *
+ */
+
+#include <linux/timekeeper_internal.h>
+#include <asm/vgtod.h>
+#include <asm/vvar.h>
+
+DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data);
+
+void update_vsyscall_tz(void)
+{
+ vsyscall_gtod_data.tz_minuteswest = sys_tz.tz_minuteswest;
+ vsyscall_gtod_data.tz_dsttime = sys_tz.tz_dsttime;
+}
+
+void update_vsyscall(struct timekeeper *tk)
+{
+ struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data;
+
+ gtod_write_begin(vdata);
+
+ /* copy vsyscall data */
+ vdata->vclock_mode = tk->clock->archdata.vclock_mode;
+ vdata->cycle_last = tk->clock->cycle_last;
+ vdata->mask = tk->clock->mask;
+ vdata->mult = tk->mult;
+ vdata->shift = tk->shift;
+
+ vdata->wall_time_sec = tk->xtime_sec;
+ vdata->wall_time_snsec = tk->xtime_nsec;
+
+ vdata->monotonic_time_sec = tk->xtime_sec
+ + tk->wall_to_monotonic.tv_sec;
+ vdata->monotonic_time_snsec = tk->xtime_nsec
+ + ((u64)tk->wall_to_monotonic.tv_nsec
+ << tk->shift);
+ while (vdata->monotonic_time_snsec >=
+ (((u64)NSEC_PER_SEC) << tk->shift)) {
+ vdata->monotonic_time_snsec -=
+ ((u64)NSEC_PER_SEC) << tk->shift;
+ vdata->monotonic_time_sec++;
+ }
+
+ vdata->wall_time_coarse_sec = tk->xtime_sec;
+ vdata->wall_time_coarse_nsec = (long)(tk->xtime_nsec >> tk->shift);
+
+ vdata->monotonic_time_coarse_sec =
+ vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec;
+ vdata->monotonic_time_coarse_nsec =
+ vdata->wall_time_coarse_nsec + tk->wall_to_monotonic.tv_nsec;
+
+ while (vdata->monotonic_time_coarse_nsec >= NSEC_PER_SEC) {
+ vdata->monotonic_time_coarse_nsec -= NSEC_PER_SEC;
+ vdata->monotonic_time_coarse_sec++;
+ }
+
+ gtod_write_end(vdata);
+}
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index b014d9414d0..040681928e9 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -66,3 +66,10 @@ EXPORT_SYMBOL(empty_zero_page);
#ifndef CONFIG_PARAVIRT
EXPORT_SYMBOL(native_load_gs_index);
#endif
+
+#ifdef CONFIG_PREEMPT
+EXPORT_SYMBOL(___preempt_schedule);
+#ifdef CONFIG_CONTEXT_TRACKING
+EXPORT_SYMBOL(___preempt_schedule_context);
+#endif
+#endif
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 8ce0072cd70..e48b674639c 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -116,6 +116,8 @@ struct x86_msi_ops x86_msi = {
.teardown_msi_irqs = default_teardown_msi_irqs,
.restore_msi_irqs = default_restore_msi_irqs,
.setup_hpet_msi = default_setup_hpet_msi,
+ .msi_mask_irq = default_msi_mask_irq,
+ .msix_mask_irq = default_msix_mask_irq,
};
/* MSI arch specific hooks */
@@ -134,9 +136,17 @@ void arch_teardown_msi_irq(unsigned int irq)
x86_msi.teardown_msi_irq(irq);
}
-void arch_restore_msi_irqs(struct pci_dev *dev, int irq)
+void arch_restore_msi_irqs(struct pci_dev *dev)
{
- x86_msi.restore_msi_irqs(dev, irq);
+ x86_msi.restore_msi_irqs(dev);
+}
+u32 arch_msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag)
+{
+ return x86_msi.msi_mask_irq(desc, mask, flag);
+}
+u32 arch_msix_mask_irq(struct msi_desc *desc, u32 flag)
+{
+ return x86_msi.msix_mask_irq(desc, flag);
}
#endif
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 422fd822347..a4b451c6add 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -562,6 +562,16 @@ static void __init xstate_enable_boot_cpu(void)
if (cpu_has_xsaveopt && eagerfpu != DISABLE)
eagerfpu = ENABLE;
+ if (pcntxt_mask & XSTATE_EAGER) {
+ if (eagerfpu == DISABLE) {
+ pr_err("eagerfpu not present, disabling some xstate features: 0x%llx\n",
+ pcntxt_mask & XSTATE_EAGER);
+ pcntxt_mask &= ~XSTATE_EAGER;
+ } else {
+ eagerfpu = ENABLE;
+ }
+ }
+
pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x\n",
pcntxt_mask, xstate_size);
}