diff options
Diffstat (limited to 'arch/x86')
93 files changed, 1652 insertions, 1362 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cea0cd9a316..b8676498d8d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -25,6 +25,7 @@ config X86 select HAVE_IDE select HAVE_OPROFILE select HAVE_PERF_EVENTS if (!M386 && !M486) + select HAVE_IRQ_WORK select HAVE_IOREMAP_PROT select HAVE_KPROBES select ARCH_WANT_OPTIONAL_GPIOLIB @@ -33,6 +34,7 @@ config X86 select HAVE_KRETPROBES select HAVE_OPTPROBES select HAVE_FTRACE_MCOUNT_RECORD + select HAVE_C_RECORDMCOUNT select HAVE_DYNAMIC_FTRACE select HAVE_FUNCTION_TRACER select HAVE_FUNCTION_GRAPH_TRACER @@ -59,6 +61,8 @@ config X86 select ANON_INODES select HAVE_ARCH_KMEMCHECK select HAVE_USER_RETURN_NOTIFIER + select HAVE_ARCH_JUMP_LABEL + select HAVE_TEXT_POKE_SMP config INSTRUCTION_DECODER def_bool (KPROBES || PERF_EVENTS) @@ -670,7 +674,7 @@ config GART_IOMMU bool "GART IOMMU support" if EMBEDDED default y select SWIOTLB - depends on X86_64 && PCI && K8_NB + depends on X86_64 && PCI && AMD_NB ---help--- Support for full DMA access of devices with 32bit memory access only on systems with more than 3GB. This is usually needed for USB, @@ -795,6 +799,17 @@ config SCHED_MC making when dealing with multi-core CPU chips at a cost of slightly increased overhead in some places. If unsure say N here. +config IRQ_TIME_ACCOUNTING + bool "Fine granularity task level IRQ time accounting" + default n + ---help--- + Select this option to enable fine granularity task irq time + accounting. This is done by reading a timestamp on each + transitions between softirq and hardirq state, so there can be a + small performance impact. + + If in doubt, say N here. + source "kernel/Kconfig.preempt" config X86_UP_APIC @@ -1326,25 +1341,34 @@ config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK Set whether the default state of memory_corruption_check is on or off. -config X86_RESERVE_LOW_64K - bool "Reserve low 64K of RAM on AMI/Phoenix BIOSen" - default y +config X86_RESERVE_LOW + int "Amount of low memory, in kilobytes, to reserve for the BIOS" + default 64 + range 4 640 ---help--- - Reserve the first 64K of physical RAM on BIOSes that are known - to potentially corrupt that memory range. A numbers of BIOSes are - known to utilize this area during suspend/resume, so it must not - be used by the kernel. + Specify the amount of low memory to reserve for the BIOS. - Set this to N if you are absolutely sure that you trust the BIOS - to get all its memory reservations and usages right. + The first page contains BIOS data structures that the kernel + must not use, so that page must always be reserved. - If you have doubts about the BIOS (e.g. suspend/resume does not - work or there's kernel crashes after certain hardware hotplug - events) and it's not AMI or Phoenix, then you might want to enable - X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check typical - corruption patterns. + By default we reserve the first 64K of physical RAM, as a + number of BIOSes are known to corrupt that memory range + during events such as suspend/resume or monitor cable + insertion, so it must not be used by the kernel. - Say Y if unsure. + You can set this to 4 if you are absolutely sure that you + trust the BIOS to get all its memory reservations and usages + right. If you know your BIOS have problems beyond the + default 64K area, you can set this to 640 to avoid using the + entire low memory range. + + If you have doubts about the BIOS (e.g. suspend/resume does + not work or there's kernel crashes after certain hardware + hotplug events) then you might want to enable + X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check + typical corruption patterns. + + Leave this to the default value of 64 if you are unsure. config MATH_EMULATION bool @@ -2076,7 +2100,7 @@ config OLPC_OPENFIRMWARE endif # X86_32 -config K8_NB +config AMD_NB def_bool y depends on CPU_SUP_AMD && PCI @@ -2125,6 +2149,10 @@ config HAVE_ATOMIC_IOMAP def_bool y depends on X86_32 +config HAVE_TEXT_POKE_SMP + bool + select STOP_MACHINE if SMP + source "net/Kconfig" source "drivers/Kconfig" diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 8aa1b59b907..b02e509072a 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -74,7 +74,7 @@ endif ifdef CONFIG_CC_STACKPROTECTOR cc_has_sp := $(srctree)/scripts/gcc-x86_$(BITS)-has-stack-protector.sh - ifeq ($(shell $(CONFIG_SHELL) $(cc_has_sp) $(CC) $(biarch)),y) + ifeq ($(shell $(CONFIG_SHELL) $(cc_has_sp) $(CC) $(KBUILD_CPPFLAGS) $(biarch)),y) stackp-y := -fstack-protector KBUILD_CFLAGS += $(stackp-y) else @@ -96,8 +96,12 @@ cfi := $(call as-instr,.cfi_startproc\n.cfi_rel_offset $(sp-y)$(comma)0\n.cfi_en # is .cfi_signal_frame supported too? cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1) cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTIONS=1) -KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) -KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) + +# does binutils support specific instructions? +asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1) + +KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) +KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) LDFLAGS := -m elf_$(UTS_MACHINE) diff --git a/arch/x86/boot/early_serial_console.c b/arch/x86/boot/early_serial_console.c index 030f4b93e25..5df2869c874 100644 --- a/arch/x86/boot/early_serial_console.c +++ b/arch/x86/boot/early_serial_console.c @@ -58,7 +58,19 @@ static void parse_earlyprintk(void) if (arg[pos] == ',') pos++; - if (!strncmp(arg, "ttyS", 4)) { + /* + * make sure we have + * "serial,0x3f8,115200" + * "serial,ttyS0,115200" + * "ttyS0,115200" + */ + if (pos == 7 && !strncmp(arg + pos, "0x", 2)) { + port = simple_strtoull(arg + pos, &e, 16); + if (port == 0 || arg + pos == e) + port = DEFAULT_SERIAL_PORT; + else + pos = e - arg; + } else if (!strncmp(arg + pos, "ttyS", 4)) { static const int bases[] = { 0x3f8, 0x2f8 }; int idx = 0; diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 0350311906a..2d93bdbc9ac 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -34,7 +34,7 @@ #include <asm/ia32.h> #undef WARN_OLD -#undef CORE_DUMP /* probably broken */ +#undef CORE_DUMP /* definitely broken */ static int load_aout_binary(struct linux_binprm *, struct pt_regs *regs); static int load_aout_library(struct file *); @@ -131,21 +131,15 @@ static void set_brk(unsigned long start, unsigned long end) * macros to write out all the necessary info. */ -static int dump_write(struct file *file, const void *addr, int nr) -{ - return file->f_op->write(file, addr, nr, &file->f_pos) == nr; -} +#include <linux/coredump.h> #define DUMP_WRITE(addr, nr) \ if (!dump_write(file, (void *)(addr), (nr))) \ goto end_coredump; -#define DUMP_SEEK(offset) \ - if (file->f_op->llseek) { \ - if (file->f_op->llseek(file, (offset), 0) != (offset)) \ - goto end_coredump; \ - } else \ - file->f_pos = (offset) +#define DUMP_SEEK(offset) \ + if (!dump_seek(file, offset)) \ + goto end_coredump; #define START_DATA() (u.u_tsize << PAGE_SHIFT) #define START_STACK(u) (u.start_stack) @@ -217,12 +211,6 @@ static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, dump_size = dump.u_ssize << PAGE_SHIFT; DUMP_WRITE(dump_start, dump_size); } - /* - * Finally dump the task struct. Not be used by gdb, but - * could be useful - */ - set_fs(KERNEL_DS); - DUMP_WRITE(current, sizeof(*current)); end_coredump: set_fs(fs); return has_dumped; diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index b86feabed69..518bb99c339 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -50,7 +50,12 @@ /* * Reload arg registers from stack in case ptrace changed them. * We don't reload %eax because syscall_trace_enter() returned - * the value it wants us to use in the table lookup. + * the %rax value we should see. Instead, we just truncate that + * value to 32 bits again as we did on entry from user mode. + * If it's a new value set by user_regset during entry tracing, + * this matches the normal truncation of the user-mode value. + * If it's -1 to make us punt the syscall, then (u32)-1 is still + * an appropriately invalid value. */ .macro LOAD_ARGS32 offset, _r9=0 .if \_r9 @@ -60,6 +65,7 @@ movl \offset+48(%rsp),%edx movl \offset+56(%rsp),%esi movl \offset+64(%rsp),%edi + movl %eax,%eax /* zero extension */ .endm .macro CFI_STARTPROC32 simple @@ -153,7 +159,7 @@ ENTRY(ia32_sysenter_target) testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) CFI_REMEMBER_STATE jnz sysenter_tracesys - cmpl $(IA32_NR_syscalls-1),%eax + cmpq $(IA32_NR_syscalls-1),%rax ja ia32_badsys sysenter_do_call: IA32_ARG_FIXUP @@ -195,7 +201,7 @@ sysexit_from_sys_call: movl $AUDIT_ARCH_I386,%edi /* 1st arg: audit arch */ call audit_syscall_entry movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */ - cmpl $(IA32_NR_syscalls-1),%eax + cmpq $(IA32_NR_syscalls-1),%rax ja ia32_badsys movl %ebx,%edi /* reload 1st syscall arg */ movl RCX-ARGOFFSET(%rsp),%esi /* reload 2nd syscall arg */ @@ -248,7 +254,7 @@ sysenter_tracesys: call syscall_trace_enter LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ RESTORE_REST - cmpl $(IA32_NR_syscalls-1),%eax + cmpq $(IA32_NR_syscalls-1),%rax ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */ jmp sysenter_do_call CFI_ENDPROC @@ -314,7 +320,7 @@ ENTRY(ia32_cstar_target) testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) CFI_REMEMBER_STATE jnz cstar_tracesys - cmpl $IA32_NR_syscalls-1,%eax + cmpq $IA32_NR_syscalls-1,%rax ja ia32_badsys cstar_do_call: IA32_ARG_FIXUP 1 @@ -367,7 +373,7 @@ cstar_tracesys: LOAD_ARGS32 ARGOFFSET, 1 /* reload args from stack in case ptrace changed it */ RESTORE_REST xchgl %ebp,%r9d - cmpl $(IA32_NR_syscalls-1),%eax + cmpq $(IA32_NR_syscalls-1),%rax ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */ jmp cstar_do_call END(ia32_cstar_target) @@ -425,7 +431,7 @@ ENTRY(ia32_syscall) orl $TS_COMPAT,TI_status(%r10) testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) jnz ia32_tracesys - cmpl $(IA32_NR_syscalls-1),%eax + cmpq $(IA32_NR_syscalls-1),%rax ja ia32_badsys ia32_do_call: IA32_ARG_FIXUP @@ -444,7 +450,7 @@ ia32_tracesys: call syscall_trace_enter LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ RESTORE_REST - cmpl $(IA32_NR_syscalls-1),%eax + cmpq $(IA32_NR_syscalls-1),%rax ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */ jmp ia32_do_call END(ia32_syscall) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index bc6abb7bc7e..76561d20ea2 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -4,6 +4,7 @@ #include <linux/types.h> #include <linux/stddef.h> #include <linux/stringify.h> +#include <linux/jump_label.h> #include <asm/asm.h> /* @@ -160,6 +161,8 @@ static inline void apply_paravirt(struct paravirt_patch_site *start, #define __parainstructions_end NULL #endif +extern void *text_poke_early(void *addr, const void *opcode, size_t len); + /* * Clear and restore the kernel write-protection flag on the local CPU. * Allows the kernel to edit read-only pages. @@ -180,4 +183,12 @@ static inline void apply_paravirt(struct paravirt_patch_site *start, extern void *text_poke(void *addr, const void *opcode, size_t len); extern void *text_poke_smp(void *addr, const void *opcode, size_t len); +#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL) +#define IDEAL_NOP_SIZE_5 5 +extern unsigned char ideal_nop5[IDEAL_NOP_SIZE_5]; +extern void arch_init_ideal_nop5(void); +#else +static inline void arch_init_ideal_nop5(void) {} +#endif + #endif /* _ASM_X86_ALTERNATIVE_H */ diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h index 5af2982133b..f16a2caca1e 100644 --- a/arch/x86/include/asm/amd_iommu.h +++ b/arch/x86/include/asm/amd_iommu.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. + * Copyright (C) 2007-2010 Advanced Micro Devices, Inc. * Author: Joerg Roedel <joerg.roedel@amd.com> * Leo Duran <leo.duran@amd.com> * diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h index d2544f1d705..916bc8111a0 100644 --- a/arch/x86/include/asm/amd_iommu_proto.h +++ b/arch/x86/include/asm/amd_iommu_proto.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2009 Advanced Micro Devices, Inc. + * Copyright (C) 2009-2010 Advanced Micro Devices, Inc. * Author: Joerg Roedel <joerg.roedel@amd.com> * * This program is free software; you can redistribute it and/or modify it @@ -38,4 +38,10 @@ static inline void amd_iommu_stats_init(void) { } #endif /* !CONFIG_AMD_IOMMU_STATS */ +static inline bool is_rd890_iommu(struct pci_dev *pdev) +{ + return (pdev->vendor == PCI_VENDOR_ID_ATI) && + (pdev->device == PCI_DEVICE_ID_RD890_IOMMU); +} + #endif /* _ASM_X86_AMD_IOMMU_PROTO_H */ diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index 7014e88bc77..e3509fc303b 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. + * Copyright (C) 2007-2010 Advanced Micro Devices, Inc. * Author: Joerg Roedel <joerg.roedel@amd.com> * Leo Duran <leo.duran@amd.com> * @@ -368,6 +368,9 @@ struct amd_iommu { /* capabilities of that IOMMU read from ACPI */ u32 cap; + /* flags read from acpi table */ + u8 acpi_flags; + /* * Capability pointer. There could be more than one IOMMU per PCI * device function if there are more than one AMD IOMMU capability @@ -411,6 +414,24 @@ struct amd_iommu { /* default dma_ops domain for that IOMMU */ struct dma_ops_domain *default_dom; + + /* + * We can't rely on the BIOS to restore all values on reinit, so we + * need to stash them + */ + + /* The iommu BAR */ + u32 stored_addr_lo; + u32 stored_addr_hi; + + /* + * Each iommu has 6 l1s, each of which is documented as having 0x12 + * registers + */ + u32 stored_l1[6][0x12]; + + /* The l2 indirect registers */ + u32 stored_l2[0x83]; }; /* diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/amd_nb.h index af00bd1d208..c8517f81b21 100644 --- a/arch/x86/include/asm/k8.h +++ b/arch/x86/include/asm/amd_nb.h @@ -1,5 +1,5 @@ -#ifndef _ASM_X86_K8_H -#define _ASM_X86_K8_H +#ifndef _ASM_X86_AMD_NB_H +#define _ASM_X86_AMD_NB_H #include <linux/pci.h> @@ -7,24 +7,27 @@ extern struct pci_device_id k8_nb_ids[]; struct bootnode; extern int early_is_k8_nb(u32 value); -extern struct pci_dev **k8_northbridges; -extern int num_k8_northbridges; extern int cache_k8_northbridges(void); extern void k8_flush_garts(void); extern int k8_get_nodes(struct bootnode *nodes); extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn); extern int k8_scan_nodes(void); -#ifdef CONFIG_K8_NB -extern int num_k8_northbridges; +struct k8_northbridge_info { + u16 num; + u8 gart_supported; + struct pci_dev **nb_misc; +}; +extern struct k8_northbridge_info k8_northbridges; + +#ifdef CONFIG_AMD_NB static inline struct pci_dev *node_to_k8_nb_misc(int node) { - return (node < num_k8_northbridges) ? k8_northbridges[node] : NULL; + return (node < k8_northbridges.num) ? k8_northbridges.nb_misc[node] : NULL; } #else -#define num_k8_northbridges 0 static inline struct pci_dev *node_to_k8_nb_misc(int node) { @@ -33,4 +36,4 @@ static inline struct pci_dev *node_to_k8_nb_misc(int node) #endif -#endif /* _ASM_X86_K8_H */ +#endif /* _ASM_X86_AMD_NB_H */ diff --git a/arch/x86/include/asm/apb_timer.h b/arch/x86/include/asm/apb_timer.h index a69b1ac9eaf..2fefa501d3b 100644 --- a/arch/x86/include/asm/apb_timer.h +++ b/arch/x86/include/asm/apb_timer.h @@ -54,7 +54,6 @@ extern struct clock_event_device *global_clock_event; extern unsigned long apbt_quick_calibrate(void); extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu); extern void apbt_setup_secondary_clock(void); -extern unsigned int boot_cpu_id; extern struct sfi_timer_table_entry *sfi_get_mtmr(int hint); extern void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr); diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 545776efeb1..bafd80defa4 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -309,7 +309,7 @@ static inline int test_and_change_bit(int nr, volatile unsigned long *addr) static __always_inline int constant_test_bit(unsigned int nr, const volatile unsigned long *addr) { return ((1UL << (nr % BITS_PER_LONG)) & - (((unsigned long *)addr)[nr / BITS_PER_LONG])) != 0; + (addr[nr / BITS_PER_LONG])) != 0; } static inline int variable_test_bit(int nr, volatile const unsigned long *addr) diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 306160e58b4..1d9cd27c292 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -205,7 +205,7 @@ static inline compat_uptr_t ptr_to_compat(void __user *uptr) return (u32)(unsigned long)uptr; } -static inline void __user *compat_alloc_user_space(long len) +static inline void __user *arch_compat_alloc_user_space(long len) { struct pt_regs *regs = task_pt_regs(current); return (void __user *)regs->sp - len; diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index b185091bf19..4fab24de26b 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -32,6 +32,5 @@ extern void arch_unregister_cpu(int); DECLARE_PER_CPU(int, cpu_state); -extern unsigned int boot_cpu_id; #endif /* _ASM_X86_CPU_H */ diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 781a50b29a4..220e2ea08e8 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -152,10 +152,14 @@ #define X86_FEATURE_3DNOWPREFETCH (6*32+ 8) /* 3DNow prefetch instructions */ #define X86_FEATURE_OSVW (6*32+ 9) /* OS Visible Workaround */ #define X86_FEATURE_IBS (6*32+10) /* Instruction Based Sampling */ -#define X86_FEATURE_SSE5 (6*32+11) /* SSE-5 */ +#define X86_FEATURE_XOP (6*32+11) /* extended AVX instructions */ #define X86_FEATURE_SKINIT (6*32+12) /* SKINIT/STGI instructions */ #define X86_FEATURE_WDT (6*32+13) /* Watchdog timer */ +#define X86_FEATURE_LWP (6*32+15) /* Light Weight Profiling */ +#define X86_FEATURE_FMA4 (6*32+16) /* 4 operands MAC instructions */ #define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */ +#define X86_FEATURE_TBM (6*32+21) /* trailing bit manipulations */ +#define X86_FEATURE_TOPOEXT (6*32+22) /* topology extensions CPUID leafs */ /* * Auxiliary flags: Linux defined - For features scattered in various @@ -168,6 +172,7 @@ #define X86_FEATURE_XSAVEOPT (7*32+ 4) /* Optimized Xsave */ #define X86_FEATURE_PLN (7*32+ 5) /* Intel Power Limit Notification */ #define X86_FEATURE_PTS (7*32+ 6) /* Intel Package Thermal Status */ +#define X86_FEATURE_DTS (7*32+ 7) /* Digital Thermal Sensor */ /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */ @@ -179,6 +184,13 @@ #define X86_FEATURE_LBRV (8*32+ 6) /* AMD LBR Virtualization support */ #define X86_FEATURE_SVML (8*32+ 7) /* "svm_lock" AMD SVM locking MSR */ #define X86_FEATURE_NRIPS (8*32+ 8) /* "nrip_save" AMD SVM next_rip save */ +#define X86_FEATURE_TSCRATEMSR (8*32+ 9) /* "tsc_scale" AMD TSC scaling support */ +#define X86_FEATURE_VMCBCLEAN (8*32+10) /* "vmcb_clean" AMD VMCB clean bits support */ +#define X86_FEATURE_FLUSHBYASID (8*32+11) /* AMD flush-by-ASID support */ +#define X86_FEATURE_DECODEASSISTS (8*32+12) /* AMD Decode Assists support */ +#define X86_FEATURE_PAUSEFILTER (8*32+13) /* AMD filtered pause intercept */ +#define X86_FEATURE_PFTHRESHOLD (8*32+14) /* AMD pause filter threshold */ + /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ #define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ @@ -296,6 +308,7 @@ extern const char * const x86_power_flags[32]; #endif /* CONFIG_X86_64 */ +#if __GNUC__ >= 4 /* * Static testing of CPU features. Used the same as boot_cpu_has(). * These are only valid after alternatives have run, but will statically @@ -304,7 +317,7 @@ extern const char * const x86_power_flags[32]; */ static __always_inline __pure bool __static_cpu_has(u16 bit) { -#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5) +#if __GNUC__ > 4 || __GNUC_MINOR__ >= 5 asm goto("1: jmp %l[t_no]\n" "2:\n" ".section .altinstructions,\"a\"\n" @@ -345,7 +358,6 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) #endif } -#if __GNUC__ >= 4 #define static_cpu_has(bit) \ ( \ __builtin_constant_p(boot_cpu_has(bit)) ? \ diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h index 733f7e91e7a..32609919931 100644 --- a/arch/x86/include/asm/dwarf2.h +++ b/arch/x86/include/asm/dwarf2.h @@ -89,6 +89,16 @@ CFI_ADJUST_CFA_OFFSET -8 .endm + .macro pushfq_cfi + pushfq + CFI_ADJUST_CFA_OFFSET 8 + .endm + + .macro popfq_cfi + popfq + CFI_ADJUST_CFA_OFFSET -8 + .endm + .macro movq_cfi reg offset=0 movq %\reg, \offset(%rsp) CFI_REL_OFFSET \reg, \offset @@ -109,6 +119,16 @@ CFI_ADJUST_CFA_OFFSET -4 .endm + .macro pushfl_cfi + pushfl + CFI_ADJUST_CFA_OFFSET 4 + .endm + + .macro popfl_cfi + popfl + CFI_ADJUST_CFA_OFFSET -4 + .endm + .macro movl_cfi reg offset=0 movl %\reg, \offset(%esp) CFI_REL_OFFSET \reg, \offset diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index 8e8ec663a98..b8e96a18676 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -49,8 +49,8 @@ BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR) BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR) BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) -#ifdef CONFIG_PERF_EVENTS -BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR) +#ifdef CONFIG_IRQ_WORK +BUILD_INTERRUPT(irq_work_interrupt, IRQ_WORK_VECTOR) #endif #ifdef CONFIG_X86_THERMAL_VECTOR diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h index 4ac5b0f33fc..bf357f9b25f 100644 --- a/arch/x86/include/asm/gart.h +++ b/arch/x86/include/asm/gart.h @@ -17,6 +17,7 @@ extern int fix_aperture; #define GARTEN (1<<0) #define DISGARTCPU (1<<4) #define DISGARTIO (1<<5) +#define DISTLBWALKPRB (1<<6) /* GART cache control register bits. */ #define INVGART (1<<0) @@ -27,7 +28,6 @@ extern int fix_aperture; #define AMD64_GARTAPERTUREBASE 0x94 #define AMD64_GARTTABLEBASE 0x98 #define AMD64_GARTCACHECTL 0x9c -#define AMD64_GARTEN (1<<0) #ifdef CONFIG_GART_IOMMU extern int gart_iommu_aperture; @@ -57,6 +57,19 @@ static inline void gart_iommu_hole_init(void) extern int agp_amd64_init(void); +static inline void gart_set_size_and_enable(struct pci_dev *dev, u32 order) +{ + u32 ctl; + + /* + * Don't enable translation but enable GART IO and CPU accesses. + * Also, set DISTLBWALKPRB since GART tables memory is UC. + */ + ctl = DISTLBWALKPRB | order << 1; + + pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl); +} + static inline void enable_gart_translation(struct pci_dev *dev, u64 addr) { u32 tmp, ctl; diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index aeab29aee61..55e4de613f0 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -14,7 +14,7 @@ typedef struct { #endif unsigned int x86_platform_ipis; /* arch dependent */ unsigned int apic_perf_irqs; - unsigned int apic_pending_irqs; + unsigned int apic_irq_work_irqs; #ifdef CONFIG_SMP unsigned int irq_resched_count; unsigned int irq_call_count; diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h index 004e6e25e91..1d5c08a1bdf 100644 --- a/arch/x86/include/asm/hpet.h +++ b/arch/x86/include/asm/hpet.h @@ -68,7 +68,6 @@ extern unsigned long force_hpet_address; extern u8 hpet_blockid; extern int hpet_force_user; extern u8 hpet_msi_disable; -extern u8 hpet_readback_cmp; extern int is_hpet_enabled(void); extern int hpet_enable(void); extern void hpet_disable(void); diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h index 528a11e8d3e..824ca07860d 100644 --- a/arch/x86/include/asm/hw_breakpoint.h +++ b/arch/x86/include/asm/hw_breakpoint.h @@ -20,7 +20,7 @@ struct arch_hw_breakpoint { #include <linux/list.h> /* Available HW breakpoint length encodings */ -#define X86_BREAKPOINT_LEN_X 0x00 +#define X86_BREAKPOINT_LEN_X 0x40 #define X86_BREAKPOINT_LEN_1 0x40 #define X86_BREAKPOINT_LEN_2 0x44 #define X86_BREAKPOINT_LEN_4 0x4c diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 46c0fe05f23..3a54a1ca1a0 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -29,7 +29,7 @@ extern void apic_timer_interrupt(void); extern void x86_platform_ipi(void); extern void error_interrupt(void); -extern void perf_pending_interrupt(void); +extern void irq_work_interrupt(void); extern void spurious_interrupt(void); extern void thermal_interrupt(void); diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index a73a8d5a5e6..4aa2bb3b242 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -55,6 +55,12 @@ extern int save_i387_xstate_ia32(void __user *buf); extern int restore_i387_xstate_ia32(void __user *buf); #endif +#ifdef CONFIG_MATH_EMULATION +extern void finit_soft_fpu(struct i387_soft_struct *soft); +#else +static inline void finit_soft_fpu(struct i387_soft_struct *soft) {} +#endif + #define X87_FSW_ES (1 << 7) /* Exception Summary */ static __always_inline __pure bool use_xsaveopt(void) @@ -67,6 +73,11 @@ static __always_inline __pure bool use_xsave(void) return static_cpu_has(X86_FEATURE_XSAVE); } +static __always_inline __pure bool use_fxsr(void) +{ + return static_cpu_has(X86_FEATURE_FXSR); +} + extern void __sanitize_i387_state(struct task_struct *); static inline void sanitize_i387_state(struct task_struct *tsk) @@ -77,19 +88,11 @@ static inline void sanitize_i387_state(struct task_struct *tsk) } #ifdef CONFIG_X86_64 - -/* Ignore delayed exceptions from user space */ -static inline void tolerant_fwait(void) -{ - asm volatile("1: fwait\n" - "2:\n" - _ASM_EXTABLE(1b, 2b)); -} - static inline int fxrstor_checking(struct i387_fxsave_struct *fx) { int err; + /* See comment in fxsave() below. */ asm volatile("1: rex64/fxrstor (%[fx])\n\t" "2:\n" ".section .fixup,\"ax\"\n" @@ -98,44 +101,10 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx) ".previous\n" _ASM_EXTABLE(1b, 3b) : [err] "=r" (err) -#if 0 /* See comment in fxsave() below. */ - : [fx] "r" (fx), "m" (*fx), "0" (0)); -#else - : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0)); -#endif + : [fx] "R" (fx), "m" (*fx), "0" (0)); return err; } -/* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception - is pending. Clear the x87 state here by setting it to fixed - values. The kernel data segment can be sometimes 0 and sometimes - new user value. Both should be ok. - Use the PDA as safe address because it should be already in L1. */ -static inline void fpu_clear(struct fpu *fpu) -{ - struct xsave_struct *xstate = &fpu->state->xsave; - struct i387_fxsave_struct *fx = &fpu->state->fxsave; - - /* - * xsave header may indicate the init state of the FP. - */ - if (use_xsave() && - !(xstate->xsave_hdr.xstate_bv & XSTATE_FP)) - return; - - if (unlikely(fx->swd & X87_FSW_ES)) - asm volatile("fnclex"); - alternative_input(ASM_NOP8 ASM_NOP2, - " emms\n" /* clear stack tags */ - " fildl %%gs:0", /* load to clear state */ - X86_FEATURE_FXSAVE_LEAK); -} - -static inline void clear_fpu_state(struct task_struct *tsk) -{ - fpu_clear(&tsk->thread.fpu); -} - static inline int fxsave_user(struct i387_fxsave_struct __user *fx) { int err; @@ -149,6 +118,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx) if (unlikely(err)) return -EFAULT; + /* See comment in fxsave() below. */ asm volatile("1: rex64/fxsave (%[fx])\n\t" "2:\n" ".section .fixup,\"ax\"\n" @@ -157,11 +127,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx) ".previous\n" _ASM_EXTABLE(1b, 3b) : [err] "=r" (err), "=m" (*fx) -#if 0 /* See comment in fxsave() below. */ - : [fx] "r" (fx), "0" (0)); -#else - : [fx] "cdaSDb" (fx), "0" (0)); -#endif + : [fx] "R" (fx), "0" (0)); if (unlikely(err) && __clear_user(fx, sizeof(struct i387_fxsave_struct))) err = -EFAULT; @@ -175,56 +141,29 @@ static inline void fpu_fxsave(struct fpu *fpu) uses any extended registers for addressing, a second REX prefix will be generated (to the assembler, rex64 followed by semicolon is a separate instruction), and hence the 64-bitness is lost. */ -#if 0 + +#ifdef CONFIG_AS_FXSAVEQ /* Using "fxsaveq %0" would be the ideal choice, but is only supported starting with gas 2.16. */ __asm__ __volatile__("fxsaveq %0" : "=m" (fpu->state->fxsave)); -#elif 0 +#else /* Using, as a workaround, the properly prefixed form below isn't accepted by any binutils version so far released, complaining that the same type of prefix is used twice if an extended register is - needed for addressing (fix submitted to mainline 2005-11-21). */ - __asm__ __volatile__("rex64/fxsave %0" - : "=m" (fpu->state->fxsave)); -#else - /* This, however, we can work around by forcing the compiler to select + needed for addressing (fix submitted to mainline 2005-11-21). + asm volatile("rex64/fxsave %0" + : "=m" (fpu->state->fxsave)); + This, however, we can work around by forcing the compiler to select an addressing mode that doesn't require extended registers. */ - __asm__ __volatile__("rex64/fxsave (%1)" - : "=m" (fpu->state->fxsave) - : "cdaSDb" (&fpu->state->fxsave)); + asm volatile("rex64/fxsave (%[fx])" + : "=m" (fpu->state->fxsave) + : [fx] "R" (&fpu->state->fxsave)); #endif } -static inline void fpu_save_init(struct fpu *fpu) -{ - if (use_xsave()) - fpu_xsave(fpu); - else - fpu_fxsave(fpu); - - fpu_clear(fpu); -} - -static inline void __save_init_fpu(struct task_struct *tsk) -{ - fpu_save_init(&tsk->thread.fpu); - task_thread_info(tsk)->status &= ~TS_USEDFPU; -} - #else /* CONFIG_X86_32 */ -#ifdef CONFIG_MATH_EMULATION -extern void finit_soft_fpu(struct i387_soft_struct *soft); -#else -static inline void finit_soft_fpu(struct i387_soft_struct *soft) {} -#endif - -static inline void tolerant_fwait(void) -{ - asm volatile("fnclex ; fwait"); -} - /* perform fxrstor iff the processor has extended states, otherwise frstor */ static inline int fxrstor_checking(struct i387_fxsave_struct *fx) { @@ -241,6 +180,14 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx) return 0; } +static inline void fpu_fxsave(struct fpu *fpu) +{ + asm volatile("fxsave %[fx]" + : [fx] "=m" (fpu->state->fxsave)); +} + +#endif /* CONFIG_X86_64 */ + /* We need a safe address that is cheap to find and that is already in L1 during context switch. The best choices are unfortunately different for UP and SMP */ @@ -256,47 +203,33 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx) static inline void fpu_save_init(struct fpu *fpu) { if (use_xsave()) { - struct xsave_struct *xstate = &fpu->state->xsave; - struct i387_fxsave_struct *fx = &fpu->state->fxsave; - fpu_xsave(fpu); /* * xsave header may indicate the init state of the FP. */ - if (!(xstate->xsave_hdr.xstate_bv & XSTATE_FP)) - goto end; - - if (unlikely(fx->swd & X87_FSW_ES)) - asm volatile("fnclex"); - - /* - * we can do a simple return here or be paranoid :) - */ - goto clear_state; + if (!(fpu->state->xsave.xsave_hdr.xstate_bv & XSTATE_FP)) + return; + } else if (use_fxsr()) { + fpu_fxsave(fpu); + } else { + asm volatile("fsave %[fx]; fwait" + : [fx] "=m" (fpu->state->fsave)); + return; } - /* Use more nops than strictly needed in case the compiler - varies code */ - alternative_input( - "fnsave %[fx] ;fwait;" GENERIC_NOP8 GENERIC_NOP4, - "fxsave %[fx]\n" - "bt $7,%[fsw] ; jnc 1f ; fnclex\n1:", - X86_FEATURE_FXSR, - [fx] "m" (fpu->state->fxsave), - [fsw] "m" (fpu->state->fxsave.swd) : "memory"); -clear_state: + if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES)) + asm volatile("fnclex"); + /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is pending. Clear the x87 state here by setting it to fixed values. safe_address is a random variable that should be in L1 */ alternative_input( - GENERIC_NOP8 GENERIC_NOP2, + ASM_NOP8 ASM_NOP2, "emms\n\t" /* clear stack tags */ - "fildl %[addr]", /* set F?P to defined value */ + "fildl %P[addr]", /* set F?P to defined value */ X86_FEATURE_FXSAVE_LEAK, [addr] "m" (safe_address)); -end: - ; } static inline void __save_init_fpu(struct task_struct *tsk) @@ -305,9 +238,6 @@ static inline void __save_init_fpu(struct task_struct *tsk) task_thread_info(tsk)->status &= ~TS_USEDFPU; } - -#endif /* CONFIG_X86_64 */ - static inline int fpu_fxrstor_checking(struct fpu *fpu) { return fxrstor_checking(&fpu->state->fxsave); @@ -344,7 +274,10 @@ static inline void __unlazy_fpu(struct task_struct *tsk) static inline void __clear_fpu(struct task_struct *tsk) { if (task_thread_info(tsk)->status & TS_USEDFPU) { - tolerant_fwait(); + /* Ignore delayed exceptions from user space */ + asm volatile("1: fwait\n" + "2:\n" + _ASM_EXTABLE(1b, 2b)); task_thread_info(tsk)->status &= ~TS_USEDFPU; stts(); } @@ -405,19 +338,6 @@ static inline void irq_ts_restore(int TS_state) stts(); } -#ifdef CONFIG_X86_64 - -static inline void save_init_fpu(struct task_struct *tsk) -{ - __save_init_fpu(tsk); - stts(); -} - -#define unlazy_fpu __unlazy_fpu -#define clear_fpu __clear_fpu - -#else /* CONFIG_X86_32 */ - /* * These disable preemption on their own and are safe */ @@ -443,8 +363,6 @@ static inline void clear_fpu(struct task_struct *tsk) preempt_enable(); } -#endif /* CONFIG_X86_64 */ - /* * i387 state interaction */ @@ -508,7 +426,4 @@ extern void fpu_finit(struct fpu *fpu); #endif /* __ASSEMBLY__ */ -#define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5 -#define PSHUFB_XMM5_XMM6 .byte 0x66, 0x0f, 0x38, 0x00, 0xf5 - #endif /* _ASM_X86_I387_H */ diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index e2ca3009255..6af0894dafb 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -114,9 +114,9 @@ #define X86_PLATFORM_IPI_VECTOR 0xed /* - * Performance monitoring pending work vector: + * IRQ work vector: */ -#define LOCAL_PENDING_VECTOR 0xec +#define IRQ_WORK_VECTOR 0xec #define UV_BAU_MESSAGE 0xea diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h new file mode 100644 index 00000000000..f52d42e8058 --- /dev/null +++ b/arch/x86/include/asm/jump_label.h @@ -0,0 +1,37 @@ +#ifndef _ASM_X86_JUMP_LABEL_H +#define _ASM_X86_JUMP_LABEL_H + +#ifdef __KERNEL__ + +#include <linux/types.h> +#include <asm/nops.h> + +#define JUMP_LABEL_NOP_SIZE 5 + +# define JUMP_LABEL_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t" + +# define JUMP_LABEL(key, label) \ + do { \ + asm goto("1:" \ + JUMP_LABEL_INITIAL_NOP \ + ".pushsection __jump_table, \"a\" \n\t"\ + _ASM_PTR "1b, %l[" #label "], %c0 \n\t" \ + ".popsection \n\t" \ + : : "i" (key) : : label); \ + } while (0) + +#endif /* __KERNEL__ */ + +#ifdef CONFIG_X86_64 +typedef u64 jump_label_t; +#else +typedef u32 jump_label_t; +#endif + +struct jump_entry { + jump_label_t code; + jump_label_t target; + jump_label_t key; +}; + +#endif diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 502e53f999c..c52e2eb40a1 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -652,20 +652,6 @@ static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) return (struct kvm_mmu_page *)page_private(page); } -static inline u16 kvm_read_fs(void) -{ - u16 seg; - asm("mov %%fs, %0" : "=g"(seg)); - return seg; -} - -static inline u16 kvm_read_gs(void) -{ - u16 seg; - asm("mov %%gs, %0" : "=g"(seg)); - return seg; -} - static inline u16 kvm_read_ldt(void) { u16 ldt; @@ -673,16 +659,6 @@ static inline u16 kvm_read_ldt(void) return ldt; } -static inline void kvm_load_fs(u16 sel) -{ - asm("mov %0, %%fs" : : "rm"(sel)); -} - -static inline void kvm_load_gs(u16 sel) -{ - asm("mov %0, %%gs" : : "rm"(sel)); -} - static inline void kvm_load_ldt(u16 sel) { asm("lldt %0" : : "rm"(sel)); diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h index def500776b1..a70cd216be5 100644 --- a/arch/x86/include/asm/perf_event_p4.h +++ b/arch/x86/include/asm/perf_event_p4.h @@ -36,19 +36,6 @@ #define P4_ESCR_EMASK(v) ((v) << P4_ESCR_EVENTMASK_SHIFT) #define P4_ESCR_TAG(v) ((v) << P4_ESCR_TAG_SHIFT) -/* Non HT mask */ -#define P4_ESCR_MASK \ - (P4_ESCR_EVENT_MASK | \ - P4_ESCR_EVENTMASK_MASK | \ - P4_ESCR_TAG_MASK | \ - P4_ESCR_TAG_ENABLE | \ - P4_ESCR_T0_OS | \ - P4_ESCR_T0_USR) - -/* HT mask */ -#define P4_ESCR_MASK_HT \ - (P4_ESCR_MASK | P4_ESCR_T1_OS | P4_ESCR_T1_USR) - #define P4_CCCR_OVF 0x80000000U #define P4_CCCR_CASCADE 0x40000000U #define P4_CCCR_OVF_PMI_T0 0x04000000U @@ -70,23 +57,6 @@ #define P4_CCCR_THRESHOLD(v) ((v) << P4_CCCR_THRESHOLD_SHIFT) #define P4_CCCR_ESEL(v) ((v) << P4_CCCR_ESCR_SELECT_SHIFT) -/* Non HT mask */ -#define P4_CCCR_MASK \ - (P4_CCCR_OVF | \ - P4_CCCR_CASCADE | \ - P4_CCCR_OVF_PMI_T0 | \ - P4_CCCR_FORCE_OVF | \ - P4_CCCR_EDGE | \ - P4_CCCR_THRESHOLD_MASK | \ - P4_CCCR_COMPLEMENT | \ - P4_CCCR_COMPARE | \ - P4_CCCR_ESCR_SELECT_MASK | \ - P4_CCCR_ENABLE) - -/* HT mask */ -#define P4_CCCR_MASK_HT \ - (P4_CCCR_MASK | P4_CCCR_OVF_PMI_T1 | P4_CCCR_THREAD_ANY) - #define P4_GEN_ESCR_EMASK(class, name, bit) \ class##__##name = ((1 << bit) << P4_ESCR_EVENTMASK_SHIFT) #define P4_ESCR_EMASK_BIT(class, name) class##__##name @@ -127,6 +97,28 @@ #define P4_CONFIG_HT_SHIFT 63 #define P4_CONFIG_HT (1ULL << P4_CONFIG_HT_SHIFT) +/* + * The bits we allow to pass for RAW events + */ +#define P4_CONFIG_MASK_ESCR \ + P4_ESCR_EVENT_MASK | \ + P4_ESCR_EVENTMASK_MASK | \ + P4_ESCR_TAG_MASK | \ + P4_ESCR_TAG_ENABLE + +#define P4_CONFIG_MASK_CCCR \ + P4_CCCR_EDGE | \ + P4_CCCR_THRESHOLD_MASK | \ + P4_CCCR_COMPLEMENT | \ + P4_CCCR_COMPARE | \ + P4_CCCR_THREAD_ANY | \ + P4_CCCR_RESERVED + +/* some dangerous bits are reserved for kernel internals */ +#define P4_CONFIG_MASK \ + (p4_config_pack_escr(P4_CONFIG_MASK_ESCR)) | \ + (p4_config_pack_cccr(P4_CONFIG_MASK_CCCR)) + static inline bool p4_is_event_cascaded(u64 config) { u32 cccr = p4_config_unpack_cccr(config); diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index f358241e412..cae9c3cb95c 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -110,6 +110,8 @@ struct cpuinfo_x86 { u16 phys_proc_id; /* Core id: */ u16 cpu_core_id; + /* Compute unit id */ + u8 compute_unit_id; /* Index into per_cpu list: */ u16 cpu_index; #endif @@ -602,7 +604,7 @@ extern unsigned long mmu_cr4_features; static inline void set_in_cr4(unsigned long mask) { - unsigned cr4; + unsigned long cr4; mmu_cr4_features |= mask; cr4 = read_cr4(); @@ -612,7 +614,7 @@ static inline void set_in_cr4(unsigned long mask) static inline void clear_in_cr4(unsigned long mask) { - unsigned cr4; + unsigned long cr4; mmu_cr4_features &= ~mask; cr4 = read_cr4(); diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 0925676266b..f49257e3380 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -11,6 +11,8 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_tsc.o = -pg CFLAGS_REMOVE_rtc.o = -pg CFLAGS_REMOVE_paravirt-spinlocks.o = -pg +CFLAGS_REMOVE_pvclock.o = -pg +CFLAGS_REMOVE_kvmclock.o = -pg CFLAGS_REMOVE_ftrace.o = -pg CFLAGS_REMOVE_early_printk.o = -pg endif @@ -32,7 +34,8 @@ GCOV_PROFILE_paravirt.o := n obj-y := process_$(BITS).o signal.o entry_$(BITS).o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o ldt.o dumpstack.o -obj-y += setup.o x86_init.o i8259.o irqinit.o +obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o +obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_X86_VISWS) += visws_quirks.o obj-$(CONFIG_X86_32) += probe_roms_32.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o @@ -87,7 +90,7 @@ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_HPET_TIMER) += hpet.o obj-$(CONFIG_APB_TIMER) += apb_timer.o -obj-$(CONFIG_K8_NB) += k8.o +obj-$(CONFIG_AMD_NB) += amd_nb.o obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o @@ -120,7 +123,6 @@ obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o # 64 bit specific files ifeq ($(CONFIG_X86_64),y) obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o - obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o obj-$(CONFIG_AUDIT) += audit_64.o obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index bcc4adda7b9..5812404a0d4 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -62,7 +62,7 @@ struct cstate_entry { unsigned int ecx; } states[ACPI_PROCESSOR_MAX_POWER]; }; -static struct cstate_entry *cpu_cstate_entry; /* per CPU ptr */ +static struct cstate_entry __percpu *cpu_cstate_entry; /* per CPU ptr */ static short mwait_supported[ACPI_PROCESSOR_MAX_POWER]; diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index f65ab8b014c..a36bb90aef5 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -195,7 +195,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len) extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern s32 __smp_locks[], __smp_locks_end[]; -static void *text_poke_early(void *addr, const void *opcode, size_t len); +void *text_poke_early(void *addr, const void *opcode, size_t len); /* Replace instructions with better alternatives for this CPU type. This runs before SMP is initialized to avoid SMP problems with @@ -522,7 +522,7 @@ void __init alternative_instructions(void) * instructions. And on the local CPU you need to be protected again NMI or MCE * handlers seeing an inconsistent instruction while you patch. */ -static void *__init_or_module text_poke_early(void *addr, const void *opcode, +void *__init_or_module text_poke_early(void *addr, const void *opcode, size_t len) { unsigned long flags; @@ -637,7 +637,72 @@ void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len) tpp.len = len; atomic_set(&stop_machine_first, 1); wrote_text = 0; - stop_machine(stop_machine_text_poke, (void *)&tpp, NULL); + /* Use __stop_machine() because the caller already got online_cpus. */ + __stop_machine(stop_machine_text_poke, (void *)&tpp, NULL); return addr; } +#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL) + +unsigned char ideal_nop5[IDEAL_NOP_SIZE_5]; + +void __init arch_init_ideal_nop5(void) +{ + extern const unsigned char ftrace_test_p6nop[]; + extern const unsigned char ftrace_test_nop5[]; + extern const unsigned char ftrace_test_jmp[]; + int faulted = 0; + + /* + * There is no good nop for all x86 archs. + * We will default to using the P6_NOP5, but first we + * will test to make sure that the nop will actually + * work on this CPU. If it faults, we will then + * go to a lesser efficient 5 byte nop. If that fails + * we then just use a jmp as our nop. This isn't the most + * efficient nop, but we can not use a multi part nop + * since we would then risk being preempted in the middle + * of that nop, and if we enabled tracing then, it might + * cause a system crash. + * + * TODO: check the cpuid to determine the best nop. + */ + asm volatile ( + "ftrace_test_jmp:" + "jmp ftrace_test_p6nop\n" + "nop\n" + "nop\n" + "nop\n" /* 2 byte jmp + 3 bytes */ + "ftrace_test_p6nop:" + P6_NOP5 + "jmp 1f\n" + "ftrace_test_nop5:" + ".byte 0x66,0x66,0x66,0x66,0x90\n" + "1:" + ".section .fixup, \"ax\"\n" + "2: movl $1, %0\n" + " jmp ftrace_test_nop5\n" + "3: movl $2, %0\n" + " jmp 1b\n" + ".previous\n" + _ASM_EXTABLE(ftrace_test_p6nop, 2b) + _ASM_EXTABLE(ftrace_test_nop5, 3b) + : "=r"(faulted) : "0" (faulted)); + + switch (faulted) { + case 0: + pr_info("converting mcount calls to 0f 1f 44 00 00\n"); + memcpy(ideal_nop5, ftrace_test_p6nop, IDEAL_NOP_SIZE_5); + break; + case 1: + pr_info("converting mcount calls to 66 66 66 66 90\n"); + memcpy(ideal_nop5, ftrace_test_nop5, IDEAL_NOP_SIZE_5); + break; + case 2: + pr_info("converting mcount calls to jmp . + 5\n"); + memcpy(ideal_nop5, ftrace_test_jmp, IDEAL_NOP_SIZE_5); + break; + } + +} +#endif diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index fa044e1e30a..d2fdb0826df 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. + * Copyright (C) 2007-2010 Advanced Micro Devices, Inc. * Author: Joerg Roedel <joerg.roedel@amd.com> * Leo Duran <leo.duran@amd.com> * @@ -1953,6 +1953,7 @@ static void __unmap_single(struct dma_ops_domain *dma_dom, size_t size, int dir) { + dma_addr_t flush_addr; dma_addr_t i, start; unsigned int pages; @@ -1960,6 +1961,7 @@ static void __unmap_single(struct dma_ops_domain *dma_dom, (dma_addr + size > dma_dom->aperture_size)) return; + flush_addr = dma_addr; pages = iommu_num_pages(dma_addr, size, PAGE_SIZE); dma_addr &= PAGE_MASK; start = dma_addr; @@ -1974,7 +1976,7 @@ static void __unmap_single(struct dma_ops_domain *dma_dom, dma_ops_free_addresses(dma_dom, dma_addr, pages); if (amd_iommu_unmap_flush || dma_dom->need_flush) { - iommu_flush_pages(&dma_dom->domain, dma_addr, size); + iommu_flush_pages(&dma_dom->domain, flush_addr, size); dma_dom->need_flush = false; } } diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 3cc63e2b8dd..3cb482e123d 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. + * Copyright (C) 2007-2010 Advanced Micro Devices, Inc. * Author: Joerg Roedel <joerg.roedel@amd.com> * Leo Duran <leo.duran@amd.com> * @@ -194,6 +194,39 @@ static inline unsigned long tbl_size(int entry_size) return 1UL << shift; } +/* Access to l1 and l2 indexed register spaces */ + +static u32 iommu_read_l1(struct amd_iommu *iommu, u16 l1, u8 address) +{ + u32 val; + + pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16)); + pci_read_config_dword(iommu->dev, 0xfc, &val); + return val; +} + +static void iommu_write_l1(struct amd_iommu *iommu, u16 l1, u8 address, u32 val) +{ + pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16 | 1 << 31)); + pci_write_config_dword(iommu->dev, 0xfc, val); + pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16)); +} + +static u32 iommu_read_l2(struct amd_iommu *iommu, u8 address) +{ + u32 val; + + pci_write_config_dword(iommu->dev, 0xf0, address); + pci_read_config_dword(iommu->dev, 0xf4, &val); + return val; +} + +static void iommu_write_l2(struct amd_iommu *iommu, u8 address, u32 val) +{ + pci_write_config_dword(iommu->dev, 0xf0, (address | 1 << 8)); + pci_write_config_dword(iommu->dev, 0xf4, val); +} + /**************************************************************************** * * AMD IOMMU MMIO register space handling functions @@ -619,6 +652,7 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu) { int cap_ptr = iommu->cap_ptr; u32 range, misc; + int i, j; pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET, &iommu->cap); @@ -632,6 +666,30 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu) iommu->last_device = calc_devid(MMIO_GET_BUS(range), MMIO_GET_LD(range)); iommu->evt_msi_num = MMIO_MSI_NUM(misc); + + if (!is_rd890_iommu(iommu->dev)) + return; + + /* + * Some rd890 systems may not be fully reconfigured by the BIOS, so + * it's necessary for us to store this information so it can be + * reprogrammed on resume + */ + + pci_read_config_dword(iommu->dev, iommu->cap_ptr + 4, + &iommu->stored_addr_lo); + pci_read_config_dword(iommu->dev, iommu->cap_ptr + 8, + &iommu->stored_addr_hi); + + /* Low bit locks writes to configuration space */ + iommu->stored_addr_lo &= ~1; + + for (i = 0; i < 6; i++) + for (j = 0; j < 0x12; j++) + iommu->stored_l1[i][j] = iommu_read_l1(iommu, i, j); + + for (i = 0; i < 0x83; i++) + iommu->stored_l2[i] = iommu_read_l2(iommu, i); } /* @@ -649,29 +707,9 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, struct ivhd_entry *e; /* - * First set the recommended feature enable bits from ACPI - * into the IOMMU control registers - */ - h->flags & IVHD_FLAG_HT_TUN_EN_MASK ? - iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) : - iommu_feature_disable(iommu, CONTROL_HT_TUN_EN); - - h->flags & IVHD_FLAG_PASSPW_EN_MASK ? - iommu_feature_enable(iommu, CONTROL_PASSPW_EN) : - iommu_feature_disable(iommu, CONTROL_PASSPW_EN); - - h->flags & IVHD_FLAG_RESPASSPW_EN_MASK ? - iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) : - iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN); - - h->flags & IVHD_FLAG_ISOC_EN_MASK ? - iommu_feature_enable(iommu, CONTROL_ISOC_EN) : - iommu_feature_disable(iommu, CONTROL_ISOC_EN); - - /* - * make IOMMU memory accesses cache coherent + * First save the recommended feature enable bits from ACPI */ - iommu_feature_enable(iommu, CONTROL_COHERENT_EN); + iommu->acpi_flags = h->flags; /* * Done. Now parse the device entries @@ -1116,6 +1154,79 @@ static void init_device_table(void) } } +static void iommu_init_flags(struct amd_iommu *iommu) +{ + iommu->acpi_flags & IVHD_FLAG_HT_TUN_EN_MASK ? + iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) : + iommu_feature_disable(iommu, CONTROL_HT_TUN_EN); + + iommu->acpi_flags & IVHD_FLAG_PASSPW_EN_MASK ? + iommu_feature_enable(iommu, CONTROL_PASSPW_EN) : + iommu_feature_disable(iommu, CONTROL_PASSPW_EN); + + iommu->acpi_flags & IVHD_FLAG_RESPASSPW_EN_MASK ? + iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) : + iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN); + + iommu->acpi_flags & IVHD_FLAG_ISOC_EN_MASK ? + iommu_feature_enable(iommu, CONTROL_ISOC_EN) : + iommu_feature_disable(iommu, CONTROL_ISOC_EN); + + /* + * make IOMMU memory accesses cache coherent + */ + iommu_feature_enable(iommu, CONTROL_COHERENT_EN); +} + +static void iommu_apply_resume_quirks(struct amd_iommu *iommu) +{ + int i, j; + u32 ioc_feature_control; + struct pci_dev *pdev = NULL; + + /* RD890 BIOSes may not have completely reconfigured the iommu */ + if (!is_rd890_iommu(iommu->dev)) + return; + + /* + * First, we need to ensure that the iommu is enabled. This is + * controlled by a register in the northbridge + */ + pdev = pci_get_bus_and_slot(iommu->dev->bus->number, PCI_DEVFN(0, 0)); + + if (!pdev) + return; + + /* Select Northbridge indirect register 0x75 and enable writing */ + pci_write_config_dword(pdev, 0x60, 0x75 | (1 << 7)); + pci_read_config_dword(pdev, 0x64, &ioc_feature_control); + + /* Enable the iommu */ + if (!(ioc_feature_control & 0x1)) + pci_write_config_dword(pdev, 0x64, ioc_feature_control | 1); + + pci_dev_put(pdev); + + /* Restore the iommu BAR */ + pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4, + iommu->stored_addr_lo); + pci_write_config_dword(iommu->dev, iommu->cap_ptr + 8, + iommu->stored_addr_hi); + + /* Restore the l1 indirect regs for each of the 6 l1s */ + for (i = 0; i < 6; i++) + for (j = 0; j < 0x12; j++) + iommu_write_l1(iommu, i, j, iommu->stored_l1[i][j]); + + /* Restore the l2 indirect regs */ + for (i = 0; i < 0x83; i++) + iommu_write_l2(iommu, i, iommu->stored_l2[i]); + + /* Lock PCI setup registers */ + pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4, + iommu->stored_addr_lo | 1); +} + /* * This function finally enables all IOMMUs found in the system after * they have been initialized @@ -1126,6 +1237,7 @@ static void enable_iommus(void) for_each_iommu(iommu) { iommu_disable(iommu); + iommu_init_flags(iommu); iommu_set_device_table(iommu); iommu_enable_command_buffer(iommu); iommu_enable_event_buffer(iommu); @@ -1150,6 +1262,11 @@ static void disable_iommus(void) static int amd_iommu_resume(struct sys_device *dev) { + struct amd_iommu *iommu; + + for_each_iommu(iommu) + iommu_apply_resume_quirks(iommu); + /* re-load the hardware */ enable_iommus(); diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/amd_nb.c index 0f7bc20cfcd..8f6463d8ed0 100644 --- a/arch/x86/kernel/k8.c +++ b/arch/x86/kernel/amd_nb.c @@ -8,21 +8,19 @@ #include <linux/errno.h> #include <linux/module.h> #include <linux/spinlock.h> -#include <asm/k8.h> - -int num_k8_northbridges; -EXPORT_SYMBOL(num_k8_northbridges); +#include <asm/amd_nb.h> static u32 *flush_words; struct pci_device_id k8_nb_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_MISC) }, {} }; EXPORT_SYMBOL(k8_nb_ids); -struct pci_dev **k8_northbridges; +struct k8_northbridge_info k8_northbridges; EXPORT_SYMBOL(k8_northbridges); static struct pci_dev *next_k8_northbridge(struct pci_dev *dev) @@ -40,36 +38,45 @@ int cache_k8_northbridges(void) int i; struct pci_dev *dev; - if (num_k8_northbridges) + if (k8_northbridges.num) return 0; dev = NULL; while ((dev = next_k8_northbridge(dev)) != NULL) - num_k8_northbridges++; + k8_northbridges.num++; + + /* some CPU families (e.g. family 0x11) do not support GART */ + if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 || + boot_cpu_data.x86 == 0x15) + k8_northbridges.gart_supported = 1; - k8_northbridges = kmalloc((num_k8_northbridges + 1) * sizeof(void *), - GFP_KERNEL); - if (!k8_northbridges) + k8_northbridges.nb_misc = kmalloc((k8_northbridges.num + 1) * + sizeof(void *), GFP_KERNEL); + if (!k8_northbridges.nb_misc) return -ENOMEM; - if (!num_k8_northbridges) { - k8_northbridges[0] = NULL; + if (!k8_northbridges.num) { + k8_northbridges.nb_misc[0] = NULL; return 0; } - flush_words = kmalloc(num_k8_northbridges * sizeof(u32), GFP_KERNEL); - if (!flush_words) { - kfree(k8_northbridges); - return -ENOMEM; + if (k8_northbridges.gart_supported) { + flush_words = kmalloc(k8_northbridges.num * sizeof(u32), + GFP_KERNEL); + if (!flush_words) { + kfree(k8_northbridges.nb_misc); + return -ENOMEM; + } } dev = NULL; i = 0; while ((dev = next_k8_northbridge(dev)) != NULL) { - k8_northbridges[i] = dev; - pci_read_config_dword(dev, 0x9c, &flush_words[i++]); + k8_northbridges.nb_misc[i] = dev; + if (k8_northbridges.gart_supported) + pci_read_config_dword(dev, 0x9c, &flush_words[i++]); } - k8_northbridges[i] = NULL; + k8_northbridges.nb_misc[i] = NULL; return 0; } EXPORT_SYMBOL_GPL(cache_k8_northbridges); @@ -93,22 +100,25 @@ void k8_flush_garts(void) unsigned long flags; static DEFINE_SPINLOCK(gart_lock); + if (!k8_northbridges.gart_supported) + return; + /* Avoid races between AGP and IOMMU. In theory it's not needed but I'm not sure if the hardware won't lose flush requests when another is pending. This whole thing is so expensive anyways that it doesn't matter to serialize more. -AK */ spin_lock_irqsave(&gart_lock, flags); flushed = 0; - for (i = 0; i < num_k8_northbridges; i++) { - pci_write_config_dword(k8_northbridges[i], 0x9c, + for (i = 0; i < k8_northbridges.num; i++) { + pci_write_config_dword(k8_northbridges.nb_misc[i], 0x9c, flush_words[i]|1); flushed++; } - for (i = 0; i < num_k8_northbridges; i++) { + for (i = 0; i < k8_northbridges.num; i++) { u32 w; /* Make sure the hardware actually executed the flush*/ for (;;) { - pci_read_config_dword(k8_northbridges[i], + pci_read_config_dword(k8_northbridges.nb_misc[i], 0x9c, &w); if (!(w & 1)) break; diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index 8dd77800ff5..6fe2b5cb4f3 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -343,7 +343,7 @@ void apbt_setup_secondary_clock(void) /* Don't register boot CPU clockevent */ cpu = smp_processor_id(); - if (cpu == boot_cpu_id) + if (!cpu) return; /* * We need to calculate the scaled math multiplication factor for @@ -398,7 +398,7 @@ static int apbt_cpuhp_notify(struct notifier_block *n, } break; default: - pr_debug(KERN_INFO "APBT notified %lu, no action\n", action); + pr_debug("APBT notified %lu, no action\n", action); } return NOTIFY_OK; } @@ -552,7 +552,7 @@ bad_count: pr_debug("APB CS going back %lx:%lx:%lx ", t2, last_read, t2 - last_read); bad_count_x3: - pr_debug(KERN_INFO "tripple check enforced\n"); + pr_debug("triple check enforced\n"); t0 = apbt_readl(phy_cs_timer_id, APBTMR_N_CURRENT_VALUE); udelay(1); diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index a2e0caf26e1..377f5db3b8b 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -27,7 +27,7 @@ #include <asm/gart.h> #include <asm/pci-direct.h> #include <asm/dma.h> -#include <asm/k8.h> +#include <asm/amd_nb.h> #include <asm/x86_init.h> int gart_iommu_aperture; @@ -307,7 +307,7 @@ void __init early_gart_iommu_check(void) continue; ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); - aper_enabled = ctl & AMD64_GARTEN; + aper_enabled = ctl & GARTEN; aper_order = (ctl >> 1) & 7; aper_size = (32 * 1024 * 1024) << aper_order; aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff; @@ -362,7 +362,7 @@ void __init early_gart_iommu_check(void) continue; ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); - ctl &= ~AMD64_GARTEN; + ctl &= ~GARTEN; write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl); } } @@ -505,8 +505,13 @@ out: /* Fix up the north bridges */ for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { - int bus; - int dev_base, dev_limit; + int bus, dev_base, dev_limit; + + /* + * Don't enable translation yet but enable GART IO and CPU + * accesses and set DISTLBWALKPRB since GART table memory is UC. + */ + u32 ctl = DISTLBWALKPRB | aper_order << 1; bus = bus_dev_ranges[i].bus; dev_base = bus_dev_ranges[i].dev_base; @@ -515,10 +520,7 @@ out: if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) continue; - /* Don't enable translation yet. That is done later. - Assume this BIOS didn't initialise the GART so - just overwrite all previous bits */ - write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, aper_order << 1); + write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl); write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25); } } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index f1efebaf551..f1e78940d90 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -162,7 +162,7 @@ int __init arch_early_irq_init(void) cfg = irq_cfgx; count = ARRAY_SIZE(irq_cfgx); - node= cpu_to_node(boot_cpu_id); + node = cpu_to_node(0); for (i = 0; i < count; i++) { desc = irq_to_desc(i); @@ -306,14 +306,19 @@ void arch_init_copy_chip_data(struct irq_desc *old_desc, old_cfg = old_desc->chip_data; - memcpy(cfg, old_cfg, sizeof(struct irq_cfg)); + cfg->vector = old_cfg->vector; + cfg->move_in_progress = old_cfg->move_in_progress; + cpumask_copy(cfg->domain, old_cfg->domain); + cpumask_copy(cfg->old_domain, old_cfg->old_domain); init_copy_irq_2_pin(old_cfg, cfg, node); } -static void free_irq_cfg(struct irq_cfg *old_cfg) +static void free_irq_cfg(struct irq_cfg *cfg) { - kfree(old_cfg); + free_cpumask_var(cfg->domain); + free_cpumask_var(cfg->old_domain); + kfree(cfg); } void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc) @@ -1483,7 +1488,7 @@ static void __init setup_IO_APIC_irqs(void) int notcon = 0; struct irq_desc *desc; struct irq_cfg *cfg; - int node = cpu_to_node(boot_cpu_id); + int node = cpu_to_node(0); apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); @@ -1548,7 +1553,7 @@ static void __init setup_IO_APIC_irqs(void) void setup_IO_APIC_irq_extra(u32 gsi) { int apic_id = 0, pin, idx, irq; - int node = cpu_to_node(boot_cpu_id); + int node = cpu_to_node(0); struct irq_desc *desc; struct irq_cfg *cfg; @@ -2927,7 +2932,7 @@ static inline void __init check_timer(void) { struct irq_desc *desc = irq_to_desc(0); struct irq_cfg *cfg = desc->chip_data; - int node = cpu_to_node(boot_cpu_id); + int node = cpu_to_node(0); int apic1, pin1, apic2, pin2; unsigned long flags; int no_pin1 = 0; @@ -3281,7 +3286,7 @@ unsigned int create_irq_nr(unsigned int irq_want, int node) int create_irq(void) { - int node = cpu_to_node(boot_cpu_id); + int node = cpu_to_node(0); unsigned int irq_want; int irq; @@ -3903,7 +3908,7 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq, if (dev) node = dev_to_node(dev); else - node = cpu_to_node(boot_cpu_id); + node = cpu_to_node(0); desc = irq_to_desc_alloc_node(irq, node); if (!desc) { diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 7b598b84c90..f744f54cb24 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -698,9 +698,11 @@ void __init uv_system_init(void) for (j = 0; j < 64; j++) { if (!test_bit(j, &present)) continue; - uv_blade_info[blade].pnode = (i * 64 + j); + pnode = (i * 64 + j); + uv_blade_info[blade].pnode = pnode; uv_blade_info[blade].nr_possible_cpus = 0; uv_blade_info[blade].nr_online_cpus = 0; + max_pnode = max(pnode, max_pnode); blade++; } } @@ -738,7 +740,6 @@ void __init uv_system_init(void) uv_cpu_hub_info(cpu)->scir.offset = uv_scir_offset(apicid); uv_node_to_blade[nid] = blade; uv_cpu_to_blade[cpu] = blade; - max_pnode = max(pnode, max_pnode); } /* Add blade/pnode info for nodes without cpus */ @@ -750,7 +751,6 @@ void __init uv_system_init(void) pnode = (paddr >> m_val) & pnode_mask; blade = boot_pnode_to_blade(pnode); uv_node_to_blade[nid] = blade; - max_pnode = max(pnode, max_pnode); } map_gru_high(max_pnode); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index ba5f62f45f0..9e093f8fe78 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -148,7 +148,7 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP /* calling is from identify_secondary_cpu() ? */ - if (c->cpu_index == boot_cpu_id) + if (!c->cpu_index) return; /* @@ -253,37 +253,51 @@ static int __cpuinit nearby_node(int apicid) #endif /* - * Fixup core topology information for AMD multi-node processors. - * Assumption: Number of cores in each internal node is the same. + * Fixup core topology information for + * (1) AMD multi-node processors + * Assumption: Number of cores in each internal node is the same. + * (2) AMD processors supporting compute units */ #ifdef CONFIG_X86_HT -static void __cpuinit amd_fixup_dcm(struct cpuinfo_x86 *c) +static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c) { - unsigned long long value; - u32 nodes, cores_per_node; + u32 nodes; + u8 node_id; int cpu = smp_processor_id(); - if (!cpu_has(c, X86_FEATURE_NODEID_MSR)) - return; + /* get information required for multi-node processors */ + if (cpu_has(c, X86_FEATURE_TOPOEXT)) { + u32 eax, ebx, ecx, edx; - /* fixup topology information only once for a core */ - if (cpu_has(c, X86_FEATURE_AMD_DCM)) - return; + cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); + nodes = ((ecx >> 8) & 7) + 1; + node_id = ecx & 7; - rdmsrl(MSR_FAM10H_NODE_ID, value); + /* get compute unit information */ + smp_num_siblings = ((ebx >> 8) & 3) + 1; + c->compute_unit_id = ebx & 0xff; + } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { + u64 value; - nodes = ((value >> 3) & 7) + 1; - if (nodes == 1) + rdmsrl(MSR_FAM10H_NODE_ID, value); + nodes = ((value >> 3) & 7) + 1; + node_id = value & 7; + } else return; - set_cpu_cap(c, X86_FEATURE_AMD_DCM); - cores_per_node = c->x86_max_cores / nodes; + /* fixup multi-node processor information */ + if (nodes > 1) { + u32 cores_per_node; + + set_cpu_cap(c, X86_FEATURE_AMD_DCM); + cores_per_node = c->x86_max_cores / nodes; - /* store NodeID, use llc_shared_map to store sibling info */ - per_cpu(cpu_llc_id, cpu) = value & 7; + /* store NodeID, use llc_shared_map to store sibling info */ + per_cpu(cpu_llc_id, cpu) = node_id; - /* fixup core id to be in range from 0 to (cores_per_node - 1) */ - c->cpu_core_id = c->cpu_core_id % cores_per_node; + /* core id to be in range from 0 to (cores_per_node - 1) */ + c->cpu_core_id = c->cpu_core_id % cores_per_node; + } } #endif @@ -304,9 +318,7 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) c->phys_proc_id = c->initial_apicid >> bits; /* use socket ID also for last level cache */ per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; - /* fixup topology information on multi-node processors */ - if ((c->x86 == 0x10) && (c->x86_model == 9)) - amd_fixup_dcm(c); + amd_get_topology(c); #endif } @@ -412,6 +424,23 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_EXTD_APICID); } #endif + + /* We need to do the following only once */ + if (c != &boot_cpu_data) + return; + + if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { + + if (c->x86 > 0x10 || + (c->x86 == 0x10 && c->x86_model >= 0x2)) { + u64 val; + + rdmsrl(MSR_K7_HWCR, val); + if (!(val & BIT(24))) + printk(KERN_WARNING FW_BUG "TSC doesn't count " + "with P0 frequency!\n"); + } + } } static void __cpuinit init_amd(struct cpuinfo_x86 *c) @@ -523,7 +552,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) #endif if (c->extended_cpuid_level >= 0x80000006) { - if ((c->x86 >= 0x0f) && (cpuid_edx(0x80000006) & 0xf000)) + if (cpuid_edx(0x80000006) & 0xf000) num_cache_leaves = 4; else num_cache_leaves = 3; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 490dac63c2d..4b68bda3093 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -545,7 +545,7 @@ void __cpuinit cpu_detect(struct cpuinfo_x86 *c) } } -static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) +void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) { u32 tfms, xlvl; u32 ebx; @@ -665,7 +665,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) this_cpu->c_early_init(c); #ifdef CONFIG_SMP - c->cpu_index = boot_cpu_id; + c->cpu_index = 0; #endif filter_cpuid_features(c, false); } @@ -704,16 +704,21 @@ void __init early_cpu_init(void) } /* - * The NOPL instruction is supposed to exist on all CPUs with - * family >= 6; unfortunately, that's not true in practice because - * of early VIA chips and (more importantly) broken virtualizers that - * are not easy to detect. In the latter case it doesn't even *fail* - * reliably, so probing for it doesn't even work. Disable it completely + * The NOPL instruction is supposed to exist on all CPUs of family >= 6; + * unfortunately, that's not true in practice because of early VIA + * chips and (more importantly) broken virtualizers that are not easy + * to detect. In the latter case it doesn't even *fail* reliably, so + * probing for it doesn't even work. Disable it completely on 32-bit * unless we can find a reliable way to detect all the broken cases. + * Enable it explicitly on 64-bit for non-constant inputs of cpu_has(). */ static void __cpuinit detect_nopl(struct cpuinfo_x86 *c) { +#ifdef CONFIG_X86_32 clear_cpu_cap(c, X86_FEATURE_NOPL); +#else + set_cpu_cap(c, X86_FEATURE_NOPL); +#endif } static void __cpuinit generic_identify(struct cpuinfo_x86 *c) @@ -1264,13 +1269,6 @@ void __cpuinit cpu_init(void) clear_all_debug_regs(); dbg_restore_debug_regs(); - /* - * Force FPU initialization: - */ - current_thread_info()->status = 0; - clear_used_math(); - mxcsr_feature_mask_init(); - fpu_init(); xsave_init(); } diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 3624e8a0f71..e765633f210 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -32,6 +32,8 @@ struct cpu_dev { extern const struct cpu_dev *const __x86_cpu_dev_start[], *const __x86_cpu_dev_end[]; +extern void get_cpu_cap(struct cpuinfo_x86 *c); extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); +extern void get_cpu_cap(struct cpuinfo_x86 *c); #endif diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c index 994230d4dc4..4f6f679f279 100644 --- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c @@ -368,16 +368,22 @@ static int __init pcc_cpufreq_do_osc(acpi_handle *handle) return -ENODEV; out_obj = output.pointer; - if (out_obj->type != ACPI_TYPE_BUFFER) - return -ENODEV; + if (out_obj->type != ACPI_TYPE_BUFFER) { + ret = -ENODEV; + goto out_free; + } errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); - if (errors) - return -ENODEV; + if (errors) { + ret = -ENODEV; + goto out_free; + } supported = *((u32 *)(out_obj->buffer.pointer + 4)); - if (!(supported & 0x1)) - return -ENODEV; + if (!(supported & 0x1)) { + ret = -ENODEV; + goto out_free; + } out_free: kfree(output.pointer); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 85f69cdeae1..695f17731e2 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -39,6 +39,7 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) misc_enable &= ~MSR_IA32_MISC_ENABLE_LIMIT_CPUID; wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); c->cpuid_level = cpuid_eax(0); + get_cpu_cap(c); } } @@ -169,7 +170,7 @@ static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP /* calling is from identify_secondary_cpu() ? */ - if (c->cpu_index == boot_cpu_id) + if (!c->cpu_index) return; /* diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 898c2f4eab8..12cd823c8d0 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -17,7 +17,7 @@ #include <asm/processor.h> #include <linux/smp.h> -#include <asm/k8.h> +#include <asm/amd_nb.h> #include <asm/smp.h> #define LVL_1_INST 1 @@ -306,7 +306,7 @@ struct _cache_attr { ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count); }; -#ifdef CONFIG_CPU_SUP_AMD +#ifdef CONFIG_AMD_NB /* * L3 cache descriptors @@ -369,7 +369,7 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, return; /* not in virtualized environments */ - if (num_k8_northbridges == 0) + if (k8_northbridges.num == 0) return; /* @@ -377,7 +377,7 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, * never freed but this is done only on shutdown so it doesn't matter. */ if (!l3_caches) { - int size = num_k8_northbridges * sizeof(struct amd_l3_cache *); + int size = k8_northbridges.num * sizeof(struct amd_l3_cache *); l3_caches = kzalloc(size, GFP_ATOMIC); if (!l3_caches) @@ -556,12 +556,12 @@ static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, show_cache_disable_1, store_cache_disable_1); -#else /* CONFIG_CPU_SUP_AMD */ +#else /* CONFIG_AMD_NB */ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index) { }; -#endif /* CONFIG_CPU_SUP_AMD */ +#endif /* CONFIG_AMD_NB */ static int __cpuinit cpuid4_cache_lookup_regs(int index, @@ -1000,7 +1000,7 @@ static struct attribute *default_attrs[] = { static struct attribute *default_l3_attrs[] = { DEFAULT_SYSFS_CACHE_ATTRS, -#ifdef CONFIG_CPU_SUP_AMD +#ifdef CONFIG_AMD_NB &cache_disable_0.attr, &cache_disable_1.attr, #endif diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 5e975298fa8..39aaee5c1ab 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -141,6 +141,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) address = (low & MASK_BLKPTR_LO) >> 21; if (!address) break; + address += MCG_XBLK_ADDR; } else ++address; @@ -148,12 +149,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) if (rdmsr_safe(address, &low, &high)) break; - if (!(high & MASK_VALID_HI)) { - if (block) - continue; - else - break; - } + if (!(high & MASK_VALID_HI)) + continue; if (!(high & MASK_CNTP_HI) || (high & MASK_LOCKED_HI)) diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index d9368eeda30..4b683267eca 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -216,7 +216,7 @@ static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev, err = sysfs_add_file_to_group(&sys_dev->kobj, &attr_core_power_limit_count.attr, thermal_attr_group.name); - if (cpu_has(c, X86_FEATURE_PTS)) + if (cpu_has(c, X86_FEATURE_PTS)) { err = sysfs_add_file_to_group(&sys_dev->kobj, &attr_package_throttle_count.attr, thermal_attr_group.name); @@ -224,6 +224,7 @@ static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev, err = sysfs_add_file_to_group(&sys_dev->kobj, &attr_package_power_limit_count.attr, thermal_attr_group.name); + } return err; } @@ -349,7 +350,7 @@ static void intel_thermal_interrupt(void) static void unexpected_thermal_interrupt(void) { - printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", + printk(KERN_ERR "CPU%d: Unexpected LVT thermal interrupt!\n", smp_processor_id()); add_taint(TAINT_MACHINE_CHECK); } diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index c5f59d07142..ac140c7be39 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -827,7 +827,7 @@ int __init amd_special_default_mtrr(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) return 0; - if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11) + if (boot_cpu_data.x86 < 0xf) return 0; /* In case some hypervisor doesn't pass SYSCFG through: */ if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 3efdf2870a3..fe73c1844a9 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -102,6 +102,7 @@ struct cpu_hw_events { */ struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */ unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + unsigned long running[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; int enabled; int n_events; @@ -530,7 +531,7 @@ static int x86_pmu_hw_config(struct perf_event *event) /* * Setup the hardware configuration for a given attr_type */ -static int __hw_perf_event_init(struct perf_event *event) +static int __x86_pmu_event_init(struct perf_event *event) { int err; @@ -583,7 +584,7 @@ static void x86_pmu_disable_all(void) } } -void hw_perf_disable(void) +static void x86_pmu_disable(struct pmu *pmu) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -618,7 +619,7 @@ static void x86_pmu_enable_all(int added) } } -static const struct pmu pmu; +static struct pmu pmu; static inline int is_x86_event(struct perf_event *event) { @@ -800,10 +801,10 @@ static inline int match_prev_assignment(struct hw_perf_event *hwc, hwc->last_tag == cpuc->tags[i]; } -static int x86_pmu_start(struct perf_event *event); -static void x86_pmu_stop(struct perf_event *event); +static void x86_pmu_start(struct perf_event *event, int flags); +static void x86_pmu_stop(struct perf_event *event, int flags); -void hw_perf_enable(void) +static void x86_pmu_enable(struct pmu *pmu) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct perf_event *event; @@ -839,7 +840,14 @@ void hw_perf_enable(void) match_prev_assignment(hwc, cpuc, i)) continue; - x86_pmu_stop(event); + /* + * Ensure we don't accidentally enable a stopped + * counter simply because we rescheduled. + */ + if (hwc->state & PERF_HES_STOPPED) + hwc->state |= PERF_HES_ARCH; + + x86_pmu_stop(event, PERF_EF_UPDATE); } for (i = 0; i < cpuc->n_events; i++) { @@ -851,7 +859,10 @@ void hw_perf_enable(void) else if (i < n_running) continue; - x86_pmu_start(event); + if (hwc->state & PERF_HES_ARCH) + continue; + + x86_pmu_start(event, PERF_EF_RELOAD); } cpuc->n_added = 0; perf_events_lapic_init(); @@ -952,15 +963,12 @@ static void x86_pmu_enable_event(struct perf_event *event) } /* - * activate a single event + * Add a single event to the PMU. * * The event is added to the group of enabled events * but only if it can be scehduled with existing events. - * - * Called with PMU disabled. If successful and return value 1, - * then guaranteed to call perf_enable() and hw_perf_enable() */ -static int x86_pmu_enable(struct perf_event *event) +static int x86_pmu_add(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc; @@ -969,57 +977,67 @@ static int x86_pmu_enable(struct perf_event *event) hwc = &event->hw; + perf_pmu_disable(event->pmu); n0 = cpuc->n_events; - n = collect_events(cpuc, event, false); - if (n < 0) - return n; + ret = n = collect_events(cpuc, event, false); + if (ret < 0) + goto out; + + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + if (!(flags & PERF_EF_START)) + hwc->state |= PERF_HES_ARCH; /* * If group events scheduling transaction was started, * skip the schedulability test here, it will be peformed - * at commit time(->commit_txn) as a whole + * at commit time (->commit_txn) as a whole */ if (cpuc->group_flag & PERF_EVENT_TXN) - goto out; + goto done_collect; ret = x86_pmu.schedule_events(cpuc, n, assign); if (ret) - return ret; + goto out; /* * copy new assignment, now we know it is possible * will be used by hw_perf_enable() */ memcpy(cpuc->assign, assign, n*sizeof(int)); -out: +done_collect: cpuc->n_events = n; cpuc->n_added += n - n0; cpuc->n_txn += n - n0; - return 0; + ret = 0; +out: + perf_pmu_enable(event->pmu); + return ret; } -static int x86_pmu_start(struct perf_event *event) +static void x86_pmu_start(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int idx = event->hw.idx; - if (idx == -1) - return -EAGAIN; + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) + return; + + if (WARN_ON_ONCE(idx == -1)) + return; + + if (flags & PERF_EF_RELOAD) { + WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); + x86_perf_event_set_period(event); + } + + event->hw.state = 0; - x86_perf_event_set_period(event); cpuc->events[idx] = event; __set_bit(idx, cpuc->active_mask); + __set_bit(idx, cpuc->running); x86_pmu.enable(event); perf_event_update_userpage(event); - - return 0; -} - -static void x86_pmu_unthrottle(struct perf_event *event) -{ - int ret = x86_pmu_start(event); - WARN_ON_ONCE(ret); } void perf_event_print_debug(void) @@ -1076,27 +1094,29 @@ void perf_event_print_debug(void) local_irq_restore(flags); } -static void x86_pmu_stop(struct perf_event *event) +static void x86_pmu_stop(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; - int idx = hwc->idx; - if (!__test_and_clear_bit(idx, cpuc->active_mask)) - return; - - x86_pmu.disable(event); - - /* - * Drain the remaining delta count out of a event - * that we are disabling: - */ - x86_perf_event_update(event); + if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) { + x86_pmu.disable(event); + cpuc->events[hwc->idx] = NULL; + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; + } - cpuc->events[idx] = NULL; + if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { + /* + * Drain the remaining delta count out of a event + * that we are disabling: + */ + x86_perf_event_update(event); + hwc->state |= PERF_HES_UPTODATE; + } } -static void x86_pmu_disable(struct perf_event *event) +static void x86_pmu_del(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int i; @@ -1109,7 +1129,7 @@ static void x86_pmu_disable(struct perf_event *event) if (cpuc->group_flag & PERF_EVENT_TXN) return; - x86_pmu_stop(event); + x86_pmu_stop(event, PERF_EF_UPDATE); for (i = 0; i < cpuc->n_events; i++) { if (event == cpuc->event_list[i]) { @@ -1132,7 +1152,6 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) struct perf_sample_data data; struct cpu_hw_events *cpuc; struct perf_event *event; - struct hw_perf_event *hwc; int idx, handled = 0; u64 val; @@ -1141,11 +1160,18 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) cpuc = &__get_cpu_var(cpu_hw_events); for (idx = 0; idx < x86_pmu.num_counters; idx++) { - if (!test_bit(idx, cpuc->active_mask)) + if (!test_bit(idx, cpuc->active_mask)) { + /* + * Though we deactivated the counter some cpus + * might still deliver spurious interrupts still + * in flight. Catch them: + */ + if (__test_and_clear_bit(idx, cpuc->running)) + handled++; continue; + } event = cpuc->events[idx]; - hwc = &event->hw; val = x86_perf_event_update(event); if (val & (1ULL << (x86_pmu.cntval_bits - 1))) @@ -1161,7 +1187,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) continue; if (perf_event_overflow(event, 1, &data, regs)) - x86_pmu_stop(event); + x86_pmu_stop(event, 0); } if (handled) @@ -1170,25 +1196,6 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) return handled; } -void smp_perf_pending_interrupt(struct pt_regs *regs) -{ - irq_enter(); - ack_APIC_irq(); - inc_irq_stat(apic_pending_irqs); - perf_event_do_pending(); - irq_exit(); -} - -void set_perf_event_pending(void) -{ -#ifdef CONFIG_X86_LOCAL_APIC - if (!x86_pmu.apic || !x86_pmu_initialized()) - return; - - apic->send_IPI_self(LOCAL_PENDING_VECTOR); -#endif -} - void perf_events_lapic_init(void) { if (!x86_pmu.apic || !x86_pmu_initialized()) @@ -1378,7 +1385,6 @@ void __init init_hw_perf_events(void) x86_pmu.num_counters = X86_PMC_MAX_GENERIC; } x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; - perf_max_events = x86_pmu.num_counters; if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", @@ -1414,6 +1420,7 @@ void __init init_hw_perf_events(void) pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed); pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl); + perf_pmu_register(&pmu); perf_cpu_notifier(x86_pmu_notifier); } @@ -1427,10 +1434,11 @@ static inline void x86_pmu_read(struct perf_event *event) * Set the flag to make pmu::enable() not perform the * schedulability test, it will be performed at commit time */ -static void x86_pmu_start_txn(const struct pmu *pmu) +static void x86_pmu_start_txn(struct pmu *pmu) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + perf_pmu_disable(pmu); cpuc->group_flag |= PERF_EVENT_TXN; cpuc->n_txn = 0; } @@ -1440,7 +1448,7 @@ static void x86_pmu_start_txn(const struct pmu *pmu) * Clear the flag and pmu::enable() will perform the * schedulability test. */ -static void x86_pmu_cancel_txn(const struct pmu *pmu) +static void x86_pmu_cancel_txn(struct pmu *pmu) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -1450,6 +1458,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu) */ cpuc->n_added -= cpuc->n_txn; cpuc->n_events -= cpuc->n_txn; + perf_pmu_enable(pmu); } /* @@ -1457,7 +1466,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu) * Perform the group schedulability test as a whole * Return 0 if success */ -static int x86_pmu_commit_txn(const struct pmu *pmu) +static int x86_pmu_commit_txn(struct pmu *pmu) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int assign[X86_PMC_IDX_MAX]; @@ -1479,22 +1488,10 @@ static int x86_pmu_commit_txn(const struct pmu *pmu) memcpy(cpuc->assign, assign, n*sizeof(int)); cpuc->group_flag &= ~PERF_EVENT_TXN; - + perf_pmu_enable(pmu); return 0; } -static const struct pmu pmu = { - .enable = x86_pmu_enable, - .disable = x86_pmu_disable, - .start = x86_pmu_start, - .stop = x86_pmu_stop, - .read = x86_pmu_read, - .unthrottle = x86_pmu_unthrottle, - .start_txn = x86_pmu_start_txn, - .cancel_txn = x86_pmu_cancel_txn, - .commit_txn = x86_pmu_commit_txn, -}; - /* * validate that we can schedule this event */ @@ -1569,12 +1566,22 @@ out: return ret; } -const struct pmu *hw_perf_event_init(struct perf_event *event) +int x86_pmu_event_init(struct perf_event *event) { - const struct pmu *tmp; + struct pmu *tmp; int err; - err = __hw_perf_event_init(event); + switch (event->attr.type) { + case PERF_TYPE_RAW: + case PERF_TYPE_HARDWARE: + case PERF_TYPE_HW_CACHE: + break; + + default: + return -ENOENT; + } + + err = __x86_pmu_event_init(event); if (!err) { /* * we temporarily connect event to its pmu @@ -1594,26 +1601,31 @@ const struct pmu *hw_perf_event_init(struct perf_event *event) if (err) { if (event->destroy) event->destroy(event); - return ERR_PTR(err); } - return &pmu; + return err; } -/* - * callchain support - */ +static struct pmu pmu = { + .pmu_enable = x86_pmu_enable, + .pmu_disable = x86_pmu_disable, -static inline -void callchain_store(struct perf_callchain_entry *entry, u64 ip) -{ - if (entry->nr < PERF_MAX_STACK_DEPTH) - entry->ip[entry->nr++] = ip; -} + .event_init = x86_pmu_event_init, -static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); -static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); + .add = x86_pmu_add, + .del = x86_pmu_del, + .start = x86_pmu_start, + .stop = x86_pmu_stop, + .read = x86_pmu_read, + .start_txn = x86_pmu_start_txn, + .cancel_txn = x86_pmu_cancel_txn, + .commit_txn = x86_pmu_commit_txn, +}; + +/* + * callchain support + */ static void backtrace_warning_symbol(void *data, char *msg, unsigned long symbol) @@ -1635,7 +1647,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable) { struct perf_callchain_entry *entry = data; - callchain_store(entry, addr); + perf_callchain_store(entry, addr); } static const struct stacktrace_ops backtrace_ops = { @@ -1646,11 +1658,15 @@ static const struct stacktrace_ops backtrace_ops = { .walk_stack = print_context_stack_bp, }; -static void -perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) +void +perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) { - callchain_store(entry, PERF_CONTEXT_KERNEL); - callchain_store(entry, regs->ip); + if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + /* TODO: We don't support guest os callchain now */ + return; + } + + perf_callchain_store(entry, regs->ip); dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry); } @@ -1679,7 +1695,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) if (fp < compat_ptr(regs->sp)) break; - callchain_store(entry, frame.return_address); + perf_callchain_store(entry, frame.return_address); fp = compat_ptr(frame.next_frame); } return 1; @@ -1692,19 +1708,20 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) } #endif -static void -perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) +void +perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) { struct stack_frame frame; const void __user *fp; - if (!user_mode(regs)) - regs = task_pt_regs(current); + if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + /* TODO: We don't support guest os callchain now */ + return; + } fp = (void __user *)regs->bp; - callchain_store(entry, PERF_CONTEXT_USER); - callchain_store(entry, regs->ip); + perf_callchain_store(entry, regs->ip); if (perf_callchain_user32(regs, entry)) return; @@ -1721,52 +1738,11 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) if ((unsigned long)fp < regs->sp) break; - callchain_store(entry, frame.return_address); + perf_callchain_store(entry, frame.return_address); fp = frame.next_frame; } } -static void -perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry) -{ - int is_user; - - if (!regs) - return; - - is_user = user_mode(regs); - - if (is_user && current->state != TASK_RUNNING) - return; - - if (!is_user) - perf_callchain_kernel(regs, entry); - - if (current->mm) - perf_callchain_user(regs, entry); -} - -struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) -{ - struct perf_callchain_entry *entry; - - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { - /* TODO: We don't support guest os callchain now */ - return NULL; - } - - if (in_nmi()) - entry = &__get_cpu_var(pmc_nmi_entry); - else - entry = &__get_cpu_var(pmc_irq_entry); - - entry->nr = 0; - - perf_do_callchain(regs, entry); - - return entry; -} - unsigned long perf_instruction_pointer(struct pt_regs *regs) { unsigned long ip; diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index c2897b7b4a3..46d58448c3a 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -52,7 +52,7 @@ static __initconst const u64 amd_hw_cache_event_ids [ C(DTLB) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ - [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */ + [ C(RESULT_MISS) ] = 0x0746, /* L1_DTLB_AND_L2_DLTB_MISS.ALL */ }, [ C(OP_WRITE) ] = { [ C(RESULT_ACCESS) ] = 0, @@ -66,7 +66,7 @@ static __initconst const u64 amd_hw_cache_event_ids [ C(ITLB) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */ - [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */ + [ C(RESULT_MISS) ] = 0x0385, /* L1_ITLB_AND_L2_ITLB_MISS.ALL */ }, [ C(OP_WRITE) ] = { [ C(RESULT_ACCESS) ] = -1, diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index ee05c90012d..c8f5c088cad 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -713,18 +713,18 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) struct cpu_hw_events *cpuc; int bit, loops; u64 status; - int handled = 0; + int handled; perf_sample_data_init(&data, 0); cpuc = &__get_cpu_var(cpu_hw_events); intel_pmu_disable_all(); - intel_pmu_drain_bts_buffer(); + handled = intel_pmu_drain_bts_buffer(); status = intel_pmu_get_status(); if (!status) { intel_pmu_enable_all(0); - return 0; + return handled; } loops = 0; @@ -763,7 +763,7 @@ again: data.period = event->hw.last_period; if (perf_event_overflow(event, 1, &data, regs)) - x86_pmu_stop(event); + x86_pmu_stop(event, 0); } /* diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 18018d1311c..4977f9c400e 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -214,7 +214,7 @@ static void intel_pmu_disable_bts(void) update_debugctlmsr(debugctlmsr); } -static void intel_pmu_drain_bts_buffer(void) +static int intel_pmu_drain_bts_buffer(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct debug_store *ds = cpuc->ds; @@ -231,16 +231,16 @@ static void intel_pmu_drain_bts_buffer(void) struct pt_regs regs; if (!event) - return; + return 0; if (!ds) - return; + return 0; at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; top = (struct bts_record *)(unsigned long)ds->bts_index; if (top <= at) - return; + return 0; ds->bts_index = ds->bts_buffer_base; @@ -256,7 +256,7 @@ static void intel_pmu_drain_bts_buffer(void) perf_prepare_sample(&header, &data, event, ®s); if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1)) - return; + return 1; for (; at < top; at++) { data.ip = at->from; @@ -270,6 +270,7 @@ static void intel_pmu_drain_bts_buffer(void) /* There's new data available. */ event->hw.interrupts++; event->pending_kill = POLL_IN; + return 1; } /* @@ -491,7 +492,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event, regs.flags &= ~PERF_EFLAGS_EXACT; if (perf_event_overflow(event, 1, &data, ®s)) - x86_pmu_stop(event); + x86_pmu_stop(event, 0); } static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index b560db3305b..81400b93e69 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -18,6 +18,8 @@ struct p4_event_bind { unsigned int opcode; /* Event code and ESCR selector */ unsigned int escr_msr[2]; /* ESCR MSR for this event */ + unsigned int escr_emask; /* valid ESCR EventMask bits */ + unsigned int shared; /* event is shared across threads */ char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */ }; @@ -66,231 +68,435 @@ static struct p4_event_bind p4_event_bind_map[] = { [P4_EVENT_TC_DELIVER_MODE] = { .opcode = P4_OPCODE(P4_EVENT_TC_DELIVER_MODE), .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DD) | + P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DB) | + P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DI) | + P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BD) | + P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BB) | + P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BI) | + P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, ID), + .shared = 1, .cntr = { {4, 5, -1}, {6, 7, -1} }, }, [P4_EVENT_BPU_FETCH_REQUEST] = { .opcode = P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST), .escr_msr = { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_BPU_FETCH_REQUEST, TCMISS), .cntr = { {0, -1, -1}, {2, -1, -1} }, }, [P4_EVENT_ITLB_REFERENCE] = { .opcode = P4_OPCODE(P4_EVENT_ITLB_REFERENCE), .escr_msr = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT) | + P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, MISS) | + P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT_UK), .cntr = { {0, -1, -1}, {2, -1, -1} }, }, [P4_EVENT_MEMORY_CANCEL] = { .opcode = P4_OPCODE(P4_EVENT_MEMORY_CANCEL), .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, ST_RB_FULL) | + P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, 64K_CONF), .cntr = { {8, 9, -1}, {10, 11, -1} }, }, [P4_EVENT_MEMORY_COMPLETE] = { .opcode = P4_OPCODE(P4_EVENT_MEMORY_COMPLETE), .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, LSC) | + P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, SSC), .cntr = { {8, 9, -1}, {10, 11, -1} }, }, [P4_EVENT_LOAD_PORT_REPLAY] = { .opcode = P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY), .escr_msr = { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_LOAD_PORT_REPLAY, SPLIT_LD), .cntr = { {8, 9, -1}, {10, 11, -1} }, }, [P4_EVENT_STORE_PORT_REPLAY] = { .opcode = P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY), .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_STORE_PORT_REPLAY, SPLIT_ST), .cntr = { {8, 9, -1}, {10, 11, -1} }, }, [P4_EVENT_MOB_LOAD_REPLAY] = { .opcode = P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY), .escr_msr = { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STA) | + P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STD) | + P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, PARTIAL_DATA) | + P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, UNALGN_ADDR), .cntr = { {0, -1, -1}, {2, -1, -1} }, }, [P4_EVENT_PAGE_WALK_TYPE] = { .opcode = P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE), .escr_msr = { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, DTMISS) | + P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, ITMISS), + .shared = 1, .cntr = { {0, -1, -1}, {2, -1, -1} }, }, [P4_EVENT_BSQ_CACHE_REFERENCE] = { .opcode = P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE), .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS), .cntr = { {0, -1, -1}, {2, -1, -1} }, }, [P4_EVENT_IOQ_ALLOCATION] = { .opcode = P4_OPCODE(P4_EVENT_IOQ_ALLOCATION), .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, DEFAULT) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_READ) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_WRITE) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_UC) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WC) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WT) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WP) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WB) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OWN) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OTHER) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, PREFETCH), .cntr = { {0, -1, -1}, {2, -1, -1} }, }, [P4_EVENT_IOQ_ACTIVE_ENTRIES] = { /* shared ESCR */ .opcode = P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES), .escr_msr = { MSR_P4_FSB_ESCR1, MSR_P4_FSB_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, DEFAULT) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_READ) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_WRITE) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_UC) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WC) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WT) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WP) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WB) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OWN) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OTHER) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, PREFETCH), .cntr = { {2, -1, -1}, {3, -1, -1} }, }, [P4_EVENT_FSB_DATA_ACTIVITY] = { .opcode = P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY), .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV) | + P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN) | + P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OTHER) | + P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_DRV) | + P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OWN) | + P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OTHER), + .shared = 1, .cntr = { {0, -1, -1}, {2, -1, -1} }, }, [P4_EVENT_BSQ_ALLOCATION] = { /* shared ESCR, broken CCCR1 */ .opcode = P4_OPCODE(P4_EVENT_BSQ_ALLOCATION), .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE0) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE1) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN0) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN1) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_IO_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LOCK_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_CACHE_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_SPLIT_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_DEM_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_ORD_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE0) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE1) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE2), .cntr = { {0, -1, -1}, {1, -1, -1} }, }, [P4_EVENT_BSQ_ACTIVE_ENTRIES] = { /* shared ESCR */ .opcode = P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES), .escr_msr = { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE0) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE1) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN0) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN1) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE0) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE1) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE2), .cntr = { {2, -1, -1}, {3, -1, -1} }, }, [P4_EVENT_SSE_INPUT_ASSIST] = { .opcode = P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST), .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_SSE_INPUT_ASSIST, ALL), + .shared = 1, .cntr = { {8, 9, -1}, {10, 11, -1} }, }, [P4_EVENT_PACKED_SP_UOP] = { .opcode = P4_OPCODE(P4_EVENT_PACKED_SP_UOP), .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_SP_UOP, ALL), + .shared = 1, .cntr = { {8, 9, -1}, {10, 11, -1} }, }, [P4_EVENT_PACKED_DP_UOP] = { .opcode = P4_OPCODE(P4_EVENT_PACKED_DP_UOP), .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_DP_UOP, ALL), + .shared = 1, .cntr = { {8, 9, -1}, {10, 11, -1} }, }, [P4_EVENT_SCALAR_SP_UOP] = { .opcode = P4_OPCODE(P4_EVENT_SCALAR_SP_UOP), .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_SP_UOP, ALL), + .shared = 1, .cntr = { {8, 9, -1}, {10, 11, -1} }, }, [P4_EVENT_SCALAR_DP_UOP] = { .opcode = P4_OPCODE(P4_EVENT_SCALAR_DP_UOP), .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_DP_UOP, ALL), + .shared = 1, .cntr = { {8, 9, -1}, {10, 11, -1} }, }, [P4_EVENT_64BIT_MMX_UOP] = { .opcode = P4_OPCODE(P4_EVENT_64BIT_MMX_UOP), .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_64BIT_MMX_UOP, ALL), + .shared = 1, .cntr = { {8, 9, -1}, {10, 11, -1} }, }, [P4_EVENT_128BIT_MMX_UOP] = { .opcode = P4_OPCODE(P4_EVENT_128BIT_MMX_UOP), .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_128BIT_MMX_UOP, ALL), + .shared = 1, .cntr = { {8, 9, -1}, {10, 11, -1} }, }, [P4_EVENT_X87_FP_UOP] = { .opcode = P4_OPCODE(P4_EVENT_X87_FP_UOP), .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_X87_FP_UOP, ALL), + .shared = 1, .cntr = { {8, 9, -1}, {10, 11, -1} }, }, [P4_EVENT_TC_MISC] = { .opcode = P4_OPCODE(P4_EVENT_TC_MISC), .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_TC_MISC, FLUSH), .cntr = { {4, 5, -1}, {6, 7, -1} }, }, [P4_EVENT_GLOBAL_POWER_EVENTS] = { .opcode = P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS), .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING), .cntr = { {0, -1, -1}, {2, -1, -1} }, }, [P4_EVENT_TC_MS_XFER] = { .opcode = P4_OPCODE(P4_EVENT_TC_MS_XFER), .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_TC_MS_XFER, CISC), .cntr = { {4, 5, -1}, {6, 7, -1} }, }, [P4_EVENT_UOP_QUEUE_WRITES] = { .opcode = P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES), .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_BUILD) | + P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_DELIVER) | + P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_ROM), .cntr = { {4, 5, -1}, {6, 7, -1} }, }, [P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = { .opcode = P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE), .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL) | + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CALL) | + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, RETURN) | + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT), .cntr = { {4, 5, -1}, {6, 7, -1} }, }, [P4_EVENT_RETIRED_BRANCH_TYPE] = { .opcode = P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE), .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL) | + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL) | + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN) | + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT), .cntr = { {4, 5, -1}, {6, 7, -1} }, }, [P4_EVENT_RESOURCE_STALL] = { .opcode = P4_OPCODE(P4_EVENT_RESOURCE_STALL), .escr_msr = { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_RESOURCE_STALL, SBFULL), .cntr = { {12, 13, 16}, {14, 15, 17} }, }, [P4_EVENT_WC_BUFFER] = { .opcode = P4_OPCODE(P4_EVENT_WC_BUFFER), .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_EVICTS) | + P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_FULL_EVICTS), + .shared = 1, .cntr = { {8, 9, -1}, {10, 11, -1} }, }, [P4_EVENT_B2B_CYCLES] = { .opcode = P4_OPCODE(P4_EVENT_B2B_CYCLES), .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, + .escr_emask = 0, .cntr = { {0, -1, -1}, {2, -1, -1} }, }, [P4_EVENT_BNR] = { .opcode = P4_OPCODE(P4_EVENT_BNR), .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, + .escr_emask = 0, .cntr = { {0, -1, -1}, {2, -1, -1} }, }, [P4_EVENT_SNOOP] = { .opcode = P4_OPCODE(P4_EVENT_SNOOP), .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, + .escr_emask = 0, .cntr = { {0, -1, -1}, {2, -1, -1} }, }, [P4_EVENT_RESPONSE] = { .opcode = P4_OPCODE(P4_EVENT_RESPONSE), .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, + .escr_emask = 0, .cntr = { {0, -1, -1}, {2, -1, -1} }, }, [P4_EVENT_FRONT_END_EVENT] = { .opcode = P4_OPCODE(P4_EVENT_FRONT_END_EVENT), .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, NBOGUS) | + P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, BOGUS), .cntr = { {12, 13, 16}, {14, 15, 17} }, }, [P4_EVENT_EXECUTION_EVENT] = { .opcode = P4_OPCODE(P4_EVENT_EXECUTION_EVENT), .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3), .cntr = { {12, 13, 16}, {14, 15, 17} }, }, [P4_EVENT_REPLAY_EVENT] = { .opcode = P4_OPCODE(P4_EVENT_REPLAY_EVENT), .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, NBOGUS) | + P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, BOGUS), .cntr = { {12, 13, 16}, {14, 15, 17} }, }, [P4_EVENT_INSTR_RETIRED] = { .opcode = P4_OPCODE(P4_EVENT_INSTR_RETIRED), .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG) | + P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSTAG) | + P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG) | + P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSTAG), .cntr = { {12, 13, 16}, {14, 15, 17} }, }, [P4_EVENT_UOPS_RETIRED] = { .opcode = P4_OPCODE(P4_EVENT_UOPS_RETIRED), .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, NBOGUS) | + P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, BOGUS), .cntr = { {12, 13, 16}, {14, 15, 17} }, }, [P4_EVENT_UOP_TYPE] = { .opcode = P4_OPCODE(P4_EVENT_UOP_TYPE), .escr_msr = { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGLOADS) | + P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGSTORES), .cntr = { {12, 13, 16}, {14, 15, 17} }, }, [P4_EVENT_BRANCH_RETIRED] = { .opcode = P4_OPCODE(P4_EVENT_BRANCH_RETIRED), .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNP) | + P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNM) | + P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTP) | + P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTM), .cntr = { {12, 13, 16}, {14, 15, 17} }, }, [P4_EVENT_MISPRED_BRANCH_RETIRED] = { .opcode = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED), .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS), .cntr = { {12, 13, 16}, {14, 15, 17} }, }, [P4_EVENT_X87_ASSIST] = { .opcode = P4_OPCODE(P4_EVENT_X87_ASSIST), .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSU) | + P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSO) | + P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAO) | + P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAU) | + P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, PREA), .cntr = { {12, 13, 16}, {14, 15, 17} }, }, [P4_EVENT_MACHINE_CLEAR] = { .opcode = P4_OPCODE(P4_EVENT_MACHINE_CLEAR), .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, CLEAR) | + P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, MOCLEAR) | + P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, SMCLEAR), .cntr = { {12, 13, 16}, {14, 15, 17} }, }, [P4_EVENT_INSTR_COMPLETED] = { .opcode = P4_OPCODE(P4_EVENT_INSTR_COMPLETED), .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, NBOGUS) | + P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, BOGUS), .cntr = { {12, 13, 16}, {14, 15, 17} }, }, }; @@ -428,29 +634,73 @@ static u64 p4_pmu_event_map(int hw_event) return config; } +/* check cpu model specifics */ +static bool p4_event_match_cpu_model(unsigned int event_idx) +{ + /* INSTR_COMPLETED event only exist for model 3, 4, 6 (Prescott) */ + if (event_idx == P4_EVENT_INSTR_COMPLETED) { + if (boot_cpu_data.x86_model != 3 && + boot_cpu_data.x86_model != 4 && + boot_cpu_data.x86_model != 6) + return false; + } + + /* + * For info + * - IQ_ESCR0, IQ_ESCR1 only for models 1 and 2 + */ + + return true; +} + static int p4_validate_raw_event(struct perf_event *event) { - unsigned int v; + unsigned int v, emask; - /* user data may have out-of-bound event index */ + /* User data may have out-of-bound event index */ v = p4_config_unpack_event(event->attr.config); - if (v >= ARRAY_SIZE(p4_event_bind_map)) { - pr_warning("P4 PMU: Unknown event code: %d\n", v); + if (v >= ARRAY_SIZE(p4_event_bind_map)) + return -EINVAL; + + /* It may be unsupported: */ + if (!p4_event_match_cpu_model(v)) return -EINVAL; + + /* + * NOTE: P4_CCCR_THREAD_ANY has not the same meaning as + * in Architectural Performance Monitoring, it means not + * on _which_ logical cpu to count but rather _when_, ie it + * depends on logical cpu state -- count event if one cpu active, + * none, both or any, so we just allow user to pass any value + * desired. + * + * In turn we always set Tx_OS/Tx_USR bits bound to logical + * cpu without their propagation to another cpu + */ + + /* + * if an event is shared accross the logical threads + * the user needs special permissions to be able to use it + */ + if (p4_event_bind_map[v].shared) { + if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) + return -EACCES; } + /* ESCR EventMask bits may be invalid */ + emask = p4_config_unpack_escr(event->attr.config) & P4_ESCR_EVENTMASK_MASK; + if (emask & ~p4_event_bind_map[v].escr_emask) + return -EINVAL; + /* - * it may have some screwed PEBS bits + * it may have some invalid PEBS bits */ - if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE)) { - pr_warning("P4 PMU: PEBS are not supported yet\n"); + if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE)) return -EINVAL; - } + v = p4_config_unpack_metric(event->attr.config); - if (v >= ARRAY_SIZE(p4_pebs_bind_map)) { - pr_warning("P4 PMU: Unknown metric code: %d\n", v); + if (v >= ARRAY_SIZE(p4_pebs_bind_map)) return -EINVAL; - } return 0; } @@ -478,27 +728,21 @@ static int p4_hw_config(struct perf_event *event) if (event->attr.type == PERF_TYPE_RAW) { + /* + * Clear bits we reserve to be managed by kernel itself + * and never allowed from a user space + */ + event->attr.config &= P4_CONFIG_MASK; + rc = p4_validate_raw_event(event); if (rc) goto out; /* - * We don't control raw events so it's up to the caller - * to pass sane values (and we don't count the thread number - * on HT machine but allow HT-compatible specifics to be - * passed on) - * * Note that for RAW events we allow user to use P4_CCCR_RESERVED * bits since we keep additional info here (for cache events and etc) - * - * XXX: HT wide things should check perf_paranoid_cpu() && - * CAP_SYS_ADMIN */ - event->hw.config |= event->attr.config & - (p4_config_pack_escr(P4_ESCR_MASK_HT) | - p4_config_pack_cccr(P4_CCCR_MASK_HT | P4_CCCR_RESERVED)); - - event->hw.config &= ~P4_CCCR_FORCE_OVF; + event->hw.config |= event->attr.config; } rc = x86_setup_perfctr(event); @@ -660,8 +904,12 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) for (idx = 0; idx < x86_pmu.num_counters; idx++) { int overflow; - if (!test_bit(idx, cpuc->active_mask)) + if (!test_bit(idx, cpuc->active_mask)) { + /* catch in-flight IRQs */ + if (__test_and_clear_bit(idx, cpuc->running)) + handled++; continue; + } event = cpuc->events[idx]; hwc = &event->hw; diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index fb329e9f849..d9f4ff8fcd6 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -700,11 +700,10 @@ static void probe_nmi_watchdog(void) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: - if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 && - boot_cpu_data.x86 != 16 && boot_cpu_data.x86 != 17) - return; - wd_ops = &k7_wd_ops; - break; + if (boot_cpu_data.x86 == 6 || + (boot_cpu_data.x86 >= 0xf && boot_cpu_data.x86 <= 0x15)) + wd_ops = &k7_wd_ops; + return; case X86_VENDOR_INTEL: /* Work around where perfctr1 doesn't have a working enable * bit as described in the following errata: diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 34b4dad6f0b..c7f64e6f537 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -31,6 +31,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) const struct cpuid_bit *cb; static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { + { X86_FEATURE_DTS, CR_EAX, 0, 0x00000006, 0 }, { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 }, { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 }, { X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 }, @@ -43,6 +44,12 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 }, { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 }, { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 }, + { X86_FEATURE_TSCRATEMSR, CR_EDX, 4, 0x8000000a, 0 }, + { X86_FEATURE_VMCBCLEAN, CR_EDX, 5, 0x8000000a, 0 }, + { X86_FEATURE_FLUSHBYASID, CR_EDX, 6, 0x8000000a, 0 }, + { X86_FEATURE_DECODEASSISTS, CR_EDX, 7, 0x8000000a, 0 }, + { X86_FEATURE_PAUSEFILTER, CR_EDX,10, 0x8000000a, 0 }, + { X86_FEATURE_PFTHRESHOLD, CR_EDX,12, 0x8000000a, 0 }, { 0, 0, 0, 0, 0 } }; diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index e5cc7e82e60..76b8cd953de 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -18,7 +18,6 @@ #include <asm/apic.h> #include <asm/iommu.h> #include <asm/gart.h> -#include <asm/hpet.h> static void __init fix_hypertransport_config(int num, int slot, int func) { @@ -98,7 +97,6 @@ static void __init nvidia_bugs(int num, int slot, int func) } #if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC) -#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC) static u32 __init ati_ixp4x0_rev(int num, int slot, int func) { u32 d; @@ -116,7 +114,6 @@ static u32 __init ati_ixp4x0_rev(int num, int slot, int func) d &= 0xff; return d; } -#endif static void __init ati_bugs(int num, int slot, int func) { @@ -192,21 +189,6 @@ static void __init ati_bugs_contd(int num, int slot, int func) } #endif -/* - * Force the read back of the CMP register in hpet_next_event() - * to work around the problem that the CMP register write seems to be - * delayed. See hpet_next_event() for details. - * - * We do this on all SMBUS incarnations for now until we have more - * information about the affected chipsets. - */ -static void __init ati_hpet_bugs(int num, int slot, int func) -{ -#ifdef CONFIG_HPET_TIMER - hpet_readback_cmp = 1; -#endif -} - #define QFLAG_APPLY_ONCE 0x1 #define QFLAG_APPLIED 0x2 #define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) @@ -236,8 +218,6 @@ static struct chipset early_qrk[] __initdata = { PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs }, { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS, PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd }, - { PCI_VENDOR_ID_ATI, PCI_ANY_ID, - PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_hpet_bugs }, {} }; diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 227d00920d2..9fb188d7bc7 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -115,8 +115,7 @@ /* unfortunately push/pop can't be no-op */ .macro PUSH_GS - pushl $0 - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $0 .endm .macro POP_GS pop=0 addl $(4 + \pop), %esp @@ -140,14 +139,12 @@ #else /* CONFIG_X86_32_LAZY_GS */ .macro PUSH_GS - pushl %gs - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %gs /*CFI_REL_OFFSET gs, 0*/ .endm .macro POP_GS pop=0 -98: popl %gs - CFI_ADJUST_CFA_OFFSET -4 +98: popl_cfi %gs /*CFI_RESTORE gs*/ .if \pop <> 0 add $\pop, %esp @@ -195,35 +192,25 @@ .macro SAVE_ALL cld PUSH_GS - pushl %fs - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %fs /*CFI_REL_OFFSET fs, 0;*/ - pushl %es - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %es /*CFI_REL_OFFSET es, 0;*/ - pushl %ds - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ds /*CFI_REL_OFFSET ds, 0;*/ - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %eax CFI_REL_OFFSET eax, 0 - pushl %ebp - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ebp CFI_REL_OFFSET ebp, 0 - pushl %edi - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %edi CFI_REL_OFFSET edi, 0 - pushl %esi - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %esi CFI_REL_OFFSET esi, 0 - pushl %edx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %edx CFI_REL_OFFSET edx, 0 - pushl %ecx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ecx CFI_REL_OFFSET ecx, 0 - pushl %ebx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ebx CFI_REL_OFFSET ebx, 0 movl $(__USER_DS), %edx movl %edx, %ds @@ -234,39 +221,29 @@ .endm .macro RESTORE_INT_REGS - popl %ebx - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %ebx CFI_RESTORE ebx - popl %ecx - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %ecx CFI_RESTORE ecx - popl %edx - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %edx CFI_RESTORE edx - popl %esi - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %esi CFI_RESTORE esi - popl %edi - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %edi CFI_RESTORE edi - popl %ebp - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %ebp CFI_RESTORE ebp - popl %eax - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %eax CFI_RESTORE eax .endm .macro RESTORE_REGS pop=0 RESTORE_INT_REGS -1: popl %ds - CFI_ADJUST_CFA_OFFSET -4 +1: popl_cfi %ds /*CFI_RESTORE ds;*/ -2: popl %es - CFI_ADJUST_CFA_OFFSET -4 +2: popl_cfi %es /*CFI_RESTORE es;*/ -3: popl %fs - CFI_ADJUST_CFA_OFFSET -4 +3: popl_cfi %fs /*CFI_RESTORE fs;*/ POP_GS \pop .pushsection .fixup, "ax" @@ -320,16 +297,12 @@ ENTRY(ret_from_fork) CFI_STARTPROC - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %eax call schedule_tail GET_THREAD_INFO(%ebp) - popl %eax - CFI_ADJUST_CFA_OFFSET -4 - pushl $0x0202 # Reset kernel eflags - CFI_ADJUST_CFA_OFFSET 4 - popfl - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %eax + pushl_cfi $0x0202 # Reset kernel eflags + popfl_cfi jmp syscall_exit CFI_ENDPROC END(ret_from_fork) @@ -409,29 +382,23 @@ sysenter_past_esp: * enough kernel state to call TRACE_IRQS_OFF can be called - but * we immediately enable interrupts at that point anyway. */ - pushl $(__USER_DS) - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $(__USER_DS) /*CFI_REL_OFFSET ss, 0*/ - pushl %ebp - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ebp CFI_REL_OFFSET esp, 0 - pushfl + pushfl_cfi orl $X86_EFLAGS_IF, (%esp) - CFI_ADJUST_CFA_OFFSET 4 - pushl $(__USER_CS) - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $(__USER_CS) /*CFI_REL_OFFSET cs, 0*/ /* * Push current_thread_info()->sysenter_return to the stack. * A tiny bit of offset fixup is necessary - 4*4 means the 4 words * pushed above; +8 corresponds to copy_thread's esp0 setting. */ - pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) CFI_REL_OFFSET eip, 0 - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %eax SAVE_ALL ENABLE_INTERRUPTS(CLBR_NONE) @@ -486,8 +453,7 @@ sysenter_audit: movl %eax,%edx /* 2nd arg: syscall number */ movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */ call audit_syscall_entry - pushl %ebx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ebx movl PT_EAX(%esp),%eax /* reload syscall number */ jmp sysenter_do_call @@ -529,8 +495,7 @@ ENDPROC(ia32_sysenter_target) # system call handler stub ENTRY(system_call) RING0_INT_FRAME # can't unwind into user space anyway - pushl %eax # save orig_eax - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %eax # save orig_eax SAVE_ALL GET_THREAD_INFO(%ebp) # system call tracing in operation / emulation @@ -566,7 +531,6 @@ restore_all_notrace: je ldt_ss # returning to user-space with LDT SS restore_nocheck: RESTORE_REGS 4 # skip orig_eax/error_code - CFI_ADJUST_CFA_OFFSET -4 irq_return: INTERRUPT_RETURN .section .fixup,"ax" @@ -619,10 +583,8 @@ ldt_ss: shr $16, %edx mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ - pushl $__ESPFIX_SS - CFI_ADJUST_CFA_OFFSET 4 - push %eax /* new kernel esp */ - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $__ESPFIX_SS + pushl_cfi %eax /* new kernel esp */ /* Disable interrupts, but do not irqtrace this section: we * will soon execute iret and the tracer was already set to * the irqstate after the iret */ @@ -666,11 +628,9 @@ work_notifysig: # deal with pending signals and ALIGN work_notifysig_v86: - pushl %ecx # save ti_flags for do_notify_resume - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ecx # save ti_flags for do_notify_resume call save_v86_state # %eax contains pt_regs pointer - popl %ecx - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %ecx movl %eax, %esp #else movl %esp, %eax @@ -750,14 +710,18 @@ ptregs_##name: \ #define PTREGSCALL3(name) \ ALIGN; \ ptregs_##name: \ + CFI_STARTPROC; \ leal 4(%esp),%eax; \ - pushl %eax; \ + pushl_cfi %eax; \ movl PT_EDX(%eax),%ecx; \ movl PT_ECX(%eax),%edx; \ movl PT_EBX(%eax),%eax; \ call sys_##name; \ addl $4,%esp; \ - ret + CFI_ADJUST_CFA_OFFSET -4; \ + ret; \ + CFI_ENDPROC; \ +ENDPROC(ptregs_##name) PTREGSCALL1(iopl) PTREGSCALL0(fork) @@ -772,15 +736,19 @@ PTREGSCALL1(vm86old) /* Clone is an oddball. The 4th arg is in %edi */ ALIGN; ptregs_clone: + CFI_STARTPROC leal 4(%esp),%eax - pushl %eax - pushl PT_EDI(%eax) + pushl_cfi %eax + pushl_cfi PT_EDI(%eax) movl PT_EDX(%eax),%ecx movl PT_ECX(%eax),%edx movl PT_EBX(%eax),%eax call sys_clone addl $8,%esp + CFI_ADJUST_CFA_OFFSET -8 ret + CFI_ENDPROC +ENDPROC(ptregs_clone) .macro FIXUP_ESPFIX_STACK /* @@ -795,10 +763,8 @@ ptregs_clone: mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ shl $16, %eax addl %esp, %eax /* the adjusted stack pointer */ - pushl $__KERNEL_DS - CFI_ADJUST_CFA_OFFSET 4 - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $__KERNEL_DS + pushl_cfi %eax lss (%esp), %esp /* switch to the normal stack segment */ CFI_ADJUST_CFA_OFFSET -8 .endm @@ -835,8 +801,7 @@ vector=FIRST_EXTERNAL_VECTOR .if vector <> FIRST_EXTERNAL_VECTOR CFI_ADJUST_CFA_OFFSET -4 .endif -1: pushl $(~vector+0x80) /* Note: always in signed byte range */ - CFI_ADJUST_CFA_OFFSET 4 +1: pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */ .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 jmp 2f .endif @@ -876,8 +841,7 @@ ENDPROC(common_interrupt) #define BUILD_INTERRUPT3(name, nr, fn) \ ENTRY(name) \ RING0_INT_FRAME; \ - pushl $~(nr); \ - CFI_ADJUST_CFA_OFFSET 4; \ + pushl_cfi $~(nr); \ SAVE_ALL; \ TRACE_IRQS_OFF \ movl %esp,%eax; \ @@ -893,21 +857,18 @@ ENDPROC(name) ENTRY(coprocessor_error) RING0_INT_FRAME - pushl $0 - CFI_ADJUST_CFA_OFFSET 4 - pushl $do_coprocessor_error - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $0 + pushl_cfi $do_coprocessor_error jmp error_code CFI_ENDPROC END(coprocessor_error) ENTRY(simd_coprocessor_error) RING0_INT_FRAME - pushl $0 - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $0 #ifdef CONFIG_X86_INVD_BUG /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ -661: pushl $do_general_protection +661: pushl_cfi $do_general_protection 662: .section .altinstructions,"a" .balign 4 @@ -922,19 +883,16 @@ ENTRY(simd_coprocessor_error) 664: .previous #else - pushl $do_simd_coprocessor_error + pushl_cfi $do_simd_coprocessor_error #endif - CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC END(simd_coprocessor_error) ENTRY(device_not_available) RING0_INT_FRAME - pushl $-1 # mark this as an int - CFI_ADJUST_CFA_OFFSET 4 - pushl $do_device_not_available - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $-1 # mark this as an int + pushl_cfi $do_device_not_available jmp error_code CFI_ENDPROC END(device_not_available) @@ -956,82 +914,68 @@ END(native_irq_enable_sysexit) ENTRY(overflow) RING0_INT_FRAME - pushl $0 - CFI_ADJUST_CFA_OFFSET 4 - pushl $do_overflow - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $0 + pushl_cfi $do_overflow jmp error_code CFI_ENDPROC END(overflow) ENTRY(bounds) RING0_INT_FRAME - pushl $0 - CFI_ADJUST_CFA_OFFSET 4 - pushl $do_bounds - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $0 + pushl_cfi $do_bounds jmp error_code CFI_ENDPROC END(bounds) ENTRY(invalid_op) RING0_INT_FRAME - pushl $0 - CFI_ADJUST_CFA_OFFSET 4 - pushl $do_invalid_op - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $0 + pushl_cfi $do_invalid_op jmp error_code CFI_ENDPROC END(invalid_op) ENTRY(coprocessor_segment_overrun) RING0_INT_FRAME - pushl $0 - CFI_ADJUST_CFA_OFFSET 4 - pushl $do_coprocessor_segment_overrun - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $0 + pushl_cfi $do_coprocessor_segment_overrun jmp error_code CFI_ENDPROC END(coprocessor_segment_overrun) ENTRY(invalid_TSS) RING0_EC_FRAME - pushl $do_invalid_TSS - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $do_invalid_TSS jmp error_code CFI_ENDPROC END(invalid_TSS) ENTRY(segment_not_present) RING0_EC_FRAME - pushl $do_segment_not_present - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $do_segment_not_present jmp error_code CFI_ENDPROC END(segment_not_present) ENTRY(stack_segment) RING0_EC_FRAME - pushl $do_stack_segment - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $do_stack_segment jmp error_code CFI_ENDPROC END(stack_segment) ENTRY(alignment_check) RING0_EC_FRAME - pushl $do_alignment_check - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $do_alignment_check jmp error_code CFI_ENDPROC END(alignment_check) ENTRY(divide_error) RING0_INT_FRAME - pushl $0 # no error code - CFI_ADJUST_CFA_OFFSET 4 - pushl $do_divide_error - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $0 # no error code + pushl_cfi $do_divide_error jmp error_code CFI_ENDPROC END(divide_error) @@ -1039,10 +983,8 @@ END(divide_error) #ifdef CONFIG_X86_MCE ENTRY(machine_check) RING0_INT_FRAME - pushl $0 - CFI_ADJUST_CFA_OFFSET 4 - pushl machine_check_vector - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $0 + pushl_cfi machine_check_vector jmp error_code CFI_ENDPROC END(machine_check) @@ -1050,10 +992,8 @@ END(machine_check) ENTRY(spurious_interrupt_bug) RING0_INT_FRAME - pushl $0 - CFI_ADJUST_CFA_OFFSET 4 - pushl $do_spurious_interrupt_bug - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $0 + pushl_cfi $do_spurious_interrupt_bug jmp error_code CFI_ENDPROC END(spurious_interrupt_bug) @@ -1084,8 +1024,7 @@ ENTRY(xen_sysenter_target) ENTRY(xen_hypervisor_callback) CFI_STARTPROC - pushl $0 - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $0 SAVE_ALL TRACE_IRQS_OFF @@ -1121,23 +1060,20 @@ ENDPROC(xen_hypervisor_callback) # We distinguish between categories by maintaining a status value in EAX. ENTRY(xen_failsafe_callback) CFI_STARTPROC - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %eax movl $1,%eax 1: mov 4(%esp),%ds 2: mov 8(%esp),%es 3: mov 12(%esp),%fs 4: mov 16(%esp),%gs testl %eax,%eax - popl %eax - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %eax lea 16(%esp),%esp CFI_ADJUST_CFA_OFFSET -16 jz 5f addl $16,%esp jmp iret_exc # EAX != 0 => Category 2 (Bad IRET) -5: pushl $0 # EAX == 0 => Category 1 (Bad segment) - CFI_ADJUST_CFA_OFFSET 4 +5: pushl_cfi $0 # EAX == 0 => Category 1 (Bad segment) SAVE_ALL jmp ret_from_exception CFI_ENDPROC @@ -1287,40 +1223,29 @@ syscall_table_size=(.-sys_call_table) ENTRY(page_fault) RING0_EC_FRAME - pushl $do_page_fault - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $do_page_fault ALIGN error_code: /* the function address is in %gs's slot on the stack */ - pushl %fs - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %fs /*CFI_REL_OFFSET fs, 0*/ - pushl %es - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %es /*CFI_REL_OFFSET es, 0*/ - pushl %ds - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ds /*CFI_REL_OFFSET ds, 0*/ - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %eax CFI_REL_OFFSET eax, 0 - pushl %ebp - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ebp CFI_REL_OFFSET ebp, 0 - pushl %edi - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %edi CFI_REL_OFFSET edi, 0 - pushl %esi - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %esi CFI_REL_OFFSET esi, 0 - pushl %edx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %edx CFI_REL_OFFSET edx, 0 - pushl %ecx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ecx CFI_REL_OFFSET ecx, 0 - pushl %ebx - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ebx CFI_REL_OFFSET ebx, 0 cld movl $(__KERNEL_PERCPU), %ecx @@ -1362,12 +1287,9 @@ END(page_fault) movl TSS_sysenter_sp0 + \offset(%esp), %esp CFI_DEF_CFA esp, 0 CFI_UNDEFINED eip - pushfl - CFI_ADJUST_CFA_OFFSET 4 - pushl $__KERNEL_CS - CFI_ADJUST_CFA_OFFSET 4 - pushl $sysenter_past_esp - CFI_ADJUST_CFA_OFFSET 4 + pushfl_cfi + pushl_cfi $__KERNEL_CS + pushl_cfi $sysenter_past_esp CFI_REL_OFFSET eip, 0 .endm @@ -1377,8 +1299,7 @@ ENTRY(debug) jne debug_stack_correct FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn debug_stack_correct: - pushl $-1 # mark this as an int - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $-1 # mark this as an int SAVE_ALL TRACE_IRQS_OFF xorl %edx,%edx # error code 0 @@ -1398,32 +1319,27 @@ END(debug) */ ENTRY(nmi) RING0_INT_FRAME - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %eax movl %ss, %eax cmpw $__ESPFIX_SS, %ax - popl %eax - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %eax je nmi_espfix_stack cmpl $ia32_sysenter_target,(%esp) je nmi_stack_fixup - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %eax movl %esp,%eax /* Do not access memory above the end of our stack page, * it might not exist. */ andl $(THREAD_SIZE-1),%eax cmpl $(THREAD_SIZE-20),%eax - popl %eax - CFI_ADJUST_CFA_OFFSET -4 + popl_cfi %eax jae nmi_stack_correct cmpl $ia32_sysenter_target,12(%esp) je nmi_debug_stack_check nmi_stack_correct: /* We have a RING0_INT_FRAME here */ - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %eax SAVE_ALL xorl %edx,%edx # zero error code movl %esp,%eax # pt_regs pointer @@ -1452,18 +1368,14 @@ nmi_espfix_stack: * * create the pointer to lss back */ - pushl %ss - CFI_ADJUST_CFA_OFFSET 4 - pushl %esp - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %ss + pushl_cfi %esp addl $4, (%esp) /* copy the iret frame of 12 bytes */ .rept 3 - pushl 16(%esp) - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi 16(%esp) .endr - pushl %eax - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi %eax SAVE_ALL FIXUP_ESPFIX_STACK # %eax == %esp xorl %edx,%edx # zero error code @@ -1477,8 +1389,7 @@ END(nmi) ENTRY(int3) RING0_INT_FRAME - pushl $-1 # mark this as an int - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $-1 # mark this as an int SAVE_ALL TRACE_IRQS_OFF xorl %edx,%edx # zero error code @@ -1490,8 +1401,7 @@ END(int3) ENTRY(general_protection) RING0_EC_FRAME - pushl $do_general_protection - CFI_ADJUST_CFA_OFFSET 4 + pushl_cfi $do_general_protection jmp error_code CFI_ENDPROC END(general_protection) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 17be5ec7cbb..a7ae7fd1010 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -213,23 +213,17 @@ ENDPROC(native_usergs_sysret64) .macro FAKE_STACK_FRAME child_rip /* push in order ss, rsp, eflags, cs, rip */ xorl %eax, %eax - pushq $__KERNEL_DS /* ss */ - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi $__KERNEL_DS /* ss */ /*CFI_REL_OFFSET ss,0*/ - pushq %rax /* rsp */ - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi %rax /* rsp */ CFI_REL_OFFSET rsp,0 - pushq $X86_EFLAGS_IF /* eflags - interrupts on */ - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi $X86_EFLAGS_IF /* eflags - interrupts on */ /*CFI_REL_OFFSET rflags,0*/ - pushq $__KERNEL_CS /* cs */ - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi $__KERNEL_CS /* cs */ /*CFI_REL_OFFSET cs,0*/ - pushq \child_rip /* rip */ - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi \child_rip /* rip */ CFI_REL_OFFSET rip,0 - pushq %rax /* orig rax */ - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi %rax /* orig rax */ .endm .macro UNFAKE_STACK_FRAME @@ -398,10 +392,8 @@ ENTRY(ret_from_fork) LOCK ; btr $TIF_FORK,TI_flags(%r8) - push kernel_eflags(%rip) - CFI_ADJUST_CFA_OFFSET 8 - popf # reset kernel eflags - CFI_ADJUST_CFA_OFFSET -8 + pushq_cfi kernel_eflags(%rip) + popfq_cfi # reset kernel eflags call schedule_tail # rdi: 'prev' task parameter @@ -521,11 +513,9 @@ sysret_careful: jnc sysret_signal TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) - pushq %rdi - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi %rdi call schedule - popq %rdi - CFI_ADJUST_CFA_OFFSET -8 + popq_cfi %rdi jmp sysret_check /* Handle a signal */ @@ -634,11 +624,9 @@ int_careful: jnc int_very_careful TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) - pushq %rdi - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi %rdi call schedule - popq %rdi - CFI_ADJUST_CFA_OFFSET -8 + popq_cfi %rdi DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF jmp int_with_check @@ -652,12 +640,10 @@ int_check_syscall_exit_work: /* Check for syscall exit trace */ testl $_TIF_WORK_SYSCALL_EXIT,%edx jz int_signal - pushq %rdi - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi %rdi leaq 8(%rsp),%rdi # &ptregs -> arg1 call syscall_trace_leave - popq %rdi - CFI_ADJUST_CFA_OFFSET -8 + popq_cfi %rdi andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi jmp int_restore_rest @@ -714,9 +700,8 @@ END(ptregscall_common) ENTRY(stub_execve) CFI_STARTPROC - popq %r11 - CFI_ADJUST_CFA_OFFSET -8 - CFI_REGISTER rip, r11 + addq $8, %rsp + PARTIAL_FRAME 0 SAVE_REST FIXUP_TOP_OF_STACK %r11 movq %rsp, %rcx @@ -735,7 +720,7 @@ END(stub_execve) ENTRY(stub_rt_sigreturn) CFI_STARTPROC addq $8, %rsp - CFI_ADJUST_CFA_OFFSET -8 + PARTIAL_FRAME 0 SAVE_REST movq %rsp,%rdi FIXUP_TOP_OF_STACK %r11 @@ -766,8 +751,7 @@ vector=FIRST_EXTERNAL_VECTOR .if vector <> FIRST_EXTERNAL_VECTOR CFI_ADJUST_CFA_OFFSET -8 .endif -1: pushq $(~vector+0x80) /* Note: always in signed byte range */ - CFI_ADJUST_CFA_OFFSET 8 +1: pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */ .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 jmp 2f .endif @@ -796,8 +780,8 @@ END(interrupt) /* 0(%rsp): ~(interrupt number) */ .macro interrupt func - subq $10*8, %rsp - CFI_ADJUST_CFA_OFFSET 10*8 + subq $ORIG_RAX-ARGOFFSET+8, %rsp + CFI_ADJUST_CFA_OFFSET ORIG_RAX-ARGOFFSET+8 call save_args PARTIAL_FRAME 0 call \func @@ -822,6 +806,7 @@ ret_from_intr: TRACE_IRQS_OFF decl PER_CPU_VAR(irq_count) leaveq + CFI_RESTORE rbp CFI_DEF_CFA_REGISTER rsp CFI_ADJUST_CFA_OFFSET -8 exit_intr: @@ -903,11 +888,9 @@ retint_careful: jnc retint_signal TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) - pushq %rdi - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi %rdi call schedule - popq %rdi - CFI_ADJUST_CFA_OFFSET -8 + popq_cfi %rdi GET_THREAD_INFO(%rcx) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF @@ -956,8 +939,7 @@ END(common_interrupt) .macro apicinterrupt num sym do_sym ENTRY(\sym) INTR_FRAME - pushq $~(\num) - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi $~(\num) interrupt \do_sym jmp ret_from_intr CFI_ENDPROC @@ -1023,9 +1005,9 @@ apicinterrupt ERROR_APIC_VECTOR \ apicinterrupt SPURIOUS_APIC_VECTOR \ spurious_interrupt smp_spurious_interrupt -#ifdef CONFIG_PERF_EVENTS -apicinterrupt LOCAL_PENDING_VECTOR \ - perf_pending_interrupt smp_perf_pending_interrupt +#ifdef CONFIG_IRQ_WORK +apicinterrupt IRQ_WORK_VECTOR \ + irq_work_interrupt smp_irq_work_interrupt #endif /* @@ -1036,8 +1018,8 @@ ENTRY(\sym) INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ - subq $15*8,%rsp - CFI_ADJUST_CFA_OFFSET 15*8 + subq $ORIG_RAX-R15, %rsp + CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 call error_entry DEFAULT_FRAME 0 movq %rsp,%rdi /* pt_regs pointer */ @@ -1052,9 +1034,9 @@ END(\sym) ENTRY(\sym) INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq $-1 /* ORIG_RAX: no syscall to restart */ - CFI_ADJUST_CFA_OFFSET 8 - subq $15*8, %rsp + pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ + subq $ORIG_RAX-R15, %rsp + CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 call save_paranoid TRACE_IRQS_OFF movq %rsp,%rdi /* pt_regs pointer */ @@ -1070,9 +1052,9 @@ END(\sym) ENTRY(\sym) INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME - pushq $-1 /* ORIG_RAX: no syscall to restart */ - CFI_ADJUST_CFA_OFFSET 8 - subq $15*8, %rsp + pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ + subq $ORIG_RAX-R15, %rsp + CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 call save_paranoid TRACE_IRQS_OFF movq %rsp,%rdi /* pt_regs pointer */ @@ -1089,8 +1071,8 @@ END(\sym) ENTRY(\sym) XCPT_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME - subq $15*8,%rsp - CFI_ADJUST_CFA_OFFSET 15*8 + subq $ORIG_RAX-R15, %rsp + CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 call error_entry DEFAULT_FRAME 0 movq %rsp,%rdi /* pt_regs pointer */ @@ -1107,8 +1089,8 @@ END(\sym) ENTRY(\sym) XCPT_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME - subq $15*8,%rsp - CFI_ADJUST_CFA_OFFSET 15*8 + subq $ORIG_RAX-R15, %rsp + CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 call save_paranoid DEFAULT_FRAME 0 TRACE_IRQS_OFF @@ -1139,16 +1121,14 @@ zeroentry simd_coprocessor_error do_simd_coprocessor_error /* edi: new selector */ ENTRY(native_load_gs_index) CFI_STARTPROC - pushf - CFI_ADJUST_CFA_OFFSET 8 + pushfq_cfi DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) SWAPGS gs_change: movl %edi,%gs 2: mfence /* workaround */ SWAPGS - popf - CFI_ADJUST_CFA_OFFSET -8 + popfq_cfi ret CFI_ENDPROC END(native_load_gs_index) @@ -1215,8 +1195,7 @@ END(kernel_execve) /* Call softirq on interrupt stack. Interrupts are off. */ ENTRY(call_softirq) CFI_STARTPROC - push %rbp - CFI_ADJUST_CFA_OFFSET 8 + pushq_cfi %rbp CFI_REL_OFFSET rbp,0 mov %rsp,%rbp CFI_DEF_CFA_REGISTER rbp @@ -1225,6 +1204,7 @@ ENTRY(call_softirq) push %rbp # backlink for old unwinder call __do_softirq leaveq + CFI_RESTORE rbp CFI_DEF_CFA_REGISTER rsp CFI_ADJUST_CFA_OFFSET -8 decl PER_CPU_VAR(irq_count) @@ -1368,7 +1348,7 @@ paranoidzeroentry machine_check *machine_check_vector(%rip) /* ebx: no swapgs flag */ ENTRY(paranoid_exit) - INTR_FRAME + DEFAULT_FRAME DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF testl %ebx,%ebx /* swapgs needed? */ @@ -1445,7 +1425,6 @@ error_swapgs: error_sti: TRACE_IRQS_OFF ret - CFI_ENDPROC /* * There are two places in the kernel that can potentially fault with @@ -1470,6 +1449,7 @@ bstep_iret: /* Fix truncated RIP */ movq %rcx,RIP+8(%rsp) jmp error_swapgs + CFI_ENDPROC END(error_entry) @@ -1498,8 +1478,8 @@ ENTRY(nmi) INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME pushq_cfi $-1 - subq $15*8, %rsp - CFI_ADJUST_CFA_OFFSET 15*8 + subq $ORIG_RAX-R15, %rsp + CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 call save_paranoid DEFAULT_FRAME 0 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index cd37469b54e..3afb33f14d2 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -257,14 +257,9 @@ do_ftrace_mod_code(unsigned long ip, void *new_code) return mod_code_status; } - - - -static unsigned char ftrace_nop[MCOUNT_INSN_SIZE]; - static unsigned char *ftrace_nop_replace(void) { - return ftrace_nop; + return ideal_nop5; } static int @@ -338,62 +333,6 @@ int ftrace_update_ftrace_func(ftrace_func_t func) int __init ftrace_dyn_arch_init(void *data) { - extern const unsigned char ftrace_test_p6nop[]; - extern const unsigned char ftrace_test_nop5[]; - extern const unsigned char ftrace_test_jmp[]; - int faulted = 0; - - /* - * There is no good nop for all x86 archs. - * We will default to using the P6_NOP5, but first we - * will test to make sure that the nop will actually - * work on this CPU. If it faults, we will then - * go to a lesser efficient 5 byte nop. If that fails - * we then just use a jmp as our nop. This isn't the most - * efficient nop, but we can not use a multi part nop - * since we would then risk being preempted in the middle - * of that nop, and if we enabled tracing then, it might - * cause a system crash. - * - * TODO: check the cpuid to determine the best nop. - */ - asm volatile ( - "ftrace_test_jmp:" - "jmp ftrace_test_p6nop\n" - "nop\n" - "nop\n" - "nop\n" /* 2 byte jmp + 3 bytes */ - "ftrace_test_p6nop:" - P6_NOP5 - "jmp 1f\n" - "ftrace_test_nop5:" - ".byte 0x66,0x66,0x66,0x66,0x90\n" - "1:" - ".section .fixup, \"ax\"\n" - "2: movl $1, %0\n" - " jmp ftrace_test_nop5\n" - "3: movl $2, %0\n" - " jmp 1b\n" - ".previous\n" - _ASM_EXTABLE(ftrace_test_p6nop, 2b) - _ASM_EXTABLE(ftrace_test_nop5, 3b) - : "=r"(faulted) : "0" (faulted)); - - switch (faulted) { - case 0: - pr_info("converting mcount calls to 0f 1f 44 00 00\n"); - memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE); - break; - case 1: - pr_info("converting mcount calls to 66 66 66 66 90\n"); - memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE); - break; - case 2: - pr_info("converting mcount calls to jmp . + 5\n"); - memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE); - break; - } - /* The return code is retured via data */ *(unsigned long *)data = 0; diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 351f9c0fea1..7494999141b 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -35,7 +35,6 @@ unsigned long hpet_address; u8 hpet_blockid; /* OS timer block num */ u8 hpet_msi_disable; -u8 hpet_readback_cmp; #ifdef CONFIG_PCI_MSI static unsigned long hpet_num_timers; @@ -395,23 +394,27 @@ static int hpet_next_event(unsigned long delta, * at that point and we would wait for the next hpet interrupt * forever. We found out that reading the CMP register back * forces the transfer so we can rely on the comparison with - * the counter register below. + * the counter register below. If the read back from the + * compare register does not match the value we programmed + * then we might have a real hardware problem. We can not do + * much about it here, but at least alert the user/admin with + * a prominent warning. * - * That works fine on those ATI chipsets, but on newer Intel - * chipsets (ICH9...) this triggers due to an erratum: Reading - * the comparator immediately following a write is returning - * the old value. + * An erratum on some chipsets (ICH9,..), results in + * comparator read immediately following a write returning old + * value. Workaround for this is to read this value second + * time, when first read returns old value. * - * We restrict the read back to the affected ATI chipsets (set - * by quirks) and also run it with hpet=verbose for debugging - * purposes. + * In fact the write to the comparator register is delayed up + * to two HPET cycles so the workaround we tried to restrict + * the readback to those known to be borked ATI chipsets + * failed miserably. So we give up on optimizations forever + * and penalize all HPET incarnations unconditionally. */ - if (hpet_readback_cmp || hpet_verbose) { - u32 cmp = hpet_readl(HPET_Tn_CMP(timer)); - - if (cmp != cnt) + if (unlikely((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt)) { + if (hpet_readl(HPET_Tn_CMP(timer)) != cnt) printk_once(KERN_WARNING - "hpet: compare register read back failed.\n"); + "hpet: compare register read back failed.\n"); } return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; @@ -503,7 +506,7 @@ static int hpet_assign_irq(struct hpet_dev *dev) { unsigned int irq; - irq = create_irq(); + irq = create_irq_nr(0, -1); if (!irq) return -EINVAL; diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index a474ec37c32..ff15c9dcc25 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -206,11 +206,27 @@ int arch_check_bp_in_kernelspace(struct perf_event *bp) int arch_bp_generic_fields(int x86_len, int x86_type, int *gen_len, int *gen_type) { - /* Len */ - switch (x86_len) { - case X86_BREAKPOINT_LEN_X: + /* Type */ + switch (x86_type) { + case X86_BREAKPOINT_EXECUTE: + if (x86_len != X86_BREAKPOINT_LEN_X) + return -EINVAL; + + *gen_type = HW_BREAKPOINT_X; *gen_len = sizeof(long); + return 0; + case X86_BREAKPOINT_WRITE: + *gen_type = HW_BREAKPOINT_W; break; + case X86_BREAKPOINT_RW: + *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R; + break; + default: + return -EINVAL; + } + + /* Len */ + switch (x86_len) { case X86_BREAKPOINT_LEN_1: *gen_len = HW_BREAKPOINT_LEN_1; break; @@ -229,21 +245,6 @@ int arch_bp_generic_fields(int x86_len, int x86_type, return -EINVAL; } - /* Type */ - switch (x86_type) { - case X86_BREAKPOINT_EXECUTE: - *gen_type = HW_BREAKPOINT_X; - break; - case X86_BREAKPOINT_WRITE: - *gen_type = HW_BREAKPOINT_W; - break; - case X86_BREAKPOINT_RW: - *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R; - break; - default: - return -EINVAL; - } - return 0; } @@ -316,9 +317,6 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp) ret = -EINVAL; switch (info->len) { - case X86_BREAKPOINT_LEN_X: - align = sizeof(long) -1; - break; case X86_BREAKPOINT_LEN_1: align = 0; break; diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index a46cb3522c0..58bb239a2fd 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -68,19 +68,22 @@ static void __cpuinit init_thread_xstate(void) */ if (!HAVE_HWFP) { + /* + * Disable xsave as we do not support it if i387 + * emulation is enabled. + */ + setup_clear_cpu_cap(X86_FEATURE_XSAVE); + setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); xstate_size = sizeof(struct i387_soft_struct); return; } if (cpu_has_fxsr) xstate_size = sizeof(struct i387_fxsave_struct); -#ifdef CONFIG_X86_32 else xstate_size = sizeof(struct i387_fsave_struct); -#endif } -#ifdef CONFIG_X86_64 /* * Called at bootup to set up the initial FPU state that is later cloned * into all processes. @@ -88,12 +91,21 @@ static void __cpuinit init_thread_xstate(void) void __cpuinit fpu_init(void) { - unsigned long oldcr0 = read_cr0(); - - set_in_cr4(X86_CR4_OSFXSR); - set_in_cr4(X86_CR4_OSXMMEXCPT); + unsigned long cr0; + unsigned long cr4_mask = 0; - write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */ + if (cpu_has_fxsr) + cr4_mask |= X86_CR4_OSFXSR; + if (cpu_has_xmm) + cr4_mask |= X86_CR4_OSXMMEXCPT; + if (cr4_mask) + set_in_cr4(cr4_mask); + + cr0 = read_cr0(); + cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */ + if (!HAVE_HWFP) + cr0 |= X86_CR0_EM; + write_cr0(cr0); if (!smp_processor_id()) init_thread_xstate(); @@ -104,24 +116,12 @@ void __cpuinit fpu_init(void) clear_used_math(); } -#else /* CONFIG_X86_64 */ - -void __cpuinit fpu_init(void) -{ - if (!smp_processor_id()) - init_thread_xstate(); -} - -#endif /* CONFIG_X86_32 */ - void fpu_finit(struct fpu *fpu) { -#ifdef CONFIG_X86_32 if (!HAVE_HWFP) { finit_soft_fpu(&fpu->state->soft); return; } -#endif if (cpu_has_fxsr) { struct i387_fxsave_struct *fx = &fpu->state->fxsave; @@ -386,19 +386,17 @@ convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) #ifdef CONFIG_X86_64 env->fip = fxsave->rip; env->foo = fxsave->rdp; + /* + * should be actually ds/cs at fpu exception time, but + * that information is not available in 64bit mode. + */ + env->fcs = task_pt_regs(tsk)->cs; if (tsk == current) { - /* - * should be actually ds/cs at fpu exception time, but - * that information is not available in 64bit mode. - */ - asm("mov %%ds, %[fos]" : [fos] "=r" (env->fos)); - asm("mov %%cs, %[fcs]" : [fcs] "=r" (env->fcs)); + savesegment(ds, env->fos); } else { - struct pt_regs *regs = task_pt_regs(tsk); - - env->fos = 0xffff0000 | tsk->thread.ds; - env->fcs = regs->cs; + env->fos = tsk->thread.ds; } + env->fos |= 0xffff0000; #else env->fip = fxsave->fip; env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 91fd0c70a18..44edb03fc9e 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -67,10 +67,10 @@ static int show_other_interrupts(struct seq_file *p, int prec) for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); seq_printf(p, " Performance monitoring interrupts\n"); - seq_printf(p, "%*s: ", prec, "PND"); + seq_printf(p, "%*s: ", prec, "IWI"); for_each_online_cpu(j) - seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); - seq_printf(p, " Performance pending work\n"); + seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs); + seq_printf(p, " IRQ work interrupts\n"); #endif if (x86_platform_ipi_callback) { seq_printf(p, "%*s: ", prec, "PLT"); @@ -185,7 +185,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += irq_stats(cpu)->apic_timer_irqs; sum += irq_stats(cpu)->irq_spurious_count; sum += irq_stats(cpu)->apic_perf_irqs; - sum += irq_stats(cpu)->apic_pending_irqs; + sum += irq_stats(cpu)->apic_irq_work_irqs; #endif if (x86_platform_ipi_callback) sum += irq_stats(cpu)->x86_platform_ipis; diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c new file mode 100644 index 00000000000..ca8f703a1e7 --- /dev/null +++ b/arch/x86/kernel/irq_work.c @@ -0,0 +1,30 @@ +/* + * x86 specific code for irq_work + * + * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + */ + +#include <linux/kernel.h> +#include <linux/irq_work.h> +#include <linux/hardirq.h> +#include <asm/apic.h> + +void smp_irq_work_interrupt(struct pt_regs *regs) +{ + irq_enter(); + ack_APIC_irq(); + inc_irq_stat(apic_irq_work_irqs); + irq_work_run(); + irq_exit(); +} + +void arch_irq_work_raise(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + if (!cpu_has_apic) + return; + + apic->send_IPI_self(IRQ_WORK_VECTOR); + apic_wait_icr_idle(); +#endif +} diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 990ae7cfc57..713969b9266 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -224,9 +224,9 @@ static void __init apic_intr_init(void) alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); - /* Performance monitoring interrupts: */ -# ifdef CONFIG_PERF_EVENTS - alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); + /* IRQ work interrupts: */ +# ifdef CONFIG_IRQ_WORK + alloc_intr_gate(IRQ_WORK_VECTOR, irq_work_interrupt); # endif #endif diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c new file mode 100644 index 00000000000..961b6b30ba9 --- /dev/null +++ b/arch/x86/kernel/jump_label.c @@ -0,0 +1,50 @@ +/* + * jump label x86 support + * + * Copyright (C) 2009 Jason Baron <jbaron@redhat.com> + * + */ +#include <linux/jump_label.h> +#include <linux/memory.h> +#include <linux/uaccess.h> +#include <linux/module.h> +#include <linux/list.h> +#include <linux/jhash.h> +#include <linux/cpu.h> +#include <asm/kprobes.h> +#include <asm/alternative.h> + +#ifdef HAVE_JUMP_LABEL + +union jump_code_union { + char code[JUMP_LABEL_NOP_SIZE]; + struct { + char jump; + int offset; + } __attribute__((packed)); +}; + +void arch_jump_label_transform(struct jump_entry *entry, + enum jump_label_type type) +{ + union jump_code_union code; + + if (type == JUMP_LABEL_ENABLE) { + code.jump = 0xe9; + code.offset = entry->target - + (entry->code + JUMP_LABEL_NOP_SIZE); + } else + memcpy(&code, ideal_nop5, JUMP_LABEL_NOP_SIZE); + get_online_cpus(); + mutex_lock(&text_mutex); + text_poke_smp((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE); + mutex_unlock(&text_mutex); + put_online_cpus(); +} + +void arch_jump_label_text_poke_early(jump_label_t addr) +{ + text_poke_early((void *)addr, ideal_nop5, JUMP_LABEL_NOP_SIZE); +} + +#endif diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 770ebfb349e..1cbd54c0df9 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -230,9 +230,6 @@ static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) return 0; } -/* Dummy buffers for kallsyms_lookup */ -static char __dummy_buf[KSYM_NAME_LEN]; - /* Check if paddr is at an instruction boundary */ static int __kprobes can_probe(unsigned long paddr) { @@ -241,7 +238,7 @@ static int __kprobes can_probe(unsigned long paddr) struct insn insn; kprobe_opcode_t buf[MAX_INSN_SIZE]; - if (!kallsyms_lookup(paddr, NULL, &offset, NULL, __dummy_buf)) + if (!kallsyms_lookup_size_offset(paddr, NULL, &offset)) return 0; /* Decode instructions */ @@ -1129,7 +1126,7 @@ static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr, *(unsigned long *)addr = val; } -void __kprobes kprobes_optinsn_template_holder(void) +static void __used __kprobes kprobes_optinsn_template_holder(void) { asm volatile ( ".global optprobe_template_entry\n" @@ -1221,7 +1218,8 @@ static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src) } /* Check whether the address range is reserved */ if (ftrace_text_reserved(src, src + len - 1) || - alternatives_text_reserved(src, src + len - 1)) + alternatives_text_reserved(src, src + len - 1) || + jump_label_text_reserved(src, src + len - 1)) return -EBUSY; return len; @@ -1269,11 +1267,9 @@ static int __kprobes can_optimize(unsigned long paddr) unsigned long addr, size = 0, offset = 0; struct insn insn; kprobe_opcode_t buf[MAX_INSN_SIZE]; - /* Dummy buffers for lookup_symbol_attrs */ - static char __dummy_buf[KSYM_NAME_LEN]; /* Lookup symbol including addr */ - if (!kallsyms_lookup(paddr, &size, &offset, NULL, __dummy_buf)) + if (!kallsyms_lookup_size_offset(paddr, &size, &offset)) return 0; /* Check there is enough space for a relative jump. */ diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 035c8c52918..b3ea9db39db 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -36,7 +36,7 @@ static int init_one_level2_page(struct kimage *image, pgd_t *pgd, if (!page) goto out; pud = (pud_t *)page_address(page); - memset(pud, 0, PAGE_SIZE); + clear_page(pud); set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); } pud = pud_offset(pgd, addr); @@ -45,7 +45,7 @@ static int init_one_level2_page(struct kimage *image, pgd_t *pgd, if (!page) goto out; pmd = (pmd_t *)page_address(page); - memset(pmd, 0, PAGE_SIZE); + clear_page(pmd); set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); } pmd = pmd_offset(pud, addr); diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index e0bc186d750..8f295609173 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -239,11 +239,13 @@ int module_finalize(const Elf_Ehdr *hdr, apply_paravirt(pseg, pseg + para->sh_size); } - return module_bug_finalize(hdr, sechdrs, me); + /* make jump label nops */ + jump_label_apply_nops(me); + + return 0; } void module_arch_cleanup(struct module *mod) { alternatives_smp_module_del(mod); - module_bug_cleanup(mod); } diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 0f7f130caa6..c562207b1b3 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -39,7 +39,7 @@ #include <asm/cacheflush.h> #include <asm/swiotlb.h> #include <asm/dma.h> -#include <asm/k8.h> +#include <asm/amd_nb.h> #include <asm/x86_init.h> static unsigned long iommu_bus_base; /* GART remapping area (physical) */ @@ -560,8 +560,11 @@ static void enable_gart_translations(void) { int i; - for (i = 0; i < num_k8_northbridges; i++) { - struct pci_dev *dev = k8_northbridges[i]; + if (!k8_northbridges.gart_supported) + return; + + for (i = 0; i < k8_northbridges.num; i++) { + struct pci_dev *dev = k8_northbridges.nb_misc[i]; enable_gart_translation(dev, __pa(agp_gatt_table)); } @@ -592,16 +595,19 @@ static void gart_fixup_northbridges(struct sys_device *dev) if (!fix_up_north_bridges) return; + if (!k8_northbridges.gart_supported) + return; + pr_info("PCI-DMA: Restoring GART aperture settings\n"); - for (i = 0; i < num_k8_northbridges; i++) { - struct pci_dev *dev = k8_northbridges[i]; + for (i = 0; i < k8_northbridges.num; i++) { + struct pci_dev *dev = k8_northbridges.nb_misc[i]; /* * Don't enable translations just yet. That is the next * step. Restore the pre-suspend aperture settings. */ - pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, aperture_order << 1); + gart_set_size_and_enable(dev, aperture_order); pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25); } } @@ -649,8 +655,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info) aper_size = aper_base = info->aper_size = 0; dev = NULL; - for (i = 0; i < num_k8_northbridges; i++) { - dev = k8_northbridges[i]; + for (i = 0; i < k8_northbridges.num; i++) { + dev = k8_northbridges.nb_misc[i]; new_aper_base = read_aperture(dev, &new_aper_size); if (!new_aper_base) goto nommu; @@ -718,10 +724,13 @@ static void gart_iommu_shutdown(void) if (!no_agp) return; - for (i = 0; i < num_k8_northbridges; i++) { + if (!k8_northbridges.gart_supported) + return; + + for (i = 0; i < k8_northbridges.num; i++) { u32 ctl; - dev = k8_northbridges[i]; + dev = k8_northbridges.nb_misc[i]; pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); ctl &= ~GARTEN; @@ -739,7 +748,7 @@ int __init gart_iommu_init(void) unsigned long scratch; long i; - if (num_k8_northbridges == 0) + if (!k8_northbridges.gart_supported) return 0; #ifndef CONFIG_AGP_AMD64 diff --git a/arch/x86/kernel/pmtimer_64.c b/arch/x86/kernel/pmtimer_64.c deleted file mode 100644 index b112406f199..00000000000 --- a/arch/x86/kernel/pmtimer_64.c +++ /dev/null @@ -1,69 +0,0 @@ -/* Ported over from i386 by AK, original copyright was: - * - * (C) Dominik Brodowski <linux@brodo.de> 2003 - * - * Driver to use the Power Management Timer (PMTMR) available in some - * southbridges as primary timing source for the Linux kernel. - * - * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c, - * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4. - * - * This file is licensed under the GPL v2. - * - * Dropped all the hardware bug workarounds for now. Hopefully they - * are not needed on 64bit chipsets. - */ - -#include <linux/jiffies.h> -#include <linux/kernel.h> -#include <linux/time.h> -#include <linux/init.h> -#include <linux/cpumask.h> -#include <linux/acpi_pmtmr.h> - -#include <asm/io.h> -#include <asm/proto.h> -#include <asm/msr.h> -#include <asm/vsyscall.h> - -static inline u32 cyc2us(u32 cycles) -{ - /* The Power Management Timer ticks at 3.579545 ticks per microsecond. - * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%] - * - * Even with HZ = 100, delta is at maximum 35796 ticks, so it can - * easily be multiplied with 286 (=0x11E) without having to fear - * u32 overflows. - */ - cycles *= 286; - return (cycles >> 10); -} - -static unsigned pmtimer_wait_tick(void) -{ - u32 a, b; - for (a = b = inl(pmtmr_ioport) & ACPI_PM_MASK; - a == b; - b = inl(pmtmr_ioport) & ACPI_PM_MASK) - cpu_relax(); - return b; -} - -/* note: wait time is rounded up to one tick */ -void pmtimer_wait(unsigned us) -{ - u32 a, b; - a = pmtimer_wait_tick(); - do { - b = inl(pmtmr_ioport); - cpu_relax(); - } while (cyc2us(b - a) < us); -} - -static int __init nopmtimer_setup(char *s) -{ - pmtmr_ioport = 0; - return 1; -} - -__setup("nopmtimer", nopmtimer_setup); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 3d9ea531ddd..b3d7a3a04f3 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -424,7 +424,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) load_TLS(next, cpu); /* Must be after DS reload */ - unlazy_fpu(prev_p); + __unlazy_fpu(prev_p); /* Make sure cpu is ready for new context */ if (preload_fpu) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index e3af342fe83..7a4cf14223b 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -84,7 +84,7 @@ static int __init reboot_setup(char *str) } /* we will leave sorting out the final value when we are ready to reboot, since we might not - have set up boot_cpu_id or smp_num_cpu */ + have detected BSP APIC ID or smp_num_cpu */ break; #endif /* CONFIG_SMP */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index c3a4fbb2b99..9e5cdaf73b4 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -107,11 +107,12 @@ #include <asm/percpu.h> #include <asm/topology.h> #include <asm/apicdef.h> -#include <asm/k8.h> +#include <asm/amd_nb.h> #ifdef CONFIG_X86_64 #include <asm/numa_64.h> #endif #include <asm/mce.h> +#include <asm/alternative.h> /* * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. @@ -125,7 +126,6 @@ unsigned long max_pfn_mapped; RESERVE_BRK(dmi_alloc, 65536); #endif -unsigned int boot_cpu_id __read_mostly; static __initdata unsigned long _brk_start = (unsigned long)__brk_base; unsigned long _brk_end = (unsigned long)__brk_base; @@ -618,79 +618,7 @@ static __init void reserve_ibft_region(void) reserve_early_overlap_ok(addr, addr + size, "ibft"); } -#ifdef CONFIG_X86_RESERVE_LOW_64K -static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) -{ - printk(KERN_NOTICE - "%s detected: BIOS may corrupt low RAM, working around it.\n", - d->ident); - - e820_update_range(0, 0x10000, E820_RAM, E820_RESERVED); - sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); - - return 0; -} -#endif - -/* List of systems that have known low memory corruption BIOS problems */ -static struct dmi_system_id __initdata bad_bios_dmi_table[] = { -#ifdef CONFIG_X86_RESERVE_LOW_64K - { - .callback = dmi_low_memory_corruption, - .ident = "AMI BIOS", - .matches = { - DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."), - }, - }, - { - .callback = dmi_low_memory_corruption, - .ident = "Phoenix BIOS", - .matches = { - DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"), - }, - }, - { - .callback = dmi_low_memory_corruption, - .ident = "Phoenix/MSC BIOS", - .matches = { - DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"), - }, - }, - /* - * AMI BIOS with low memory corruption was found on Intel DG45ID and - * DG45FC boards. - * It has a different DMI_BIOS_VENDOR = "Intel Corp.", for now we will - * match only DMI_BOARD_NAME and see if there is more bad products - * with this vendor. - */ - { - .callback = dmi_low_memory_corruption, - .ident = "AMI BIOS", - .matches = { - DMI_MATCH(DMI_BOARD_NAME, "DG45ID"), - }, - }, - { - .callback = dmi_low_memory_corruption, - .ident = "AMI BIOS", - .matches = { - DMI_MATCH(DMI_BOARD_NAME, "DG45FC"), - }, - }, - /* - * The Dell Inspiron Mini 1012 has DMI_BIOS_VENDOR = "Dell Inc.", so - * match on the product name. - */ - { - .callback = dmi_low_memory_corruption, - .ident = "Phoenix BIOS", - .matches = { - DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 1012"), - }, - }, -#endif - {} -}; +static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10; static void __init trim_bios_range(void) { @@ -698,8 +626,14 @@ static void __init trim_bios_range(void) * A special case is the first 4Kb of memory; * This is a BIOS owned area, not kernel ram, but generally * not listed as such in the E820 table. + * + * This typically reserves additional memory (64KiB by default) + * since some BIOSes are known to corrupt low memory. See the + * Kconfig help text for X86_RESERVE_LOW. */ - e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED); + e820_update_range(0, ALIGN(reserve_low, PAGE_SIZE), + E820_RAM, E820_RESERVED); + /* * special case: Some BIOSen report the PC BIOS * area (640->1Mb) as ram even though it is not. @@ -709,6 +643,28 @@ static void __init trim_bios_range(void) sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); } +static int __init parse_reservelow(char *p) +{ + unsigned long long size; + + if (!p) + return -EINVAL; + + size = memparse(p, &p); + + if (size < 4096) + size = 4096; + + if (size > 640*1024) + size = 640*1024; + + reserve_low = size; + + return 0; +} + +early_param("reservelow", parse_reservelow); + /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -726,6 +682,7 @@ void __init setup_arch(char **cmdline_p) { int acpi = 0; int k8 = 0; + unsigned long flags; #ifdef CONFIG_X86_32 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); @@ -863,8 +820,6 @@ void __init setup_arch(char **cmdline_p) dmi_scan_machine(); - dmi_check_system(bad_bios_dmi_table); - /* * VMware detection requires dmi to be available, so this * needs to be done after dmi_scan_machine, for the BP. @@ -1071,6 +1026,10 @@ void __init setup_arch(char **cmdline_p) x86_init.oem.banner(); mcheck_init(); + + local_irq_save(flags); + arch_init_ideal_nop5(); + local_irq_restore(flags); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index a60df9ae645..2335c15c93a 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -253,7 +253,7 @@ void __init setup_per_cpu_areas(void) * Up to this point, the boot CPU has been using .init.data * area. Reload any changed state for the boot CPU. */ - if (cpu == boot_cpu_id) + if (!cpu) switch_to_new_gdt(cpu); } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index fdccfe9dc63..0116552c950 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -398,6 +398,19 @@ void __cpuinit smp_store_cpu_info(int id) identify_secondary_cpu(c); } +static void __cpuinit link_thread_siblings(int cpu1, int cpu2) +{ + struct cpuinfo_x86 *c1 = &cpu_data(cpu1); + struct cpuinfo_x86 *c2 = &cpu_data(cpu2); + + cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2)); + cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1)); + cpumask_set_cpu(cpu1, cpu_core_mask(cpu2)); + cpumask_set_cpu(cpu2, cpu_core_mask(cpu1)); + cpumask_set_cpu(cpu1, c2->llc_shared_map); + cpumask_set_cpu(cpu2, c1->llc_shared_map); +} + void __cpuinit set_cpu_sibling_map(int cpu) { @@ -410,14 +423,13 @@ void __cpuinit set_cpu_sibling_map(int cpu) for_each_cpu(i, cpu_sibling_setup_mask) { struct cpuinfo_x86 *o = &cpu_data(i); - if (c->phys_proc_id == o->phys_proc_id && - c->cpu_core_id == o->cpu_core_id) { - cpumask_set_cpu(i, cpu_sibling_mask(cpu)); - cpumask_set_cpu(cpu, cpu_sibling_mask(i)); - cpumask_set_cpu(i, cpu_core_mask(cpu)); - cpumask_set_cpu(cpu, cpu_core_mask(i)); - cpumask_set_cpu(i, c->llc_shared_map); - cpumask_set_cpu(cpu, o->llc_shared_map); + if (cpu_has(c, X86_FEATURE_TOPOEXT)) { + if (c->phys_proc_id == o->phys_proc_id && + c->compute_unit_id == o->compute_unit_id) + link_thread_siblings(cpu, i); + } else if (c->phys_proc_id == o->phys_proc_id && + c->cpu_core_id == o->cpu_core_id) { + link_thread_siblings(cpu, i); } } } else { diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c index d5e06624e34..0b0cb5fede1 100644 --- a/arch/x86/kernel/sys_i386_32.c +++ b/arch/x86/kernel/sys_i386_32.c @@ -33,8 +33,8 @@ int kernel_execve(const char *filename, const char *const envp[]) { long __res; - asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx" + asm volatile ("int $0x80" : "=a" (__res) - : "0" (__NR_execve), "ri" (filename), "c" (argv), "d" (envp) : "memory"); + : "0" (__NR_execve), "b" (filename), "c" (argv), "d" (envp) : "memory"); return __res; } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 60788dee0f8..d43968503dd 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -776,21 +776,10 @@ asmlinkage void math_state_restore(void) } EXPORT_SYMBOL_GPL(math_state_restore); -#ifndef CONFIG_MATH_EMULATION -void math_emulate(struct math_emu_info *info) -{ - printk(KERN_EMERG - "math-emulation not enabled and no coprocessor found.\n"); - printk(KERN_EMERG "killing %s.\n", current->comm); - force_sig(SIGFPE, current); - schedule(); -} -#endif /* CONFIG_MATH_EMULATION */ - dotraplinkage void __kprobes do_device_not_available(struct pt_regs *regs, long error_code) { -#ifdef CONFIG_X86_32 +#ifdef CONFIG_MATH_EMULATION if (read_cr0() & X86_CR0_EM) { struct math_emu_info info = { }; @@ -798,12 +787,12 @@ do_device_not_available(struct pt_regs *regs, long error_code) info.regs = regs; math_emulate(&info); - } else { - math_state_restore(); /* interrupts still off */ - conditional_sti(regs); + return; } -#else - math_state_restore(); +#endif + math_state_restore(); /* interrupts still off */ +#ifdef CONFIG_X86_32 + conditional_sti(regs); #endif } @@ -881,18 +870,6 @@ void __init trap_init(void) #endif #ifdef CONFIG_X86_32 - if (cpu_has_fxsr) { - printk(KERN_INFO "Enabling fast FPU save and restore... "); - set_in_cr4(X86_CR4_OSFXSR); - printk("done.\n"); - } - if (cpu_has_xmm) { - printk(KERN_INFO - "Enabling unmasked SIMD FPU exception support... "); - set_in_cr4(X86_CR4_OSXMMEXCPT); - printk("done.\n"); - } - set_system_trap_gate(SYSCALL_VECTOR, &system_call); set_bit(SYSCALL_VECTOR, used_vectors); #endif diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 26a863a9c2a..0c40d8b7241 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -104,10 +104,14 @@ int __init notsc_setup(char *str) __setup("notsc", notsc_setup); +static int no_sched_irq_time; + static int __init tsc_setup(char *str) { if (!strcmp(str, "reliable")) tsc_clocksource_reliable = 1; + if (!strncmp(str, "noirqtime", 9)) + no_sched_irq_time = 1; return 1; } @@ -801,6 +805,7 @@ void mark_tsc_unstable(char *reason) if (!tsc_unstable) { tsc_unstable = 1; sched_clock_stable = 0; + disable_sched_clock_irqtime(); printk(KERN_INFO "Marking TSC unstable due to %s\n", reason); /* Change only the rating, when not registered */ if (clocksource_tsc.mult) @@ -892,60 +897,6 @@ static void __init init_tsc_clocksource(void) clocksource_register_khz(&clocksource_tsc, tsc_khz); } -#ifdef CONFIG_X86_64 -/* - * calibrate_cpu is used on systems with fixed rate TSCs to determine - * processor frequency - */ -#define TICK_COUNT 100000000 -static unsigned long __init calibrate_cpu(void) -{ - int tsc_start, tsc_now; - int i, no_ctr_free; - unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0; - unsigned long flags; - - for (i = 0; i < 4; i++) - if (avail_to_resrv_perfctr_nmi_bit(i)) - break; - no_ctr_free = (i == 4); - if (no_ctr_free) { - WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... " - "cpu_khz value may be incorrect.\n"); - i = 3; - rdmsrl(MSR_K7_EVNTSEL3, evntsel3); - wrmsrl(MSR_K7_EVNTSEL3, 0); - rdmsrl(MSR_K7_PERFCTR3, pmc3); - } else { - reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i); - reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i); - } - local_irq_save(flags); - /* start measuring cycles, incrementing from 0 */ - wrmsrl(MSR_K7_PERFCTR0 + i, 0); - wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76); - rdtscl(tsc_start); - do { - rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now); - tsc_now = get_cycles(); - } while ((tsc_now - tsc_start) < TICK_COUNT); - - local_irq_restore(flags); - if (no_ctr_free) { - wrmsrl(MSR_K7_EVNTSEL3, 0); - wrmsrl(MSR_K7_PERFCTR3, pmc3); - wrmsrl(MSR_K7_EVNTSEL3, evntsel3); - } else { - release_perfctr_nmi(MSR_K7_PERFCTR0 + i); - release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); - } - - return pmc_now * tsc_khz / (tsc_now - tsc_start); -} -#else -static inline unsigned long calibrate_cpu(void) { return cpu_khz; } -#endif - void __init tsc_init(void) { u64 lpj; @@ -964,10 +915,6 @@ void __init tsc_init(void) return; } - if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) && - (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)) - cpu_khz = calibrate_cpu(); - printk("Detected %lu.%03lu MHz processor.\n", (unsigned long)cpu_khz / 1000, (unsigned long)cpu_khz % 1000); @@ -987,6 +934,9 @@ void __init tsc_init(void) /* now allow native_sched_clock() to use rdtsc */ tsc_disabled = 0; + if (!no_sched_irq_time) + enable_sched_clock_irqtime(); + lpj = ((u64)tsc_khz * 1000); do_div(lpj, HZ); lpj_fine = lpj; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 77d8c0f4817..22b06f7660f 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1056,14 +1056,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) vcpu->arch.apic = apic; - apic->regs_page = alloc_page(GFP_KERNEL); + apic->regs_page = alloc_page(GFP_KERNEL|__GFP_ZERO); if (apic->regs_page == NULL) { printk(KERN_ERR "malloc apic regs error for vcpu %x\n", vcpu->vcpu_id); goto nomem_free_apic; } apic->regs = page_address(apic->regs_page); - memset(apic->regs, 0, PAGE_SIZE); apic->vcpu = vcpu; hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index bc5b9b8d4a3..8a3f9f64f86 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -766,7 +766,6 @@ static void init_vmcb(struct vcpu_svm *svm) control->iopm_base_pa = iopm_base; control->msrpm_base_pa = __pa(svm->msrpm); - control->tsc_offset = 0; control->int_ctl = V_INTR_MASKING_MASK; init_seg(&save->es); @@ -902,6 +901,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; svm->asid_generation = 0; init_vmcb(svm); + svm->vmcb->control.tsc_offset = 0-native_read_tsc(); err = fx_init(&svm->vcpu); if (err) @@ -3163,8 +3163,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) sync_lapic_to_cr8(vcpu); save_host_msrs(vcpu); - fs_selector = kvm_read_fs(); - gs_selector = kvm_read_gs(); + savesegment(fs, fs_selector); + savesegment(gs, gs_selector); ldt_selector = kvm_read_ldt(); svm->vmcb->save.cr2 = vcpu->arch.cr2; /* required for live migration with NPT */ @@ -3251,10 +3251,15 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; - kvm_load_fs(fs_selector); - kvm_load_gs(gs_selector); - kvm_load_ldt(ldt_selector); load_host_msrs(vcpu); + loadsegment(fs, fs_selector); +#ifdef CONFIG_X86_64 + load_gs_index(gs_selector); + wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs); +#else + loadsegment(gs, gs_selector); +#endif + kvm_load_ldt(ldt_selector); reload_tss(vcpu); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 49b25eee25a..7bddfab1201 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -803,7 +803,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) */ vmx->host_state.ldt_sel = kvm_read_ldt(); vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; - vmx->host_state.fs_sel = kvm_read_fs(); + savesegment(fs, vmx->host_state.fs_sel); if (!(vmx->host_state.fs_sel & 7)) { vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); vmx->host_state.fs_reload_needed = 0; @@ -811,7 +811,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) vmcs_write16(HOST_FS_SELECTOR, 0); vmx->host_state.fs_reload_needed = 1; } - vmx->host_state.gs_sel = kvm_read_gs(); + savesegment(gs, vmx->host_state.gs_sel); if (!(vmx->host_state.gs_sel & 7)) vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel); else { @@ -841,27 +841,21 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) static void __vmx_load_host_state(struct vcpu_vmx *vmx) { - unsigned long flags; - if (!vmx->host_state.loaded) return; ++vmx->vcpu.stat.host_state_reload; vmx->host_state.loaded = 0; if (vmx->host_state.fs_reload_needed) - kvm_load_fs(vmx->host_state.fs_sel); + loadsegment(fs, vmx->host_state.fs_sel); if (vmx->host_state.gs_ldt_reload_needed) { kvm_load_ldt(vmx->host_state.ldt_sel); - /* - * If we have to reload gs, we must take care to - * preserve our gs base. - */ - local_irq_save(flags); - kvm_load_gs(vmx->host_state.gs_sel); #ifdef CONFIG_X86_64 - wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); + load_gs_index(vmx->host_state.gs_sel); + wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs); +#else + loadsegment(gs, vmx->host_state.gs_sel); #endif - local_irq_restore(flags); } reload_tss(); #ifdef CONFIG_X86_64 @@ -2589,8 +2583,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ - vmcs_write16(HOST_FS_SELECTOR, kvm_read_fs()); /* 22.2.4 */ - vmcs_write16(HOST_GS_SELECTOR, kvm_read_gs()); /* 22.2.4 */ + vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ + vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ #ifdef CONFIG_X86_64 rdmsrl(MSR_FS_BASE, a); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3a09c625d52..6c2ecf0a806 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1991,13 +1991,14 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | 0 /* Reserved, DCA */ | F(XMM4_1) | F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | - 0 /* Reserved, AES */ | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX); + 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | + F(F16C); /* cpuid 0x80000001.ecx */ const u32 kvm_supported_word6_x86_features = F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | - F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) | - 0 /* SKINIT */ | 0 /* WDT */; + F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) | + 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 9257510b483..9d5f5584845 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -324,9 +324,8 @@ static void lguest_load_gdt(const struct desc_ptr *desc) } /* - * For a single GDT entry which changes, we do the lazy thing: alter our GDT, - * then tell the Host to reload the entire thing. This operation is so rare - * that this naive implementation is reasonable. + * For a single GDT entry which changes, we simply change our copy and + * then tell the host about it. */ static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, const void *desc, int type) @@ -338,9 +337,13 @@ static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, } /* - * OK, I lied. There are three "thread local storage" GDT entries which change + * There are three "thread local storage" GDT entries which change * on every context switch (these three entries are how glibc implements - * __thread variables). So we have a hypercall specifically for this case. + * __thread variables). As an optimization, we have a hypercall + * specifically for this case. + * + * Wouldn't it be nicer to have a general LOAD_GDT_ENTRIES hypercall + * which took a range of entries? */ static void lguest_load_tls(struct thread_struct *t, unsigned int cpu) { diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 4c4508e8a20..a24c6cfdccc 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -251,6 +251,8 @@ static noinline __kprobes int vmalloc_fault(unsigned long address) if (!(address >= VMALLOC_START && address < VMALLOC_END)) return -1; + WARN_ON_ONCE(in_nmi()); + /* * Synchronize this task's top level page-table * with the 'reference' page table. @@ -369,6 +371,8 @@ static noinline __kprobes int vmalloc_fault(unsigned long address) if (!(address >= VMALLOC_START && address < VMALLOC_END)) return -1; + WARN_ON_ONCE(in_nmi()); + /* * Copy kernel mappings over when needed. This can also * happen within a race in page table update. In the later diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index bca79091b9d..558f2d33207 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -67,7 +67,7 @@ static __init void *alloc_low_page(void) panic("alloc_low_page: ran out of memory"); adr = __va(pfn * PAGE_SIZE); - memset(adr, 0, PAGE_SIZE); + clear_page(adr); return adr; } @@ -558,7 +558,7 @@ char swsusp_pg_dir[PAGE_SIZE] static inline void save_pg_dir(void) { - memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE); + copy_page(swsusp_pg_dir, swapper_pg_dir); } #else /* !CONFIG_ACPI_SLEEP */ static inline void save_pg_dir(void) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 9a6674689a2..7c48ad4faca 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -293,7 +293,7 @@ static __ref void *alloc_low_page(unsigned long *phys) panic("alloc_low_page: ran out of memory"); adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); - memset(adr, 0, PAGE_SIZE); + clear_page(adr); *phys = pfn * PAGE_SIZE; return adr; } diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c index 970ed579d4e..52d54bfc1eb 100644 --- a/arch/x86/mm/k8topology_64.c +++ b/arch/x86/mm/k8topology_64.c @@ -22,7 +22,7 @@ #include <asm/numa.h> #include <asm/mpspec.h> #include <asm/apic.h> -#include <asm/k8.h> +#include <asm/amd_nb.h> static struct bootnode __initdata nodes[8]; static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE; @@ -54,8 +54,8 @@ static __init int find_northbridge(void) static __init void early_get_boot_cpu_id(void) { /* - * need to get boot_cpu_id so can use that to create apicid_to_node - * in k8_scan_nodes() + * need to get the APIC ID of the BSP so can use that to + * create apicid_to_node in k8_scan_nodes() */ #ifdef CONFIG_X86_MPPARSE /* @@ -212,7 +212,7 @@ int __init k8_scan_nodes(void) bits = boot_cpu_data.x86_coreid_bits; cores = (1<<bits); apicid_base = 0; - /* need to get boot_cpu_id early for system with apicid lifting */ + /* get the APIC ID of the BSP early for systems with apicid lifting */ early_get_boot_cpu_id(); if (boot_cpu_physical_apicid > 0) { pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid); diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c index b3b531a4f8e..d87dd6d042d 100644 --- a/arch/x86/mm/kmemcheck/kmemcheck.c +++ b/arch/x86/mm/kmemcheck/kmemcheck.c @@ -631,6 +631,8 @@ bool kmemcheck_fault(struct pt_regs *regs, unsigned long address, if (!pte) return false; + WARN_ON_ONCE(in_nmi()); + if (error_code & 2) kmemcheck_access(regs, address, KMEMCHECK_WRITE); else diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index a7bcc23ef96..4962f1aeda6 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -18,7 +18,7 @@ #include <asm/dma.h> #include <asm/numa.h> #include <asm/acpi.h> -#include <asm/k8.h> +#include <asm/amd_nb.h> struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index f9897f7a9ef..9c0d0d399c3 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -420,9 +420,11 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) return -1; } - for_each_node_mask(i, nodes_parsed) - e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, - nodes[i].end >> PAGE_SHIFT); + for (i = 0; i < num_node_memblks; i++) + e820_register_active_regions(memblk_nodeid[i], + node_memblk_range[i].start >> PAGE_SHIFT, + node_memblk_range[i].end >> PAGE_SHIFT); + /* for out of order entries in SRAT */ sort_node_map(); if (!nodes_cover_memory(nodes)) { diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index 3855096c59b..2d49d4e19a3 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c @@ -14,6 +14,7 @@ #include <asm/ptrace.h> #include <asm/uaccess.h> #include <asm/stacktrace.h> +#include <linux/compat.h> static void backtrace_warning_symbol(void *data, char *msg, unsigned long symbol) @@ -48,14 +49,12 @@ static struct stacktrace_ops backtrace_ops = { .walk_stack = print_context_stack, }; -struct frame_head { - struct frame_head *bp; - unsigned long ret; -} __attribute__((packed)); - -static struct frame_head *dump_user_backtrace(struct frame_head *head) +#ifdef CONFIG_COMPAT +static struct stack_frame_ia32 * +dump_user_backtrace_32(struct stack_frame_ia32 *head) { - struct frame_head bufhead[2]; + struct stack_frame_ia32 bufhead[2]; + struct stack_frame_ia32 *fp; /* Also check accessibility of one struct frame_head beyond */ if (!access_ok(VERIFY_READ, head, sizeof(bufhead))) @@ -63,20 +62,66 @@ static struct frame_head *dump_user_backtrace(struct frame_head *head) if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead))) return NULL; - oprofile_add_trace(bufhead[0].ret); + fp = (struct stack_frame_ia32 *) compat_ptr(bufhead[0].next_frame); + + oprofile_add_trace(bufhead[0].return_address); + + /* frame pointers should strictly progress back up the stack + * (towards higher addresses) */ + if (head >= fp) + return NULL; + + return fp; +} + +static inline int +x86_backtrace_32(struct pt_regs * const regs, unsigned int depth) +{ + struct stack_frame_ia32 *head; + + /* User process is 32-bit */ + if (!current || !test_thread_flag(TIF_IA32)) + return 0; + + head = (struct stack_frame_ia32 *) regs->bp; + while (depth-- && head) + head = dump_user_backtrace_32(head); + + return 1; +} + +#else +static inline int +x86_backtrace_32(struct pt_regs * const regs, unsigned int depth) +{ + return 0; +} +#endif /* CONFIG_COMPAT */ + +static struct stack_frame *dump_user_backtrace(struct stack_frame *head) +{ + struct stack_frame bufhead[2]; + + /* Also check accessibility of one struct stack_frame beyond */ + if (!access_ok(VERIFY_READ, head, sizeof(bufhead))) + return NULL; + if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead))) + return NULL; + + oprofile_add_trace(bufhead[0].return_address); /* frame pointers should strictly progress back up the stack * (towards higher addresses) */ - if (head >= bufhead[0].bp) + if (head >= bufhead[0].next_frame) return NULL; - return bufhead[0].bp; + return bufhead[0].next_frame; } void x86_backtrace(struct pt_regs * const regs, unsigned int depth) { - struct frame_head *head = (struct frame_head *)frame_pointer(regs); + struct stack_frame *head = (struct stack_frame *)frame_pointer(regs); if (!user_mode_vm(regs)) { unsigned long stack = kernel_stack_pointer(regs); @@ -86,6 +131,9 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth) return; } + if (x86_backtrace_32(regs, depth)) + return; + while (depth-- && head) head = dump_user_backtrace(head); } diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index cfe4faabb0f..bd1489c3ce0 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -671,7 +671,10 @@ static int __init ppro_init(char **cpu_type) case 14: *cpu_type = "i386/core"; break; - case 15: case 23: + case 0x0f: + case 0x16: + case 0x17: + case 0x1d: *cpu_type = "i386/core_2"; break; case 0x1a: @@ -692,9 +695,6 @@ static int __init ppro_init(char **cpu_type) return 1; } -/* in order to get sysfs right */ -static int using_nmi; - int __init op_nmi_init(struct oprofile_operations *ops) { __u8 vendor = boot_cpu_data.x86_vendor; @@ -702,8 +702,6 @@ int __init op_nmi_init(struct oprofile_operations *ops) char *cpu_type = NULL; int ret = 0; - using_nmi = 0; - if (!cpu_has_apic) return -ENODEV; @@ -787,13 +785,11 @@ int __init op_nmi_init(struct oprofile_operations *ops) if (ret) return ret; - using_nmi = 1; printk(KERN_INFO "oprofile: using NMI interrupt.\n"); return 0; } void op_nmi_exit(void) { - if (using_nmi) - exit_sysfs(); + exit_sysfs(); } diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 1a5353a753f..b2bb5aa3b05 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -489,8 +489,9 @@ static void xen_hvm_setup_cpu_clockevents(void) __init void xen_hvm_init_time_ops(void) { /* vector callback is needed otherwise we cannot receive interrupts - * on cpu > 0 */ - if (!xen_have_vector_callback && num_present_cpus() > 1) + * on cpu > 0 and at this point we don't know how many cpus are + * available */ + if (!xen_have_vector_callback) return; if (!xen_feature(XENFEAT_hvm_safe_pvclock)) { printk(KERN_INFO "Xen doesn't support pvclock on HVM," |