diff options
Diffstat (limited to 'arch/x86/kernel/cpu')
89 files changed, 19978 insertions, 14564 deletions
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 3f0ebe429a0..7fd54f09b01 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -14,7 +14,8 @@ CFLAGS_common.o := $(nostackp) obj-y := intel_cacheinfo.o scattered.o topology.o obj-y += proc.o capflags.o powerflags.o common.o -obj-y += vmware.o hypervisor.o sched.o mshyperv.o +obj-y += rdrand.o +obj-y += match.o obj-$(CONFIG_X86_32) += bugs.o obj-$(CONFIG_X86_64) += bugs_64.o @@ -28,17 +29,30 @@ obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o +ifdef CONFIG_PERF_EVENTS +obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd.o perf_event_amd_uncore.o +ifdef CONFIG_AMD_IOMMU +obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd_iommu.o +endif +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o perf_event_intel_rapl.o +endif + + obj-$(CONFIG_X86_MCE) += mcheck/ obj-$(CONFIG_MTRR) += mtrr/ -obj-$(CONFIG_CPU_FREQ) += cpufreq/ +obj-$(CONFIG_MICROCODE) += microcode/ + +obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o perf_event_amd_ibs.o -obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o +obj-$(CONFIG_HYPERVISOR_GUEST) += vmware.o hypervisor.o mshyperv.o quiet_cmd_mkcapflags = MKCAP $@ - cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@ + cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $< $@ cpufeature = $(src)/../../include/asm/cpufeature.h targets += capflags.c -$(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.pl FORCE +$(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.sh FORCE $(call if_changed,mkcapflags) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 9e093f8fe78..ce8b8ff0e0e 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -1,21 +1,55 @@ -#include <linux/init.h> +#include <linux/export.h> #include <linux/bitops.h> +#include <linux/elf.h> #include <linux/mm.h> #include <linux/io.h> +#include <linux/sched.h> #include <asm/processor.h> #include <asm/apic.h> #include <asm/cpu.h> #include <asm/pci-direct.h> #ifdef CONFIG_X86_64 -# include <asm/numa_64.h> # include <asm/mmconfig.h> # include <asm/cacheflush.h> #endif #include "cpu.h" +static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) +{ + u32 gprs[8] = { 0 }; + int err; + + WARN_ONCE((boot_cpu_data.x86 != 0xf), + "%s should only be used on K8!\n", __func__); + + gprs[1] = msr; + gprs[7] = 0x9c5a203a; + + err = rdmsr_safe_regs(gprs); + + *p = gprs[0] | ((u64)gprs[2] << 32); + + return err; +} + +static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) +{ + u32 gprs[8] = { 0 }; + + WARN_ONCE((boot_cpu_data.x86 != 0xf), + "%s should only be used on K8!\n", __func__); + + gprs[0] = (u32)val; + gprs[1] = msr; + gprs[2] = val >> 32; + gprs[7] = 0x9c5a203a; + + return wrmsr_safe_regs(gprs); +} + #ifdef CONFIG_X86_32 /* * B step AMD K6 before B 9730xxxx have hardware bugs that can cause @@ -23,17 +57,18 @@ * contact AMD for precise details and a CPU swap. * * See http://www.multimania.com/poulot/k6bug.html - * http://www.amd.com/K6/k6docs/revgd.html + * and section 2.6.2 of "AMD-K6 Processor Revision Guide - Model 6" + * (Publication # 21266 Issue Date: August 1998) * * The following test is erm.. interesting. AMD neglected to up * the chip setting when fixing the bug but they also tweaked some * performance at the same time.. */ -extern void vide(void); -__asm__(".align 4\nvide: ret"); +extern __visible void vide(void); +__asm__(".globl vide\n\t.align 4\nvide: ret"); -static void __cpuinit init_amd_k5(struct cpuinfo_x86 *c) +static void init_amd_k5(struct cpuinfo_x86 *c) { /* * General Systems BIOSen alias the cpu frequency registers @@ -51,10 +86,10 @@ static void __cpuinit init_amd_k5(struct cpuinfo_x86 *c) } -static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c) +static void init_amd_k6(struct cpuinfo_x86 *c) { u32 l, h; - int mbytes = num_physpages >> (20-PAGE_SHIFT); + int mbytes = get_num_physpages() >> (20-PAGE_SHIFT); if (c->x86_model < 6) { /* Based on AMD doc 20734R - June 2000 */ @@ -91,7 +126,6 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c) "system stability may be impaired when more than 32 MB are used.\n"); else printk(KERN_CONT "probably OK (after B9730xxxx).\n"); - printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n"); } /* K6 with old style WHCR */ @@ -144,9 +178,8 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c) } } -static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) +static void amd_k7_smp_check(struct cpuinfo_x86 *c) { -#ifdef CONFIG_SMP /* calling is from identify_secondary_cpu() ? */ if (!c->cpu_index) return; @@ -158,11 +191,11 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) /* Athlon 660/661 is valid. */ if ((c->x86_model == 6) && ((c->x86_mask == 0) || (c->x86_mask == 1))) - goto valid_k7; + return; /* Duron 670 is valid */ if ((c->x86_model == 7) && (c->x86_mask == 0)) - goto valid_k7; + return; /* * Athlon 662, Duron 671, and Athlon >model 7 have capability @@ -175,7 +208,7 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) ((c->x86_model == 7) && (c->x86_mask >= 1)) || (c->x86_model > 7)) if (cpu_has_mp) - goto valid_k7; + return; /* If we get here, not a certified SMP capable AMD system. */ @@ -185,15 +218,10 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) */ WARN_ONCE(1, "WARNING: This combination of AMD" " processors is not suitable for SMP.\n"); - if (!test_taint(TAINT_UNSAFE_SMP)) - add_taint(TAINT_UNSAFE_SMP); - -valid_k7: - ; -#endif + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE); } -static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) +static void init_amd_k7(struct cpuinfo_x86 *c) { u32 l, h; @@ -205,9 +233,7 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) if (c->x86_model >= 6 && c->x86_model <= 10) { if (!cpu_has(c, X86_FEATURE_XMM)) { printk(KERN_INFO "Enabling disabled K7/SSE Support.\n"); - rdmsr(MSR_K7_HWCR, l, h); - l &= ~0x00008000; - wrmsr(MSR_K7_HWCR, l, h); + msr_clear_bit(MSR_K7_HWCR, 15); set_cpu_cap(c, X86_FEATURE_XMM); } } @@ -233,18 +259,22 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) } #endif -#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) -static int __cpuinit nearby_node(int apicid) +#ifdef CONFIG_NUMA +/* + * To workaround broken NUMA config. Read the comment in + * srat_detect_node(). + */ +static int nearby_node(int apicid) { int i, node; for (i = apicid - 1; i >= 0; i--) { - node = apicid_to_node[i]; + node = __apicid_to_node[i]; if (node != NUMA_NO_NODE && node_online(node)) return node; } for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { - node = apicid_to_node[i]; + node = __apicid_to_node[i]; if (node != NUMA_NO_NODE && node_online(node)) return node; } @@ -259,14 +289,14 @@ static int __cpuinit nearby_node(int apicid) * (2) AMD processors supporting compute units */ #ifdef CONFIG_X86_HT -static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c) +static void amd_get_topology(struct cpuinfo_x86 *c) { - u32 nodes; + u32 nodes, cores_per_cu = 1; u8 node_id; int cpu = smp_processor_id(); /* get information required for multi-node processors */ - if (cpu_has(c, X86_FEATURE_TOPOEXT)) { + if (cpu_has_topoext) { u32 eax, ebx, ecx, edx; cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); @@ -276,6 +306,7 @@ static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c) /* get compute unit information */ smp_num_siblings = ((ebx >> 8) & 3) + 1; c->compute_unit_id = ebx & 0xff; + cores_per_cu += ((ebx >> 8) & 3); } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { u64 value; @@ -288,24 +319,27 @@ static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c) /* fixup multi-node processor information */ if (nodes > 1) { u32 cores_per_node; + u32 cus_per_node; set_cpu_cap(c, X86_FEATURE_AMD_DCM); cores_per_node = c->x86_max_cores / nodes; + cus_per_node = cores_per_node / cores_per_cu; /* store NodeID, use llc_shared_map to store sibling info */ per_cpu(cpu_llc_id, cpu) = node_id; - /* core id to be in range from 0 to (cores_per_node - 1) */ - c->cpu_core_id = c->cpu_core_id % cores_per_node; + /* core id has to be in the [0 .. cores_per_node - 1] range */ + c->cpu_core_id %= cores_per_node; + c->compute_unit_id %= cus_per_node; } } #endif /* - * On a AMD dual core setup the lower bits of the APIC id distingush the cores. + * On a AMD dual core setup the lower bits of the APIC id distinguish the cores. * Assumes number of cores is a power of two. */ -static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) +static void amd_detect_cmp(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_HT unsigned bits; @@ -322,9 +356,9 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) #endif } -int amd_get_nb_id(int cpu) +u16 amd_get_nb_id(int cpu) { - int id = 0; + u16 id = 0; #ifdef CONFIG_SMP id = per_cpu(cpu_llc_id, cpu); #endif @@ -332,33 +366,50 @@ int amd_get_nb_id(int cpu) } EXPORT_SYMBOL_GPL(amd_get_nb_id); -static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) +static void srat_detect_node(struct cpuinfo_x86 *c) { -#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) +#ifdef CONFIG_NUMA int cpu = smp_processor_id(); int node; unsigned apicid = c->apicid; - node = per_cpu(cpu_llc_id, cpu); + node = numa_cpu_node(cpu); + if (node == NUMA_NO_NODE) + node = per_cpu(cpu_llc_id, cpu); - if (apicid_to_node[apicid] != NUMA_NO_NODE) - node = apicid_to_node[apicid]; - if (!node_online(node)) { - /* Two possibilities here: - - The CPU is missing memory and no node was created. - In that case try picking one from a nearby CPU - - The APIC IDs differ from the HyperTransport node IDs - which the K8 northbridge parsing fills in. - Assume they are all increased by a constant offset, - but in the same order as the HT nodeids. - If that doesn't result in a usable node fall back to the - path for the previous case. */ + /* + * On multi-fabric platform (e.g. Numascale NumaChip) a + * platform-specific handler needs to be called to fixup some + * IDs of the CPU. + */ + if (x86_cpuinit.fixup_cpu_id) + x86_cpuinit.fixup_cpu_id(c, node); + if (!node_online(node)) { + /* + * Two possibilities here: + * + * - The CPU is missing memory and no node was created. In + * that case try picking one from a nearby CPU. + * + * - The APIC IDs differ from the HyperTransport node IDs + * which the K8 northbridge parsing fills in. Assume + * they are all increased by a constant offset, but in + * the same order as the HT nodeids. If that doesn't + * result in a usable node fall back to the path for the + * previous case. + * + * This workaround operates directly on the mapping between + * APIC ID and NUMA node, assuming certain relationship + * between APIC ID, HT node ID and NUMA topology. As going + * through CPU mapping may alter the outcome, directly + * access __apicid_to_node[]. + */ int ht_nodeid = c->initial_apicid; if (ht_nodeid >= 0 && - apicid_to_node[ht_nodeid] != NUMA_NO_NODE) - node = apicid_to_node[ht_nodeid]; + __apicid_to_node[ht_nodeid] != NUMA_NO_NODE) + node = __apicid_to_node[ht_nodeid]; /* Pick a nearby node */ if (!node_online(node)) node = nearby_node(apicid); @@ -367,7 +418,7 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) #endif } -static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c) +static void early_init_amd_mc(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_HT unsigned bits, ecx; @@ -393,7 +444,35 @@ static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c) #endif } -static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) +static void bsp_init_amd(struct cpuinfo_x86 *c) +{ + if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { + + if (c->x86 > 0x10 || + (c->x86 == 0x10 && c->x86_model >= 0x2)) { + u64 val; + + rdmsrl(MSR_K7_HWCR, val); + if (!(val & BIT(24))) + printk(KERN_WARNING FW_BUG "TSC doesn't count " + "with P0 frequency!\n"); + } + } + + if (c->x86 == 0x15) { + unsigned long upperbit; + u32 cpuid, assoc; + + cpuid = cpuid_edx(0x80000005); + assoc = cpuid >> 16 & 0xff; + upperbit = ((cpuid >> 24) << 10) / assoc; + + va_align.mask = (upperbit - 1) & PAGE_MASK; + va_align.flags = ALIGN_VA_32 | ALIGN_VA_64; + } +} + +static void early_init_amd(struct cpuinfo_x86 *c) { early_init_amd_mc(c); @@ -404,6 +483,8 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) if (c->x86_power & (1 << 8)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); + if (!check_tsc_unstable()) + set_sched_clock_stable(); } #ifdef CONFIG_X86_64 @@ -425,29 +506,21 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) } #endif - /* We need to do the following only once */ - if (c != &boot_cpu_data) - return; - - if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { - - if (c->x86 > 0x10 || - (c->x86 == 0x10 && c->x86_model >= 0x2)) { - u64 val; - - rdmsrl(MSR_K7_HWCR, val); - if (!(val & BIT(24))) - printk(KERN_WARNING FW_BUG "TSC doesn't count " - "with P0 frequency!\n"); - } - } + /* F16h erratum 793, CVE-2013-6885 */ + if (c->x86 == 0x16 && c->x86_model <= 0xf) + msr_set_bit(MSR_AMD64_LS_CFG, 15); } -static void __cpuinit init_amd(struct cpuinfo_x86 *c) +static const int amd_erratum_383[]; +static const int amd_erratum_400[]; +static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum); + +static void init_amd(struct cpuinfo_x86 *c) { -#ifdef CONFIG_SMP + u32 dummy; unsigned long long value; +#ifdef CONFIG_SMP /* * Disable TLB flush filter by setting HWCR.FFDIS on K8 * bit 6 of msr C001_0015 @@ -455,11 +528,8 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) * Errata 63 for SH-B3 steppings * Errata 122 for all steppings (F+ have it disabled by default) */ - if (c->x86 == 0xf) { - rdmsrl(MSR_K7_HWCR, value); - value |= 1 << 6; - wrmsrl(MSR_K7_HWCR, value); - } + if (c->x86 == 0xf) + msr_set_bit(MSR_K7_HWCR, 6); #endif early_init_amd(c); @@ -485,12 +555,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) * (AMD Erratum #110, docId: 25759). */ if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) { - u64 val; - clear_cpu_cap(c, X86_FEATURE_LAHF_LM); - if (!rdmsrl_amd_safe(0xc001100d, &val)) { - val &= ~(1ULL << 32); - wrmsrl_amd_safe(0xc001100d, val); + if (!rdmsrl_amd_safe(0xc001100d, &value)) { + value &= ~(1ULL << 32); + wrmsrl_amd_safe(0xc001100d, value); } } @@ -539,6 +607,33 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) } } + /* re-enable TopologyExtensions if switched off by BIOS */ + if ((c->x86 == 0x15) && + (c->x86_model >= 0x10) && (c->x86_model <= 0x1f) && + !cpu_has(c, X86_FEATURE_TOPOEXT)) { + + if (msr_set_bit(0xc0011005, 54) > 0) { + rdmsrl(0xc0011005, value); + if (value & BIT_64(54)) { + set_cpu_cap(c, X86_FEATURE_TOPOEXT); + pr_info(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n"); + } + } + } + + /* + * The way access filter has a performance penalty on some workloads. + * Disable it on the affected CPUs. + */ + if ((c->x86 == 0x15) && + (c->x86_model >= 0x02) && (c->x86_model < 0x20)) { + + if (!rdmsrl_safe(0xc0011021, &value) && !(value & 0x1E)) { + value |= 0x1E; + wrmsrl_safe(0xc0011021, value); + } + } + cpu_detect_cache_sizes(c); /* Multi core CPU? */ @@ -551,12 +646,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) detect_ht(c); #endif - if (c->extended_cpuid_level >= 0x80000006) { - if (cpuid_edx(0x80000006) & 0xf000) - num_cache_leaves = 4; - else - num_cache_leaves = 3; - } + init_amd_cacheinfo(c); if (c->x86 >= 0xf) set_cpu_cap(c, X86_FEATURE_K8); @@ -584,21 +674,58 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) * benefit in doing so. */ if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { + unsigned long pfn = tseg >> PAGE_SHIFT; + printk(KERN_DEBUG "tseg: %010llx\n", tseg); - if ((tseg>>PMD_SHIFT) < - (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) || - ((tseg>>PMD_SHIFT) < - (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) && - (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT)))) + if (pfn_range_is_mapped(pfn, pfn + 1)) set_memory_4k((unsigned long)__va(tseg), 1); } } #endif + + /* + * Family 0x12 and above processors have APIC timer + * running in deep C states. + */ + if (c->x86 > 0x11) + set_cpu_cap(c, X86_FEATURE_ARAT); + + if (c->x86 == 0x10) { + /* + * Disable GART TLB Walk Errors on Fam10h. We do this here + * because this is always needed when GART is enabled, even in a + * kernel which has no MCE support built in. + * BIOS should disable GartTlbWlk Errors already. If + * it doesn't, do it here as suggested by the BKDG. + * + * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012 + */ + msr_set_bit(MSR_AMD64_MCx_MASK(4), 10); + + /* + * On family 10h BIOS may not have properly enabled WC+ support, + * causing it to be converted to CD memtype. This may result in + * performance degradation for certain nested-paging guests. + * Prevent this conversion by clearing bit 24 in + * MSR_AMD64_BU_CFG2. + * + * NOTE: we want to use the _safe accessors so as not to #GP kvm + * guests on older kvm hosts. + */ + msr_clear_bit(MSR_AMD64_BU_CFG2, 24); + + if (cpu_has_amd_erratum(c, amd_erratum_383)) + set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH); + } + + if (cpu_has_amd_erratum(c, amd_erratum_400)) + set_cpu_bug(c, X86_BUG_AMD_APIC_C1E); + + rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); } #ifdef CONFIG_X86_32 -static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, - unsigned int size) +static unsigned int amd_size_cache(struct cpuinfo_x86 *c, unsigned int size) { /* AMD errata T13 (order #21922) */ if ((c->x86 == 6)) { @@ -614,12 +741,68 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, } #endif -static const struct cpu_dev __cpuinitconst amd_cpu_dev = { +static void cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c) +{ + tlb_flushall_shift = 6; +} + +static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c) +{ + u32 ebx, eax, ecx, edx; + u16 mask = 0xfff; + + if (c->x86 < 0xf) + return; + + if (c->extended_cpuid_level < 0x80000006) + return; + + cpuid(0x80000006, &eax, &ebx, &ecx, &edx); + + tlb_lld_4k[ENTRIES] = (ebx >> 16) & mask; + tlb_lli_4k[ENTRIES] = ebx & mask; + + /* + * K8 doesn't have 2M/4M entries in the L2 TLB so read out the L1 TLB + * characteristics from the CPUID function 0x80000005 instead. + */ + if (c->x86 == 0xf) { + cpuid(0x80000005, &eax, &ebx, &ecx, &edx); + mask = 0xff; + } + + /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ + if (!((eax >> 16) & mask)) + tlb_lld_2m[ENTRIES] = (cpuid_eax(0x80000005) >> 16) & 0xff; + else + tlb_lld_2m[ENTRIES] = (eax >> 16) & mask; + + /* a 4M entry uses two 2M entries */ + tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1; + + /* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ + if (!(eax & mask)) { + /* Erratum 658 */ + if (c->x86 == 0x15 && c->x86_model <= 0x1f) { + tlb_lli_2m[ENTRIES] = 1024; + } else { + cpuid(0x80000005, &eax, &ebx, &ecx, &edx); + tlb_lli_2m[ENTRIES] = eax & 0xff; + } + } else + tlb_lli_2m[ENTRIES] = eax & mask; + + tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1; + + cpu_set_tlb_flushall_shift(c); +} + +static const struct cpu_dev amd_cpu_dev = { .c_vendor = "AMD", .c_ident = { "AuthenticAMD" }, #ifdef CONFIG_X86_32 - .c_models = { - { .vendor = X86_VENDOR_AMD, .family = 4, .model_names = + .legacy_models = { + { .family = 4, .model_names = { [3] = "486 DX/2", [7] = "486 DX/2-WB", @@ -630,9 +813,11 @@ static const struct cpu_dev __cpuinitconst amd_cpu_dev = { } }, }, - .c_size_cache = amd_size_cache, + .legacy_cache_size = amd_size_cache, #endif .c_early_init = early_init_amd, + .c_detect_tlb = cpu_detect_tlb_amd, + .c_bsp_init = bsp_init_amd, .c_init = init_amd, .c_x86_vendor = X86_VENDOR_AMD, }; @@ -646,8 +831,7 @@ cpu_dev_register(amd_cpu_dev); * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that * have an OSVW id assigned, which it takes as first argument. Both take a * variable number of family-specific model-stepping ranges created by - * AMD_MODEL_RANGE(). Each erratum also has to be declared as extern const - * int[] in arch/x86/include/asm/processor.h. + * AMD_MODEL_RANGE(). * * Example: * @@ -657,32 +841,28 @@ cpu_dev_register(amd_cpu_dev); * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0)); */ -const int amd_erratum_400[] = +#define AMD_LEGACY_ERRATUM(...) { -1, __VA_ARGS__, 0 } +#define AMD_OSVW_ERRATUM(osvw_id, ...) { osvw_id, __VA_ARGS__, 0 } +#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \ + ((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end)) +#define AMD_MODEL_RANGE_FAMILY(range) (((range) >> 24) & 0xff) +#define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff) +#define AMD_MODEL_RANGE_END(range) ((range) & 0xfff) + +static const int amd_erratum_400[] = AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf), AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf)); -EXPORT_SYMBOL_GPL(amd_erratum_400); -const int amd_erratum_383[] = +static const int amd_erratum_383[] = AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf)); -EXPORT_SYMBOL_GPL(amd_erratum_383); -bool cpu_has_amd_erratum(const int *erratum) + +static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum) { - struct cpuinfo_x86 *cpu = ¤t_cpu_data; int osvw_id = *erratum++; u32 range; u32 ms; - /* - * If called early enough that current_cpu_data hasn't been initialized - * yet, fall back to boot_cpu_data. - */ - if (cpu->x86 == 0) - cpu = &boot_cpu_data; - - if (cpu->x86_vendor != X86_VENDOR_AMD) - return false; - if (osvw_id >= 0 && osvw_id < 65536 && cpu_has(cpu, X86_FEATURE_OSVW)) { u64 osvw_len; @@ -707,5 +887,3 @@ bool cpu_has_amd_erratum(const int *erratum) return false; } - -EXPORT_SYMBOL_GPL(cpu_has_amd_erratum); diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index c39576cb301..03445346ee0 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -17,23 +17,6 @@ #include <asm/paravirt.h> #include <asm/alternative.h> -static int __init no_halt(char *s) -{ - boot_cpu_data.hlt_works_ok = 0; - return 1; -} - -__setup("no-hlt", no_halt); - -static int __init no_387(char *s) -{ - boot_cpu_data.hard_math = 0; - write_cr0(X86_CR0_TS | X86_CR0_EM | X86_CR0_MP | read_cr0()); - return 1; -} - -__setup("no387", no_387); - static double __initdata x = 4195835.0; static double __initdata y = 3145727.0; @@ -52,20 +35,13 @@ static void __init check_fpu(void) { s32 fdiv_bug; - if (!boot_cpu_data.hard_math) { -#ifndef CONFIG_MATH_EMULATION - printk(KERN_EMERG "No coprocessor found and no math emulation present.\n"); - printk(KERN_EMERG "Giving up.\n"); - for (;;) ; -#endif - return; - } + kernel_fpu_begin(); /* * trap_init() enabled FXSR and company _before_ testing for FP * problems here. * - * Test for the divl bug.. + * Test for the divl bug: http://en.wikipedia.org/wiki/Fdiv_bug */ __asm__("fninit\n\t" "fldl %1\n\t" @@ -79,91 +55,40 @@ static void __init check_fpu(void) : "=m" (*&fdiv_bug) : "m" (*&x), "m" (*&y)); - boot_cpu_data.fdiv_bug = fdiv_bug; - if (boot_cpu_data.fdiv_bug) - printk(KERN_WARNING "Hmm, FPU with FDIV bug.\n"); -} - -static void __init check_hlt(void) -{ - if (boot_cpu_data.x86 >= 5 || paravirt_enabled()) - return; + kernel_fpu_end(); - printk(KERN_INFO "Checking 'hlt' instruction... "); - if (!boot_cpu_data.hlt_works_ok) { - printk("disabled\n"); - return; + if (fdiv_bug) { + set_cpu_bug(&boot_cpu_data, X86_BUG_FDIV); + pr_warn("Hmm, FPU with FDIV bug\n"); } - halt(); - halt(); - halt(); - halt(); - printk(KERN_CONT "OK.\n"); -} - -/* - * Most 386 processors have a bug where a POPAD can lock the - * machine even from user space. - */ - -static void __init check_popad(void) -{ -#ifndef CONFIG_X86_POPAD_OK - int res, inp = (int) &res; - - printk(KERN_INFO "Checking for popad bug... "); - __asm__ __volatile__( - "movl $12345678,%%eax; movl $0,%%edi; pusha; popa; movl (%%edx,%%edi),%%ecx " - : "=&a" (res) - : "d" (inp) - : "ecx", "edi"); - /* - * If this fails, it means that any user program may lock the - * CPU hard. Too bad. - */ - if (res != 12345678) - printk(KERN_CONT "Buggy.\n"); - else - printk(KERN_CONT "OK.\n"); -#endif -} - -/* - * Check whether we are able to run this kernel safely on SMP. - * - * - In order to run on a i386, we need to be compiled for i386 - * (for due to lack of "invlpg" and working WP on a i386) - * - In order to run on anything without a TSC, we need to be - * compiled for a i486. - */ - -static void __init check_config(void) -{ -/* - * We'd better not be a i386 if we're configured to use some - * i486+ only features! (WP works in supervisor mode and the - * new "invlpg" and "bswap" instructions) - */ -#if defined(CONFIG_X86_WP_WORKS_OK) || defined(CONFIG_X86_INVLPG) || \ - defined(CONFIG_X86_BSWAP) - if (boot_cpu_data.x86 == 3) - panic("Kernel requires i486+ for 'invlpg' and other features"); -#endif } - void __init check_bugs(void) { identify_boot_cpu(); #ifndef CONFIG_SMP - printk(KERN_INFO "CPU: "); + pr_info("CPU: "); print_cpu_info(&boot_cpu_data); #endif - check_config(); - check_fpu(); - check_hlt(); - check_popad(); + + /* + * Check whether we are able to run this kernel safely on SMP. + * + * - i386 is no longer supported. + * - In order to run on anything without a TSC, we need to be + * compiled for a i486. + */ + if (boot_cpu_data.x86 < 4) + panic("Kernel requires i486+ for 'invlpg' and other features"); + init_utsname()->machine[1] = '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86); alternative_instructions(); + + /* + * kernel_fpu_begin/end() in check_fpu() relies on the patched + * alternative instructions. + */ + if (cpu_has_fpu) + check_fpu(); } diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index e58d978e075..d8fba5c15fb 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -1,6 +1,5 @@ #include <linux/bitops.h> #include <linux/kernel.h> -#include <linux/init.h> #include <asm/processor.h> #include <asm/e820.h> @@ -9,236 +8,6 @@ #include "cpu.h" -#ifdef CONFIG_X86_OOSTORE - -static u32 __cpuinit power2(u32 x) -{ - u32 s = 1; - - while (s <= x) - s <<= 1; - - return s >>= 1; -} - - -/* - * Set up an actual MCR - */ -static void __cpuinit centaur_mcr_insert(int reg, u32 base, u32 size, int key) -{ - u32 lo, hi; - - hi = base & ~0xFFF; - lo = ~(size-1); /* Size is a power of 2 so this makes a mask */ - lo &= ~0xFFF; /* Remove the ctrl value bits */ - lo |= key; /* Attribute we wish to set */ - wrmsr(reg+MSR_IDT_MCR0, lo, hi); - mtrr_centaur_report_mcr(reg, lo, hi); /* Tell the mtrr driver */ -} - -/* - * Figure what we can cover with MCR's - * - * Shortcut: We know you can't put 4Gig of RAM on a winchip - */ -static u32 __cpuinit ramtop(void) -{ - u32 clip = 0xFFFFFFFFUL; - u32 top = 0; - int i; - - for (i = 0; i < e820.nr_map; i++) { - unsigned long start, end; - - if (e820.map[i].addr > 0xFFFFFFFFUL) - continue; - /* - * Don't MCR over reserved space. Ignore the ISA hole - * we frob around that catastrophe already - */ - if (e820.map[i].type == E820_RESERVED) { - if (e820.map[i].addr >= 0x100000UL && - e820.map[i].addr < clip) - clip = e820.map[i].addr; - continue; - } - start = e820.map[i].addr; - end = e820.map[i].addr + e820.map[i].size; - if (start >= end) - continue; - if (end > top) - top = end; - } - /* - * Everything below 'top' should be RAM except for the ISA hole. - * Because of the limited MCR's we want to map NV/ACPI into our - * MCR range for gunk in RAM - * - * Clip might cause us to MCR insufficient RAM but that is an - * acceptable failure mode and should only bite obscure boxes with - * a VESA hole at 15Mb - * - * The second case Clip sometimes kicks in is when the EBDA is marked - * as reserved. Again we fail safe with reasonable results - */ - if (top > clip) - top = clip; - - return top; -} - -/* - * Compute a set of MCR's to give maximum coverage - */ -static int __cpuinit centaur_mcr_compute(int nr, int key) -{ - u32 mem = ramtop(); - u32 root = power2(mem); - u32 base = root; - u32 top = root; - u32 floor = 0; - int ct = 0; - - while (ct < nr) { - u32 fspace = 0; - u32 high; - u32 low; - - /* - * Find the largest block we will fill going upwards - */ - high = power2(mem-top); - - /* - * Find the largest block we will fill going downwards - */ - low = base/2; - - /* - * Don't fill below 1Mb going downwards as there - * is an ISA hole in the way. - */ - if (base <= 1024*1024) - low = 0; - - /* - * See how much space we could cover by filling below - * the ISA hole - */ - - if (floor == 0) - fspace = 512*1024; - else if (floor == 512*1024) - fspace = 128*1024; - - /* And forget ROM space */ - - /* - * Now install the largest coverage we get - */ - if (fspace > high && fspace > low) { - centaur_mcr_insert(ct, floor, fspace, key); - floor += fspace; - } else if (high > low) { - centaur_mcr_insert(ct, top, high, key); - top += high; - } else if (low > 0) { - base -= low; - centaur_mcr_insert(ct, base, low, key); - } else - break; - ct++; - } - /* - * We loaded ct values. We now need to set the mask. The caller - * must do this bit. - */ - return ct; -} - -static void __cpuinit centaur_create_optimal_mcr(void) -{ - int used; - int i; - - /* - * Allocate up to 6 mcrs to mark as much of ram as possible - * as write combining and weak write ordered. - * - * To experiment with: Linux never uses stack operations for - * mmio spaces so we could globally enable stack operation wc - * - * Load the registers with type 31 - full write combining, all - * writes weakly ordered. - */ - used = centaur_mcr_compute(6, 31); - - /* - * Wipe unused MCRs - */ - for (i = used; i < 8; i++) - wrmsr(MSR_IDT_MCR0+i, 0, 0); -} - -static void __cpuinit winchip2_create_optimal_mcr(void) -{ - u32 lo, hi; - int used; - int i; - - /* - * Allocate up to 6 mcrs to mark as much of ram as possible - * as write combining, weak store ordered. - * - * Load the registers with type 25 - * 8 - weak write ordering - * 16 - weak read ordering - * 1 - write combining - */ - used = centaur_mcr_compute(6, 25); - - /* - * Mark the registers we are using. - */ - rdmsr(MSR_IDT_MCR_CTRL, lo, hi); - for (i = 0; i < used; i++) - lo |= 1<<(9+i); - wrmsr(MSR_IDT_MCR_CTRL, lo, hi); - - /* - * Wipe unused MCRs - */ - - for (i = used; i < 8; i++) - wrmsr(MSR_IDT_MCR0+i, 0, 0); -} - -/* - * Handle the MCR key on the Winchip 2. - */ -static void __cpuinit winchip2_unprotect_mcr(void) -{ - u32 lo, hi; - u32 key; - - rdmsr(MSR_IDT_MCR_CTRL, lo, hi); - lo &= ~0x1C0; /* blank bits 8-6 */ - key = (lo>>17) & 7; - lo |= key<<6; /* replace with unlock key */ - wrmsr(MSR_IDT_MCR_CTRL, lo, hi); -} - -static void __cpuinit winchip2_protect_mcr(void) -{ - u32 lo, hi; - - rdmsr(MSR_IDT_MCR_CTRL, lo, hi); - lo &= ~0x1C0; /* blank bits 8-6 */ - wrmsr(MSR_IDT_MCR_CTRL, lo, hi); -} -#endif /* CONFIG_X86_OOSTORE */ - #define ACE_PRESENT (1 << 6) #define ACE_ENABLED (1 << 7) #define ACE_FCR (1 << 28) /* MSR_VIA_FCR */ @@ -247,7 +16,7 @@ static void __cpuinit winchip2_protect_mcr(void) #define RNG_ENABLED (1 << 3) #define RNG_ENABLE (1 << 6) /* MSR_VIA_RNG */ -static void __cpuinit init_c3(struct cpuinfo_x86 *c) +static void init_c3(struct cpuinfo_x86 *c) { u32 lo, hi; @@ -278,7 +47,7 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c) } #ifdef CONFIG_X86_32 /* Cyrix III family needs CX8 & PGE explicitly enabled. */ - if (c->x86_model >= 6 && c->x86_model <= 9) { + if (c->x86_model >= 6 && c->x86_model <= 13) { rdmsr(MSR_VIA_FCR, lo, hi); lo |= (1<<1 | 1<<7); wrmsr(MSR_VIA_FCR, lo, hi); @@ -318,7 +87,7 @@ enum { EAMD3D = 1<<20, }; -static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c) +static void early_init_centaur(struct cpuinfo_x86 *c) { switch (c->x86) { #ifdef CONFIG_X86_32 @@ -337,7 +106,7 @@ static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c) #endif } -static void __cpuinit init_centaur(struct cpuinfo_x86 *c) +static void init_centaur(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_32 char *name; @@ -363,20 +132,6 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c) fcr_clr = DPDC; printk(KERN_NOTICE "Disabling bugged TSC.\n"); clear_cpu_cap(c, X86_FEATURE_TSC); -#ifdef CONFIG_X86_OOSTORE - centaur_create_optimal_mcr(); - /* - * Enable: - * write combining on non-stack, non-string - * write combining on string, all types - * weak write ordering - * - * The C6 original lacks weak read order - * - * Note 0x120 is write only on Winchip 1 - */ - wrmsr(MSR_IDT_MCR_CTRL, 0x01F0001F, 0); -#endif break; case 8: switch (c->x86_mask) { @@ -393,40 +148,12 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c) fcr_set = ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK| E2MMX|EAMD3D; fcr_clr = DPDC; -#ifdef CONFIG_X86_OOSTORE - winchip2_unprotect_mcr(); - winchip2_create_optimal_mcr(); - rdmsr(MSR_IDT_MCR_CTRL, lo, hi); - /* - * Enable: - * write combining on non-stack, non-string - * write combining on string, all types - * weak write ordering - */ - lo |= 31; - wrmsr(MSR_IDT_MCR_CTRL, lo, hi); - winchip2_protect_mcr(); -#endif break; case 9: name = "3"; fcr_set = ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK| E2MMX|EAMD3D; fcr_clr = DPDC; -#ifdef CONFIG_X86_OOSTORE - winchip2_unprotect_mcr(); - winchip2_create_optimal_mcr(); - rdmsr(MSR_IDT_MCR_CTRL, lo, hi); - /* - * Enable: - * write combining on non-stack, non-string - * write combining on string, all types - * weak write ordering - */ - lo |= 31; - wrmsr(MSR_IDT_MCR_CTRL, lo, hi); - winchip2_protect_mcr(); -#endif break; default: name = "??"; @@ -468,10 +195,10 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c) #endif } -static unsigned int __cpuinit +#ifdef CONFIG_X86_32 +static unsigned int centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size) { -#ifdef CONFIG_X86_32 /* VIA C3 CPUs (670-68F) need further shifting. */ if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8))) size >>= 8; @@ -484,16 +211,18 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size) if ((c->x86 == 6) && (c->x86_model == 9) && (c->x86_mask == 1) && (size == 65)) size -= 1; -#endif return size; } +#endif -static const struct cpu_dev __cpuinitconst centaur_cpu_dev = { +static const struct cpu_dev centaur_cpu_dev = { .c_vendor = "Centaur", .c_ident = { "CentaurHauls" }, .c_early_init = early_init_centaur, .c_init = init_centaur, - .c_size_cache = centaur_size_cache, +#ifdef CONFIG_X86_32 + .legacy_cache_size = centaur_size_cache, +#endif .c_x86_vendor = X86_VENDOR_CENTAUR, }; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4b68bda3093..ef1b93f18ed 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -8,6 +8,7 @@ #include <linux/delay.h> #include <linux/sched.h> #include <linux/init.h> +#include <linux/kprobes.h> #include <linux/kgdb.h> #include <linux/smp.h> #include <linux/io.h> @@ -15,18 +16,22 @@ #include <asm/stackprotector.h> #include <asm/perf_event.h> #include <asm/mmu_context.h> +#include <asm/archrandom.h> #include <asm/hypervisor.h> #include <asm/processor.h> +#include <asm/debugreg.h> #include <asm/sections.h> +#include <asm/vsyscall.h> #include <linux/topology.h> #include <linux/cpumask.h> #include <asm/pgtable.h> -#include <asm/atomic.h> +#include <linux/atomic.h> #include <asm/proto.h> #include <asm/setup.h> #include <asm/apic.h> #include <asm/desc.h> #include <asm/i387.h> +#include <asm/fpu-internal.h> #include <asm/mtrr.h> #include <linux/numa.h> #include <asm/asm.h> @@ -34,6 +39,8 @@ #include <asm/mce.h> #include <asm/msr.h> #include <asm/pat.h> +#include <asm/microcode.h> +#include <asm/microcode_intel.h> #ifdef CONFIG_X86_LOCAL_APIC #include <asm/uv/uv.h> @@ -58,7 +65,7 @@ void __init setup_cpu_local_masks(void) alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask); } -static void __cpuinit default_init(struct cpuinfo_x86 *c) +static void default_init(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_64 cpu_detect_cache_sizes(c); @@ -75,13 +82,13 @@ static void __cpuinit default_init(struct cpuinfo_x86 *c) #endif } -static const struct cpu_dev __cpuinitconst default_cpu = { +static const struct cpu_dev default_cpu = { .c_init = default_init, .c_vendor = "Unknown", .c_x86_vendor = X86_VENDOR_UNKNOWN, }; -static const struct cpu_dev *this_cpu __cpuinitdata = &default_cpu; +static const struct cpu_dev *this_cpu = &default_cpu; DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { #ifdef CONFIG_X86_64 @@ -141,6 +148,8 @@ static int __init x86_xsave_setup(char *s) { setup_clear_cpu_cap(X86_FEATURE_XSAVE); setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); + setup_clear_cpu_cap(X86_FEATURE_AVX); + setup_clear_cpu_cap(X86_FEATURE_AVX2); return 1; } __setup("noxsave", x86_xsave_setup); @@ -153,8 +162,8 @@ static int __init x86_xsaveopt_setup(char *s) __setup("noxsaveopt", x86_xsaveopt_setup); #ifdef CONFIG_X86_32 -static int cachesize_override __cpuinitdata = -1; -static int disable_x86_serial_nr __cpuinitdata = 1; +static int cachesize_override = -1; +static int disable_x86_serial_nr = 1; static int __init cachesize_setup(char *str) { @@ -208,12 +217,12 @@ static inline int flag_is_changeable_p(u32 flag) } /* Probe for the CPUID instruction */ -static int __cpuinit have_cpuid_p(void) +int have_cpuid_p(void) { return flag_is_changeable_p(X86_EFLAGS_ID); } -static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) +static void squash_the_stupid_serial_number(struct cpuinfo_x86 *c) { unsigned long lo, hi; @@ -244,15 +253,47 @@ static inline int flag_is_changeable_p(u32 flag) { return 1; } -/* Probe for the CPUID instruction */ -static inline int have_cpuid_p(void) +static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c) { +} +#endif + +static __init int setup_disable_smep(char *arg) +{ + setup_clear_cpu_cap(X86_FEATURE_SMEP); return 1; } -static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c) +__setup("nosmep", setup_disable_smep); + +static __always_inline void setup_smep(struct cpuinfo_x86 *c) +{ + if (cpu_has(c, X86_FEATURE_SMEP)) + set_in_cr4(X86_CR4_SMEP); +} + +static __init int setup_disable_smap(char *arg) { + setup_clear_cpu_cap(X86_FEATURE_SMAP); + return 1; } +__setup("nosmap", setup_disable_smap); + +static __always_inline void setup_smap(struct cpuinfo_x86 *c) +{ + unsigned long eflags; + + /* This should have been cleared long ago */ + raw_local_save_flags(eflags); + BUG_ON(eflags & X86_EFLAGS_AC); + + if (cpu_has(c, X86_FEATURE_SMAP)) { +#ifdef CONFIG_X86_SMAP + set_in_cr4(X86_CR4_SMAP); +#else + clear_in_cr4(X86_CR4_SMAP); #endif + } +} /* * Some CPU features depend on higher CPUID levels, which may not always @@ -264,7 +305,7 @@ struct cpuid_dependent_feature { u32 level; }; -static const struct cpuid_dependent_feature __cpuinitconst +static const struct cpuid_dependent_feature cpuid_dependent_features[] = { { X86_FEATURE_MWAIT, 0x00000005 }, { X86_FEATURE_DCA, 0x00000009 }, @@ -272,7 +313,7 @@ cpuid_dependent_features[] = { { 0, 0 } }; -static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn) +static void filter_cpuid_features(struct cpuinfo_x86 *c, bool warn) { const struct cpuid_dependent_feature *df; @@ -310,9 +351,10 @@ static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn) */ /* Look up CPU names by table lookup. */ -static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c) +static const char *table_lookup_model(struct cpuinfo_x86 *c) { - const struct cpu_model_info *info; +#ifdef CONFIG_X86_32 + const struct legacy_cpu_model_info *info; if (c->x86_model >= 16) return NULL; /* Range check */ @@ -320,18 +362,19 @@ static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c) if (!this_cpu) return NULL; - info = this_cpu->c_models; + info = this_cpu->legacy_models; - while (info && info->family) { + while (info->family) { if (info->family == c->x86) return info->model_names[c->x86_model]; info++; } +#endif return NULL; /* Not found */ } -__u32 cpu_caps_cleared[NCAPINTS] __cpuinitdata; -__u32 cpu_caps_set[NCAPINTS] __cpuinitdata; +__u32 cpu_caps_cleared[NCAPINTS]; +__u32 cpu_caps_set[NCAPINTS]; void load_percpu_segment(int cpu) { @@ -360,9 +403,9 @@ void switch_to_new_gdt(int cpu) load_percpu_segment(cpu); } -static const struct cpu_dev *__cpuinitdata cpu_devs[X86_VENDOR_NUM] = {}; +static const struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; -static void __cpuinit get_model_name(struct cpuinfo_x86 *c) +static void get_model_name(struct cpuinfo_x86 *c) { unsigned int *v; char *p, *q; @@ -391,7 +434,7 @@ static void __cpuinit get_model_name(struct cpuinfo_x86 *c) } } -void __cpuinit cpu_detect_cache_sizes(struct cpuinfo_x86 *c) +void cpu_detect_cache_sizes(struct cpuinfo_x86 *c) { unsigned int n, dummy, ebx, ecx, edx, l2size; @@ -416,8 +459,8 @@ void __cpuinit cpu_detect_cache_sizes(struct cpuinfo_x86 *c) c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff); #else /* do processor-specific cache resizing */ - if (this_cpu->c_size_cache) - l2size = this_cpu->c_size_cache(c, l2size); + if (this_cpu->legacy_cache_size) + l2size = this_cpu->legacy_cache_size(c, l2size); /* Allow user to override all this if necessary. */ if (cachesize_override != -1) @@ -430,7 +473,37 @@ void __cpuinit cpu_detect_cache_sizes(struct cpuinfo_x86 *c) c->x86_cache_size = l2size; } -void __cpuinit detect_ht(struct cpuinfo_x86 *c) +u16 __read_mostly tlb_lli_4k[NR_INFO]; +u16 __read_mostly tlb_lli_2m[NR_INFO]; +u16 __read_mostly tlb_lli_4m[NR_INFO]; +u16 __read_mostly tlb_lld_4k[NR_INFO]; +u16 __read_mostly tlb_lld_2m[NR_INFO]; +u16 __read_mostly tlb_lld_4m[NR_INFO]; +u16 __read_mostly tlb_lld_1g[NR_INFO]; + +/* + * tlb_flushall_shift shows the balance point in replacing cr3 write + * with multiple 'invlpg'. It will do this replacement when + * flush_tlb_lines <= active_lines/2^tlb_flushall_shift. + * If tlb_flushall_shift is -1, means the replacement will be disabled. + */ +s8 __read_mostly tlb_flushall_shift = -1; + +void cpu_detect_tlb(struct cpuinfo_x86 *c) +{ + if (this_cpu->c_detect_tlb) + this_cpu->c_detect_tlb(c); + + printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" + "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n" + "tlb_flushall_shift: %d\n", + tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES], + tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES], + tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES], + tlb_lld_1g[ENTRIES], tlb_flushall_shift); +} + +void detect_ht(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_HT u32 eax, ebx, ecx, edx; @@ -458,13 +531,6 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c) if (smp_num_siblings <= 1) goto out; - if (smp_num_siblings > nr_cpu_ids) { - pr_warning("CPU: Unsupported number of siblings %d", - smp_num_siblings); - smp_num_siblings = 1; - return; - } - index_msb = get_count_order(smp_num_siblings); c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb); @@ -488,7 +554,7 @@ out: #endif } -static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) +static void get_cpu_vendor(struct cpuinfo_x86 *c) { char *v = c->x86_vendor_id; int i; @@ -515,7 +581,7 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) this_cpu = &default_cpu; } -void __cpuinit cpu_detect(struct cpuinfo_x86 *c) +void cpu_detect(struct cpuinfo_x86 *c) { /* Get vendor name */ cpuid(0x00000000, (unsigned int *)&c->cpuid_level, @@ -545,7 +611,7 @@ void __cpuinit cpu_detect(struct cpuinfo_x86 *c) } } -void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) +void get_cpu_cap(struct cpuinfo_x86 *c) { u32 tfms, xlvl; u32 ebx; @@ -565,8 +631,7 @@ void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx); - if (eax > 0) - c->x86_capability[9] = ebx; + c->x86_capability[9] = ebx; } /* AMD-defined flags: level 0x80000001 */ @@ -597,7 +662,7 @@ void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) init_scattered_cpuid_features(c); } -static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c) +static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_32 int i; @@ -656,18 +721,20 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) return; cpu_detect(c); - get_cpu_vendor(c); - get_cpu_cap(c); + fpu_detect(c); if (this_cpu->c_early_init) this_cpu->c_early_init(c); -#ifdef CONFIG_SMP c->cpu_index = 0; -#endif filter_cpuid_features(c, false); + + if (this_cpu->c_bsp_init) + this_cpu->c_bsp_init(c); + + setup_force_cpu_cap(X86_FEATURE_ALWAYS); } void __init early_cpu_init(void) @@ -675,7 +742,7 @@ void __init early_cpu_init(void) const struct cpu_dev *const *cdev; int count = 0; -#ifdef PROCESSOR_SELECT +#ifdef CONFIG_PROCESSOR_SELECT printk(KERN_INFO "KERNEL supported cpus:\n"); #endif @@ -687,7 +754,7 @@ void __init early_cpu_init(void) cpu_devs[count] = cpudev; count++; -#ifdef PROCESSOR_SELECT +#ifdef CONFIG_PROCESSOR_SELECT { unsigned int j; @@ -712,7 +779,7 @@ void __init early_cpu_init(void) * unless we can find a reliable way to detect all the broken cases. * Enable it explicitly on 64-bit for non-constant inputs of cpu_has(). */ -static void __cpuinit detect_nopl(struct cpuinfo_x86 *c) +static void detect_nopl(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_32 clear_cpu_cap(c, X86_FEATURE_NOPL); @@ -721,7 +788,7 @@ static void __cpuinit detect_nopl(struct cpuinfo_x86 *c) #endif } -static void __cpuinit generic_identify(struct cpuinfo_x86 *c) +static void generic_identify(struct cpuinfo_x86 *c) { c->extended_cpuid_level = 0; @@ -747,10 +814,7 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c) c->apicid = c->initial_apicid; # endif #endif - -#ifdef CONFIG_X86_HT c->phys_proc_id = c->initial_apicid; -#endif } get_model_name(c); /* Default name */ @@ -761,7 +825,7 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c) /* * This does the hard work of actually picking apart the CPU stuff... */ -static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) +static void identify_cpu(struct cpuinfo_x86 *c) { int i; @@ -817,6 +881,10 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) /* Disable the PN if appropriate */ squash_the_stupid_serial_number(c); + /* Set up SMEP/SMAP */ + setup_smep(c); + setup_smap(c); + /* * The vendor-specific functions might have changed features. * Now we do "generic changes." @@ -842,6 +910,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) #endif init_hypervisor(c); + x86_init_rdrand(c); /* * Clear/Set all flags overriden by options, need do it @@ -862,6 +931,10 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) /* AND the already accumulated flags with these */ for (i = 0; i < NCAPINTS; i++) boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; + + /* OR, i.e. replicate the bug flags */ + for (i = NCAPINTS; i < NCAPINTS + NBUGINTS; i++) + c->x86_capability[i] |= boot_cpu_data.x86_capability[i]; } /* Init Machine Check Exception if available. */ @@ -869,7 +942,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) select_idle_routine(c); -#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) +#ifdef CONFIG_NUMA numa_add_cpu(smp_processor_id()); #endif } @@ -882,22 +955,54 @@ static void vgetcpu_set_mode(void) else vgetcpu_mode = VGETCPU_LSL; } + +/* May not be __init: called during resume */ +static void syscall32_cpu_init(void) +{ + /* Load these always in case some future AMD CPU supports + SYSENTER from compat mode too. */ + wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); + wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); + + wrmsrl(MSR_CSTAR, ia32_cstar_target); +} +#endif + +#ifdef CONFIG_X86_32 +void enable_sep_cpu(void) +{ + int cpu = get_cpu(); + struct tss_struct *tss = &per_cpu(init_tss, cpu); + + if (!boot_cpu_has(X86_FEATURE_SEP)) { + put_cpu(); + return; + } + + tss->x86_tss.ss1 = __KERNEL_CS; + tss->x86_tss.sp1 = sizeof(struct tss_struct) + (unsigned long) tss; + wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); + wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0); + wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0); + put_cpu(); +} #endif void __init identify_boot_cpu(void) { identify_cpu(&boot_cpu_data); - init_c1e_mask(); + init_amd_e400_c1e_mask(); #ifdef CONFIG_X86_32 sysenter_setup(); enable_sep_cpu(); #else vgetcpu_set_mode(); #endif - init_hw_perf_events(); + cpu_detect_tlb(&boot_cpu_data); } -void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) +void identify_secondary_cpu(struct cpuinfo_x86 *c) { BUG_ON(c == &boot_cpu_data); identify_cpu(c); @@ -912,14 +1017,14 @@ struct msr_range { unsigned max; }; -static const struct msr_range msr_range_array[] __cpuinitconst = { +static const struct msr_range msr_range_array[] = { { 0x00000000, 0x00000418}, { 0xc0000000, 0xc000040b}, { 0xc0010000, 0xc0010142}, { 0xc0011000, 0xc001103b}, }; -static void __cpuinit print_cpu_msr(void) +static void __print_cpu_msr(void) { unsigned index_min, index_max; unsigned index; @@ -931,14 +1036,14 @@ static void __cpuinit print_cpu_msr(void) index_max = msr_range_array[i].max; for (index = index_min; index < index_max; index++) { - if (rdmsrl_amd_safe(index, &val)) + if (rdmsrl_safe(index, &val)) continue; printk(KERN_INFO " MSR%08x: %016llx\n", index, val); } } } -static int show_msr __cpuinitdata; +static int show_msr; static __init int setup_show_msr(char *arg) { @@ -954,12 +1059,13 @@ __setup("show_msr=", setup_show_msr); static __init int setup_noclflush(char *arg) { - setup_clear_cpu_cap(X86_FEATURE_CLFLSH); + setup_clear_cpu_cap(X86_FEATURE_CLFLUSH); + setup_clear_cpu_cap(X86_FEATURE_CLFLUSHOPT); return 1; } __setup("noclflush", setup_noclflush); -void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) +void print_cpu_info(struct cpuinfo_x86 *c) { const char *vendor = NULL; @@ -974,22 +1080,24 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) printk(KERN_CONT "%s ", vendor); if (c->x86_model_id[0]) - printk(KERN_CONT "%s", c->x86_model_id); + printk(KERN_CONT "%s", strim(c->x86_model_id)); else printk(KERN_CONT "%d86", c->x86); + printk(KERN_CONT " (fam: %02x, model: %02x", c->x86, c->x86_model); + if (c->x86_mask || c->cpuid_level >= 0) - printk(KERN_CONT " stepping %02x\n", c->x86_mask); + printk(KERN_CONT ", stepping: %02x)\n", c->x86_mask); else - printk(KERN_CONT "\n"); + printk(KERN_CONT ")\n"); -#ifdef CONFIG_SMP + print_cpu_msr(c); +} + +void print_cpu_msr(struct cpuinfo_x86 *c) +{ if (c->cpu_index < show_msr) - print_cpu_msr(); -#else - if (show_msr) - print_cpu_msr(); -#endif + __print_cpu_msr(); } static __init int setup_disablecpuid(char *arg) @@ -1005,11 +1113,17 @@ static __init int setup_disablecpuid(char *arg) } __setup("clearcpuid=", setup_disablecpuid); +DEFINE_PER_CPU(unsigned long, kernel_stack) = + (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE; +EXPORT_PER_CPU_SYMBOL(kernel_stack); + #ifdef CONFIG_X86_64 struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table }; +struct desc_ptr debug_idt_descr = { NR_VECTORS * 16 - 1, + (unsigned long) debug_idt_table }; DEFINE_PER_CPU_FIRST(union irq_stack_union, - irq_stack_union) __aligned(PAGE_SIZE); + irq_stack_union) __aligned(PAGE_SIZE) __visible; /* * The following four percpu variables are hot. Align current_task to @@ -1019,14 +1133,15 @@ DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = &init_task; EXPORT_PER_CPU_SYMBOL(current_task); -DEFINE_PER_CPU(unsigned long, kernel_stack) = - (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE; -EXPORT_PER_CPU_SYMBOL(kernel_stack); - DEFINE_PER_CPU(char *, irq_stack_ptr) = init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64; -DEFINE_PER_CPU(unsigned int, irq_count) = -1; +DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; + +DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; +EXPORT_PER_CPU_SYMBOL(__preempt_count); + +DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); /* * Special IST stacks which the CPU switches to when it calls @@ -1060,35 +1175,57 @@ void syscall_init(void) /* Flags to clear on syscall */ wrmsrl(MSR_SYSCALL_MASK, - X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL); + X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF| + X86_EFLAGS_IOPL|X86_EFLAGS_AC); } -unsigned long kernel_eflags; - /* * Copies of the original ist values from the tss are only accessed during * debugging, no special alignment required. */ DEFINE_PER_CPU(struct orig_ist, orig_ist); +static DEFINE_PER_CPU(unsigned long, debug_stack_addr); +DEFINE_PER_CPU(int, debug_stack_usage); + +int is_debug_stack(unsigned long addr) +{ + return __get_cpu_var(debug_stack_usage) || + (addr <= __get_cpu_var(debug_stack_addr) && + addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ)); +} +NOKPROBE_SYMBOL(is_debug_stack); + +DEFINE_PER_CPU(u32, debug_idt_ctr); + +void debug_stack_set_zero(void) +{ + this_cpu_inc(debug_idt_ctr); + load_current_idt(); +} +NOKPROBE_SYMBOL(debug_stack_set_zero); + +void debug_stack_reset(void) +{ + if (WARN_ON(!this_cpu_read(debug_idt_ctr))) + return; + if (this_cpu_dec_return(debug_idt_ctr) == 0) + load_current_idt(); +} +NOKPROBE_SYMBOL(debug_stack_reset); + #else /* CONFIG_X86_64 */ DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; EXPORT_PER_CPU_SYMBOL(current_task); +DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; +EXPORT_PER_CPU_SYMBOL(__preempt_count); +DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); #ifdef CONFIG_CC_STACKPROTECTOR DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); #endif -/* Make sure %fs and %gs are initialized properly in idle threads */ -struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) -{ - memset(regs, 0, sizeof(struct pt_regs)); - regs->fs = __KERNEL_PERCPU; - regs->gs = __KERNEL_STACK_CANARY; - - return regs; -} #endif /* CONFIG_X86_64 */ /* @@ -1130,7 +1267,7 @@ static void dbg_restore_debug_regs(void) */ #ifdef CONFIG_X86_64 -void __cpuinit cpu_init(void) +void cpu_init(void) { struct orig_ist *oist; struct task_struct *me; @@ -1139,12 +1276,18 @@ void __cpuinit cpu_init(void) int cpu; int i; + /* + * Load microcode on this cpu if a valid microcode is available. + * This is early microcode loading procedure. + */ + load_ucode_ap(); + cpu = stack_smp_processor_id(); t = &per_cpu(init_tss, cpu); oist = &per_cpu(orig_ist, cpu); #ifdef CONFIG_NUMA - if (cpu != 0 && percpu_read(numa_node) == 0 && + if (this_cpu_read(numa_node) == 0 && early_cpu_to_node(cpu) != NUMA_NO_NODE) set_numa_node(early_cpu_to_node(cpu)); #endif @@ -1166,7 +1309,7 @@ void __cpuinit cpu_init(void) switch_to_new_gdt(cpu); loadsegment(fs, 0); - load_idt((const struct desc_ptr *)&idt_descr); + load_current_idt(); memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); syscall_init(); @@ -1176,8 +1319,7 @@ void __cpuinit cpu_init(void) barrier(); x86_configure_nx(); - if (cpu != 0) - enable_x2apic(); + enable_x2apic(); /* * set up and load the per-CPU TSS @@ -1189,6 +1331,8 @@ void __cpuinit cpu_init(void) estacks += exception_stack_sizes[v]; oist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks; + if (v == DEBUG_STACK-1) + per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks; } } @@ -1215,9 +1359,6 @@ void __cpuinit cpu_init(void) dbg_restore_debug_regs(); fpu_init(); - xsave_init(); - - raw_local_save_flags(kernel_eflags); if (is_uv_system()) uv_cpu_init(); @@ -1225,13 +1366,15 @@ void __cpuinit cpu_init(void) #else -void __cpuinit cpu_init(void) +void cpu_init(void) { int cpu = smp_processor_id(); struct task_struct *curr = current; struct tss_struct *t = &per_cpu(init_tss, cpu); struct thread_struct *thread = &curr->thread; + show_ucode_info_early(); + if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) { printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); for (;;) @@ -1243,7 +1386,7 @@ void __cpuinit cpu_init(void) if (cpu_has_vme || cpu_has_tsc || cpu_has_de) clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); - load_idt(&idt_descr); + load_current_idt(); switch_to_new_gdt(cpu); /* @@ -1270,6 +1413,19 @@ void __cpuinit cpu_init(void) dbg_restore_debug_regs(); fpu_init(); - xsave_init(); } #endif + +#ifdef CONFIG_X86_DEBUG_STATIC_CPU_HAS +void warn_pre_alternatives(void) +{ + WARN(1, "You're using static_cpu_has before alternatives have run!\n"); +} +EXPORT_SYMBOL_GPL(warn_pre_alternatives); +#endif + +inline bool __static_cpu_has_safe(u16 bit) +{ + return boot_cpu_has(bit); +} +EXPORT_SYMBOL_GPL(__static_cpu_has_safe); diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index e765633f210..c37dc37e831 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -1,13 +1,6 @@ #ifndef ARCH_X86_CPU_H - #define ARCH_X86_CPU_H -struct cpu_model_info { - int vendor; - int family; - const char *model_names[16]; -}; - /* attempt to consolidate cpu attributes */ struct cpu_dev { const char *c_vendor; @@ -15,13 +8,31 @@ struct cpu_dev { /* some have two possibilities for cpuid string */ const char *c_ident[2]; - struct cpu_model_info c_models[4]; - void (*c_early_init)(struct cpuinfo_x86 *); + void (*c_bsp_init)(struct cpuinfo_x86 *); void (*c_init)(struct cpuinfo_x86 *); void (*c_identify)(struct cpuinfo_x86 *); - unsigned int (*c_size_cache)(struct cpuinfo_x86 *, unsigned int); + void (*c_detect_tlb)(struct cpuinfo_x86 *); int c_x86_vendor; +#ifdef CONFIG_X86_32 + /* Optional vendor specific routine to obtain the cache size. */ + unsigned int (*legacy_cache_size)(struct cpuinfo_x86 *, + unsigned int); + + /* Family/stepping-based lookup table for model names. */ + struct legacy_cpu_model_info { + int family; + const char *model_names[16]; + } legacy_models[5]; +#endif +}; + +struct _tlb_table { + unsigned char descriptor; + char tlb_type; + unsigned int entries; + /* unsigned int ways; */ + char info[128]; }; #define cpu_dev_register(cpu_devX) \ @@ -34,6 +45,4 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[], extern void get_cpu_cap(struct cpuinfo_x86 *c); extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); -extern void get_cpu_cap(struct cpuinfo_x86 *c); - -#endif +#endif /* ARCH_X86_CPU_H */ diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig deleted file mode 100644 index 870e6cc6ad2..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/Kconfig +++ /dev/null @@ -1,266 +0,0 @@ -# -# CPU Frequency scaling -# - -menu "CPU Frequency scaling" - -source "drivers/cpufreq/Kconfig" - -if CPU_FREQ - -comment "CPUFreq processor drivers" - -config X86_PCC_CPUFREQ - tristate "Processor Clocking Control interface driver" - depends on ACPI && ACPI_PROCESSOR - help - This driver adds support for the PCC interface. - - For details, take a look at: - <file:Documentation/cpu-freq/pcc-cpufreq.txt>. - - To compile this driver as a module, choose M here: the - module will be called pcc-cpufreq. - - If in doubt, say N. - -config X86_ACPI_CPUFREQ - tristate "ACPI Processor P-States driver" - select CPU_FREQ_TABLE - depends on ACPI_PROCESSOR - help - This driver adds a CPUFreq driver which utilizes the ACPI - Processor Performance States. - This driver also supports Intel Enhanced Speedstep. - - To compile this driver as a module, choose M here: the - module will be called acpi-cpufreq. - - For details, take a look at <file:Documentation/cpu-freq/>. - - If in doubt, say N. - -config ELAN_CPUFREQ - tristate "AMD Elan SC400 and SC410" - select CPU_FREQ_TABLE - depends on X86_ELAN - ---help--- - This adds the CPUFreq driver for AMD Elan SC400 and SC410 - processors. - - You need to specify the processor maximum speed as boot - parameter: elanfreq=maxspeed (in kHz) or as module - parameter "max_freq". - - For details, take a look at <file:Documentation/cpu-freq/>. - - If in doubt, say N. - -config SC520_CPUFREQ - tristate "AMD Elan SC520" - select CPU_FREQ_TABLE - depends on X86_ELAN - ---help--- - This adds the CPUFreq driver for AMD Elan SC520 processor. - - For details, take a look at <file:Documentation/cpu-freq/>. - - If in doubt, say N. - - -config X86_POWERNOW_K6 - tristate "AMD Mobile K6-2/K6-3 PowerNow!" - select CPU_FREQ_TABLE - depends on X86_32 - help - This adds the CPUFreq driver for mobile AMD K6-2+ and mobile - AMD K6-3+ processors. - - For details, take a look at <file:Documentation/cpu-freq/>. - - If in doubt, say N. - -config X86_POWERNOW_K7 - tristate "AMD Mobile Athlon/Duron PowerNow!" - select CPU_FREQ_TABLE - depends on X86_32 - help - This adds the CPUFreq driver for mobile AMD K7 mobile processors. - - For details, take a look at <file:Documentation/cpu-freq/>. - - If in doubt, say N. - -config X86_POWERNOW_K7_ACPI - bool - depends on X86_POWERNOW_K7 && ACPI_PROCESSOR - depends on !(X86_POWERNOW_K7 = y && ACPI_PROCESSOR = m) - depends on X86_32 - default y - -config X86_POWERNOW_K8 - tristate "AMD Opteron/Athlon64 PowerNow!" - select CPU_FREQ_TABLE - depends on ACPI && ACPI_PROCESSOR - help - This adds the CPUFreq driver for K8/K10 Opteron/Athlon64 processors. - - To compile this driver as a module, choose M here: the - module will be called powernow-k8. - - For details, take a look at <file:Documentation/cpu-freq/>. - -config X86_GX_SUSPMOD - tristate "Cyrix MediaGX/NatSemi Geode Suspend Modulation" - depends on X86_32 && PCI - help - This add the CPUFreq driver for NatSemi Geode processors which - support suspend modulation. - - For details, take a look at <file:Documentation/cpu-freq/>. - - If in doubt, say N. - -config X86_SPEEDSTEP_CENTRINO - tristate "Intel Enhanced SpeedStep (deprecated)" - select CPU_FREQ_TABLE - select X86_SPEEDSTEP_CENTRINO_TABLE if X86_32 - depends on X86_32 || (X86_64 && ACPI_PROCESSOR) - help - This is deprecated and this functionality is now merged into - acpi_cpufreq (X86_ACPI_CPUFREQ). Use that driver instead of - speedstep_centrino. - This adds the CPUFreq driver for Enhanced SpeedStep enabled - mobile CPUs. This means Intel Pentium M (Centrino) CPUs - or 64bit enabled Intel Xeons. - - To compile this driver as a module, choose M here: the - module will be called speedstep-centrino. - - For details, take a look at <file:Documentation/cpu-freq/>. - - If in doubt, say N. - -config X86_SPEEDSTEP_CENTRINO_TABLE - bool "Built-in tables for Banias CPUs" - depends on X86_32 && X86_SPEEDSTEP_CENTRINO - default y - help - Use built-in tables for Banias CPUs if ACPI encoding - is not available. - - If in doubt, say N. - -config X86_SPEEDSTEP_ICH - tristate "Intel Speedstep on ICH-M chipsets (ioport interface)" - select CPU_FREQ_TABLE - depends on X86_32 - help - This adds the CPUFreq driver for certain mobile Intel Pentium III - (Coppermine), all mobile Intel Pentium III-M (Tualatin) and all - mobile Intel Pentium 4 P4-M on systems which have an Intel ICH2, - ICH3 or ICH4 southbridge. - - For details, take a look at <file:Documentation/cpu-freq/>. - - If in doubt, say N. - -config X86_SPEEDSTEP_SMI - tristate "Intel SpeedStep on 440BX/ZX/MX chipsets (SMI interface)" - select CPU_FREQ_TABLE - depends on X86_32 && EXPERIMENTAL - help - This adds the CPUFreq driver for certain mobile Intel Pentium III - (Coppermine), all mobile Intel Pentium III-M (Tualatin) - on systems which have an Intel 440BX/ZX/MX southbridge. - - For details, take a look at <file:Documentation/cpu-freq/>. - - If in doubt, say N. - -config X86_P4_CLOCKMOD - tristate "Intel Pentium 4 clock modulation" - select CPU_FREQ_TABLE - help - This adds the CPUFreq driver for Intel Pentium 4 / XEON - processors. When enabled it will lower CPU temperature by skipping - clocks. - - This driver should be only used in exceptional - circumstances when very low power is needed because it causes severe - slowdowns and noticeable latencies. Normally Speedstep should be used - instead. - - To compile this driver as a module, choose M here: the - module will be called p4-clockmod. - - For details, take a look at <file:Documentation/cpu-freq/>. - - Unless you are absolutely sure say N. - -config X86_CPUFREQ_NFORCE2 - tristate "nVidia nForce2 FSB changing" - depends on X86_32 && EXPERIMENTAL - help - This adds the CPUFreq driver for FSB changing on nVidia nForce2 - platforms. - - For details, take a look at <file:Documentation/cpu-freq/>. - - If in doubt, say N. - -config X86_LONGRUN - tristate "Transmeta LongRun" - depends on X86_32 - help - This adds the CPUFreq driver for Transmeta Crusoe and Efficeon processors - which support LongRun. - - For details, take a look at <file:Documentation/cpu-freq/>. - - If in doubt, say N. - -config X86_LONGHAUL - tristate "VIA Cyrix III Longhaul" - select CPU_FREQ_TABLE - depends on X86_32 && ACPI_PROCESSOR - help - This adds the CPUFreq driver for VIA Samuel/CyrixIII, - VIA Cyrix Samuel/C3, VIA Cyrix Ezra and VIA Cyrix Ezra-T - processors. - - For details, take a look at <file:Documentation/cpu-freq/>. - - If in doubt, say N. - -config X86_E_POWERSAVER - tristate "VIA C7 Enhanced PowerSaver (DANGEROUS)" - select CPU_FREQ_TABLE - depends on X86_32 && EXPERIMENTAL - help - This adds the CPUFreq driver for VIA C7 processors. However, this driver - does not have any safeguards to prevent operating the CPU out of spec - and is thus considered dangerous. Please use the regular ACPI cpufreq - driver, enabled by CONFIG_X86_ACPI_CPUFREQ. - - If in doubt, say N. - -comment "shared options" - -config X86_SPEEDSTEP_LIB - tristate - default (X86_SPEEDSTEP_ICH || X86_SPEEDSTEP_SMI || X86_P4_CLOCKMOD) - -config X86_SPEEDSTEP_RELAXED_CAP_CHECK - bool "Relaxed speedstep capability checks" - depends on X86_32 && (X86_SPEEDSTEP_SMI || X86_SPEEDSTEP_ICH) - help - Don't perform all checks for a speedstep capable system which would - normally be done. Some ancient or strange systems, though speedstep - capable, don't always indicate that they are speedstep capable. This - option lets the probing code bypass some of those checks if the - parameter "relaxed_check=1" is passed to the module. - -endif # CPU_FREQ - -endmenu diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile deleted file mode 100644 index bd54bf67e6f..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/Makefile +++ /dev/null @@ -1,21 +0,0 @@ -# Link order matters. K8 is preferred to ACPI because of firmware bugs in early -# K8 systems. ACPI is preferred to all other hardware-specific drivers. -# speedstep-* is preferred over p4-clockmod. - -obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o mperf.o -obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o mperf.o -obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o -obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o -obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o -obj-$(CONFIG_X86_LONGHAUL) += longhaul.o -obj-$(CONFIG_X86_E_POWERSAVER) += e_powersaver.o -obj-$(CONFIG_ELAN_CPUFREQ) += elanfreq.o -obj-$(CONFIG_SC520_CPUFREQ) += sc520_freq.o -obj-$(CONFIG_X86_LONGRUN) += longrun.o -obj-$(CONFIG_X86_GX_SUSPMOD) += gx-suspmod.o -obj-$(CONFIG_X86_SPEEDSTEP_ICH) += speedstep-ich.o -obj-$(CONFIG_X86_SPEEDSTEP_LIB) += speedstep-lib.o -obj-$(CONFIG_X86_SPEEDSTEP_SMI) += speedstep-smi.o -obj-$(CONFIG_X86_SPEEDSTEP_CENTRINO) += speedstep-centrino.o -obj-$(CONFIG_X86_P4_CLOCKMOD) += p4-clockmod.o -obj-$(CONFIG_X86_CPUFREQ_NFORCE2) += cpufreq-nforce2.o diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c deleted file mode 100644 index a2baafb2fe6..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ /dev/null @@ -1,776 +0,0 @@ -/* - * acpi-cpufreq.c - ACPI Processor P-States Driver - * - * Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com> - * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> - * Copyright (C) 2002 - 2004 Dominik Brodowski <linux@brodo.de> - * Copyright (C) 2006 Denis Sadykov <denis.m.sadykov@intel.com> - * - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or (at - * your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write to the Free Software Foundation, Inc., - * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. - * - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/smp.h> -#include <linux/sched.h> -#include <linux/cpufreq.h> -#include <linux/compiler.h> -#include <linux/dmi.h> -#include <linux/slab.h> - -#include <linux/acpi.h> -#include <linux/io.h> -#include <linux/delay.h> -#include <linux/uaccess.h> - -#include <acpi/processor.h> - -#include <asm/msr.h> -#include <asm/processor.h> -#include <asm/cpufeature.h> -#include "mperf.h" - -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "acpi-cpufreq", msg) - -MODULE_AUTHOR("Paul Diefenbaugh, Dominik Brodowski"); -MODULE_DESCRIPTION("ACPI Processor P-States Driver"); -MODULE_LICENSE("GPL"); - -enum { - UNDEFINED_CAPABLE = 0, - SYSTEM_INTEL_MSR_CAPABLE, - SYSTEM_IO_CAPABLE, -}; - -#define INTEL_MSR_RANGE (0xffff) - -struct acpi_cpufreq_data { - struct acpi_processor_performance *acpi_data; - struct cpufreq_frequency_table *freq_table; - unsigned int resume; - unsigned int cpu_feature; -}; - -static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data); - -/* acpi_perf_data is a pointer to percpu data. */ -static struct acpi_processor_performance __percpu *acpi_perf_data; - -static struct cpufreq_driver acpi_cpufreq_driver; - -static unsigned int acpi_pstate_strict; - -static int check_est_cpu(unsigned int cpuid) -{ - struct cpuinfo_x86 *cpu = &cpu_data(cpuid); - - return cpu_has(cpu, X86_FEATURE_EST); -} - -static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data) -{ - struct acpi_processor_performance *perf; - int i; - - perf = data->acpi_data; - - for (i = 0; i < perf->state_count; i++) { - if (value == perf->states[i].status) - return data->freq_table[i].frequency; - } - return 0; -} - -static unsigned extract_msr(u32 msr, struct acpi_cpufreq_data *data) -{ - int i; - struct acpi_processor_performance *perf; - - msr &= INTEL_MSR_RANGE; - perf = data->acpi_data; - - for (i = 0; data->freq_table[i].frequency != CPUFREQ_TABLE_END; i++) { - if (msr == perf->states[data->freq_table[i].index].status) - return data->freq_table[i].frequency; - } - return data->freq_table[0].frequency; -} - -static unsigned extract_freq(u32 val, struct acpi_cpufreq_data *data) -{ - switch (data->cpu_feature) { - case SYSTEM_INTEL_MSR_CAPABLE: - return extract_msr(val, data); - case SYSTEM_IO_CAPABLE: - return extract_io(val, data); - default: - return 0; - } -} - -struct msr_addr { - u32 reg; -}; - -struct io_addr { - u16 port; - u8 bit_width; -}; - -struct drv_cmd { - unsigned int type; - const struct cpumask *mask; - union { - struct msr_addr msr; - struct io_addr io; - } addr; - u32 val; -}; - -/* Called via smp_call_function_single(), on the target CPU */ -static void do_drv_read(void *_cmd) -{ - struct drv_cmd *cmd = _cmd; - u32 h; - - switch (cmd->type) { - case SYSTEM_INTEL_MSR_CAPABLE: - rdmsr(cmd->addr.msr.reg, cmd->val, h); - break; - case SYSTEM_IO_CAPABLE: - acpi_os_read_port((acpi_io_address)cmd->addr.io.port, - &cmd->val, - (u32)cmd->addr.io.bit_width); - break; - default: - break; - } -} - -/* Called via smp_call_function_many(), on the target CPUs */ -static void do_drv_write(void *_cmd) -{ - struct drv_cmd *cmd = _cmd; - u32 lo, hi; - - switch (cmd->type) { - case SYSTEM_INTEL_MSR_CAPABLE: - rdmsr(cmd->addr.msr.reg, lo, hi); - lo = (lo & ~INTEL_MSR_RANGE) | (cmd->val & INTEL_MSR_RANGE); - wrmsr(cmd->addr.msr.reg, lo, hi); - break; - case SYSTEM_IO_CAPABLE: - acpi_os_write_port((acpi_io_address)cmd->addr.io.port, - cmd->val, - (u32)cmd->addr.io.bit_width); - break; - default: - break; - } -} - -static void drv_read(struct drv_cmd *cmd) -{ - int err; - cmd->val = 0; - - err = smp_call_function_any(cmd->mask, do_drv_read, cmd, 1); - WARN_ON_ONCE(err); /* smp_call_function_any() was buggy? */ -} - -static void drv_write(struct drv_cmd *cmd) -{ - int this_cpu; - - this_cpu = get_cpu(); - if (cpumask_test_cpu(this_cpu, cmd->mask)) - do_drv_write(cmd); - smp_call_function_many(cmd->mask, do_drv_write, cmd, 1); - put_cpu(); -} - -static u32 get_cur_val(const struct cpumask *mask) -{ - struct acpi_processor_performance *perf; - struct drv_cmd cmd; - - if (unlikely(cpumask_empty(mask))) - return 0; - - switch (per_cpu(acfreq_data, cpumask_first(mask))->cpu_feature) { - case SYSTEM_INTEL_MSR_CAPABLE: - cmd.type = SYSTEM_INTEL_MSR_CAPABLE; - cmd.addr.msr.reg = MSR_IA32_PERF_STATUS; - break; - case SYSTEM_IO_CAPABLE: - cmd.type = SYSTEM_IO_CAPABLE; - perf = per_cpu(acfreq_data, cpumask_first(mask))->acpi_data; - cmd.addr.io.port = perf->control_register.address; - cmd.addr.io.bit_width = perf->control_register.bit_width; - break; - default: - return 0; - } - - cmd.mask = mask; - drv_read(&cmd); - - dprintk("get_cur_val = %u\n", cmd.val); - - return cmd.val; -} - -static unsigned int get_cur_freq_on_cpu(unsigned int cpu) -{ - struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu); - unsigned int freq; - unsigned int cached_freq; - - dprintk("get_cur_freq_on_cpu (%d)\n", cpu); - - if (unlikely(data == NULL || - data->acpi_data == NULL || data->freq_table == NULL)) { - return 0; - } - - cached_freq = data->freq_table[data->acpi_data->state].frequency; - freq = extract_freq(get_cur_val(cpumask_of(cpu)), data); - if (freq != cached_freq) { - /* - * The dreaded BIOS frequency change behind our back. - * Force set the frequency on next target call. - */ - data->resume = 1; - } - - dprintk("cur freq = %u\n", freq); - - return freq; -} - -static unsigned int check_freqs(const struct cpumask *mask, unsigned int freq, - struct acpi_cpufreq_data *data) -{ - unsigned int cur_freq; - unsigned int i; - - for (i = 0; i < 100; i++) { - cur_freq = extract_freq(get_cur_val(mask), data); - if (cur_freq == freq) - return 1; - udelay(10); - } - return 0; -} - -static int acpi_cpufreq_target(struct cpufreq_policy *policy, - unsigned int target_freq, unsigned int relation) -{ - struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu); - struct acpi_processor_performance *perf; - struct cpufreq_freqs freqs; - struct drv_cmd cmd; - unsigned int next_state = 0; /* Index into freq_table */ - unsigned int next_perf_state = 0; /* Index into perf table */ - unsigned int i; - int result = 0; - - dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu); - - if (unlikely(data == NULL || - data->acpi_data == NULL || data->freq_table == NULL)) { - return -ENODEV; - } - - perf = data->acpi_data; - result = cpufreq_frequency_table_target(policy, - data->freq_table, - target_freq, - relation, &next_state); - if (unlikely(result)) { - result = -ENODEV; - goto out; - } - - next_perf_state = data->freq_table[next_state].index; - if (perf->state == next_perf_state) { - if (unlikely(data->resume)) { - dprintk("Called after resume, resetting to P%d\n", - next_perf_state); - data->resume = 0; - } else { - dprintk("Already at target state (P%d)\n", - next_perf_state); - goto out; - } - } - - switch (data->cpu_feature) { - case SYSTEM_INTEL_MSR_CAPABLE: - cmd.type = SYSTEM_INTEL_MSR_CAPABLE; - cmd.addr.msr.reg = MSR_IA32_PERF_CTL; - cmd.val = (u32) perf->states[next_perf_state].control; - break; - case SYSTEM_IO_CAPABLE: - cmd.type = SYSTEM_IO_CAPABLE; - cmd.addr.io.port = perf->control_register.address; - cmd.addr.io.bit_width = perf->control_register.bit_width; - cmd.val = (u32) perf->states[next_perf_state].control; - break; - default: - result = -ENODEV; - goto out; - } - - /* cpufreq holds the hotplug lock, so we are safe from here on */ - if (policy->shared_type != CPUFREQ_SHARED_TYPE_ANY) - cmd.mask = policy->cpus; - else - cmd.mask = cpumask_of(policy->cpu); - - freqs.old = perf->states[perf->state].core_frequency * 1000; - freqs.new = data->freq_table[next_state].frequency; - for_each_cpu(i, policy->cpus) { - freqs.cpu = i; - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - } - - drv_write(&cmd); - - if (acpi_pstate_strict) { - if (!check_freqs(cmd.mask, freqs.new, data)) { - dprintk("acpi_cpufreq_target failed (%d)\n", - policy->cpu); - result = -EAGAIN; - goto out; - } - } - - for_each_cpu(i, policy->cpus) { - freqs.cpu = i; - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - } - perf->state = next_perf_state; - -out: - return result; -} - -static int acpi_cpufreq_verify(struct cpufreq_policy *policy) -{ - struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu); - - dprintk("acpi_cpufreq_verify\n"); - - return cpufreq_frequency_table_verify(policy, data->freq_table); -} - -static unsigned long -acpi_cpufreq_guess_freq(struct acpi_cpufreq_data *data, unsigned int cpu) -{ - struct acpi_processor_performance *perf = data->acpi_data; - - if (cpu_khz) { - /* search the closest match to cpu_khz */ - unsigned int i; - unsigned long freq; - unsigned long freqn = perf->states[0].core_frequency * 1000; - - for (i = 0; i < (perf->state_count-1); i++) { - freq = freqn; - freqn = perf->states[i+1].core_frequency * 1000; - if ((2 * cpu_khz) > (freqn + freq)) { - perf->state = i; - return freq; - } - } - perf->state = perf->state_count-1; - return freqn; - } else { - /* assume CPU is at P0... */ - perf->state = 0; - return perf->states[0].core_frequency * 1000; - } -} - -static void free_acpi_perf_data(void) -{ - unsigned int i; - - /* Freeing a NULL pointer is OK, and alloc_percpu zeroes. */ - for_each_possible_cpu(i) - free_cpumask_var(per_cpu_ptr(acpi_perf_data, i) - ->shared_cpu_map); - free_percpu(acpi_perf_data); -} - -/* - * acpi_cpufreq_early_init - initialize ACPI P-States library - * - * Initialize the ACPI P-States library (drivers/acpi/processor_perflib.c) - * in order to determine correct frequency and voltage pairings. We can - * do _PDC and _PSD and find out the processor dependency for the - * actual init that will happen later... - */ -static int __init acpi_cpufreq_early_init(void) -{ - unsigned int i; - dprintk("acpi_cpufreq_early_init\n"); - - acpi_perf_data = alloc_percpu(struct acpi_processor_performance); - if (!acpi_perf_data) { - dprintk("Memory allocation error for acpi_perf_data.\n"); - return -ENOMEM; - } - for_each_possible_cpu(i) { - if (!zalloc_cpumask_var_node( - &per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map, - GFP_KERNEL, cpu_to_node(i))) { - - /* Freeing a NULL pointer is OK: alloc_percpu zeroes. */ - free_acpi_perf_data(); - return -ENOMEM; - } - } - - /* Do initialization in ACPI core */ - acpi_processor_preregister_performance(acpi_perf_data); - return 0; -} - -#ifdef CONFIG_SMP -/* - * Some BIOSes do SW_ANY coordination internally, either set it up in hw - * or do it in BIOS firmware and won't inform about it to OS. If not - * detected, this has a side effect of making CPU run at a different speed - * than OS intended it to run at. Detect it and handle it cleanly. - */ -static int bios_with_sw_any_bug; - -static int sw_any_bug_found(const struct dmi_system_id *d) -{ - bios_with_sw_any_bug = 1; - return 0; -} - -static const struct dmi_system_id sw_any_bug_dmi_table[] = { - { - .callback = sw_any_bug_found, - .ident = "Supermicro Server X6DLP", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Supermicro"), - DMI_MATCH(DMI_BIOS_VERSION, "080010"), - DMI_MATCH(DMI_PRODUCT_NAME, "X6DLP"), - }, - }, - { } -}; - -static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c) -{ - /* Intel Xeon Processor 7100 Series Specification Update - * http://www.intel.com/Assets/PDF/specupdate/314554.pdf - * AL30: A Machine Check Exception (MCE) Occurring during an - * Enhanced Intel SpeedStep Technology Ratio Change May Cause - * Both Processor Cores to Lock Up. */ - if (c->x86_vendor == X86_VENDOR_INTEL) { - if ((c->x86 == 15) && - (c->x86_model == 6) && - (c->x86_mask == 8)) { - printk(KERN_INFO "acpi-cpufreq: Intel(R) " - "Xeon(R) 7100 Errata AL30, processors may " - "lock up on frequency changes: disabling " - "acpi-cpufreq.\n"); - return -ENODEV; - } - } - return 0; -} -#endif - -static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) -{ - unsigned int i; - unsigned int valid_states = 0; - unsigned int cpu = policy->cpu; - struct acpi_cpufreq_data *data; - unsigned int result = 0; - struct cpuinfo_x86 *c = &cpu_data(policy->cpu); - struct acpi_processor_performance *perf; -#ifdef CONFIG_SMP - static int blacklisted; -#endif - - dprintk("acpi_cpufreq_cpu_init\n"); - -#ifdef CONFIG_SMP - if (blacklisted) - return blacklisted; - blacklisted = acpi_cpufreq_blacklist(c); - if (blacklisted) - return blacklisted; -#endif - - data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL); - if (!data) - return -ENOMEM; - - data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu); - per_cpu(acfreq_data, cpu) = data; - - if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) - acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS; - - result = acpi_processor_register_performance(data->acpi_data, cpu); - if (result) - goto err_free; - - perf = data->acpi_data; - policy->shared_type = perf->shared_type; - - /* - * Will let policy->cpus know about dependency only when software - * coordination is required. - */ - if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL || - policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) { - cpumask_copy(policy->cpus, perf->shared_cpu_map); - } - cpumask_copy(policy->related_cpus, perf->shared_cpu_map); - -#ifdef CONFIG_SMP - dmi_check_system(sw_any_bug_dmi_table); - if (bios_with_sw_any_bug && cpumask_weight(policy->cpus) == 1) { - policy->shared_type = CPUFREQ_SHARED_TYPE_ALL; - cpumask_copy(policy->cpus, cpu_core_mask(cpu)); - } -#endif - - /* capability check */ - if (perf->state_count <= 1) { - dprintk("No P-States\n"); - result = -ENODEV; - goto err_unreg; - } - - if (perf->control_register.space_id != perf->status_register.space_id) { - result = -ENODEV; - goto err_unreg; - } - - switch (perf->control_register.space_id) { - case ACPI_ADR_SPACE_SYSTEM_IO: - dprintk("SYSTEM IO addr space\n"); - data->cpu_feature = SYSTEM_IO_CAPABLE; - break; - case ACPI_ADR_SPACE_FIXED_HARDWARE: - dprintk("HARDWARE addr space\n"); - if (!check_est_cpu(cpu)) { - result = -ENODEV; - goto err_unreg; - } - data->cpu_feature = SYSTEM_INTEL_MSR_CAPABLE; - break; - default: - dprintk("Unknown addr space %d\n", - (u32) (perf->control_register.space_id)); - result = -ENODEV; - goto err_unreg; - } - - data->freq_table = kmalloc(sizeof(struct cpufreq_frequency_table) * - (perf->state_count+1), GFP_KERNEL); - if (!data->freq_table) { - result = -ENOMEM; - goto err_unreg; - } - - /* detect transition latency */ - policy->cpuinfo.transition_latency = 0; - for (i = 0; i < perf->state_count; i++) { - if ((perf->states[i].transition_latency * 1000) > - policy->cpuinfo.transition_latency) - policy->cpuinfo.transition_latency = - perf->states[i].transition_latency * 1000; - } - - /* Check for high latency (>20uS) from buggy BIOSes, like on T42 */ - if (perf->control_register.space_id == ACPI_ADR_SPACE_FIXED_HARDWARE && - policy->cpuinfo.transition_latency > 20 * 1000) { - policy->cpuinfo.transition_latency = 20 * 1000; - printk_once(KERN_INFO - "P-state transition latency capped at 20 uS\n"); - } - - /* table init */ - for (i = 0; i < perf->state_count; i++) { - if (i > 0 && perf->states[i].core_frequency >= - data->freq_table[valid_states-1].frequency / 1000) - continue; - - data->freq_table[valid_states].index = i; - data->freq_table[valid_states].frequency = - perf->states[i].core_frequency * 1000; - valid_states++; - } - data->freq_table[valid_states].frequency = CPUFREQ_TABLE_END; - perf->state = 0; - - result = cpufreq_frequency_table_cpuinfo(policy, data->freq_table); - if (result) - goto err_freqfree; - - if (perf->states[0].core_frequency * 1000 != policy->cpuinfo.max_freq) - printk(KERN_WARNING FW_WARN "P-state 0 is not max freq\n"); - - switch (perf->control_register.space_id) { - case ACPI_ADR_SPACE_SYSTEM_IO: - /* Current speed is unknown and not detectable by IO port */ - policy->cur = acpi_cpufreq_guess_freq(data, policy->cpu); - break; - case ACPI_ADR_SPACE_FIXED_HARDWARE: - acpi_cpufreq_driver.get = get_cur_freq_on_cpu; - policy->cur = get_cur_freq_on_cpu(cpu); - break; - default: - break; - } - - /* notify BIOS that we exist */ - acpi_processor_notify_smm(THIS_MODULE); - - /* Check for APERF/MPERF support in hardware */ - if (cpu_has(c, X86_FEATURE_APERFMPERF)) - acpi_cpufreq_driver.getavg = cpufreq_get_measured_perf; - - dprintk("CPU%u - ACPI performance management activated.\n", cpu); - for (i = 0; i < perf->state_count; i++) - dprintk(" %cP%d: %d MHz, %d mW, %d uS\n", - (i == perf->state ? '*' : ' '), i, - (u32) perf->states[i].core_frequency, - (u32) perf->states[i].power, - (u32) perf->states[i].transition_latency); - - cpufreq_frequency_table_get_attr(data->freq_table, policy->cpu); - - /* - * the first call to ->target() should result in us actually - * writing something to the appropriate registers. - */ - data->resume = 1; - - return result; - -err_freqfree: - kfree(data->freq_table); -err_unreg: - acpi_processor_unregister_performance(perf, cpu); -err_free: - kfree(data); - per_cpu(acfreq_data, cpu) = NULL; - - return result; -} - -static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy) -{ - struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu); - - dprintk("acpi_cpufreq_cpu_exit\n"); - - if (data) { - cpufreq_frequency_table_put_attr(policy->cpu); - per_cpu(acfreq_data, policy->cpu) = NULL; - acpi_processor_unregister_performance(data->acpi_data, - policy->cpu); - kfree(data->freq_table); - kfree(data); - } - - return 0; -} - -static int acpi_cpufreq_resume(struct cpufreq_policy *policy) -{ - struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu); - - dprintk("acpi_cpufreq_resume\n"); - - data->resume = 1; - - return 0; -} - -static struct freq_attr *acpi_cpufreq_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; - -static struct cpufreq_driver acpi_cpufreq_driver = { - .verify = acpi_cpufreq_verify, - .target = acpi_cpufreq_target, - .bios_limit = acpi_processor_get_bios_limit, - .init = acpi_cpufreq_cpu_init, - .exit = acpi_cpufreq_cpu_exit, - .resume = acpi_cpufreq_resume, - .name = "acpi-cpufreq", - .owner = THIS_MODULE, - .attr = acpi_cpufreq_attr, -}; - -static int __init acpi_cpufreq_init(void) -{ - int ret; - - if (acpi_disabled) - return 0; - - dprintk("acpi_cpufreq_init\n"); - - ret = acpi_cpufreq_early_init(); - if (ret) - return ret; - - ret = cpufreq_register_driver(&acpi_cpufreq_driver); - if (ret) - free_acpi_perf_data(); - - return ret; -} - -static void __exit acpi_cpufreq_exit(void) -{ - dprintk("acpi_cpufreq_exit\n"); - - cpufreq_unregister_driver(&acpi_cpufreq_driver); - - free_percpu(acpi_perf_data); -} - -module_param(acpi_pstate_strict, uint, 0644); -MODULE_PARM_DESC(acpi_pstate_strict, - "value 0 or non-zero. non-zero -> strict ACPI checks are " - "performed during frequency changes."); - -late_initcall(acpi_cpufreq_init); -module_exit(acpi_cpufreq_exit); - -MODULE_ALIAS("acpi"); diff --git a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c deleted file mode 100644 index 141abebc451..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c +++ /dev/null @@ -1,446 +0,0 @@ -/* - * (C) 2004-2006 Sebastian Witt <se.witt@gmx.net> - * - * Licensed under the terms of the GNU GPL License version 2. - * Based upon reverse engineered information - * - * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/moduleparam.h> -#include <linux/init.h> -#include <linux/cpufreq.h> -#include <linux/pci.h> -#include <linux/delay.h> - -#define NFORCE2_XTAL 25 -#define NFORCE2_BOOTFSB 0x48 -#define NFORCE2_PLLENABLE 0xa8 -#define NFORCE2_PLLREG 0xa4 -#define NFORCE2_PLLADR 0xa0 -#define NFORCE2_PLL(mul, div) (0x100000 | (mul << 8) | div) - -#define NFORCE2_MIN_FSB 50 -#define NFORCE2_SAFE_DISTANCE 50 - -/* Delay in ms between FSB changes */ -/* #define NFORCE2_DELAY 10 */ - -/* - * nforce2_chipset: - * FSB is changed using the chipset - */ -static struct pci_dev *nforce2_dev; - -/* fid: - * multiplier * 10 - */ -static int fid; - -/* min_fsb, max_fsb: - * minimum and maximum FSB (= FSB at boot time) - */ -static int min_fsb; -static int max_fsb; - -MODULE_AUTHOR("Sebastian Witt <se.witt@gmx.net>"); -MODULE_DESCRIPTION("nForce2 FSB changing cpufreq driver"); -MODULE_LICENSE("GPL"); - -module_param(fid, int, 0444); -module_param(min_fsb, int, 0444); - -MODULE_PARM_DESC(fid, "CPU multiplier to use (11.5 = 115)"); -MODULE_PARM_DESC(min_fsb, - "Minimum FSB to use, if not defined: current FSB - 50"); - -#define PFX "cpufreq-nforce2: " -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "cpufreq-nforce2", msg) - -/** - * nforce2_calc_fsb - calculate FSB - * @pll: PLL value - * - * Calculates FSB from PLL value - */ -static int nforce2_calc_fsb(int pll) -{ - unsigned char mul, div; - - mul = (pll >> 8) & 0xff; - div = pll & 0xff; - - if (div > 0) - return NFORCE2_XTAL * mul / div; - - return 0; -} - -/** - * nforce2_calc_pll - calculate PLL value - * @fsb: FSB - * - * Calculate PLL value for given FSB - */ -static int nforce2_calc_pll(unsigned int fsb) -{ - unsigned char xmul, xdiv; - unsigned char mul = 0, div = 0; - int tried = 0; - - /* Try to calculate multiplier and divider up to 4 times */ - while (((mul == 0) || (div == 0)) && (tried <= 3)) { - for (xdiv = 2; xdiv <= 0x80; xdiv++) - for (xmul = 1; xmul <= 0xfe; xmul++) - if (nforce2_calc_fsb(NFORCE2_PLL(xmul, xdiv)) == - fsb + tried) { - mul = xmul; - div = xdiv; - } - tried++; - } - - if ((mul == 0) || (div == 0)) - return -1; - - return NFORCE2_PLL(mul, div); -} - -/** - * nforce2_write_pll - write PLL value to chipset - * @pll: PLL value - * - * Writes new FSB PLL value to chipset - */ -static void nforce2_write_pll(int pll) -{ - int temp; - - /* Set the pll addr. to 0x00 */ - pci_write_config_dword(nforce2_dev, NFORCE2_PLLADR, 0); - - /* Now write the value in all 64 registers */ - for (temp = 0; temp <= 0x3f; temp++) - pci_write_config_dword(nforce2_dev, NFORCE2_PLLREG, pll); - - return; -} - -/** - * nforce2_fsb_read - Read FSB - * - * Read FSB from chipset - * If bootfsb != 0, return FSB at boot-time - */ -static unsigned int nforce2_fsb_read(int bootfsb) -{ - struct pci_dev *nforce2_sub5; - u32 fsb, temp = 0; - - /* Get chipset boot FSB from subdevice 5 (FSB at boot-time) */ - nforce2_sub5 = pci_get_subsys(PCI_VENDOR_ID_NVIDIA, 0x01EF, - PCI_ANY_ID, PCI_ANY_ID, NULL); - if (!nforce2_sub5) - return 0; - - pci_read_config_dword(nforce2_sub5, NFORCE2_BOOTFSB, &fsb); - fsb /= 1000000; - - /* Check if PLL register is already set */ - pci_read_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8 *)&temp); - - if (bootfsb || !temp) - return fsb; - - /* Use PLL register FSB value */ - pci_read_config_dword(nforce2_dev, NFORCE2_PLLREG, &temp); - fsb = nforce2_calc_fsb(temp); - - return fsb; -} - -/** - * nforce2_set_fsb - set new FSB - * @fsb: New FSB - * - * Sets new FSB - */ -static int nforce2_set_fsb(unsigned int fsb) -{ - u32 temp = 0; - unsigned int tfsb; - int diff; - int pll = 0; - - if ((fsb > max_fsb) || (fsb < NFORCE2_MIN_FSB)) { - printk(KERN_ERR PFX "FSB %d is out of range!\n", fsb); - return -EINVAL; - } - - tfsb = nforce2_fsb_read(0); - if (!tfsb) { - printk(KERN_ERR PFX "Error while reading the FSB\n"); - return -EINVAL; - } - - /* First write? Then set actual value */ - pci_read_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8 *)&temp); - if (!temp) { - pll = nforce2_calc_pll(tfsb); - - if (pll < 0) - return -EINVAL; - - nforce2_write_pll(pll); - } - - /* Enable write access */ - temp = 0x01; - pci_write_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8)temp); - - diff = tfsb - fsb; - - if (!diff) - return 0; - - while ((tfsb != fsb) && (tfsb <= max_fsb) && (tfsb >= min_fsb)) { - if (diff < 0) - tfsb++; - else - tfsb--; - - /* Calculate the PLL reg. value */ - pll = nforce2_calc_pll(tfsb); - if (pll == -1) - return -EINVAL; - - nforce2_write_pll(pll); -#ifdef NFORCE2_DELAY - mdelay(NFORCE2_DELAY); -#endif - } - - temp = 0x40; - pci_write_config_byte(nforce2_dev, NFORCE2_PLLADR, (u8)temp); - - return 0; -} - -/** - * nforce2_get - get the CPU frequency - * @cpu: CPU number - * - * Returns the CPU frequency - */ -static unsigned int nforce2_get(unsigned int cpu) -{ - if (cpu) - return 0; - return nforce2_fsb_read(0) * fid * 100; -} - -/** - * nforce2_target - set a new CPUFreq policy - * @policy: new policy - * @target_freq: the target frequency - * @relation: how that frequency relates to achieved frequency - * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) - * - * Sets a new CPUFreq policy. - */ -static int nforce2_target(struct cpufreq_policy *policy, - unsigned int target_freq, unsigned int relation) -{ -/* unsigned long flags; */ - struct cpufreq_freqs freqs; - unsigned int target_fsb; - - if ((target_freq > policy->max) || (target_freq < policy->min)) - return -EINVAL; - - target_fsb = target_freq / (fid * 100); - - freqs.old = nforce2_get(policy->cpu); - freqs.new = target_fsb * fid * 100; - freqs.cpu = 0; /* Only one CPU on nForce2 platforms */ - - if (freqs.old == freqs.new) - return 0; - - dprintk("Old CPU frequency %d kHz, new %d kHz\n", - freqs.old, freqs.new); - - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - - /* Disable IRQs */ - /* local_irq_save(flags); */ - - if (nforce2_set_fsb(target_fsb) < 0) - printk(KERN_ERR PFX "Changing FSB to %d failed\n", - target_fsb); - else - dprintk("Changed FSB successfully to %d\n", - target_fsb); - - /* Enable IRQs */ - /* local_irq_restore(flags); */ - - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - - return 0; -} - -/** - * nforce2_verify - verifies a new CPUFreq policy - * @policy: new policy - */ -static int nforce2_verify(struct cpufreq_policy *policy) -{ - unsigned int fsb_pol_max; - - fsb_pol_max = policy->max / (fid * 100); - - if (policy->min < (fsb_pol_max * fid * 100)) - policy->max = (fsb_pol_max + 1) * fid * 100; - - cpufreq_verify_within_limits(policy, - policy->cpuinfo.min_freq, - policy->cpuinfo.max_freq); - return 0; -} - -static int nforce2_cpu_init(struct cpufreq_policy *policy) -{ - unsigned int fsb; - unsigned int rfid; - - /* capability check */ - if (policy->cpu != 0) - return -ENODEV; - - /* Get current FSB */ - fsb = nforce2_fsb_read(0); - - if (!fsb) - return -EIO; - - /* FIX: Get FID from CPU */ - if (!fid) { - if (!cpu_khz) { - printk(KERN_WARNING PFX - "cpu_khz not set, can't calculate multiplier!\n"); - return -ENODEV; - } - - fid = cpu_khz / (fsb * 100); - rfid = fid % 5; - - if (rfid) { - if (rfid > 2) - fid += 5 - rfid; - else - fid -= rfid; - } - } - - printk(KERN_INFO PFX "FSB currently at %i MHz, FID %d.%d\n", fsb, - fid / 10, fid % 10); - - /* Set maximum FSB to FSB at boot time */ - max_fsb = nforce2_fsb_read(1); - - if (!max_fsb) - return -EIO; - - if (!min_fsb) - min_fsb = max_fsb - NFORCE2_SAFE_DISTANCE; - - if (min_fsb < NFORCE2_MIN_FSB) - min_fsb = NFORCE2_MIN_FSB; - - /* cpuinfo and default policy values */ - policy->cpuinfo.min_freq = min_fsb * fid * 100; - policy->cpuinfo.max_freq = max_fsb * fid * 100; - policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; - policy->cur = nforce2_get(policy->cpu); - policy->min = policy->cpuinfo.min_freq; - policy->max = policy->cpuinfo.max_freq; - - return 0; -} - -static int nforce2_cpu_exit(struct cpufreq_policy *policy) -{ - return 0; -} - -static struct cpufreq_driver nforce2_driver = { - .name = "nforce2", - .verify = nforce2_verify, - .target = nforce2_target, - .get = nforce2_get, - .init = nforce2_cpu_init, - .exit = nforce2_cpu_exit, - .owner = THIS_MODULE, -}; - -/** - * nforce2_detect_chipset - detect the Southbridge which contains FSB PLL logic - * - * Detects nForce2 A2 and C1 stepping - * - */ -static int nforce2_detect_chipset(void) -{ - nforce2_dev = pci_get_subsys(PCI_VENDOR_ID_NVIDIA, - PCI_DEVICE_ID_NVIDIA_NFORCE2, - PCI_ANY_ID, PCI_ANY_ID, NULL); - - if (nforce2_dev == NULL) - return -ENODEV; - - printk(KERN_INFO PFX "Detected nForce2 chipset revision %X\n", - nforce2_dev->revision); - printk(KERN_INFO PFX - "FSB changing is maybe unstable and can lead to " - "crashes and data loss.\n"); - - return 0; -} - -/** - * nforce2_init - initializes the nForce2 CPUFreq driver - * - * Initializes the nForce2 FSB support. Returns -ENODEV on unsupported - * devices, -EINVAL on problems during initiatization, and zero on - * success. - */ -static int __init nforce2_init(void) -{ - /* TODO: do we need to detect the processor? */ - - /* detect chipset */ - if (nforce2_detect_chipset()) { - printk(KERN_INFO PFX "No nForce2 chipset.\n"); - return -ENODEV; - } - - return cpufreq_register_driver(&nforce2_driver); -} - -/** - * nforce2_exit - unregisters cpufreq module - * - * Unregisters nForce2 FSB change support. - */ -static void __exit nforce2_exit(void) -{ - cpufreq_unregister_driver(&nforce2_driver); -} - -module_init(nforce2_init); -module_exit(nforce2_exit); - diff --git a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c deleted file mode 100644 index 35a257dd4bb..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c +++ /dev/null @@ -1,367 +0,0 @@ -/* - * Based on documentation provided by Dave Jones. Thanks! - * - * Licensed under the terms of the GNU GPL License version 2. - * - * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/cpufreq.h> -#include <linux/ioport.h> -#include <linux/slab.h> -#include <linux/timex.h> -#include <linux/io.h> -#include <linux/delay.h> - -#include <asm/msr.h> -#include <asm/tsc.h> - -#define EPS_BRAND_C7M 0 -#define EPS_BRAND_C7 1 -#define EPS_BRAND_EDEN 2 -#define EPS_BRAND_C3 3 -#define EPS_BRAND_C7D 4 - -struct eps_cpu_data { - u32 fsb; - struct cpufreq_frequency_table freq_table[]; -}; - -static struct eps_cpu_data *eps_cpu[NR_CPUS]; - - -static unsigned int eps_get(unsigned int cpu) -{ - struct eps_cpu_data *centaur; - u32 lo, hi; - - if (cpu) - return 0; - centaur = eps_cpu[cpu]; - if (centaur == NULL) - return 0; - - /* Return current frequency */ - rdmsr(MSR_IA32_PERF_STATUS, lo, hi); - return centaur->fsb * ((lo >> 8) & 0xff); -} - -static int eps_set_state(struct eps_cpu_data *centaur, - unsigned int cpu, - u32 dest_state) -{ - struct cpufreq_freqs freqs; - u32 lo, hi; - int err = 0; - int i; - - freqs.old = eps_get(cpu); - freqs.new = centaur->fsb * ((dest_state >> 8) & 0xff); - freqs.cpu = cpu; - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - - /* Wait while CPU is busy */ - rdmsr(MSR_IA32_PERF_STATUS, lo, hi); - i = 0; - while (lo & ((1 << 16) | (1 << 17))) { - udelay(16); - rdmsr(MSR_IA32_PERF_STATUS, lo, hi); - i++; - if (unlikely(i > 64)) { - err = -ENODEV; - goto postchange; - } - } - /* Set new multiplier and voltage */ - wrmsr(MSR_IA32_PERF_CTL, dest_state & 0xffff, 0); - /* Wait until transition end */ - i = 0; - do { - udelay(16); - rdmsr(MSR_IA32_PERF_STATUS, lo, hi); - i++; - if (unlikely(i > 64)) { - err = -ENODEV; - goto postchange; - } - } while (lo & ((1 << 16) | (1 << 17))); - - /* Return current frequency */ -postchange: - rdmsr(MSR_IA32_PERF_STATUS, lo, hi); - freqs.new = centaur->fsb * ((lo >> 8) & 0xff); - -#ifdef DEBUG - { - u8 current_multiplier, current_voltage; - - /* Print voltage and multiplier */ - rdmsr(MSR_IA32_PERF_STATUS, lo, hi); - current_voltage = lo & 0xff; - printk(KERN_INFO "eps: Current voltage = %dmV\n", - current_voltage * 16 + 700); - current_multiplier = (lo >> 8) & 0xff; - printk(KERN_INFO "eps: Current multiplier = %d\n", - current_multiplier); - } -#endif - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - return err; -} - -static int eps_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) -{ - struct eps_cpu_data *centaur; - unsigned int newstate = 0; - unsigned int cpu = policy->cpu; - unsigned int dest_state; - int ret; - - if (unlikely(eps_cpu[cpu] == NULL)) - return -ENODEV; - centaur = eps_cpu[cpu]; - - if (unlikely(cpufreq_frequency_table_target(policy, - &eps_cpu[cpu]->freq_table[0], - target_freq, - relation, - &newstate))) { - return -EINVAL; - } - - /* Make frequency transition */ - dest_state = centaur->freq_table[newstate].index & 0xffff; - ret = eps_set_state(centaur, cpu, dest_state); - if (ret) - printk(KERN_ERR "eps: Timeout!\n"); - return ret; -} - -static int eps_verify(struct cpufreq_policy *policy) -{ - return cpufreq_frequency_table_verify(policy, - &eps_cpu[policy->cpu]->freq_table[0]); -} - -static int eps_cpu_init(struct cpufreq_policy *policy) -{ - unsigned int i; - u32 lo, hi; - u64 val; - u8 current_multiplier, current_voltage; - u8 max_multiplier, max_voltage; - u8 min_multiplier, min_voltage; - u8 brand = 0; - u32 fsb; - struct eps_cpu_data *centaur; - struct cpuinfo_x86 *c = &cpu_data(0); - struct cpufreq_frequency_table *f_table; - int k, step, voltage; - int ret; - int states; - - if (policy->cpu != 0) - return -ENODEV; - - /* Check brand */ - printk(KERN_INFO "eps: Detected VIA "); - - switch (c->x86_model) { - case 10: - rdmsr(0x1153, lo, hi); - brand = (((lo >> 2) ^ lo) >> 18) & 3; - printk(KERN_CONT "Model A "); - break; - case 13: - rdmsr(0x1154, lo, hi); - brand = (((lo >> 4) ^ (lo >> 2))) & 0x000000ff; - printk(KERN_CONT "Model D "); - break; - } - - switch (brand) { - case EPS_BRAND_C7M: - printk(KERN_CONT "C7-M\n"); - break; - case EPS_BRAND_C7: - printk(KERN_CONT "C7\n"); - break; - case EPS_BRAND_EDEN: - printk(KERN_CONT "Eden\n"); - break; - case EPS_BRAND_C7D: - printk(KERN_CONT "C7-D\n"); - break; - case EPS_BRAND_C3: - printk(KERN_CONT "C3\n"); - return -ENODEV; - break; - } - /* Enable Enhanced PowerSaver */ - rdmsrl(MSR_IA32_MISC_ENABLE, val); - if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) { - val |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP; - wrmsrl(MSR_IA32_MISC_ENABLE, val); - /* Can be locked at 0 */ - rdmsrl(MSR_IA32_MISC_ENABLE, val); - if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) { - printk(KERN_INFO "eps: Can't enable Enhanced PowerSaver\n"); - return -ENODEV; - } - } - - /* Print voltage and multiplier */ - rdmsr(MSR_IA32_PERF_STATUS, lo, hi); - current_voltage = lo & 0xff; - printk(KERN_INFO "eps: Current voltage = %dmV\n", - current_voltage * 16 + 700); - current_multiplier = (lo >> 8) & 0xff; - printk(KERN_INFO "eps: Current multiplier = %d\n", current_multiplier); - - /* Print limits */ - max_voltage = hi & 0xff; - printk(KERN_INFO "eps: Highest voltage = %dmV\n", - max_voltage * 16 + 700); - max_multiplier = (hi >> 8) & 0xff; - printk(KERN_INFO "eps: Highest multiplier = %d\n", max_multiplier); - min_voltage = (hi >> 16) & 0xff; - printk(KERN_INFO "eps: Lowest voltage = %dmV\n", - min_voltage * 16 + 700); - min_multiplier = (hi >> 24) & 0xff; - printk(KERN_INFO "eps: Lowest multiplier = %d\n", min_multiplier); - - /* Sanity checks */ - if (current_multiplier == 0 || max_multiplier == 0 - || min_multiplier == 0) - return -EINVAL; - if (current_multiplier > max_multiplier - || max_multiplier <= min_multiplier) - return -EINVAL; - if (current_voltage > 0x1f || max_voltage > 0x1f) - return -EINVAL; - if (max_voltage < min_voltage) - return -EINVAL; - - /* Calc FSB speed */ - fsb = cpu_khz / current_multiplier; - /* Calc number of p-states supported */ - if (brand == EPS_BRAND_C7M) - states = max_multiplier - min_multiplier + 1; - else - states = 2; - - /* Allocate private data and frequency table for current cpu */ - centaur = kzalloc(sizeof(struct eps_cpu_data) - + (states + 1) * sizeof(struct cpufreq_frequency_table), - GFP_KERNEL); - if (!centaur) - return -ENOMEM; - eps_cpu[0] = centaur; - - /* Copy basic values */ - centaur->fsb = fsb; - - /* Fill frequency and MSR value table */ - f_table = ¢aur->freq_table[0]; - if (brand != EPS_BRAND_C7M) { - f_table[0].frequency = fsb * min_multiplier; - f_table[0].index = (min_multiplier << 8) | min_voltage; - f_table[1].frequency = fsb * max_multiplier; - f_table[1].index = (max_multiplier << 8) | max_voltage; - f_table[2].frequency = CPUFREQ_TABLE_END; - } else { - k = 0; - step = ((max_voltage - min_voltage) * 256) - / (max_multiplier - min_multiplier); - for (i = min_multiplier; i <= max_multiplier; i++) { - voltage = (k * step) / 256 + min_voltage; - f_table[k].frequency = fsb * i; - f_table[k].index = (i << 8) | voltage; - k++; - } - f_table[k].frequency = CPUFREQ_TABLE_END; - } - - policy->cpuinfo.transition_latency = 140000; /* 844mV -> 700mV in ns */ - policy->cur = fsb * current_multiplier; - - ret = cpufreq_frequency_table_cpuinfo(policy, ¢aur->freq_table[0]); - if (ret) { - kfree(centaur); - return ret; - } - - cpufreq_frequency_table_get_attr(¢aur->freq_table[0], policy->cpu); - return 0; -} - -static int eps_cpu_exit(struct cpufreq_policy *policy) -{ - unsigned int cpu = policy->cpu; - struct eps_cpu_data *centaur; - u32 lo, hi; - - if (eps_cpu[cpu] == NULL) - return -ENODEV; - centaur = eps_cpu[cpu]; - - /* Get max frequency */ - rdmsr(MSR_IA32_PERF_STATUS, lo, hi); - /* Set max frequency */ - eps_set_state(centaur, cpu, hi & 0xffff); - /* Bye */ - cpufreq_frequency_table_put_attr(policy->cpu); - kfree(eps_cpu[cpu]); - eps_cpu[cpu] = NULL; - return 0; -} - -static struct freq_attr *eps_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; - -static struct cpufreq_driver eps_driver = { - .verify = eps_verify, - .target = eps_target, - .init = eps_cpu_init, - .exit = eps_cpu_exit, - .get = eps_get, - .name = "e_powersaver", - .owner = THIS_MODULE, - .attr = eps_attr, -}; - -static int __init eps_init(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - - /* This driver will work only on Centaur C7 processors with - * Enhanced SpeedStep/PowerSaver registers */ - if (c->x86_vendor != X86_VENDOR_CENTAUR - || c->x86 != 6 || c->x86_model < 10) - return -ENODEV; - if (!cpu_has(c, X86_FEATURE_EST)) - return -ENODEV; - - if (cpufreq_register_driver(&eps_driver)) - return -EINVAL; - return 0; -} - -static void __exit eps_exit(void) -{ - cpufreq_unregister_driver(&eps_driver); -} - -MODULE_AUTHOR("Rafal Bilski <rafalbilski@interia.pl>"); -MODULE_DESCRIPTION("Enhanced PowerSaver driver for VIA C7 CPU's."); -MODULE_LICENSE("GPL"); - -module_init(eps_init); -module_exit(eps_exit); diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c deleted file mode 100644 index c587db472a7..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c +++ /dev/null @@ -1,309 +0,0 @@ -/* - * elanfreq: cpufreq driver for the AMD ELAN family - * - * (c) Copyright 2002 Robert Schwebel <r.schwebel@pengutronix.de> - * - * Parts of this code are (c) Sven Geggus <sven@geggus.net> - * - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * 2002-02-13: - initial revision for 2.4.18-pre9 by Robert Schwebel - * - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> - -#include <linux/delay.h> -#include <linux/cpufreq.h> - -#include <asm/msr.h> -#include <linux/timex.h> -#include <linux/io.h> - -#define REG_CSCIR 0x22 /* Chip Setup and Control Index Register */ -#define REG_CSCDR 0x23 /* Chip Setup and Control Data Register */ - -/* Module parameter */ -static int max_freq; - -struct s_elan_multiplier { - int clock; /* frequency in kHz */ - int val40h; /* PMU Force Mode register */ - int val80h; /* CPU Clock Speed Register */ -}; - -/* - * It is important that the frequencies - * are listed in ascending order here! - */ -static struct s_elan_multiplier elan_multiplier[] = { - {1000, 0x02, 0x18}, - {2000, 0x02, 0x10}, - {4000, 0x02, 0x08}, - {8000, 0x00, 0x00}, - {16000, 0x00, 0x02}, - {33000, 0x00, 0x04}, - {66000, 0x01, 0x04}, - {99000, 0x01, 0x05} -}; - -static struct cpufreq_frequency_table elanfreq_table[] = { - {0, 1000}, - {1, 2000}, - {2, 4000}, - {3, 8000}, - {4, 16000}, - {5, 33000}, - {6, 66000}, - {7, 99000}, - {0, CPUFREQ_TABLE_END}, -}; - - -/** - * elanfreq_get_cpu_frequency: determine current cpu speed - * - * Finds out at which frequency the CPU of the Elan SOC runs - * at the moment. Frequencies from 1 to 33 MHz are generated - * the normal way, 66 and 99 MHz are called "Hyperspeed Mode" - * and have the rest of the chip running with 33 MHz. - */ - -static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu) -{ - u8 clockspeed_reg; /* Clock Speed Register */ - - local_irq_disable(); - outb_p(0x80, REG_CSCIR); - clockspeed_reg = inb_p(REG_CSCDR); - local_irq_enable(); - - if ((clockspeed_reg & 0xE0) == 0xE0) - return 0; - - /* Are we in CPU clock multiplied mode (66/99 MHz)? */ - if ((clockspeed_reg & 0xE0) == 0xC0) { - if ((clockspeed_reg & 0x01) == 0) - return 66000; - else - return 99000; - } - - /* 33 MHz is not 32 MHz... */ - if ((clockspeed_reg & 0xE0) == 0xA0) - return 33000; - - return (1<<((clockspeed_reg & 0xE0) >> 5)) * 1000; -} - - -/** - * elanfreq_set_cpu_frequency: Change the CPU core frequency - * @cpu: cpu number - * @freq: frequency in kHz - * - * This function takes a frequency value and changes the CPU frequency - * according to this. Note that the frequency has to be checked by - * elanfreq_validatespeed() for correctness! - * - * There is no return value. - */ - -static void elanfreq_set_cpu_state(unsigned int state) -{ - struct cpufreq_freqs freqs; - - freqs.old = elanfreq_get_cpu_frequency(0); - freqs.new = elan_multiplier[state].clock; - freqs.cpu = 0; /* elanfreq.c is UP only driver */ - - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - - printk(KERN_INFO "elanfreq: attempting to set frequency to %i kHz\n", - elan_multiplier[state].clock); - - - /* - * Access to the Elan's internal registers is indexed via - * 0x22: Chip Setup & Control Register Index Register (CSCI) - * 0x23: Chip Setup & Control Register Data Register (CSCD) - * - */ - - /* - * 0x40 is the Power Management Unit's Force Mode Register. - * Bit 6 enables Hyperspeed Mode (66/100 MHz core frequency) - */ - - local_irq_disable(); - outb_p(0x40, REG_CSCIR); /* Disable hyperspeed mode */ - outb_p(0x00, REG_CSCDR); - local_irq_enable(); /* wait till internal pipelines and */ - udelay(1000); /* buffers have cleaned up */ - - local_irq_disable(); - - /* now, set the CPU clock speed register (0x80) */ - outb_p(0x80, REG_CSCIR); - outb_p(elan_multiplier[state].val80h, REG_CSCDR); - - /* now, the hyperspeed bit in PMU Force Mode Register (0x40) */ - outb_p(0x40, REG_CSCIR); - outb_p(elan_multiplier[state].val40h, REG_CSCDR); - udelay(10000); - local_irq_enable(); - - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); -}; - - -/** - * elanfreq_validatespeed: test if frequency range is valid - * @policy: the policy to validate - * - * This function checks if a given frequency range in kHz is valid - * for the hardware supported by the driver. - */ - -static int elanfreq_verify(struct cpufreq_policy *policy) -{ - return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]); -} - -static int elanfreq_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) -{ - unsigned int newstate = 0; - - if (cpufreq_frequency_table_target(policy, &elanfreq_table[0], - target_freq, relation, &newstate)) - return -EINVAL; - - elanfreq_set_cpu_state(newstate); - - return 0; -} - - -/* - * Module init and exit code - */ - -static int elanfreq_cpu_init(struct cpufreq_policy *policy) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - unsigned int i; - int result; - - /* capability check */ - if ((c->x86_vendor != X86_VENDOR_AMD) || - (c->x86 != 4) || (c->x86_model != 10)) - return -ENODEV; - - /* max freq */ - if (!max_freq) - max_freq = elanfreq_get_cpu_frequency(0); - - /* table init */ - for (i = 0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) { - if (elanfreq_table[i].frequency > max_freq) - elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID; - } - - /* cpuinfo and default policy values */ - policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; - policy->cur = elanfreq_get_cpu_frequency(0); - - result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table); - if (result) - return result; - - cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu); - return 0; -} - - -static int elanfreq_cpu_exit(struct cpufreq_policy *policy) -{ - cpufreq_frequency_table_put_attr(policy->cpu); - return 0; -} - - -#ifndef MODULE -/** - * elanfreq_setup - elanfreq command line parameter parsing - * - * elanfreq command line parameter. Use: - * elanfreq=66000 - * to set the maximum CPU frequency to 66 MHz. Note that in - * case you do not give this boot parameter, the maximum - * frequency will fall back to _current_ CPU frequency which - * might be lower. If you build this as a module, use the - * max_freq module parameter instead. - */ -static int __init elanfreq_setup(char *str) -{ - max_freq = simple_strtoul(str, &str, 0); - printk(KERN_WARNING "You're using the deprecated elanfreq command line option. Use elanfreq.max_freq instead, please!\n"); - return 1; -} -__setup("elanfreq=", elanfreq_setup); -#endif - - -static struct freq_attr *elanfreq_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; - - -static struct cpufreq_driver elanfreq_driver = { - .get = elanfreq_get_cpu_frequency, - .verify = elanfreq_verify, - .target = elanfreq_target, - .init = elanfreq_cpu_init, - .exit = elanfreq_cpu_exit, - .name = "elanfreq", - .owner = THIS_MODULE, - .attr = elanfreq_attr, -}; - - -static int __init elanfreq_init(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - - /* Test if we have the right hardware */ - if ((c->x86_vendor != X86_VENDOR_AMD) || - (c->x86 != 4) || (c->x86_model != 10)) { - printk(KERN_INFO "elanfreq: error: no Elan processor found!\n"); - return -ENODEV; - } - return cpufreq_register_driver(&elanfreq_driver); -} - - -static void __exit elanfreq_exit(void) -{ - cpufreq_unregister_driver(&elanfreq_driver); -} - - -module_param(max_freq, int, 0444); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, " - "Sven Geggus <sven@geggus.net>"); -MODULE_DESCRIPTION("cpufreq driver for AMD's Elan CPUs"); - -module_init(elanfreq_init); -module_exit(elanfreq_exit); diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c deleted file mode 100644 index 32974cf8423..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c +++ /dev/null @@ -1,517 +0,0 @@ -/* - * Cyrix MediaGX and NatSemi Geode Suspend Modulation - * (C) 2002 Zwane Mwaikambo <zwane@commfireservices.com> - * (C) 2002 Hiroshi Miura <miura@da-cha.org> - * All Rights Reserved - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * version 2 as published by the Free Software Foundation - * - * The author(s) of this software shall not be held liable for damages - * of any nature resulting due to the use of this software. This - * software is provided AS-IS with no warranties. - * - * Theoretical note: - * - * (see Geode(tm) CS5530 manual (rev.4.1) page.56) - * - * CPU frequency control on NatSemi Geode GX1/GXLV processor and CS55x0 - * are based on Suspend Modulation. - * - * Suspend Modulation works by asserting and de-asserting the SUSP# pin - * to CPU(GX1/GXLV) for configurable durations. When asserting SUSP# - * the CPU enters an idle state. GX1 stops its core clock when SUSP# is - * asserted then power consumption is reduced. - * - * Suspend Modulation's OFF/ON duration are configurable - * with 'Suspend Modulation OFF Count Register' - * and 'Suspend Modulation ON Count Register'. - * These registers are 8bit counters that represent the number of - * 32us intervals which the SUSP# pin is asserted(ON)/de-asserted(OFF) - * to the processor. - * - * These counters define a ratio which is the effective frequency - * of operation of the system. - * - * OFF Count - * F_eff = Fgx * ---------------------- - * OFF Count + ON Count - * - * 0 <= On Count, Off Count <= 255 - * - * From these limits, we can get register values - * - * off_duration + on_duration <= MAX_DURATION - * on_duration = off_duration * (stock_freq - freq) / freq - * - * off_duration = (freq * DURATION) / stock_freq - * on_duration = DURATION - off_duration - * - * - *--------------------------------------------------------------------------- - * - * ChangeLog: - * Dec. 12, 2003 Hiroshi Miura <miura@da-cha.org> - * - fix on/off register mistake - * - fix cpu_khz calc when it stops cpu modulation. - * - * Dec. 11, 2002 Hiroshi Miura <miura@da-cha.org> - * - rewrite for Cyrix MediaGX Cx5510/5520 and - * NatSemi Geode Cs5530(A). - * - * Jul. ??, 2002 Zwane Mwaikambo <zwane@commfireservices.com> - * - cs5530_mod patch for 2.4.19-rc1. - * - *--------------------------------------------------------------------------- - * - * Todo - * Test on machines with 5510, 5530, 5530A - */ - -/************************************************************************ - * Suspend Modulation - Definitions * - ************************************************************************/ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/smp.h> -#include <linux/cpufreq.h> -#include <linux/pci.h> -#include <linux/errno.h> -#include <linux/slab.h> - -#include <asm/processor-cyrix.h> - -/* PCI config registers, all at F0 */ -#define PCI_PMER1 0x80 /* power management enable register 1 */ -#define PCI_PMER2 0x81 /* power management enable register 2 */ -#define PCI_PMER3 0x82 /* power management enable register 3 */ -#define PCI_IRQTC 0x8c /* irq speedup timer counter register:typical 2 to 4ms */ -#define PCI_VIDTC 0x8d /* video speedup timer counter register: typical 50 to 100ms */ -#define PCI_MODOFF 0x94 /* suspend modulation OFF counter register, 1 = 32us */ -#define PCI_MODON 0x95 /* suspend modulation ON counter register */ -#define PCI_SUSCFG 0x96 /* suspend configuration register */ - -/* PMER1 bits */ -#define GPM (1<<0) /* global power management */ -#define GIT (1<<1) /* globally enable PM device idle timers */ -#define GTR (1<<2) /* globally enable IO traps */ -#define IRQ_SPDUP (1<<3) /* disable clock throttle during interrupt handling */ -#define VID_SPDUP (1<<4) /* disable clock throttle during vga video handling */ - -/* SUSCFG bits */ -#define SUSMOD (1<<0) /* enable/disable suspend modulation */ -/* the below is supported only with cs5530 (after rev.1.2)/cs5530A */ -#define SMISPDUP (1<<1) /* select how SMI re-enable suspend modulation: */ - /* IRQTC timer or read SMI speedup disable reg.(F1BAR[08-09h]) */ -#define SUSCFG (1<<2) /* enable powering down a GXLV processor. "Special 3Volt Suspend" mode */ -/* the below is supported only with cs5530A */ -#define PWRSVE_ISA (1<<3) /* stop ISA clock */ -#define PWRSVE (1<<4) /* active idle */ - -struct gxfreq_params { - u8 on_duration; - u8 off_duration; - u8 pci_suscfg; - u8 pci_pmer1; - u8 pci_pmer2; - struct pci_dev *cs55x0; -}; - -static struct gxfreq_params *gx_params; -static int stock_freq; - -/* PCI bus clock - defaults to 30.000 if cpu_khz is not available */ -static int pci_busclk; -module_param(pci_busclk, int, 0444); - -/* maximum duration for which the cpu may be suspended - * (32us * MAX_DURATION). If no parameter is given, this defaults - * to 255. - * Note that this leads to a maximum of 8 ms(!) where the CPU clock - * is suspended -- processing power is just 0.39% of what it used to be, - * though. 781.25 kHz(!) for a 200 MHz processor -- wow. */ -static int max_duration = 255; -module_param(max_duration, int, 0444); - -/* For the default policy, we want at least some processing power - * - let's say 5%. (min = maxfreq / POLICY_MIN_DIV) - */ -#define POLICY_MIN_DIV 20 - - -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "gx-suspmod", msg) - -/** - * we can detect a core multipiler from dir0_lsb - * from GX1 datasheet p.56, - * MULT[3:0]: - * 0000 = SYSCLK multiplied by 4 (test only) - * 0001 = SYSCLK multiplied by 10 - * 0010 = SYSCLK multiplied by 4 - * 0011 = SYSCLK multiplied by 6 - * 0100 = SYSCLK multiplied by 9 - * 0101 = SYSCLK multiplied by 5 - * 0110 = SYSCLK multiplied by 7 - * 0111 = SYSCLK multiplied by 8 - * of 33.3MHz - **/ -static int gx_freq_mult[16] = { - 4, 10, 4, 6, 9, 5, 7, 8, - 0, 0, 0, 0, 0, 0, 0, 0 -}; - - -/**************************************************************** - * Low Level chipset interface * - ****************************************************************/ -static struct pci_device_id gx_chipset_tbl[] __initdata = { - { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY), }, - { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5520), }, - { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5510), }, - { 0, }, -}; - -static void gx_write_byte(int reg, int value) -{ - pci_write_config_byte(gx_params->cs55x0, reg, value); -} - -/** - * gx_detect_chipset: - * - **/ -static __init struct pci_dev *gx_detect_chipset(void) -{ - struct pci_dev *gx_pci = NULL; - - /* check if CPU is a MediaGX or a Geode. */ - if ((boot_cpu_data.x86_vendor != X86_VENDOR_NSC) && - (boot_cpu_data.x86_vendor != X86_VENDOR_CYRIX)) { - dprintk("error: no MediaGX/Geode processor found!\n"); - return NULL; - } - - /* detect which companion chip is used */ - for_each_pci_dev(gx_pci) { - if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL) - return gx_pci; - } - - dprintk("error: no supported chipset found!\n"); - return NULL; -} - -/** - * gx_get_cpuspeed: - * - * Finds out at which efficient frequency the Cyrix MediaGX/NatSemi - * Geode CPU runs. - */ -static unsigned int gx_get_cpuspeed(unsigned int cpu) -{ - if ((gx_params->pci_suscfg & SUSMOD) == 0) - return stock_freq; - - return (stock_freq * gx_params->off_duration) - / (gx_params->on_duration + gx_params->off_duration); -} - -/** - * gx_validate_speed: - * determine current cpu speed - * - **/ - -static unsigned int gx_validate_speed(unsigned int khz, u8 *on_duration, - u8 *off_duration) -{ - unsigned int i; - u8 tmp_on, tmp_off; - int old_tmp_freq = stock_freq; - int tmp_freq; - - *off_duration = 1; - *on_duration = 0; - - for (i = max_duration; i > 0; i--) { - tmp_off = ((khz * i) / stock_freq) & 0xff; - tmp_on = i - tmp_off; - tmp_freq = (stock_freq * tmp_off) / i; - /* if this relation is closer to khz, use this. If it's equal, - * prefer it, too - lower latency */ - if (abs(tmp_freq - khz) <= abs(old_tmp_freq - khz)) { - *on_duration = tmp_on; - *off_duration = tmp_off; - old_tmp_freq = tmp_freq; - } - } - - return old_tmp_freq; -} - - -/** - * gx_set_cpuspeed: - * set cpu speed in khz. - **/ - -static void gx_set_cpuspeed(unsigned int khz) -{ - u8 suscfg, pmer1; - unsigned int new_khz; - unsigned long flags; - struct cpufreq_freqs freqs; - - freqs.cpu = 0; - freqs.old = gx_get_cpuspeed(0); - - new_khz = gx_validate_speed(khz, &gx_params->on_duration, - &gx_params->off_duration); - - freqs.new = new_khz; - - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - local_irq_save(flags); - - - - if (new_khz != stock_freq) { - /* if new khz == 100% of CPU speed, it is special case */ - switch (gx_params->cs55x0->device) { - case PCI_DEVICE_ID_CYRIX_5530_LEGACY: - pmer1 = gx_params->pci_pmer1 | IRQ_SPDUP | VID_SPDUP; - /* FIXME: need to test other values -- Zwane,Miura */ - /* typical 2 to 4ms */ - gx_write_byte(PCI_IRQTC, 4); - /* typical 50 to 100ms */ - gx_write_byte(PCI_VIDTC, 100); - gx_write_byte(PCI_PMER1, pmer1); - - if (gx_params->cs55x0->revision < 0x10) { - /* CS5530(rev 1.2, 1.3) */ - suscfg = gx_params->pci_suscfg|SUSMOD; - } else { - /* CS5530A,B.. */ - suscfg = gx_params->pci_suscfg|SUSMOD|PWRSVE; - } - break; - case PCI_DEVICE_ID_CYRIX_5520: - case PCI_DEVICE_ID_CYRIX_5510: - suscfg = gx_params->pci_suscfg | SUSMOD; - break; - default: - local_irq_restore(flags); - dprintk("fatal: try to set unknown chipset.\n"); - return; - } - } else { - suscfg = gx_params->pci_suscfg & ~(SUSMOD); - gx_params->off_duration = 0; - gx_params->on_duration = 0; - dprintk("suspend modulation disabled: cpu runs 100%% speed.\n"); - } - - gx_write_byte(PCI_MODOFF, gx_params->off_duration); - gx_write_byte(PCI_MODON, gx_params->on_duration); - - gx_write_byte(PCI_SUSCFG, suscfg); - pci_read_config_byte(gx_params->cs55x0, PCI_SUSCFG, &suscfg); - - local_irq_restore(flags); - - gx_params->pci_suscfg = suscfg; - - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - - dprintk("suspend modulation w/ duration of ON:%d us, OFF:%d us\n", - gx_params->on_duration * 32, gx_params->off_duration * 32); - dprintk("suspend modulation w/ clock speed: %d kHz.\n", freqs.new); -} - -/**************************************************************** - * High level functions * - ****************************************************************/ - -/* - * cpufreq_gx_verify: test if frequency range is valid - * - * This function checks if a given frequency range in kHz is valid - * for the hardware supported by the driver. - */ - -static int cpufreq_gx_verify(struct cpufreq_policy *policy) -{ - unsigned int tmp_freq = 0; - u8 tmp1, tmp2; - - if (!stock_freq || !policy) - return -EINVAL; - - policy->cpu = 0; - cpufreq_verify_within_limits(policy, (stock_freq / max_duration), - stock_freq); - - /* it needs to be assured that at least one supported frequency is - * within policy->min and policy->max. If it is not, policy->max - * needs to be increased until one freuqency is supported. - * policy->min may not be decreased, though. This way we guarantee a - * specific processing capacity. - */ - tmp_freq = gx_validate_speed(policy->min, &tmp1, &tmp2); - if (tmp_freq < policy->min) - tmp_freq += stock_freq / max_duration; - policy->min = tmp_freq; - if (policy->min > policy->max) - policy->max = tmp_freq; - tmp_freq = gx_validate_speed(policy->max, &tmp1, &tmp2); - if (tmp_freq > policy->max) - tmp_freq -= stock_freq / max_duration; - policy->max = tmp_freq; - if (policy->max < policy->min) - policy->max = policy->min; - cpufreq_verify_within_limits(policy, (stock_freq / max_duration), - stock_freq); - - return 0; -} - -/* - * cpufreq_gx_target: - * - */ -static int cpufreq_gx_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) -{ - u8 tmp1, tmp2; - unsigned int tmp_freq; - - if (!stock_freq || !policy) - return -EINVAL; - - policy->cpu = 0; - - tmp_freq = gx_validate_speed(target_freq, &tmp1, &tmp2); - while (tmp_freq < policy->min) { - tmp_freq += stock_freq / max_duration; - tmp_freq = gx_validate_speed(tmp_freq, &tmp1, &tmp2); - } - while (tmp_freq > policy->max) { - tmp_freq -= stock_freq / max_duration; - tmp_freq = gx_validate_speed(tmp_freq, &tmp1, &tmp2); - } - - gx_set_cpuspeed(tmp_freq); - - return 0; -} - -static int cpufreq_gx_cpu_init(struct cpufreq_policy *policy) -{ - unsigned int maxfreq, curfreq; - - if (!policy || policy->cpu != 0) - return -ENODEV; - - /* determine maximum frequency */ - if (pci_busclk) - maxfreq = pci_busclk * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f]; - else if (cpu_khz) - maxfreq = cpu_khz; - else - maxfreq = 30000 * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f]; - - stock_freq = maxfreq; - curfreq = gx_get_cpuspeed(0); - - dprintk("cpu max frequency is %d.\n", maxfreq); - dprintk("cpu current frequency is %dkHz.\n", curfreq); - - /* setup basic struct for cpufreq API */ - policy->cpu = 0; - - if (max_duration < POLICY_MIN_DIV) - policy->min = maxfreq / max_duration; - else - policy->min = maxfreq / POLICY_MIN_DIV; - policy->max = maxfreq; - policy->cur = curfreq; - policy->cpuinfo.min_freq = maxfreq / max_duration; - policy->cpuinfo.max_freq = maxfreq; - policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; - - return 0; -} - -/* - * cpufreq_gx_init: - * MediaGX/Geode GX initialize cpufreq driver - */ -static struct cpufreq_driver gx_suspmod_driver = { - .get = gx_get_cpuspeed, - .verify = cpufreq_gx_verify, - .target = cpufreq_gx_target, - .init = cpufreq_gx_cpu_init, - .name = "gx-suspmod", - .owner = THIS_MODULE, -}; - -static int __init cpufreq_gx_init(void) -{ - int ret; - struct gxfreq_params *params; - struct pci_dev *gx_pci; - - /* Test if we have the right hardware */ - gx_pci = gx_detect_chipset(); - if (gx_pci == NULL) - return -ENODEV; - - /* check whether module parameters are sane */ - if (max_duration > 0xff) - max_duration = 0xff; - - dprintk("geode suspend modulation available.\n"); - - params = kzalloc(sizeof(struct gxfreq_params), GFP_KERNEL); - if (params == NULL) - return -ENOMEM; - - params->cs55x0 = gx_pci; - gx_params = params; - - /* keep cs55x0 configurations */ - pci_read_config_byte(params->cs55x0, PCI_SUSCFG, &(params->pci_suscfg)); - pci_read_config_byte(params->cs55x0, PCI_PMER1, &(params->pci_pmer1)); - pci_read_config_byte(params->cs55x0, PCI_PMER2, &(params->pci_pmer2)); - pci_read_config_byte(params->cs55x0, PCI_MODON, &(params->on_duration)); - pci_read_config_byte(params->cs55x0, PCI_MODOFF, - &(params->off_duration)); - - ret = cpufreq_register_driver(&gx_suspmod_driver); - if (ret) { - kfree(params); - return ret; /* register error! */ - } - - return 0; -} - -static void __exit cpufreq_gx_exit(void) -{ - cpufreq_unregister_driver(&gx_suspmod_driver); - pci_dev_put(gx_params->cs55x0); - kfree(gx_params); -} - -MODULE_AUTHOR("Hiroshi Miura <miura@da-cha.org>"); -MODULE_DESCRIPTION("Cpufreq driver for Cyrix MediaGX and NatSemi Geode"); -MODULE_LICENSE("GPL"); - -module_init(cpufreq_gx_init); -module_exit(cpufreq_gx_exit); - diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c deleted file mode 100644 index 03162dac627..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/longhaul.c +++ /dev/null @@ -1,1029 +0,0 @@ -/* - * (C) 2001-2004 Dave Jones. <davej@redhat.com> - * (C) 2002 Padraig Brady. <padraig@antefacto.com> - * - * Licensed under the terms of the GNU GPL License version 2. - * Based upon datasheets & sample CPUs kindly provided by VIA. - * - * VIA have currently 3 different versions of Longhaul. - * Version 1 (Longhaul) uses the BCR2 MSR at 0x1147. - * It is present only in Samuel 1 (C5A), Samuel 2 (C5B) stepping 0. - * Version 2 of longhaul is backward compatible with v1, but adds - * LONGHAUL MSR for purpose of both frequency and voltage scaling. - * Present in Samuel 2 (steppings 1-7 only) (C5B), and Ezra (C5C). - * Version 3 of longhaul got renamed to Powersaver and redesigned - * to use only the POWERSAVER MSR at 0x110a. - * It is present in Ezra-T (C5M), Nehemiah (C5X) and above. - * It's pretty much the same feature wise to longhaul v2, though - * there is provision for scaling FSB too, but this doesn't work - * too well in practice so we don't even try to use this. - * - * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/moduleparam.h> -#include <linux/init.h> -#include <linux/cpufreq.h> -#include <linux/pci.h> -#include <linux/slab.h> -#include <linux/string.h> -#include <linux/delay.h> -#include <linux/timex.h> -#include <linux/io.h> -#include <linux/acpi.h> - -#include <asm/msr.h> -#include <acpi/processor.h> - -#include "longhaul.h" - -#define PFX "longhaul: " - -#define TYPE_LONGHAUL_V1 1 -#define TYPE_LONGHAUL_V2 2 -#define TYPE_POWERSAVER 3 - -#define CPU_SAMUEL 1 -#define CPU_SAMUEL2 2 -#define CPU_EZRA 3 -#define CPU_EZRA_T 4 -#define CPU_NEHEMIAH 5 -#define CPU_NEHEMIAH_C 6 - -/* Flags */ -#define USE_ACPI_C3 (1 << 1) -#define USE_NORTHBRIDGE (1 << 2) - -static int cpu_model; -static unsigned int numscales = 16; -static unsigned int fsb; - -static const struct mV_pos *vrm_mV_table; -static const unsigned char *mV_vrm_table; - -static unsigned int highest_speed, lowest_speed; /* kHz */ -static unsigned int minmult, maxmult; -static int can_scale_voltage; -static struct acpi_processor *pr; -static struct acpi_processor_cx *cx; -static u32 acpi_regs_addr; -static u8 longhaul_flags; -static unsigned int longhaul_index; - -/* Module parameters */ -static int scale_voltage; -static int disable_acpi_c3; -static int revid_errata; - -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "longhaul", msg) - - -/* Clock ratios multiplied by 10 */ -static int mults[32]; -static int eblcr[32]; -static int longhaul_version; -static struct cpufreq_frequency_table *longhaul_table; - -#ifdef CONFIG_CPU_FREQ_DEBUG -static char speedbuffer[8]; - -static char *print_speed(int speed) -{ - if (speed < 1000) { - snprintf(speedbuffer, sizeof(speedbuffer), "%dMHz", speed); - return speedbuffer; - } - - if (speed%1000 == 0) - snprintf(speedbuffer, sizeof(speedbuffer), - "%dGHz", speed/1000); - else - snprintf(speedbuffer, sizeof(speedbuffer), - "%d.%dGHz", speed/1000, (speed%1000)/100); - - return speedbuffer; -} -#endif - - -static unsigned int calc_speed(int mult) -{ - int khz; - khz = (mult/10)*fsb; - if (mult%10) - khz += fsb/2; - khz *= 1000; - return khz; -} - - -static int longhaul_get_cpu_mult(void) -{ - unsigned long invalue = 0, lo, hi; - - rdmsr(MSR_IA32_EBL_CR_POWERON, lo, hi); - invalue = (lo & (1<<22|1<<23|1<<24|1<<25))>>22; - if (longhaul_version == TYPE_LONGHAUL_V2 || - longhaul_version == TYPE_POWERSAVER) { - if (lo & (1<<27)) - invalue += 16; - } - return eblcr[invalue]; -} - -/* For processor with BCR2 MSR */ - -static void do_longhaul1(unsigned int mults_index) -{ - union msr_bcr2 bcr2; - - rdmsrl(MSR_VIA_BCR2, bcr2.val); - /* Enable software clock multiplier */ - bcr2.bits.ESOFTBF = 1; - bcr2.bits.CLOCKMUL = mults_index & 0xff; - - /* Sync to timer tick */ - safe_halt(); - /* Change frequency on next halt or sleep */ - wrmsrl(MSR_VIA_BCR2, bcr2.val); - /* Invoke transition */ - ACPI_FLUSH_CPU_CACHE(); - halt(); - - /* Disable software clock multiplier */ - local_irq_disable(); - rdmsrl(MSR_VIA_BCR2, bcr2.val); - bcr2.bits.ESOFTBF = 0; - wrmsrl(MSR_VIA_BCR2, bcr2.val); -} - -/* For processor with Longhaul MSR */ - -static void do_powersaver(int cx_address, unsigned int mults_index, - unsigned int dir) -{ - union msr_longhaul longhaul; - u32 t; - - rdmsrl(MSR_VIA_LONGHAUL, longhaul.val); - /* Setup new frequency */ - if (!revid_errata) - longhaul.bits.RevisionKey = longhaul.bits.RevisionID; - else - longhaul.bits.RevisionKey = 0; - longhaul.bits.SoftBusRatio = mults_index & 0xf; - longhaul.bits.SoftBusRatio4 = (mults_index & 0x10) >> 4; - /* Setup new voltage */ - if (can_scale_voltage) - longhaul.bits.SoftVID = (mults_index >> 8) & 0x1f; - /* Sync to timer tick */ - safe_halt(); - /* Raise voltage if necessary */ - if (can_scale_voltage && dir) { - longhaul.bits.EnableSoftVID = 1; - wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); - /* Change voltage */ - if (!cx_address) { - ACPI_FLUSH_CPU_CACHE(); - halt(); - } else { - ACPI_FLUSH_CPU_CACHE(); - /* Invoke C3 */ - inb(cx_address); - /* Dummy op - must do something useless after P_LVL3 - * read */ - t = inl(acpi_gbl_FADT.xpm_timer_block.address); - } - longhaul.bits.EnableSoftVID = 0; - wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); - } - - /* Change frequency on next halt or sleep */ - longhaul.bits.EnableSoftBusRatio = 1; - wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); - if (!cx_address) { - ACPI_FLUSH_CPU_CACHE(); - halt(); - } else { - ACPI_FLUSH_CPU_CACHE(); - /* Invoke C3 */ - inb(cx_address); - /* Dummy op - must do something useless after P_LVL3 read */ - t = inl(acpi_gbl_FADT.xpm_timer_block.address); - } - /* Disable bus ratio bit */ - longhaul.bits.EnableSoftBusRatio = 0; - wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); - - /* Reduce voltage if necessary */ - if (can_scale_voltage && !dir) { - longhaul.bits.EnableSoftVID = 1; - wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); - /* Change voltage */ - if (!cx_address) { - ACPI_FLUSH_CPU_CACHE(); - halt(); - } else { - ACPI_FLUSH_CPU_CACHE(); - /* Invoke C3 */ - inb(cx_address); - /* Dummy op - must do something useless after P_LVL3 - * read */ - t = inl(acpi_gbl_FADT.xpm_timer_block.address); - } - longhaul.bits.EnableSoftVID = 0; - wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); - } -} - -/** - * longhaul_set_cpu_frequency() - * @mults_index : bitpattern of the new multiplier. - * - * Sets a new clock ratio. - */ - -static void longhaul_setstate(unsigned int table_index) -{ - unsigned int mults_index; - int speed, mult; - struct cpufreq_freqs freqs; - unsigned long flags; - unsigned int pic1_mask, pic2_mask; - u16 bm_status = 0; - u32 bm_timeout = 1000; - unsigned int dir = 0; - - mults_index = longhaul_table[table_index].index; - /* Safety precautions */ - mult = mults[mults_index & 0x1f]; - if (mult == -1) - return; - speed = calc_speed(mult); - if ((speed > highest_speed) || (speed < lowest_speed)) - return; - /* Voltage transition before frequency transition? */ - if (can_scale_voltage && longhaul_index < table_index) - dir = 1; - - freqs.old = calc_speed(longhaul_get_cpu_mult()); - freqs.new = speed; - freqs.cpu = 0; /* longhaul.c is UP only driver */ - - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - - dprintk("Setting to FSB:%dMHz Mult:%d.%dx (%s)\n", - fsb, mult/10, mult%10, print_speed(speed/1000)); -retry_loop: - preempt_disable(); - local_irq_save(flags); - - pic2_mask = inb(0xA1); - pic1_mask = inb(0x21); /* works on C3. save mask. */ - outb(0xFF, 0xA1); /* Overkill */ - outb(0xFE, 0x21); /* TMR0 only */ - - /* Wait while PCI bus is busy. */ - if (acpi_regs_addr && (longhaul_flags & USE_NORTHBRIDGE - || ((pr != NULL) && pr->flags.bm_control))) { - bm_status = inw(acpi_regs_addr); - bm_status &= 1 << 4; - while (bm_status && bm_timeout) { - outw(1 << 4, acpi_regs_addr); - bm_timeout--; - bm_status = inw(acpi_regs_addr); - bm_status &= 1 << 4; - } - } - - if (longhaul_flags & USE_NORTHBRIDGE) { - /* Disable AGP and PCI arbiters */ - outb(3, 0x22); - } else if ((pr != NULL) && pr->flags.bm_control) { - /* Disable bus master arbitration */ - acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 1); - } - switch (longhaul_version) { - - /* - * Longhaul v1. (Samuel[C5A] and Samuel2 stepping 0[C5B]) - * Software controlled multipliers only. - */ - case TYPE_LONGHAUL_V1: - do_longhaul1(mults_index); - break; - - /* - * Longhaul v2 appears in Samuel2 Steppings 1->7 [C5B] and Ezra [C5C] - * - * Longhaul v3 (aka Powersaver). (Ezra-T [C5M] & Nehemiah [C5N]) - * Nehemiah can do FSB scaling too, but this has never been proven - * to work in practice. - */ - case TYPE_LONGHAUL_V2: - case TYPE_POWERSAVER: - if (longhaul_flags & USE_ACPI_C3) { - /* Don't allow wakeup */ - acpi_write_bit_register(ACPI_BITREG_BUS_MASTER_RLD, 0); - do_powersaver(cx->address, mults_index, dir); - } else { - do_powersaver(0, mults_index, dir); - } - break; - } - - if (longhaul_flags & USE_NORTHBRIDGE) { - /* Enable arbiters */ - outb(0, 0x22); - } else if ((pr != NULL) && pr->flags.bm_control) { - /* Enable bus master arbitration */ - acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 0); - } - outb(pic2_mask, 0xA1); /* restore mask */ - outb(pic1_mask, 0x21); - - local_irq_restore(flags); - preempt_enable(); - - freqs.new = calc_speed(longhaul_get_cpu_mult()); - /* Check if requested frequency is set. */ - if (unlikely(freqs.new != speed)) { - printk(KERN_INFO PFX "Failed to set requested frequency!\n"); - /* Revision ID = 1 but processor is expecting revision key - * equal to 0. Jumpers at the bottom of processor will change - * multiplier and FSB, but will not change bits in Longhaul - * MSR nor enable voltage scaling. */ - if (!revid_errata) { - printk(KERN_INFO PFX "Enabling \"Ignore Revision ID\" " - "option.\n"); - revid_errata = 1; - msleep(200); - goto retry_loop; - } - /* Why ACPI C3 sometimes doesn't work is a mystery for me. - * But it does happen. Processor is entering ACPI C3 state, - * but it doesn't change frequency. I tried poking various - * bits in northbridge registers, but without success. */ - if (longhaul_flags & USE_ACPI_C3) { - printk(KERN_INFO PFX "Disabling ACPI C3 support.\n"); - longhaul_flags &= ~USE_ACPI_C3; - if (revid_errata) { - printk(KERN_INFO PFX "Disabling \"Ignore " - "Revision ID\" option.\n"); - revid_errata = 0; - } - msleep(200); - goto retry_loop; - } - /* This shouldn't happen. Longhaul ver. 2 was reported not - * working on processors without voltage scaling, but with - * RevID = 1. RevID errata will make things right. Just - * to be 100% sure. */ - if (longhaul_version == TYPE_LONGHAUL_V2) { - printk(KERN_INFO PFX "Switching to Longhaul ver. 1\n"); - longhaul_version = TYPE_LONGHAUL_V1; - msleep(200); - goto retry_loop; - } - } - /* Report true CPU frequency */ - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - - if (!bm_timeout) - printk(KERN_INFO PFX "Warning: Timeout while waiting for " - "idle PCI bus.\n"); -} - -/* - * Centaur decided to make life a little more tricky. - * Only longhaul v1 is allowed to read EBLCR BSEL[0:1]. - * Samuel2 and above have to try and guess what the FSB is. - * We do this by assuming we booted at maximum multiplier, and interpolate - * between that value multiplied by possible FSBs and cpu_mhz which - * was calculated at boot time. Really ugly, but no other way to do this. - */ - -#define ROUNDING 0xf - -static int guess_fsb(int mult) -{ - int speed = cpu_khz / 1000; - int i; - int speeds[] = { 666, 1000, 1333, 2000 }; - int f_max, f_min; - - for (i = 0; i < 4; i++) { - f_max = ((speeds[i] * mult) + 50) / 100; - f_max += (ROUNDING / 2); - f_min = f_max - ROUNDING; - if ((speed <= f_max) && (speed >= f_min)) - return speeds[i] / 10; - } - return 0; -} - - -static int __cpuinit longhaul_get_ranges(void) -{ - unsigned int i, j, k = 0; - unsigned int ratio; - int mult; - - /* Get current frequency */ - mult = longhaul_get_cpu_mult(); - if (mult == -1) { - printk(KERN_INFO PFX "Invalid (reserved) multiplier!\n"); - return -EINVAL; - } - fsb = guess_fsb(mult); - if (fsb == 0) { - printk(KERN_INFO PFX "Invalid (reserved) FSB!\n"); - return -EINVAL; - } - /* Get max multiplier - as we always did. - * Longhaul MSR is usefull only when voltage scaling is enabled. - * C3 is booting at max anyway. */ - maxmult = mult; - /* Get min multiplier */ - switch (cpu_model) { - case CPU_NEHEMIAH: - minmult = 50; - break; - case CPU_NEHEMIAH_C: - minmult = 40; - break; - default: - minmult = 30; - break; - } - - dprintk("MinMult:%d.%dx MaxMult:%d.%dx\n", - minmult/10, minmult%10, maxmult/10, maxmult%10); - - highest_speed = calc_speed(maxmult); - lowest_speed = calc_speed(minmult); - dprintk("FSB:%dMHz Lowest speed: %s Highest speed:%s\n", fsb, - print_speed(lowest_speed/1000), - print_speed(highest_speed/1000)); - - if (lowest_speed == highest_speed) { - printk(KERN_INFO PFX "highestspeed == lowest, aborting.\n"); - return -EINVAL; - } - if (lowest_speed > highest_speed) { - printk(KERN_INFO PFX "nonsense! lowest (%d > %d) !\n", - lowest_speed, highest_speed); - return -EINVAL; - } - - longhaul_table = kmalloc((numscales + 1) * sizeof(*longhaul_table), - GFP_KERNEL); - if (!longhaul_table) - return -ENOMEM; - - for (j = 0; j < numscales; j++) { - ratio = mults[j]; - if (ratio == -1) - continue; - if (ratio > maxmult || ratio < minmult) - continue; - longhaul_table[k].frequency = calc_speed(ratio); - longhaul_table[k].index = j; - k++; - } - if (k <= 1) { - kfree(longhaul_table); - return -ENODEV; - } - /* Sort */ - for (j = 0; j < k - 1; j++) { - unsigned int min_f, min_i; - min_f = longhaul_table[j].frequency; - min_i = j; - for (i = j + 1; i < k; i++) { - if (longhaul_table[i].frequency < min_f) { - min_f = longhaul_table[i].frequency; - min_i = i; - } - } - if (min_i != j) { - swap(longhaul_table[j].frequency, - longhaul_table[min_i].frequency); - swap(longhaul_table[j].index, - longhaul_table[min_i].index); - } - } - - longhaul_table[k].frequency = CPUFREQ_TABLE_END; - - /* Find index we are running on */ - for (j = 0; j < k; j++) { - if (mults[longhaul_table[j].index & 0x1f] == mult) { - longhaul_index = j; - break; - } - } - return 0; -} - - -static void __cpuinit longhaul_setup_voltagescaling(void) -{ - union msr_longhaul longhaul; - struct mV_pos minvid, maxvid, vid; - unsigned int j, speed, pos, kHz_step, numvscales; - int min_vid_speed; - - rdmsrl(MSR_VIA_LONGHAUL, longhaul.val); - if (!(longhaul.bits.RevisionID & 1)) { - printk(KERN_INFO PFX "Voltage scaling not supported by CPU.\n"); - return; - } - - if (!longhaul.bits.VRMRev) { - printk(KERN_INFO PFX "VRM 8.5\n"); - vrm_mV_table = &vrm85_mV[0]; - mV_vrm_table = &mV_vrm85[0]; - } else { - printk(KERN_INFO PFX "Mobile VRM\n"); - if (cpu_model < CPU_NEHEMIAH) - return; - vrm_mV_table = &mobilevrm_mV[0]; - mV_vrm_table = &mV_mobilevrm[0]; - } - - minvid = vrm_mV_table[longhaul.bits.MinimumVID]; - maxvid = vrm_mV_table[longhaul.bits.MaximumVID]; - - if (minvid.mV == 0 || maxvid.mV == 0 || minvid.mV > maxvid.mV) { - printk(KERN_INFO PFX "Bogus values Min:%d.%03d Max:%d.%03d. " - "Voltage scaling disabled.\n", - minvid.mV/1000, minvid.mV%1000, - maxvid.mV/1000, maxvid.mV%1000); - return; - } - - if (minvid.mV == maxvid.mV) { - printk(KERN_INFO PFX "Claims to support voltage scaling but " - "min & max are both %d.%03d. " - "Voltage scaling disabled\n", - maxvid.mV/1000, maxvid.mV%1000); - return; - } - - /* How many voltage steps*/ - numvscales = maxvid.pos - minvid.pos + 1; - printk(KERN_INFO PFX - "Max VID=%d.%03d " - "Min VID=%d.%03d, " - "%d possible voltage scales\n", - maxvid.mV/1000, maxvid.mV%1000, - minvid.mV/1000, minvid.mV%1000, - numvscales); - - /* Calculate max frequency at min voltage */ - j = longhaul.bits.MinMHzBR; - if (longhaul.bits.MinMHzBR4) - j += 16; - min_vid_speed = eblcr[j]; - if (min_vid_speed == -1) - return; - switch (longhaul.bits.MinMHzFSB) { - case 0: - min_vid_speed *= 13333; - break; - case 1: - min_vid_speed *= 10000; - break; - case 3: - min_vid_speed *= 6666; - break; - default: - return; - break; - } - if (min_vid_speed >= highest_speed) - return; - /* Calculate kHz for one voltage step */ - kHz_step = (highest_speed - min_vid_speed) / numvscales; - - j = 0; - while (longhaul_table[j].frequency != CPUFREQ_TABLE_END) { - speed = longhaul_table[j].frequency; - if (speed > min_vid_speed) - pos = (speed - min_vid_speed) / kHz_step + minvid.pos; - else - pos = minvid.pos; - longhaul_table[j].index |= mV_vrm_table[pos] << 8; - vid = vrm_mV_table[mV_vrm_table[pos]]; - printk(KERN_INFO PFX "f: %d kHz, index: %d, vid: %d mV\n", - speed, j, vid.mV); - j++; - } - - can_scale_voltage = 1; - printk(KERN_INFO PFX "Voltage scaling enabled.\n"); -} - - -static int longhaul_verify(struct cpufreq_policy *policy) -{ - return cpufreq_frequency_table_verify(policy, longhaul_table); -} - - -static int longhaul_target(struct cpufreq_policy *policy, - unsigned int target_freq, unsigned int relation) -{ - unsigned int table_index = 0; - unsigned int i; - unsigned int dir = 0; - u8 vid, current_vid; - - if (cpufreq_frequency_table_target(policy, longhaul_table, target_freq, - relation, &table_index)) - return -EINVAL; - - /* Don't set same frequency again */ - if (longhaul_index == table_index) - return 0; - - if (!can_scale_voltage) - longhaul_setstate(table_index); - else { - /* On test system voltage transitions exceeding single - * step up or down were turning motherboard off. Both - * "ondemand" and "userspace" are unsafe. C7 is doing - * this in hardware, C3 is old and we need to do this - * in software. */ - i = longhaul_index; - current_vid = (longhaul_table[longhaul_index].index >> 8); - current_vid &= 0x1f; - if (table_index > longhaul_index) - dir = 1; - while (i != table_index) { - vid = (longhaul_table[i].index >> 8) & 0x1f; - if (vid != current_vid) { - longhaul_setstate(i); - current_vid = vid; - msleep(200); - } - if (dir) - i++; - else - i--; - } - longhaul_setstate(table_index); - } - longhaul_index = table_index; - return 0; -} - - -static unsigned int longhaul_get(unsigned int cpu) -{ - if (cpu) - return 0; - return calc_speed(longhaul_get_cpu_mult()); -} - -static acpi_status longhaul_walk_callback(acpi_handle obj_handle, - u32 nesting_level, - void *context, void **return_value) -{ - struct acpi_device *d; - - if (acpi_bus_get_device(obj_handle, &d)) - return 0; - - *return_value = acpi_driver_data(d); - return 1; -} - -/* VIA don't support PM2 reg, but have something similar */ -static int enable_arbiter_disable(void) -{ - struct pci_dev *dev; - int status = 1; - int reg; - u8 pci_cmd; - - /* Find PLE133 host bridge */ - reg = 0x78; - dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8601_0, - NULL); - /* Find PM133/VT8605 host bridge */ - if (dev == NULL) - dev = pci_get_device(PCI_VENDOR_ID_VIA, - PCI_DEVICE_ID_VIA_8605_0, NULL); - /* Find CLE266 host bridge */ - if (dev == NULL) { - reg = 0x76; - dev = pci_get_device(PCI_VENDOR_ID_VIA, - PCI_DEVICE_ID_VIA_862X_0, NULL); - /* Find CN400 V-Link host bridge */ - if (dev == NULL) - dev = pci_get_device(PCI_VENDOR_ID_VIA, 0x7259, NULL); - } - if (dev != NULL) { - /* Enable access to port 0x22 */ - pci_read_config_byte(dev, reg, &pci_cmd); - if (!(pci_cmd & 1<<7)) { - pci_cmd |= 1<<7; - pci_write_config_byte(dev, reg, pci_cmd); - pci_read_config_byte(dev, reg, &pci_cmd); - if (!(pci_cmd & 1<<7)) { - printk(KERN_ERR PFX - "Can't enable access to port 0x22.\n"); - status = 0; - } - } - pci_dev_put(dev); - return status; - } - return 0; -} - -static int longhaul_setup_southbridge(void) -{ - struct pci_dev *dev; - u8 pci_cmd; - - /* Find VT8235 southbridge */ - dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, NULL); - if (dev == NULL) - /* Find VT8237 southbridge */ - dev = pci_get_device(PCI_VENDOR_ID_VIA, - PCI_DEVICE_ID_VIA_8237, NULL); - if (dev != NULL) { - /* Set transition time to max */ - pci_read_config_byte(dev, 0xec, &pci_cmd); - pci_cmd &= ~(1 << 2); - pci_write_config_byte(dev, 0xec, pci_cmd); - pci_read_config_byte(dev, 0xe4, &pci_cmd); - pci_cmd &= ~(1 << 7); - pci_write_config_byte(dev, 0xe4, pci_cmd); - pci_read_config_byte(dev, 0xe5, &pci_cmd); - pci_cmd |= 1 << 7; - pci_write_config_byte(dev, 0xe5, pci_cmd); - /* Get address of ACPI registers block*/ - pci_read_config_byte(dev, 0x81, &pci_cmd); - if (pci_cmd & 1 << 7) { - pci_read_config_dword(dev, 0x88, &acpi_regs_addr); - acpi_regs_addr &= 0xff00; - printk(KERN_INFO PFX "ACPI I/O at 0x%x\n", - acpi_regs_addr); - } - - pci_dev_put(dev); - return 1; - } - return 0; -} - -static int __cpuinit longhaul_cpu_init(struct cpufreq_policy *policy) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - char *cpuname = NULL; - int ret; - u32 lo, hi; - - /* Check what we have on this motherboard */ - switch (c->x86_model) { - case 6: - cpu_model = CPU_SAMUEL; - cpuname = "C3 'Samuel' [C5A]"; - longhaul_version = TYPE_LONGHAUL_V1; - memcpy(mults, samuel1_mults, sizeof(samuel1_mults)); - memcpy(eblcr, samuel1_eblcr, sizeof(samuel1_eblcr)); - break; - - case 7: - switch (c->x86_mask) { - case 0: - longhaul_version = TYPE_LONGHAUL_V1; - cpu_model = CPU_SAMUEL2; - cpuname = "C3 'Samuel 2' [C5B]"; - /* Note, this is not a typo, early Samuel2's had - * Samuel1 ratios. */ - memcpy(mults, samuel1_mults, sizeof(samuel1_mults)); - memcpy(eblcr, samuel2_eblcr, sizeof(samuel2_eblcr)); - break; - case 1 ... 15: - longhaul_version = TYPE_LONGHAUL_V2; - if (c->x86_mask < 8) { - cpu_model = CPU_SAMUEL2; - cpuname = "C3 'Samuel 2' [C5B]"; - } else { - cpu_model = CPU_EZRA; - cpuname = "C3 'Ezra' [C5C]"; - } - memcpy(mults, ezra_mults, sizeof(ezra_mults)); - memcpy(eblcr, ezra_eblcr, sizeof(ezra_eblcr)); - break; - } - break; - - case 8: - cpu_model = CPU_EZRA_T; - cpuname = "C3 'Ezra-T' [C5M]"; - longhaul_version = TYPE_POWERSAVER; - numscales = 32; - memcpy(mults, ezrat_mults, sizeof(ezrat_mults)); - memcpy(eblcr, ezrat_eblcr, sizeof(ezrat_eblcr)); - break; - - case 9: - longhaul_version = TYPE_POWERSAVER; - numscales = 32; - memcpy(mults, nehemiah_mults, sizeof(nehemiah_mults)); - memcpy(eblcr, nehemiah_eblcr, sizeof(nehemiah_eblcr)); - switch (c->x86_mask) { - case 0 ... 1: - cpu_model = CPU_NEHEMIAH; - cpuname = "C3 'Nehemiah A' [C5XLOE]"; - break; - case 2 ... 4: - cpu_model = CPU_NEHEMIAH; - cpuname = "C3 'Nehemiah B' [C5XLOH]"; - break; - case 5 ... 15: - cpu_model = CPU_NEHEMIAH_C; - cpuname = "C3 'Nehemiah C' [C5P]"; - break; - } - break; - - default: - cpuname = "Unknown"; - break; - } - /* Check Longhaul ver. 2 */ - if (longhaul_version == TYPE_LONGHAUL_V2) { - rdmsr(MSR_VIA_LONGHAUL, lo, hi); - if (lo == 0 && hi == 0) - /* Looks like MSR isn't present */ - longhaul_version = TYPE_LONGHAUL_V1; - } - - printk(KERN_INFO PFX "VIA %s CPU detected. ", cpuname); - switch (longhaul_version) { - case TYPE_LONGHAUL_V1: - case TYPE_LONGHAUL_V2: - printk(KERN_CONT "Longhaul v%d supported.\n", longhaul_version); - break; - case TYPE_POWERSAVER: - printk(KERN_CONT "Powersaver supported.\n"); - break; - }; - - /* Doesn't hurt */ - longhaul_setup_southbridge(); - - /* Find ACPI data for processor */ - acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT, - ACPI_UINT32_MAX, &longhaul_walk_callback, NULL, - NULL, (void *)&pr); - - /* Check ACPI support for C3 state */ - if (pr != NULL && longhaul_version == TYPE_POWERSAVER) { - cx = &pr->power.states[ACPI_STATE_C3]; - if (cx->address > 0 && cx->latency <= 1000) - longhaul_flags |= USE_ACPI_C3; - } - /* Disable if it isn't working */ - if (disable_acpi_c3) - longhaul_flags &= ~USE_ACPI_C3; - /* Check if northbridge is friendly */ - if (enable_arbiter_disable()) - longhaul_flags |= USE_NORTHBRIDGE; - - /* Check ACPI support for bus master arbiter disable */ - if (!(longhaul_flags & USE_ACPI_C3 - || longhaul_flags & USE_NORTHBRIDGE) - && ((pr == NULL) || !(pr->flags.bm_control))) { - printk(KERN_ERR PFX - "No ACPI support. Unsupported northbridge.\n"); - return -ENODEV; - } - - if (longhaul_flags & USE_NORTHBRIDGE) - printk(KERN_INFO PFX "Using northbridge support.\n"); - if (longhaul_flags & USE_ACPI_C3) - printk(KERN_INFO PFX "Using ACPI support.\n"); - - ret = longhaul_get_ranges(); - if (ret != 0) - return ret; - - if ((longhaul_version != TYPE_LONGHAUL_V1) && (scale_voltage != 0)) - longhaul_setup_voltagescaling(); - - policy->cpuinfo.transition_latency = 200000; /* nsec */ - policy->cur = calc_speed(longhaul_get_cpu_mult()); - - ret = cpufreq_frequency_table_cpuinfo(policy, longhaul_table); - if (ret) - return ret; - - cpufreq_frequency_table_get_attr(longhaul_table, policy->cpu); - - return 0; -} - -static int __devexit longhaul_cpu_exit(struct cpufreq_policy *policy) -{ - cpufreq_frequency_table_put_attr(policy->cpu); - return 0; -} - -static struct freq_attr *longhaul_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; - -static struct cpufreq_driver longhaul_driver = { - .verify = longhaul_verify, - .target = longhaul_target, - .get = longhaul_get, - .init = longhaul_cpu_init, - .exit = __devexit_p(longhaul_cpu_exit), - .name = "longhaul", - .owner = THIS_MODULE, - .attr = longhaul_attr, -}; - - -static int __init longhaul_init(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - - if (c->x86_vendor != X86_VENDOR_CENTAUR || c->x86 != 6) - return -ENODEV; - -#ifdef CONFIG_SMP - if (num_online_cpus() > 1) { - printk(KERN_ERR PFX "More than 1 CPU detected, " - "longhaul disabled.\n"); - return -ENODEV; - } -#endif -#ifdef CONFIG_X86_IO_APIC - if (cpu_has_apic) { - printk(KERN_ERR PFX "APIC detected. Longhaul is currently " - "broken in this configuration.\n"); - return -ENODEV; - } -#endif - switch (c->x86_model) { - case 6 ... 9: - return cpufreq_register_driver(&longhaul_driver); - case 10: - printk(KERN_ERR PFX "Use acpi-cpufreq driver for VIA C7\n"); - default: - ; - } - - return -ENODEV; -} - - -static void __exit longhaul_exit(void) -{ - int i; - - for (i = 0; i < numscales; i++) { - if (mults[i] == maxmult) { - longhaul_setstate(i); - break; - } - } - - cpufreq_unregister_driver(&longhaul_driver); - kfree(longhaul_table); -} - -/* Even if BIOS is exporting ACPI C3 state, and it is used - * with success when CPU is idle, this state doesn't - * trigger frequency transition in some cases. */ -module_param(disable_acpi_c3, int, 0644); -MODULE_PARM_DESC(disable_acpi_c3, "Don't use ACPI C3 support"); -/* Change CPU voltage with frequency. Very usefull to save - * power, but most VIA C3 processors aren't supporting it. */ -module_param(scale_voltage, int, 0644); -MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor"); -/* Force revision key to 0 for processors which doesn't - * support voltage scaling, but are introducing itself as - * such. */ -module_param(revid_errata, int, 0644); -MODULE_PARM_DESC(revid_errata, "Ignore CPU Revision ID"); - -MODULE_AUTHOR("Dave Jones <davej@redhat.com>"); -MODULE_DESCRIPTION("Longhaul driver for VIA Cyrix processors."); -MODULE_LICENSE("GPL"); - -late_initcall(longhaul_init); -module_exit(longhaul_exit); diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.h b/arch/x86/kernel/cpu/cpufreq/longhaul.h deleted file mode 100644 index cbf48fbca88..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/longhaul.h +++ /dev/null @@ -1,353 +0,0 @@ -/* - * longhaul.h - * (C) 2003 Dave Jones. - * - * Licensed under the terms of the GNU GPL License version 2. - * - * VIA-specific information - */ - -union msr_bcr2 { - struct { - unsigned Reseved:19, // 18:0 - ESOFTBF:1, // 19 - Reserved2:3, // 22:20 - CLOCKMUL:4, // 26:23 - Reserved3:5; // 31:27 - } bits; - unsigned long val; -}; - -union msr_longhaul { - struct { - unsigned RevisionID:4, // 3:0 - RevisionKey:4, // 7:4 - EnableSoftBusRatio:1, // 8 - EnableSoftVID:1, // 9 - EnableSoftBSEL:1, // 10 - Reserved:3, // 11:13 - SoftBusRatio4:1, // 14 - VRMRev:1, // 15 - SoftBusRatio:4, // 19:16 - SoftVID:5, // 24:20 - Reserved2:3, // 27:25 - SoftBSEL:2, // 29:28 - Reserved3:2, // 31:30 - MaxMHzBR:4, // 35:32 - MaximumVID:5, // 40:36 - MaxMHzFSB:2, // 42:41 - MaxMHzBR4:1, // 43 - Reserved4:4, // 47:44 - MinMHzBR:4, // 51:48 - MinimumVID:5, // 56:52 - MinMHzFSB:2, // 58:57 - MinMHzBR4:1, // 59 - Reserved5:4; // 63:60 - } bits; - unsigned long long val; -}; - -/* - * Clock ratio tables. Div/Mod by 10 to get ratio. - * The eblcr values specify the ratio read from the CPU. - * The mults values specify what to write to the CPU. - */ - -/* - * VIA C3 Samuel 1 & Samuel 2 (stepping 0) - */ -static const int __cpuinitdata samuel1_mults[16] = { - -1, /* 0000 -> RESERVED */ - 30, /* 0001 -> 3.0x */ - 40, /* 0010 -> 4.0x */ - -1, /* 0011 -> RESERVED */ - -1, /* 0100 -> RESERVED */ - 35, /* 0101 -> 3.5x */ - 45, /* 0110 -> 4.5x */ - 55, /* 0111 -> 5.5x */ - 60, /* 1000 -> 6.0x */ - 70, /* 1001 -> 7.0x */ - 80, /* 1010 -> 8.0x */ - 50, /* 1011 -> 5.0x */ - 65, /* 1100 -> 6.5x */ - 75, /* 1101 -> 7.5x */ - -1, /* 1110 -> RESERVED */ - -1, /* 1111 -> RESERVED */ -}; - -static const int __cpuinitdata samuel1_eblcr[16] = { - 50, /* 0000 -> RESERVED */ - 30, /* 0001 -> 3.0x */ - 40, /* 0010 -> 4.0x */ - -1, /* 0011 -> RESERVED */ - 55, /* 0100 -> 5.5x */ - 35, /* 0101 -> 3.5x */ - 45, /* 0110 -> 4.5x */ - -1, /* 0111 -> RESERVED */ - -1, /* 1000 -> RESERVED */ - 70, /* 1001 -> 7.0x */ - 80, /* 1010 -> 8.0x */ - 60, /* 1011 -> 6.0x */ - -1, /* 1100 -> RESERVED */ - 75, /* 1101 -> 7.5x */ - -1, /* 1110 -> RESERVED */ - 65, /* 1111 -> 6.5x */ -}; - -/* - * VIA C3 Samuel2 Stepping 1->15 - */ -static const int __cpuinitdata samuel2_eblcr[16] = { - 50, /* 0000 -> 5.0x */ - 30, /* 0001 -> 3.0x */ - 40, /* 0010 -> 4.0x */ - 100, /* 0011 -> 10.0x */ - 55, /* 0100 -> 5.5x */ - 35, /* 0101 -> 3.5x */ - 45, /* 0110 -> 4.5x */ - 110, /* 0111 -> 11.0x */ - 90, /* 1000 -> 9.0x */ - 70, /* 1001 -> 7.0x */ - 80, /* 1010 -> 8.0x */ - 60, /* 1011 -> 6.0x */ - 120, /* 1100 -> 12.0x */ - 75, /* 1101 -> 7.5x */ - 130, /* 1110 -> 13.0x */ - 65, /* 1111 -> 6.5x */ -}; - -/* - * VIA C3 Ezra - */ -static const int __cpuinitdata ezra_mults[16] = { - 100, /* 0000 -> 10.0x */ - 30, /* 0001 -> 3.0x */ - 40, /* 0010 -> 4.0x */ - 90, /* 0011 -> 9.0x */ - 95, /* 0100 -> 9.5x */ - 35, /* 0101 -> 3.5x */ - 45, /* 0110 -> 4.5x */ - 55, /* 0111 -> 5.5x */ - 60, /* 1000 -> 6.0x */ - 70, /* 1001 -> 7.0x */ - 80, /* 1010 -> 8.0x */ - 50, /* 1011 -> 5.0x */ - 65, /* 1100 -> 6.5x */ - 75, /* 1101 -> 7.5x */ - 85, /* 1110 -> 8.5x */ - 120, /* 1111 -> 12.0x */ -}; - -static const int __cpuinitdata ezra_eblcr[16] = { - 50, /* 0000 -> 5.0x */ - 30, /* 0001 -> 3.0x */ - 40, /* 0010 -> 4.0x */ - 100, /* 0011 -> 10.0x */ - 55, /* 0100 -> 5.5x */ - 35, /* 0101 -> 3.5x */ - 45, /* 0110 -> 4.5x */ - 95, /* 0111 -> 9.5x */ - 90, /* 1000 -> 9.0x */ - 70, /* 1001 -> 7.0x */ - 80, /* 1010 -> 8.0x */ - 60, /* 1011 -> 6.0x */ - 120, /* 1100 -> 12.0x */ - 75, /* 1101 -> 7.5x */ - 85, /* 1110 -> 8.5x */ - 65, /* 1111 -> 6.5x */ -}; - -/* - * VIA C3 (Ezra-T) [C5M]. - */ -static const int __cpuinitdata ezrat_mults[32] = { - 100, /* 0000 -> 10.0x */ - 30, /* 0001 -> 3.0x */ - 40, /* 0010 -> 4.0x */ - 90, /* 0011 -> 9.0x */ - 95, /* 0100 -> 9.5x */ - 35, /* 0101 -> 3.5x */ - 45, /* 0110 -> 4.5x */ - 55, /* 0111 -> 5.5x */ - 60, /* 1000 -> 6.0x */ - 70, /* 1001 -> 7.0x */ - 80, /* 1010 -> 8.0x */ - 50, /* 1011 -> 5.0x */ - 65, /* 1100 -> 6.5x */ - 75, /* 1101 -> 7.5x */ - 85, /* 1110 -> 8.5x */ - 120, /* 1111 -> 12.0x */ - - -1, /* 0000 -> RESERVED (10.0x) */ - 110, /* 0001 -> 11.0x */ - -1, /* 0010 -> 12.0x */ - -1, /* 0011 -> RESERVED (9.0x)*/ - 105, /* 0100 -> 10.5x */ - 115, /* 0101 -> 11.5x */ - 125, /* 0110 -> 12.5x */ - 135, /* 0111 -> 13.5x */ - 140, /* 1000 -> 14.0x */ - 150, /* 1001 -> 15.0x */ - 160, /* 1010 -> 16.0x */ - 130, /* 1011 -> 13.0x */ - 145, /* 1100 -> 14.5x */ - 155, /* 1101 -> 15.5x */ - -1, /* 1110 -> RESERVED (13.0x) */ - -1, /* 1111 -> RESERVED (12.0x) */ -}; - -static const int __cpuinitdata ezrat_eblcr[32] = { - 50, /* 0000 -> 5.0x */ - 30, /* 0001 -> 3.0x */ - 40, /* 0010 -> 4.0x */ - 100, /* 0011 -> 10.0x */ - 55, /* 0100 -> 5.5x */ - 35, /* 0101 -> 3.5x */ - 45, /* 0110 -> 4.5x */ - 95, /* 0111 -> 9.5x */ - 90, /* 1000 -> 9.0x */ - 70, /* 1001 -> 7.0x */ - 80, /* 1010 -> 8.0x */ - 60, /* 1011 -> 6.0x */ - 120, /* 1100 -> 12.0x */ - 75, /* 1101 -> 7.5x */ - 85, /* 1110 -> 8.5x */ - 65, /* 1111 -> 6.5x */ - - -1, /* 0000 -> RESERVED (9.0x) */ - 110, /* 0001 -> 11.0x */ - 120, /* 0010 -> 12.0x */ - -1, /* 0011 -> RESERVED (10.0x)*/ - 135, /* 0100 -> 13.5x */ - 115, /* 0101 -> 11.5x */ - 125, /* 0110 -> 12.5x */ - 105, /* 0111 -> 10.5x */ - 130, /* 1000 -> 13.0x */ - 150, /* 1001 -> 15.0x */ - 160, /* 1010 -> 16.0x */ - 140, /* 1011 -> 14.0x */ - -1, /* 1100 -> RESERVED (12.0x) */ - 155, /* 1101 -> 15.5x */ - -1, /* 1110 -> RESERVED (13.0x) */ - 145, /* 1111 -> 14.5x */ -}; - -/* - * VIA C3 Nehemiah */ - -static const int __cpuinitdata nehemiah_mults[32] = { - 100, /* 0000 -> 10.0x */ - -1, /* 0001 -> 16.0x */ - 40, /* 0010 -> 4.0x */ - 90, /* 0011 -> 9.0x */ - 95, /* 0100 -> 9.5x */ - -1, /* 0101 -> RESERVED */ - 45, /* 0110 -> 4.5x */ - 55, /* 0111 -> 5.5x */ - 60, /* 1000 -> 6.0x */ - 70, /* 1001 -> 7.0x */ - 80, /* 1010 -> 8.0x */ - 50, /* 1011 -> 5.0x */ - 65, /* 1100 -> 6.5x */ - 75, /* 1101 -> 7.5x */ - 85, /* 1110 -> 8.5x */ - 120, /* 1111 -> 12.0x */ - -1, /* 0000 -> 10.0x */ - 110, /* 0001 -> 11.0x */ - -1, /* 0010 -> 12.0x */ - -1, /* 0011 -> 9.0x */ - 105, /* 0100 -> 10.5x */ - 115, /* 0101 -> 11.5x */ - 125, /* 0110 -> 12.5x */ - 135, /* 0111 -> 13.5x */ - 140, /* 1000 -> 14.0x */ - 150, /* 1001 -> 15.0x */ - 160, /* 1010 -> 16.0x */ - 130, /* 1011 -> 13.0x */ - 145, /* 1100 -> 14.5x */ - 155, /* 1101 -> 15.5x */ - -1, /* 1110 -> RESERVED (13.0x) */ - -1, /* 1111 -> 12.0x */ -}; - -static const int __cpuinitdata nehemiah_eblcr[32] = { - 50, /* 0000 -> 5.0x */ - 160, /* 0001 -> 16.0x */ - 40, /* 0010 -> 4.0x */ - 100, /* 0011 -> 10.0x */ - 55, /* 0100 -> 5.5x */ - -1, /* 0101 -> RESERVED */ - 45, /* 0110 -> 4.5x */ - 95, /* 0111 -> 9.5x */ - 90, /* 1000 -> 9.0x */ - 70, /* 1001 -> 7.0x */ - 80, /* 1010 -> 8.0x */ - 60, /* 1011 -> 6.0x */ - 120, /* 1100 -> 12.0x */ - 75, /* 1101 -> 7.5x */ - 85, /* 1110 -> 8.5x */ - 65, /* 1111 -> 6.5x */ - 90, /* 0000 -> 9.0x */ - 110, /* 0001 -> 11.0x */ - 120, /* 0010 -> 12.0x */ - 100, /* 0011 -> 10.0x */ - 135, /* 0100 -> 13.5x */ - 115, /* 0101 -> 11.5x */ - 125, /* 0110 -> 12.5x */ - 105, /* 0111 -> 10.5x */ - 130, /* 1000 -> 13.0x */ - 150, /* 1001 -> 15.0x */ - 160, /* 1010 -> 16.0x */ - 140, /* 1011 -> 14.0x */ - 120, /* 1100 -> 12.0x */ - 155, /* 1101 -> 15.5x */ - -1, /* 1110 -> RESERVED (13.0x) */ - 145 /* 1111 -> 14.5x */ -}; - -/* - * Voltage scales. Div/Mod by 1000 to get actual voltage. - * Which scale to use depends on the VRM type in use. - */ - -struct mV_pos { - unsigned short mV; - unsigned short pos; -}; - -static const struct mV_pos __cpuinitdata vrm85_mV[32] = { - {1250, 8}, {1200, 6}, {1150, 4}, {1100, 2}, - {1050, 0}, {1800, 30}, {1750, 28}, {1700, 26}, - {1650, 24}, {1600, 22}, {1550, 20}, {1500, 18}, - {1450, 16}, {1400, 14}, {1350, 12}, {1300, 10}, - {1275, 9}, {1225, 7}, {1175, 5}, {1125, 3}, - {1075, 1}, {1825, 31}, {1775, 29}, {1725, 27}, - {1675, 25}, {1625, 23}, {1575, 21}, {1525, 19}, - {1475, 17}, {1425, 15}, {1375, 13}, {1325, 11} -}; - -static const unsigned char __cpuinitdata mV_vrm85[32] = { - 0x04, 0x14, 0x03, 0x13, 0x02, 0x12, 0x01, 0x11, - 0x00, 0x10, 0x0f, 0x1f, 0x0e, 0x1e, 0x0d, 0x1d, - 0x0c, 0x1c, 0x0b, 0x1b, 0x0a, 0x1a, 0x09, 0x19, - 0x08, 0x18, 0x07, 0x17, 0x06, 0x16, 0x05, 0x15 -}; - -static const struct mV_pos __cpuinitdata mobilevrm_mV[32] = { - {1750, 31}, {1700, 30}, {1650, 29}, {1600, 28}, - {1550, 27}, {1500, 26}, {1450, 25}, {1400, 24}, - {1350, 23}, {1300, 22}, {1250, 21}, {1200, 20}, - {1150, 19}, {1100, 18}, {1050, 17}, {1000, 16}, - {975, 15}, {950, 14}, {925, 13}, {900, 12}, - {875, 11}, {850, 10}, {825, 9}, {800, 8}, - {775, 7}, {750, 6}, {725, 5}, {700, 4}, - {675, 3}, {650, 2}, {625, 1}, {600, 0} -}; - -static const unsigned char __cpuinitdata mV_mobilevrm[32] = { - 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, - 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, - 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, - 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 -}; - diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c deleted file mode 100644 index d9f51367666..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/longrun.c +++ /dev/null @@ -1,327 +0,0 @@ -/* - * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de> - * - * Licensed under the terms of the GNU GPL License version 2. - * - * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/cpufreq.h> -#include <linux/timex.h> - -#include <asm/msr.h> -#include <asm/processor.h> - -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "longrun", msg) - -static struct cpufreq_driver longrun_driver; - -/** - * longrun_{low,high}_freq is needed for the conversion of cpufreq kHz - * values into per cent values. In TMTA microcode, the following is valid: - * performance_pctg = (current_freq - low_freq)/(high_freq - low_freq) - */ -static unsigned int longrun_low_freq, longrun_high_freq; - - -/** - * longrun_get_policy - get the current LongRun policy - * @policy: struct cpufreq_policy where current policy is written into - * - * Reads the current LongRun policy by access to MSR_TMTA_LONGRUN_FLAGS - * and MSR_TMTA_LONGRUN_CTRL - */ -static void __cpuinit longrun_get_policy(struct cpufreq_policy *policy) -{ - u32 msr_lo, msr_hi; - - rdmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi); - dprintk("longrun flags are %x - %x\n", msr_lo, msr_hi); - if (msr_lo & 0x01) - policy->policy = CPUFREQ_POLICY_PERFORMANCE; - else - policy->policy = CPUFREQ_POLICY_POWERSAVE; - - rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi); - dprintk("longrun ctrl is %x - %x\n", msr_lo, msr_hi); - msr_lo &= 0x0000007F; - msr_hi &= 0x0000007F; - - if (longrun_high_freq <= longrun_low_freq) { - /* Assume degenerate Longrun table */ - policy->min = policy->max = longrun_high_freq; - } else { - policy->min = longrun_low_freq + msr_lo * - ((longrun_high_freq - longrun_low_freq) / 100); - policy->max = longrun_low_freq + msr_hi * - ((longrun_high_freq - longrun_low_freq) / 100); - } - policy->cpu = 0; -} - - -/** - * longrun_set_policy - sets a new CPUFreq policy - * @policy: new policy - * - * Sets a new CPUFreq policy on LongRun-capable processors. This function - * has to be called with cpufreq_driver locked. - */ -static int longrun_set_policy(struct cpufreq_policy *policy) -{ - u32 msr_lo, msr_hi; - u32 pctg_lo, pctg_hi; - - if (!policy) - return -EINVAL; - - if (longrun_high_freq <= longrun_low_freq) { - /* Assume degenerate Longrun table */ - pctg_lo = pctg_hi = 100; - } else { - pctg_lo = (policy->min - longrun_low_freq) / - ((longrun_high_freq - longrun_low_freq) / 100); - pctg_hi = (policy->max - longrun_low_freq) / - ((longrun_high_freq - longrun_low_freq) / 100); - } - - if (pctg_hi > 100) - pctg_hi = 100; - if (pctg_lo > pctg_hi) - pctg_lo = pctg_hi; - - /* performance or economy mode */ - rdmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi); - msr_lo &= 0xFFFFFFFE; - switch (policy->policy) { - case CPUFREQ_POLICY_PERFORMANCE: - msr_lo |= 0x00000001; - break; - case CPUFREQ_POLICY_POWERSAVE: - break; - } - wrmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi); - - /* lower and upper boundary */ - rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi); - msr_lo &= 0xFFFFFF80; - msr_hi &= 0xFFFFFF80; - msr_lo |= pctg_lo; - msr_hi |= pctg_hi; - wrmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi); - - return 0; -} - - -/** - * longrun_verify_poliy - verifies a new CPUFreq policy - * @policy: the policy to verify - * - * Validates a new CPUFreq policy. This function has to be called with - * cpufreq_driver locked. - */ -static int longrun_verify_policy(struct cpufreq_policy *policy) -{ - if (!policy) - return -EINVAL; - - policy->cpu = 0; - cpufreq_verify_within_limits(policy, - policy->cpuinfo.min_freq, - policy->cpuinfo.max_freq); - - if ((policy->policy != CPUFREQ_POLICY_POWERSAVE) && - (policy->policy != CPUFREQ_POLICY_PERFORMANCE)) - return -EINVAL; - - return 0; -} - -static unsigned int longrun_get(unsigned int cpu) -{ - u32 eax, ebx, ecx, edx; - - if (cpu) - return 0; - - cpuid(0x80860007, &eax, &ebx, &ecx, &edx); - dprintk("cpuid eax is %u\n", eax); - - return eax * 1000; -} - -/** - * longrun_determine_freqs - determines the lowest and highest possible core frequency - * @low_freq: an int to put the lowest frequency into - * @high_freq: an int to put the highest frequency into - * - * Determines the lowest and highest possible core frequencies on this CPU. - * This is necessary to calculate the performance percentage according to - * TMTA rules: - * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq) - */ -static int __cpuinit longrun_determine_freqs(unsigned int *low_freq, - unsigned int *high_freq) -{ - u32 msr_lo, msr_hi; - u32 save_lo, save_hi; - u32 eax, ebx, ecx, edx; - u32 try_hi; - struct cpuinfo_x86 *c = &cpu_data(0); - - if (!low_freq || !high_freq) - return -EINVAL; - - if (cpu_has(c, X86_FEATURE_LRTI)) { - /* if the LongRun Table Interface is present, the - * detection is a bit easier: - * For minimum frequency, read out the maximum - * level (msr_hi), write that into "currently - * selected level", and read out the frequency. - * For maximum frequency, read out level zero. - */ - /* minimum */ - rdmsr(MSR_TMTA_LRTI_READOUT, msr_lo, msr_hi); - wrmsr(MSR_TMTA_LRTI_READOUT, msr_hi, msr_hi); - rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi); - *low_freq = msr_lo * 1000; /* to kHz */ - - /* maximum */ - wrmsr(MSR_TMTA_LRTI_READOUT, 0, msr_hi); - rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi); - *high_freq = msr_lo * 1000; /* to kHz */ - - dprintk("longrun table interface told %u - %u kHz\n", - *low_freq, *high_freq); - - if (*low_freq > *high_freq) - *low_freq = *high_freq; - return 0; - } - - /* set the upper border to the value determined during TSC init */ - *high_freq = (cpu_khz / 1000); - *high_freq = *high_freq * 1000; - dprintk("high frequency is %u kHz\n", *high_freq); - - /* get current borders */ - rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi); - save_lo = msr_lo & 0x0000007F; - save_hi = msr_hi & 0x0000007F; - - /* if current perf_pctg is larger than 90%, we need to decrease the - * upper limit to make the calculation more accurate. - */ - cpuid(0x80860007, &eax, &ebx, &ecx, &edx); - /* try decreasing in 10% steps, some processors react only - * on some barrier values */ - for (try_hi = 80; try_hi > 0 && ecx > 90; try_hi -= 10) { - /* set to 0 to try_hi perf_pctg */ - msr_lo &= 0xFFFFFF80; - msr_hi &= 0xFFFFFF80; - msr_hi |= try_hi; - wrmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi); - - /* read out current core MHz and current perf_pctg */ - cpuid(0x80860007, &eax, &ebx, &ecx, &edx); - - /* restore values */ - wrmsr(MSR_TMTA_LONGRUN_CTRL, save_lo, save_hi); - } - dprintk("percentage is %u %%, freq is %u MHz\n", ecx, eax); - - /* performance_pctg = (current_freq - low_freq)/(high_freq - low_freq) - * eqals - * low_freq * (1 - perf_pctg) = (cur_freq - high_freq * perf_pctg) - * - * high_freq * perf_pctg is stored tempoarily into "ebx". - */ - ebx = (((cpu_khz / 1000) * ecx) / 100); /* to MHz */ - - if ((ecx > 95) || (ecx == 0) || (eax < ebx)) - return -EIO; - - edx = ((eax - ebx) * 100) / (100 - ecx); - *low_freq = edx * 1000; /* back to kHz */ - - dprintk("low frequency is %u kHz\n", *low_freq); - - if (*low_freq > *high_freq) - *low_freq = *high_freq; - - return 0; -} - - -static int __cpuinit longrun_cpu_init(struct cpufreq_policy *policy) -{ - int result = 0; - - /* capability check */ - if (policy->cpu != 0) - return -ENODEV; - - /* detect low and high frequency */ - result = longrun_determine_freqs(&longrun_low_freq, &longrun_high_freq); - if (result) - return result; - - /* cpuinfo and default policy values */ - policy->cpuinfo.min_freq = longrun_low_freq; - policy->cpuinfo.max_freq = longrun_high_freq; - policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; - longrun_get_policy(policy); - - return 0; -} - - -static struct cpufreq_driver longrun_driver = { - .flags = CPUFREQ_CONST_LOOPS, - .verify = longrun_verify_policy, - .setpolicy = longrun_set_policy, - .get = longrun_get, - .init = longrun_cpu_init, - .name = "longrun", - .owner = THIS_MODULE, -}; - - -/** - * longrun_init - initializes the Transmeta Crusoe LongRun CPUFreq driver - * - * Initializes the LongRun support. - */ -static int __init longrun_init(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - - if (c->x86_vendor != X86_VENDOR_TRANSMETA || - !cpu_has(c, X86_FEATURE_LONGRUN)) - return -ENODEV; - - return cpufreq_register_driver(&longrun_driver); -} - - -/** - * longrun_exit - unregisters LongRun support - */ -static void __exit longrun_exit(void) -{ - cpufreq_unregister_driver(&longrun_driver); -} - - -MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>"); -MODULE_DESCRIPTION("LongRun driver for Transmeta Crusoe and " - "Efficeon processors."); -MODULE_LICENSE("GPL"); - -module_init(longrun_init); -module_exit(longrun_exit); diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.c b/arch/x86/kernel/cpu/cpufreq/mperf.c deleted file mode 100644 index 911e193018a..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/mperf.c +++ /dev/null @@ -1,51 +0,0 @@ -#include <linux/kernel.h> -#include <linux/smp.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/cpufreq.h> -#include <linux/slab.h> - -#include "mperf.h" - -static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf); - -/* Called via smp_call_function_single(), on the target CPU */ -static void read_measured_perf_ctrs(void *_cur) -{ - struct aperfmperf *am = _cur; - - get_aperfmperf(am); -} - -/* - * Return the measured active (C0) frequency on this CPU since last call - * to this function. - * Input: cpu number - * Return: Average CPU frequency in terms of max frequency (zero on error) - * - * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance - * over a period of time, while CPU is in C0 state. - * IA32_MPERF counts at the rate of max advertised frequency - * IA32_APERF counts at the rate of actual CPU frequency - * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and - * no meaning should be associated with absolute values of these MSRs. - */ -unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy, - unsigned int cpu) -{ - struct aperfmperf perf; - unsigned long ratio; - unsigned int retval; - - if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1)) - return 0; - - ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf); - per_cpu(acfreq_old_perf, cpu) = perf; - - retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT; - - return retval; -} -EXPORT_SYMBOL_GPL(cpufreq_get_measured_perf); -MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.h b/arch/x86/kernel/cpu/cpufreq/mperf.h deleted file mode 100644 index 5dbf2950dc2..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/mperf.h +++ /dev/null @@ -1,9 +0,0 @@ -/* - * (c) 2010 Advanced Micro Devices, Inc. - * Your use of this code is subject to the terms and conditions of the - * GNU general public license version 2. See "COPYING" or - * http://www.gnu.org/licenses/gpl.html - */ - -unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy, - unsigned int cpu); diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c deleted file mode 100644 index bd1cac747f6..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c +++ /dev/null @@ -1,331 +0,0 @@ -/* - * Pentium 4/Xeon CPU on demand clock modulation/speed scaling - * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de> - * (C) 2002 Zwane Mwaikambo <zwane@commfireservices.com> - * (C) 2002 Arjan van de Ven <arjanv@redhat.com> - * (C) 2002 Tora T. Engstad - * All Rights Reserved - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * The author(s) of this software shall not be held liable for damages - * of any nature resulting due to the use of this software. This - * software is provided AS-IS with no warranties. - * - * Date Errata Description - * 20020525 N44, O17 12.5% or 25% DC causes lockup - * - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/smp.h> -#include <linux/cpufreq.h> -#include <linux/cpumask.h> -#include <linux/timex.h> - -#include <asm/processor.h> -#include <asm/msr.h> -#include <asm/timer.h> - -#include "speedstep-lib.h" - -#define PFX "p4-clockmod: " -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "p4-clockmod", msg) - -/* - * Duty Cycle (3bits), note DC_DISABLE is not specified in - * intel docs i just use it to mean disable - */ -enum { - DC_RESV, DC_DFLT, DC_25PT, DC_38PT, DC_50PT, - DC_64PT, DC_75PT, DC_88PT, DC_DISABLE -}; - -#define DC_ENTRIES 8 - - -static int has_N44_O17_errata[NR_CPUS]; -static unsigned int stock_freq; -static struct cpufreq_driver p4clockmod_driver; -static unsigned int cpufreq_p4_get(unsigned int cpu); - -static int cpufreq_p4_setdc(unsigned int cpu, unsigned int newstate) -{ - u32 l, h; - - if (!cpu_online(cpu) || - (newstate > DC_DISABLE) || (newstate == DC_RESV)) - return -EINVAL; - - rdmsr_on_cpu(cpu, MSR_IA32_THERM_STATUS, &l, &h); - - if (l & 0x01) - dprintk("CPU#%d currently thermal throttled\n", cpu); - - if (has_N44_O17_errata[cpu] && - (newstate == DC_25PT || newstate == DC_DFLT)) - newstate = DC_38PT; - - rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h); - if (newstate == DC_DISABLE) { - dprintk("CPU#%d disabling modulation\n", cpu); - wrmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, l & ~(1<<4), h); - } else { - dprintk("CPU#%d setting duty cycle to %d%%\n", - cpu, ((125 * newstate) / 10)); - /* bits 63 - 5 : reserved - * bit 4 : enable/disable - * bits 3-1 : duty cycle - * bit 0 : reserved - */ - l = (l & ~14); - l = l | (1<<4) | ((newstate & 0x7)<<1); - wrmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, l, h); - } - - return 0; -} - - -static struct cpufreq_frequency_table p4clockmod_table[] = { - {DC_RESV, CPUFREQ_ENTRY_INVALID}, - {DC_DFLT, 0}, - {DC_25PT, 0}, - {DC_38PT, 0}, - {DC_50PT, 0}, - {DC_64PT, 0}, - {DC_75PT, 0}, - {DC_88PT, 0}, - {DC_DISABLE, 0}, - {DC_RESV, CPUFREQ_TABLE_END}, -}; - - -static int cpufreq_p4_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) -{ - unsigned int newstate = DC_RESV; - struct cpufreq_freqs freqs; - int i; - - if (cpufreq_frequency_table_target(policy, &p4clockmod_table[0], - target_freq, relation, &newstate)) - return -EINVAL; - - freqs.old = cpufreq_p4_get(policy->cpu); - freqs.new = stock_freq * p4clockmod_table[newstate].index / 8; - - if (freqs.new == freqs.old) - return 0; - - /* notifiers */ - for_each_cpu(i, policy->cpus) { - freqs.cpu = i; - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - } - - /* run on each logical CPU, - * see section 13.15.3 of IA32 Intel Architecture Software - * Developer's Manual, Volume 3 - */ - for_each_cpu(i, policy->cpus) - cpufreq_p4_setdc(i, p4clockmod_table[newstate].index); - - /* notifiers */ - for_each_cpu(i, policy->cpus) { - freqs.cpu = i; - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - } - - return 0; -} - - -static int cpufreq_p4_verify(struct cpufreq_policy *policy) -{ - return cpufreq_frequency_table_verify(policy, &p4clockmod_table[0]); -} - - -static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c) -{ - if (c->x86 == 0x06) { - if (cpu_has(c, X86_FEATURE_EST)) - printk(KERN_WARNING PFX "Warning: EST-capable CPU " - "detected. The acpi-cpufreq module offers " - "voltage scaling in addition of frequency " - "scaling. You should use that instead of " - "p4-clockmod, if possible.\n"); - switch (c->x86_model) { - case 0x0E: /* Core */ - case 0x0F: /* Core Duo */ - case 0x16: /* Celeron Core */ - case 0x1C: /* Atom */ - p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS; - return speedstep_get_frequency(SPEEDSTEP_CPU_PCORE); - case 0x0D: /* Pentium M (Dothan) */ - p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS; - /* fall through */ - case 0x09: /* Pentium M (Banias) */ - return speedstep_get_frequency(SPEEDSTEP_CPU_PM); - } - } - - if (c->x86 != 0xF) - return 0; - - /* on P-4s, the TSC runs with constant frequency independent whether - * throttling is active or not. */ - p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS; - - if (speedstep_detect_processor() == SPEEDSTEP_CPU_P4M) { - printk(KERN_WARNING PFX "Warning: Pentium 4-M detected. " - "The speedstep-ich or acpi cpufreq modules offer " - "voltage scaling in addition of frequency scaling. " - "You should use either one instead of p4-clockmod, " - "if possible.\n"); - return speedstep_get_frequency(SPEEDSTEP_CPU_P4M); - } - - return speedstep_get_frequency(SPEEDSTEP_CPU_P4D); -} - - - -static int cpufreq_p4_cpu_init(struct cpufreq_policy *policy) -{ - struct cpuinfo_x86 *c = &cpu_data(policy->cpu); - int cpuid = 0; - unsigned int i; - -#ifdef CONFIG_SMP - cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu)); -#endif - - /* Errata workaround */ - cpuid = (c->x86 << 8) | (c->x86_model << 4) | c->x86_mask; - switch (cpuid) { - case 0x0f07: - case 0x0f0a: - case 0x0f11: - case 0x0f12: - has_N44_O17_errata[policy->cpu] = 1; - dprintk("has errata -- disabling low frequencies\n"); - } - - if (speedstep_detect_processor() == SPEEDSTEP_CPU_P4D && - c->x86_model < 2) { - /* switch to maximum frequency and measure result */ - cpufreq_p4_setdc(policy->cpu, DC_DISABLE); - recalibrate_cpu_khz(); - } - /* get max frequency */ - stock_freq = cpufreq_p4_get_frequency(c); - if (!stock_freq) - return -EINVAL; - - /* table init */ - for (i = 1; (p4clockmod_table[i].frequency != CPUFREQ_TABLE_END); i++) { - if ((i < 2) && (has_N44_O17_errata[policy->cpu])) - p4clockmod_table[i].frequency = CPUFREQ_ENTRY_INVALID; - else - p4clockmod_table[i].frequency = (stock_freq * i)/8; - } - cpufreq_frequency_table_get_attr(p4clockmod_table, policy->cpu); - - /* cpuinfo and default policy values */ - - /* the transition latency is set to be 1 higher than the maximum - * transition latency of the ondemand governor */ - policy->cpuinfo.transition_latency = 10000001; - policy->cur = stock_freq; - - return cpufreq_frequency_table_cpuinfo(policy, &p4clockmod_table[0]); -} - - -static int cpufreq_p4_cpu_exit(struct cpufreq_policy *policy) -{ - cpufreq_frequency_table_put_attr(policy->cpu); - return 0; -} - -static unsigned int cpufreq_p4_get(unsigned int cpu) -{ - u32 l, h; - - rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h); - - if (l & 0x10) { - l = l >> 1; - l &= 0x7; - } else - l = DC_DISABLE; - - if (l != DC_DISABLE) - return stock_freq * l / 8; - - return stock_freq; -} - -static struct freq_attr *p4clockmod_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; - -static struct cpufreq_driver p4clockmod_driver = { - .verify = cpufreq_p4_verify, - .target = cpufreq_p4_target, - .init = cpufreq_p4_cpu_init, - .exit = cpufreq_p4_cpu_exit, - .get = cpufreq_p4_get, - .name = "p4-clockmod", - .owner = THIS_MODULE, - .attr = p4clockmod_attr, -}; - - -static int __init cpufreq_p4_init(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - int ret; - - /* - * THERM_CONTROL is architectural for IA32 now, so - * we can rely on the capability checks - */ - if (c->x86_vendor != X86_VENDOR_INTEL) - return -ENODEV; - - if (!test_cpu_cap(c, X86_FEATURE_ACPI) || - !test_cpu_cap(c, X86_FEATURE_ACC)) - return -ENODEV; - - ret = cpufreq_register_driver(&p4clockmod_driver); - if (!ret) - printk(KERN_INFO PFX "P4/Xeon(TM) CPU On-Demand Clock " - "Modulation available\n"); - - return ret; -} - - -static void __exit cpufreq_p4_exit(void) -{ - cpufreq_unregister_driver(&p4clockmod_driver); -} - - -MODULE_AUTHOR("Zwane Mwaikambo <zwane@commfireservices.com>"); -MODULE_DESCRIPTION("cpufreq driver for Pentium(TM) 4/Xeon(TM)"); -MODULE_LICENSE("GPL"); - -late_initcall(cpufreq_p4_init); -module_exit(cpufreq_p4_exit); diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c deleted file mode 100644 index 4f6f679f279..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c +++ /dev/null @@ -1,626 +0,0 @@ -/* - * pcc-cpufreq.c - Processor Clocking Control firmware cpufreq interface - * - * Copyright (C) 2009 Red Hat, Matthew Garrett <mjg@redhat.com> - * Copyright (C) 2009 Hewlett-Packard Development Company, L.P. - * Nagananda Chumbalkar <nagananda.chumbalkar@hp.com> - * - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; version 2 of the License. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or NON - * INFRINGEMENT. See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write to the Free Software Foundation, Inc., - * 675 Mass Ave, Cambridge, MA 02139, USA. - * - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/smp.h> -#include <linux/sched.h> -#include <linux/cpufreq.h> -#include <linux/compiler.h> -#include <linux/slab.h> - -#include <linux/acpi.h> -#include <linux/io.h> -#include <linux/spinlock.h> -#include <linux/uaccess.h> - -#include <acpi/processor.h> - -#define PCC_VERSION "1.00.00" -#define POLL_LOOPS 300 - -#define CMD_COMPLETE 0x1 -#define CMD_GET_FREQ 0x0 -#define CMD_SET_FREQ 0x1 - -#define BUF_SZ 4 - -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "pcc-cpufreq", msg) - -struct pcc_register_resource { - u8 descriptor; - u16 length; - u8 space_id; - u8 bit_width; - u8 bit_offset; - u8 access_size; - u64 address; -} __attribute__ ((packed)); - -struct pcc_memory_resource { - u8 descriptor; - u16 length; - u8 space_id; - u8 resource_usage; - u8 type_specific; - u64 granularity; - u64 minimum; - u64 maximum; - u64 translation_offset; - u64 address_length; -} __attribute__ ((packed)); - -static struct cpufreq_driver pcc_cpufreq_driver; - -struct pcc_header { - u32 signature; - u16 length; - u8 major; - u8 minor; - u32 features; - u16 command; - u16 status; - u32 latency; - u32 minimum_time; - u32 maximum_time; - u32 nominal; - u32 throttled_frequency; - u32 minimum_frequency; -}; - -static void __iomem *pcch_virt_addr; -static struct pcc_header __iomem *pcch_hdr; - -static DEFINE_SPINLOCK(pcc_lock); - -static struct acpi_generic_address doorbell; - -static u64 doorbell_preserve; -static u64 doorbell_write; - -static u8 OSC_UUID[16] = {0x63, 0x9B, 0x2C, 0x9F, 0x70, 0x91, 0x49, 0x1f, - 0xBB, 0x4F, 0xA5, 0x98, 0x2F, 0xA1, 0xB5, 0x46}; - -struct pcc_cpu { - u32 input_offset; - u32 output_offset; -}; - -static struct pcc_cpu __percpu *pcc_cpu_info; - -static int pcc_cpufreq_verify(struct cpufreq_policy *policy) -{ - cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq, - policy->cpuinfo.max_freq); - return 0; -} - -static inline void pcc_cmd(void) -{ - u64 doorbell_value; - int i; - - acpi_read(&doorbell_value, &doorbell); - acpi_write((doorbell_value & doorbell_preserve) | doorbell_write, - &doorbell); - - for (i = 0; i < POLL_LOOPS; i++) { - if (ioread16(&pcch_hdr->status) & CMD_COMPLETE) - break; - } -} - -static inline void pcc_clear_mapping(void) -{ - if (pcch_virt_addr) - iounmap(pcch_virt_addr); - pcch_virt_addr = NULL; -} - -static unsigned int pcc_get_freq(unsigned int cpu) -{ - struct pcc_cpu *pcc_cpu_data; - unsigned int curr_freq; - unsigned int freq_limit; - u16 status; - u32 input_buffer; - u32 output_buffer; - - spin_lock(&pcc_lock); - - dprintk("get: get_freq for CPU %d\n", cpu); - pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu); - - input_buffer = 0x1; - iowrite32(input_buffer, - (pcch_virt_addr + pcc_cpu_data->input_offset)); - iowrite16(CMD_GET_FREQ, &pcch_hdr->command); - - pcc_cmd(); - - output_buffer = - ioread32(pcch_virt_addr + pcc_cpu_data->output_offset); - - /* Clear the input buffer - we are done with the current command */ - memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ); - - status = ioread16(&pcch_hdr->status); - if (status != CMD_COMPLETE) { - dprintk("get: FAILED: for CPU %d, status is %d\n", - cpu, status); - goto cmd_incomplete; - } - iowrite16(0, &pcch_hdr->status); - curr_freq = (((ioread32(&pcch_hdr->nominal) * (output_buffer & 0xff)) - / 100) * 1000); - - dprintk("get: SUCCESS: (virtual) output_offset for cpu %d is " - "0x%x, contains a value of: 0x%x. Speed is: %d MHz\n", - cpu, (pcch_virt_addr + pcc_cpu_data->output_offset), - output_buffer, curr_freq); - - freq_limit = (output_buffer >> 8) & 0xff; - if (freq_limit != 0xff) { - dprintk("get: frequency for cpu %d is being temporarily" - " capped at %d\n", cpu, curr_freq); - } - - spin_unlock(&pcc_lock); - return curr_freq; - -cmd_incomplete: - iowrite16(0, &pcch_hdr->status); - spin_unlock(&pcc_lock); - return -EINVAL; -} - -static int pcc_cpufreq_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) -{ - struct pcc_cpu *pcc_cpu_data; - struct cpufreq_freqs freqs; - u16 status; - u32 input_buffer; - int cpu; - - spin_lock(&pcc_lock); - cpu = policy->cpu; - pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu); - - dprintk("target: CPU %d should go to target freq: %d " - "(virtual) input_offset is 0x%x\n", - cpu, target_freq, - (pcch_virt_addr + pcc_cpu_data->input_offset)); - - freqs.new = target_freq; - freqs.cpu = cpu; - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - - input_buffer = 0x1 | (((target_freq * 100) - / (ioread32(&pcch_hdr->nominal) * 1000)) << 8); - iowrite32(input_buffer, - (pcch_virt_addr + pcc_cpu_data->input_offset)); - iowrite16(CMD_SET_FREQ, &pcch_hdr->command); - - pcc_cmd(); - - /* Clear the input buffer - we are done with the current command */ - memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ); - - status = ioread16(&pcch_hdr->status); - if (status != CMD_COMPLETE) { - dprintk("target: FAILED for cpu %d, with status: 0x%x\n", - cpu, status); - goto cmd_incomplete; - } - iowrite16(0, &pcch_hdr->status); - - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - dprintk("target: was SUCCESSFUL for cpu %d\n", cpu); - spin_unlock(&pcc_lock); - - return 0; - -cmd_incomplete: - iowrite16(0, &pcch_hdr->status); - spin_unlock(&pcc_lock); - return -EINVAL; -} - -static int pcc_get_offset(int cpu) -{ - acpi_status status; - struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL}; - union acpi_object *pccp, *offset; - struct pcc_cpu *pcc_cpu_data; - struct acpi_processor *pr; - int ret = 0; - - pr = per_cpu(processors, cpu); - pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu); - - status = acpi_evaluate_object(pr->handle, "PCCP", NULL, &buffer); - if (ACPI_FAILURE(status)) - return -ENODEV; - - pccp = buffer.pointer; - if (!pccp || pccp->type != ACPI_TYPE_PACKAGE) { - ret = -ENODEV; - goto out_free; - }; - - offset = &(pccp->package.elements[0]); - if (!offset || offset->type != ACPI_TYPE_INTEGER) { - ret = -ENODEV; - goto out_free; - } - - pcc_cpu_data->input_offset = offset->integer.value; - - offset = &(pccp->package.elements[1]); - if (!offset || offset->type != ACPI_TYPE_INTEGER) { - ret = -ENODEV; - goto out_free; - } - - pcc_cpu_data->output_offset = offset->integer.value; - - memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ); - memset_io((pcch_virt_addr + pcc_cpu_data->output_offset), 0, BUF_SZ); - - dprintk("pcc_get_offset: for CPU %d: pcc_cpu_data " - "input_offset: 0x%x, pcc_cpu_data output_offset: 0x%x\n", - cpu, pcc_cpu_data->input_offset, pcc_cpu_data->output_offset); -out_free: - kfree(buffer.pointer); - return ret; -} - -static int __init pcc_cpufreq_do_osc(acpi_handle *handle) -{ - acpi_status status; - struct acpi_object_list input; - struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL}; - union acpi_object in_params[4]; - union acpi_object *out_obj; - u32 capabilities[2]; - u32 errors; - u32 supported; - int ret = 0; - - input.count = 4; - input.pointer = in_params; - input.count = 4; - input.pointer = in_params; - in_params[0].type = ACPI_TYPE_BUFFER; - in_params[0].buffer.length = 16; - in_params[0].buffer.pointer = OSC_UUID; - in_params[1].type = ACPI_TYPE_INTEGER; - in_params[1].integer.value = 1; - in_params[2].type = ACPI_TYPE_INTEGER; - in_params[2].integer.value = 2; - in_params[3].type = ACPI_TYPE_BUFFER; - in_params[3].buffer.length = 8; - in_params[3].buffer.pointer = (u8 *)&capabilities; - - capabilities[0] = OSC_QUERY_ENABLE; - capabilities[1] = 0x1; - - status = acpi_evaluate_object(*handle, "_OSC", &input, &output); - if (ACPI_FAILURE(status)) - return -ENODEV; - - if (!output.length) - return -ENODEV; - - out_obj = output.pointer; - if (out_obj->type != ACPI_TYPE_BUFFER) { - ret = -ENODEV; - goto out_free; - } - - errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); - if (errors) { - ret = -ENODEV; - goto out_free; - } - - supported = *((u32 *)(out_obj->buffer.pointer + 4)); - if (!(supported & 0x1)) { - ret = -ENODEV; - goto out_free; - } - - kfree(output.pointer); - capabilities[0] = 0x0; - capabilities[1] = 0x1; - - status = acpi_evaluate_object(*handle, "_OSC", &input, &output); - if (ACPI_FAILURE(status)) - return -ENODEV; - - if (!output.length) - return -ENODEV; - - out_obj = output.pointer; - if (out_obj->type != ACPI_TYPE_BUFFER) { - ret = -ENODEV; - goto out_free; - } - - errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); - if (errors) { - ret = -ENODEV; - goto out_free; - } - - supported = *((u32 *)(out_obj->buffer.pointer + 4)); - if (!(supported & 0x1)) { - ret = -ENODEV; - goto out_free; - } - -out_free: - kfree(output.pointer); - return ret; -} - -static int __init pcc_cpufreq_probe(void) -{ - acpi_status status; - struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL}; - struct pcc_memory_resource *mem_resource; - struct pcc_register_resource *reg_resource; - union acpi_object *out_obj, *member; - acpi_handle handle, osc_handle, pcch_handle; - int ret = 0; - - status = acpi_get_handle(NULL, "\\_SB", &handle); - if (ACPI_FAILURE(status)) - return -ENODEV; - - status = acpi_get_handle(handle, "PCCH", &pcch_handle); - if (ACPI_FAILURE(status)) - return -ENODEV; - - status = acpi_get_handle(handle, "_OSC", &osc_handle); - if (ACPI_SUCCESS(status)) { - ret = pcc_cpufreq_do_osc(&osc_handle); - if (ret) - dprintk("probe: _OSC evaluation did not succeed\n"); - /* Firmware's use of _OSC is optional */ - ret = 0; - } - - status = acpi_evaluate_object(handle, "PCCH", NULL, &output); - if (ACPI_FAILURE(status)) - return -ENODEV; - - out_obj = output.pointer; - if (out_obj->type != ACPI_TYPE_PACKAGE) { - ret = -ENODEV; - goto out_free; - } - - member = &out_obj->package.elements[0]; - if (member->type != ACPI_TYPE_BUFFER) { - ret = -ENODEV; - goto out_free; - } - - mem_resource = (struct pcc_memory_resource *)member->buffer.pointer; - - dprintk("probe: mem_resource descriptor: 0x%x," - " length: %d, space_id: %d, resource_usage: %d," - " type_specific: %d, granularity: 0x%llx," - " minimum: 0x%llx, maximum: 0x%llx," - " translation_offset: 0x%llx, address_length: 0x%llx\n", - mem_resource->descriptor, mem_resource->length, - mem_resource->space_id, mem_resource->resource_usage, - mem_resource->type_specific, mem_resource->granularity, - mem_resource->minimum, mem_resource->maximum, - mem_resource->translation_offset, - mem_resource->address_length); - - if (mem_resource->space_id != ACPI_ADR_SPACE_SYSTEM_MEMORY) { - ret = -ENODEV; - goto out_free; - } - - pcch_virt_addr = ioremap_nocache(mem_resource->minimum, - mem_resource->address_length); - if (pcch_virt_addr == NULL) { - dprintk("probe: could not map shared mem region\n"); - goto out_free; - } - pcch_hdr = pcch_virt_addr; - - dprintk("probe: PCCH header (virtual) addr: 0x%p\n", pcch_hdr); - dprintk("probe: PCCH header is at physical address: 0x%llx," - " signature: 0x%x, length: %d bytes, major: %d, minor: %d," - " supported features: 0x%x, command field: 0x%x," - " status field: 0x%x, nominal latency: %d us\n", - mem_resource->minimum, ioread32(&pcch_hdr->signature), - ioread16(&pcch_hdr->length), ioread8(&pcch_hdr->major), - ioread8(&pcch_hdr->minor), ioread32(&pcch_hdr->features), - ioread16(&pcch_hdr->command), ioread16(&pcch_hdr->status), - ioread32(&pcch_hdr->latency)); - - dprintk("probe: min time between commands: %d us," - " max time between commands: %d us," - " nominal CPU frequency: %d MHz," - " minimum CPU frequency: %d MHz," - " minimum CPU frequency without throttling: %d MHz\n", - ioread32(&pcch_hdr->minimum_time), - ioread32(&pcch_hdr->maximum_time), - ioread32(&pcch_hdr->nominal), - ioread32(&pcch_hdr->throttled_frequency), - ioread32(&pcch_hdr->minimum_frequency)); - - member = &out_obj->package.elements[1]; - if (member->type != ACPI_TYPE_BUFFER) { - ret = -ENODEV; - goto pcch_free; - } - - reg_resource = (struct pcc_register_resource *)member->buffer.pointer; - - doorbell.space_id = reg_resource->space_id; - doorbell.bit_width = reg_resource->bit_width; - doorbell.bit_offset = reg_resource->bit_offset; - doorbell.access_width = 64; - doorbell.address = reg_resource->address; - - dprintk("probe: doorbell: space_id is %d, bit_width is %d, " - "bit_offset is %d, access_width is %d, address is 0x%llx\n", - doorbell.space_id, doorbell.bit_width, doorbell.bit_offset, - doorbell.access_width, reg_resource->address); - - member = &out_obj->package.elements[2]; - if (member->type != ACPI_TYPE_INTEGER) { - ret = -ENODEV; - goto pcch_free; - } - - doorbell_preserve = member->integer.value; - - member = &out_obj->package.elements[3]; - if (member->type != ACPI_TYPE_INTEGER) { - ret = -ENODEV; - goto pcch_free; - } - - doorbell_write = member->integer.value; - - dprintk("probe: doorbell_preserve: 0x%llx," - " doorbell_write: 0x%llx\n", - doorbell_preserve, doorbell_write); - - pcc_cpu_info = alloc_percpu(struct pcc_cpu); - if (!pcc_cpu_info) { - ret = -ENOMEM; - goto pcch_free; - } - - printk(KERN_DEBUG "pcc-cpufreq: (v%s) driver loaded with frequency" - " limits: %d MHz, %d MHz\n", PCC_VERSION, - ioread32(&pcch_hdr->minimum_frequency), - ioread32(&pcch_hdr->nominal)); - kfree(output.pointer); - return ret; -pcch_free: - pcc_clear_mapping(); -out_free: - kfree(output.pointer); - return ret; -} - -static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy) -{ - unsigned int cpu = policy->cpu; - unsigned int result = 0; - - if (!pcch_virt_addr) { - result = -1; - goto out; - } - - result = pcc_get_offset(cpu); - if (result) { - dprintk("init: PCCP evaluation failed\n"); - goto out; - } - - policy->max = policy->cpuinfo.max_freq = - ioread32(&pcch_hdr->nominal) * 1000; - policy->min = policy->cpuinfo.min_freq = - ioread32(&pcch_hdr->minimum_frequency) * 1000; - policy->cur = pcc_get_freq(cpu); - - if (!policy->cur) { - dprintk("init: Unable to get current CPU frequency\n"); - result = -EINVAL; - goto out; - } - - dprintk("init: policy->max is %d, policy->min is %d\n", - policy->max, policy->min); -out: - return result; -} - -static int pcc_cpufreq_cpu_exit(struct cpufreq_policy *policy) -{ - return 0; -} - -static struct cpufreq_driver pcc_cpufreq_driver = { - .flags = CPUFREQ_CONST_LOOPS, - .get = pcc_get_freq, - .verify = pcc_cpufreq_verify, - .target = pcc_cpufreq_target, - .init = pcc_cpufreq_cpu_init, - .exit = pcc_cpufreq_cpu_exit, - .name = "pcc-cpufreq", - .owner = THIS_MODULE, -}; - -static int __init pcc_cpufreq_init(void) -{ - int ret; - - if (acpi_disabled) - return 0; - - ret = pcc_cpufreq_probe(); - if (ret) { - dprintk("pcc_cpufreq_init: PCCH evaluation failed\n"); - return ret; - } - - ret = cpufreq_register_driver(&pcc_cpufreq_driver); - - return ret; -} - -static void __exit pcc_cpufreq_exit(void) -{ - cpufreq_unregister_driver(&pcc_cpufreq_driver); - - pcc_clear_mapping(); - - free_percpu(pcc_cpu_info); -} - -MODULE_AUTHOR("Matthew Garrett, Naga Chumbalkar"); -MODULE_VERSION(PCC_VERSION); -MODULE_DESCRIPTION("Processor Clocking Control interface driver"); -MODULE_LICENSE("GPL"); - -late_initcall(pcc_cpufreq_init); -module_exit(pcc_cpufreq_exit); diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c deleted file mode 100644 index b3379d6a5c5..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c +++ /dev/null @@ -1,261 +0,0 @@ -/* - * This file was based upon code in Powertweak Linux (http://powertweak.sf.net) - * (C) 2000-2003 Dave Jones, Arjan van de Ven, Janne Pänkälä, - * Dominik Brodowski. - * - * Licensed under the terms of the GNU GPL License version 2. - * - * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/cpufreq.h> -#include <linux/ioport.h> -#include <linux/timex.h> -#include <linux/io.h> - -#include <asm/msr.h> - -#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long - as it is unused */ - -#define PFX "powernow-k6: " -static unsigned int busfreq; /* FSB, in 10 kHz */ -static unsigned int max_multiplier; - - -/* Clock ratio multiplied by 10 - see table 27 in AMD#23446 */ -static struct cpufreq_frequency_table clock_ratio[] = { - {45, /* 000 -> 4.5x */ 0}, - {50, /* 001 -> 5.0x */ 0}, - {40, /* 010 -> 4.0x */ 0}, - {55, /* 011 -> 5.5x */ 0}, - {20, /* 100 -> 2.0x */ 0}, - {30, /* 101 -> 3.0x */ 0}, - {60, /* 110 -> 6.0x */ 0}, - {35, /* 111 -> 3.5x */ 0}, - {0, CPUFREQ_TABLE_END} -}; - - -/** - * powernow_k6_get_cpu_multiplier - returns the current FSB multiplier - * - * Returns the current setting of the frequency multiplier. Core clock - * speed is frequency of the Front-Side Bus multiplied with this value. - */ -static int powernow_k6_get_cpu_multiplier(void) -{ - u64 invalue = 0; - u32 msrval; - - msrval = POWERNOW_IOPORT + 0x1; - wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */ - invalue = inl(POWERNOW_IOPORT + 0x8); - msrval = POWERNOW_IOPORT + 0x0; - wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */ - - return clock_ratio[(invalue >> 5)&7].index; -} - - -/** - * powernow_k6_set_state - set the PowerNow! multiplier - * @best_i: clock_ratio[best_i] is the target multiplier - * - * Tries to change the PowerNow! multiplier - */ -static void powernow_k6_set_state(unsigned int best_i) -{ - unsigned long outvalue = 0, invalue = 0; - unsigned long msrval; - struct cpufreq_freqs freqs; - - if (clock_ratio[best_i].index > max_multiplier) { - printk(KERN_ERR PFX "invalid target frequency\n"); - return; - } - - freqs.old = busfreq * powernow_k6_get_cpu_multiplier(); - freqs.new = busfreq * clock_ratio[best_i].index; - freqs.cpu = 0; /* powernow-k6.c is UP only driver */ - - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - - /* we now need to transform best_i to the BVC format, see AMD#23446 */ - - outvalue = (1<<12) | (1<<10) | (1<<9) | (best_i<<5); - - msrval = POWERNOW_IOPORT + 0x1; - wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */ - invalue = inl(POWERNOW_IOPORT + 0x8); - invalue = invalue & 0xf; - outvalue = outvalue | invalue; - outl(outvalue , (POWERNOW_IOPORT + 0x8)); - msrval = POWERNOW_IOPORT + 0x0; - wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */ - - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - - return; -} - - -/** - * powernow_k6_verify - verifies a new CPUfreq policy - * @policy: new policy - * - * Policy must be within lowest and highest possible CPU Frequency, - * and at least one possible state must be within min and max. - */ -static int powernow_k6_verify(struct cpufreq_policy *policy) -{ - return cpufreq_frequency_table_verify(policy, &clock_ratio[0]); -} - - -/** - * powernow_k6_setpolicy - sets a new CPUFreq policy - * @policy: new policy - * @target_freq: the target frequency - * @relation: how that frequency relates to achieved frequency - * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) - * - * sets a new CPUFreq policy - */ -static int powernow_k6_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) -{ - unsigned int newstate = 0; - - if (cpufreq_frequency_table_target(policy, &clock_ratio[0], - target_freq, relation, &newstate)) - return -EINVAL; - - powernow_k6_set_state(newstate); - - return 0; -} - - -static int powernow_k6_cpu_init(struct cpufreq_policy *policy) -{ - unsigned int i, f; - int result; - - if (policy->cpu != 0) - return -ENODEV; - - /* get frequencies */ - max_multiplier = powernow_k6_get_cpu_multiplier(); - busfreq = cpu_khz / max_multiplier; - - /* table init */ - for (i = 0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) { - f = clock_ratio[i].index; - if (f > max_multiplier) - clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID; - else - clock_ratio[i].frequency = busfreq * f; - } - - /* cpuinfo and default policy values */ - policy->cpuinfo.transition_latency = 200000; - policy->cur = busfreq * max_multiplier; - - result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio); - if (result) - return result; - - cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu); - - return 0; -} - - -static int powernow_k6_cpu_exit(struct cpufreq_policy *policy) -{ - unsigned int i; - for (i = 0; i < 8; i++) { - if (i == max_multiplier) - powernow_k6_set_state(i); - } - cpufreq_frequency_table_put_attr(policy->cpu); - return 0; -} - -static unsigned int powernow_k6_get(unsigned int cpu) -{ - unsigned int ret; - ret = (busfreq * powernow_k6_get_cpu_multiplier()); - return ret; -} - -static struct freq_attr *powernow_k6_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; - -static struct cpufreq_driver powernow_k6_driver = { - .verify = powernow_k6_verify, - .target = powernow_k6_target, - .init = powernow_k6_cpu_init, - .exit = powernow_k6_cpu_exit, - .get = powernow_k6_get, - .name = "powernow-k6", - .owner = THIS_MODULE, - .attr = powernow_k6_attr, -}; - - -/** - * powernow_k6_init - initializes the k6 PowerNow! CPUFreq driver - * - * Initializes the K6 PowerNow! support. Returns -ENODEV on unsupported - * devices, -EINVAL or -ENOMEM on problems during initiatization, and zero - * on success. - */ -static int __init powernow_k6_init(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - - if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 != 5) || - ((c->x86_model != 12) && (c->x86_model != 13))) - return -ENODEV; - - if (!request_region(POWERNOW_IOPORT, 16, "PowerNow!")) { - printk(KERN_INFO PFX "PowerNow IOPORT region already used.\n"); - return -EIO; - } - - if (cpufreq_register_driver(&powernow_k6_driver)) { - release_region(POWERNOW_IOPORT, 16); - return -EINVAL; - } - - return 0; -} - - -/** - * powernow_k6_exit - unregisters AMD K6-2+/3+ PowerNow! support - * - * Unregisters AMD K6-2+ / K6-3+ PowerNow! support. - */ -static void __exit powernow_k6_exit(void) -{ - cpufreq_unregister_driver(&powernow_k6_driver); - release_region(POWERNOW_IOPORT, 16); -} - - -MODULE_AUTHOR("Arjan van de Ven, Dave Jones <davej@redhat.com>, " - "Dominik Brodowski <linux@brodo.de>"); -MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors."); -MODULE_LICENSE("GPL"); - -module_init(powernow_k6_init); -module_exit(powernow_k6_exit); diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c deleted file mode 100644 index 4a45fd6e41b..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c +++ /dev/null @@ -1,752 +0,0 @@ -/* - * AMD K7 Powernow driver. - * (C) 2003 Dave Jones on behalf of SuSE Labs. - * (C) 2003-2004 Dave Jones <davej@redhat.com> - * - * Licensed under the terms of the GNU GPL License version 2. - * Based upon datasheets & sample CPUs kindly provided by AMD. - * - * Errata 5: - * CPU may fail to execute a FID/VID change in presence of interrupt. - * - We cli/sti on stepping A0 CPUs around the FID/VID transition. - * Errata 15: - * CPU with half frequency multipliers may hang upon wakeup from disconnect. - * - We disable half multipliers if ACPI is used on A0 stepping CPUs. - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/moduleparam.h> -#include <linux/init.h> -#include <linux/cpufreq.h> -#include <linux/slab.h> -#include <linux/string.h> -#include <linux/dmi.h> -#include <linux/timex.h> -#include <linux/io.h> - -#include <asm/timer.h> /* Needed for recalibrate_cpu_khz() */ -#include <asm/msr.h> -#include <asm/system.h> - -#ifdef CONFIG_X86_POWERNOW_K7_ACPI -#include <linux/acpi.h> -#include <acpi/processor.h> -#endif - -#include "powernow-k7.h" - -#define PFX "powernow: " - - -struct psb_s { - u8 signature[10]; - u8 tableversion; - u8 flags; - u16 settlingtime; - u8 reserved1; - u8 numpst; -}; - -struct pst_s { - u32 cpuid; - u8 fsbspeed; - u8 maxfid; - u8 startvid; - u8 numpstates; -}; - -#ifdef CONFIG_X86_POWERNOW_K7_ACPI -union powernow_acpi_control_t { - struct { - unsigned long fid:5, - vid:5, - sgtc:20, - res1:2; - } bits; - unsigned long val; -}; -#endif - -#ifdef CONFIG_CPU_FREQ_DEBUG -/* divide by 1000 to get VCore voltage in V. */ -static const int mobile_vid_table[32] = { - 2000, 1950, 1900, 1850, 1800, 1750, 1700, 1650, - 1600, 1550, 1500, 1450, 1400, 1350, 1300, 0, - 1275, 1250, 1225, 1200, 1175, 1150, 1125, 1100, - 1075, 1050, 1025, 1000, 975, 950, 925, 0, -}; -#endif - -/* divide by 10 to get FID. */ -static const int fid_codes[32] = { - 110, 115, 120, 125, 50, 55, 60, 65, - 70, 75, 80, 85, 90, 95, 100, 105, - 30, 190, 40, 200, 130, 135, 140, 210, - 150, 225, 160, 165, 170, 180, -1, -1, -}; - -/* This parameter is used in order to force ACPI instead of legacy method for - * configuration purpose. - */ - -static int acpi_force; - -static struct cpufreq_frequency_table *powernow_table; - -static unsigned int can_scale_bus; -static unsigned int can_scale_vid; -static unsigned int minimum_speed = -1; -static unsigned int maximum_speed; -static unsigned int number_scales; -static unsigned int fsb; -static unsigned int latency; -static char have_a0; - -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "powernow-k7", msg) - -static int check_fsb(unsigned int fsbspeed) -{ - int delta; - unsigned int f = fsb / 1000; - - delta = (fsbspeed > f) ? fsbspeed - f : f - fsbspeed; - return delta < 5; -} - -static int check_powernow(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - unsigned int maxei, eax, ebx, ecx, edx; - - if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 != 6)) { -#ifdef MODULE - printk(KERN_INFO PFX "This module only works with " - "AMD K7 CPUs\n"); -#endif - return 0; - } - - /* Get maximum capabilities */ - maxei = cpuid_eax(0x80000000); - if (maxei < 0x80000007) { /* Any powernow info ? */ -#ifdef MODULE - printk(KERN_INFO PFX "No powernow capabilities detected\n"); -#endif - return 0; - } - - if ((c->x86_model == 6) && (c->x86_mask == 0)) { - printk(KERN_INFO PFX "K7 660[A0] core detected, " - "enabling errata workarounds\n"); - have_a0 = 1; - } - - cpuid(0x80000007, &eax, &ebx, &ecx, &edx); - - /* Check we can actually do something before we say anything.*/ - if (!(edx & (1 << 1 | 1 << 2))) - return 0; - - printk(KERN_INFO PFX "PowerNOW! Technology present. Can scale: "); - - if (edx & 1 << 1) { - printk("frequency"); - can_scale_bus = 1; - } - - if ((edx & (1 << 1 | 1 << 2)) == 0x6) - printk(" and "); - - if (edx & 1 << 2) { - printk("voltage"); - can_scale_vid = 1; - } - - printk(".\n"); - return 1; -} - -#ifdef CONFIG_X86_POWERNOW_K7_ACPI -static void invalidate_entry(unsigned int entry) -{ - powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID; -} -#endif - -static int get_ranges(unsigned char *pst) -{ - unsigned int j; - unsigned int speed; - u8 fid, vid; - - powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) * - (number_scales + 1)), GFP_KERNEL); - if (!powernow_table) - return -ENOMEM; - - for (j = 0 ; j < number_scales; j++) { - fid = *pst++; - - powernow_table[j].frequency = (fsb * fid_codes[fid]) / 10; - powernow_table[j].index = fid; /* lower 8 bits */ - - speed = powernow_table[j].frequency; - - if ((fid_codes[fid] % 10) == 5) { -#ifdef CONFIG_X86_POWERNOW_K7_ACPI - if (have_a0 == 1) - invalidate_entry(j); -#endif - } - - if (speed < minimum_speed) - minimum_speed = speed; - if (speed > maximum_speed) - maximum_speed = speed; - - vid = *pst++; - powernow_table[j].index |= (vid << 8); /* upper 8 bits */ - - dprintk(" FID: 0x%x (%d.%dx [%dMHz]) " - "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10, - fid_codes[fid] % 10, speed/1000, vid, - mobile_vid_table[vid]/1000, - mobile_vid_table[vid]%1000); - } - powernow_table[number_scales].frequency = CPUFREQ_TABLE_END; - powernow_table[number_scales].index = 0; - - return 0; -} - - -static void change_FID(int fid) -{ - union msr_fidvidctl fidvidctl; - - rdmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val); - if (fidvidctl.bits.FID != fid) { - fidvidctl.bits.SGTC = latency; - fidvidctl.bits.FID = fid; - fidvidctl.bits.VIDC = 0; - fidvidctl.bits.FIDC = 1; - wrmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val); - } -} - - -static void change_VID(int vid) -{ - union msr_fidvidctl fidvidctl; - - rdmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val); - if (fidvidctl.bits.VID != vid) { - fidvidctl.bits.SGTC = latency; - fidvidctl.bits.VID = vid; - fidvidctl.bits.FIDC = 0; - fidvidctl.bits.VIDC = 1; - wrmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val); - } -} - - -static void change_speed(unsigned int index) -{ - u8 fid, vid; - struct cpufreq_freqs freqs; - union msr_fidvidstatus fidvidstatus; - int cfid; - - /* fid are the lower 8 bits of the index we stored into - * the cpufreq frequency table in powernow_decode_bios, - * vid are the upper 8 bits. - */ - - fid = powernow_table[index].index & 0xFF; - vid = (powernow_table[index].index & 0xFF00) >> 8; - - freqs.cpu = 0; - - rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val); - cfid = fidvidstatus.bits.CFID; - freqs.old = fsb * fid_codes[cfid] / 10; - - freqs.new = powernow_table[index].frequency; - - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - - /* Now do the magic poking into the MSRs. */ - - if (have_a0 == 1) /* A0 errata 5 */ - local_irq_disable(); - - if (freqs.old > freqs.new) { - /* Going down, so change FID first */ - change_FID(fid); - change_VID(vid); - } else { - /* Going up, so change VID first */ - change_VID(vid); - change_FID(fid); - } - - - if (have_a0 == 1) - local_irq_enable(); - - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); -} - - -#ifdef CONFIG_X86_POWERNOW_K7_ACPI - -static struct acpi_processor_performance *acpi_processor_perf; - -static int powernow_acpi_init(void) -{ - int i; - int retval = 0; - union powernow_acpi_control_t pc; - - if (acpi_processor_perf != NULL && powernow_table != NULL) { - retval = -EINVAL; - goto err0; - } - - acpi_processor_perf = kzalloc(sizeof(struct acpi_processor_performance), - GFP_KERNEL); - if (!acpi_processor_perf) { - retval = -ENOMEM; - goto err0; - } - - if (!zalloc_cpumask_var(&acpi_processor_perf->shared_cpu_map, - GFP_KERNEL)) { - retval = -ENOMEM; - goto err05; - } - - if (acpi_processor_register_performance(acpi_processor_perf, 0)) { - retval = -EIO; - goto err1; - } - - if (acpi_processor_perf->control_register.space_id != - ACPI_ADR_SPACE_FIXED_HARDWARE) { - retval = -ENODEV; - goto err2; - } - - if (acpi_processor_perf->status_register.space_id != - ACPI_ADR_SPACE_FIXED_HARDWARE) { - retval = -ENODEV; - goto err2; - } - - number_scales = acpi_processor_perf->state_count; - - if (number_scales < 2) { - retval = -ENODEV; - goto err2; - } - - powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) * - (number_scales + 1)), GFP_KERNEL); - if (!powernow_table) { - retval = -ENOMEM; - goto err2; - } - - pc.val = (unsigned long) acpi_processor_perf->states[0].control; - for (i = 0; i < number_scales; i++) { - u8 fid, vid; - struct acpi_processor_px *state = - &acpi_processor_perf->states[i]; - unsigned int speed, speed_mhz; - - pc.val = (unsigned long) state->control; - dprintk("acpi: P%d: %d MHz %d mW %d uS control %08x SGTC %d\n", - i, - (u32) state->core_frequency, - (u32) state->power, - (u32) state->transition_latency, - (u32) state->control, - pc.bits.sgtc); - - vid = pc.bits.vid; - fid = pc.bits.fid; - - powernow_table[i].frequency = fsb * fid_codes[fid] / 10; - powernow_table[i].index = fid; /* lower 8 bits */ - powernow_table[i].index |= (vid << 8); /* upper 8 bits */ - - speed = powernow_table[i].frequency; - speed_mhz = speed / 1000; - - /* processor_perflib will multiply the MHz value by 1000 to - * get a KHz value (e.g. 1266000). However, powernow-k7 works - * with true KHz values (e.g. 1266768). To ensure that all - * powernow frequencies are available, we must ensure that - * ACPI doesn't restrict them, so we round up the MHz value - * to ensure that perflib's computed KHz value is greater than - * or equal to powernow's KHz value. - */ - if (speed % 1000 > 0) - speed_mhz++; - - if ((fid_codes[fid] % 10) == 5) { - if (have_a0 == 1) - invalidate_entry(i); - } - - dprintk(" FID: 0x%x (%d.%dx [%dMHz]) " - "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10, - fid_codes[fid] % 10, speed_mhz, vid, - mobile_vid_table[vid]/1000, - mobile_vid_table[vid]%1000); - - if (state->core_frequency != speed_mhz) { - state->core_frequency = speed_mhz; - dprintk(" Corrected ACPI frequency to %d\n", - speed_mhz); - } - - if (latency < pc.bits.sgtc) - latency = pc.bits.sgtc; - - if (speed < minimum_speed) - minimum_speed = speed; - if (speed > maximum_speed) - maximum_speed = speed; - } - - powernow_table[i].frequency = CPUFREQ_TABLE_END; - powernow_table[i].index = 0; - - /* notify BIOS that we exist */ - acpi_processor_notify_smm(THIS_MODULE); - - return 0; - -err2: - acpi_processor_unregister_performance(acpi_processor_perf, 0); -err1: - free_cpumask_var(acpi_processor_perf->shared_cpu_map); -err05: - kfree(acpi_processor_perf); -err0: - printk(KERN_WARNING PFX "ACPI perflib can not be used on " - "this platform\n"); - acpi_processor_perf = NULL; - return retval; -} -#else -static int powernow_acpi_init(void) -{ - printk(KERN_INFO PFX "no support for ACPI processor found." - " Please recompile your kernel with ACPI processor\n"); - return -EINVAL; -} -#endif - -static void print_pst_entry(struct pst_s *pst, unsigned int j) -{ - dprintk("PST:%d (@%p)\n", j, pst); - dprintk(" cpuid: 0x%x fsb: %d maxFID: 0x%x startvid: 0x%x\n", - pst->cpuid, pst->fsbspeed, pst->maxfid, pst->startvid); -} - -static int powernow_decode_bios(int maxfid, int startvid) -{ - struct psb_s *psb; - struct pst_s *pst; - unsigned int i, j; - unsigned char *p; - unsigned int etuple; - unsigned int ret; - - etuple = cpuid_eax(0x80000001); - - for (i = 0xC0000; i < 0xffff0 ; i += 16) { - - p = phys_to_virt(i); - - if (memcmp(p, "AMDK7PNOW!", 10) == 0) { - dprintk("Found PSB header at %p\n", p); - psb = (struct psb_s *) p; - dprintk("Table version: 0x%x\n", psb->tableversion); - if (psb->tableversion != 0x12) { - printk(KERN_INFO PFX "Sorry, only v1.2 tables" - " supported right now\n"); - return -ENODEV; - } - - dprintk("Flags: 0x%x\n", psb->flags); - if ((psb->flags & 1) == 0) - dprintk("Mobile voltage regulator\n"); - else - dprintk("Desktop voltage regulator\n"); - - latency = psb->settlingtime; - if (latency < 100) { - printk(KERN_INFO PFX "BIOS set settling time " - "to %d microseconds. " - "Should be at least 100. " - "Correcting.\n", latency); - latency = 100; - } - dprintk("Settling Time: %d microseconds.\n", - psb->settlingtime); - dprintk("Has %d PST tables. (Only dumping ones " - "relevant to this CPU).\n", - psb->numpst); - - p += sizeof(struct psb_s); - - pst = (struct pst_s *) p; - - for (j = 0; j < psb->numpst; j++) { - pst = (struct pst_s *) p; - number_scales = pst->numpstates; - - if ((etuple == pst->cpuid) && - check_fsb(pst->fsbspeed) && - (maxfid == pst->maxfid) && - (startvid == pst->startvid)) { - print_pst_entry(pst, j); - p = (char *)pst + sizeof(struct pst_s); - ret = get_ranges(p); - return ret; - } else { - unsigned int k; - p = (char *)pst + sizeof(struct pst_s); - for (k = 0; k < number_scales; k++) - p += 2; - } - } - printk(KERN_INFO PFX "No PST tables match this cpuid " - "(0x%x)\n", etuple); - printk(KERN_INFO PFX "This is indicative of a broken " - "BIOS.\n"); - - return -EINVAL; - } - p++; - } - - return -ENODEV; -} - - -static int powernow_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) -{ - unsigned int newstate; - - if (cpufreq_frequency_table_target(policy, powernow_table, target_freq, - relation, &newstate)) - return -EINVAL; - - change_speed(newstate); - - return 0; -} - - -static int powernow_verify(struct cpufreq_policy *policy) -{ - return cpufreq_frequency_table_verify(policy, powernow_table); -} - -/* - * We use the fact that the bus frequency is somehow - * a multiple of 100000/3 khz, then we compute sgtc according - * to this multiple. - * That way, we match more how AMD thinks all of that work. - * We will then get the same kind of behaviour already tested under - * the "well-known" other OS. - */ -static int __cpuinit fixup_sgtc(void) -{ - unsigned int sgtc; - unsigned int m; - - m = fsb / 3333; - if ((m % 10) >= 5) - m += 5; - - m /= 10; - - sgtc = 100 * m * latency; - sgtc = sgtc / 3; - if (sgtc > 0xfffff) { - printk(KERN_WARNING PFX "SGTC too large %d\n", sgtc); - sgtc = 0xfffff; - } - return sgtc; -} - -static unsigned int powernow_get(unsigned int cpu) -{ - union msr_fidvidstatus fidvidstatus; - unsigned int cfid; - - if (cpu) - return 0; - rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val); - cfid = fidvidstatus.bits.CFID; - - return fsb * fid_codes[cfid] / 10; -} - - -static int __cpuinit acer_cpufreq_pst(const struct dmi_system_id *d) -{ - printk(KERN_WARNING PFX - "%s laptop with broken PST tables in BIOS detected.\n", - d->ident); - printk(KERN_WARNING PFX - "You need to downgrade to 3A21 (09/09/2002), or try a newer " - "BIOS than 3A71 (01/20/2003)\n"); - printk(KERN_WARNING PFX - "cpufreq scaling has been disabled as a result of this.\n"); - return 0; -} - -/* - * Some Athlon laptops have really fucked PST tables. - * A BIOS update is all that can save them. - * Mention this, and disable cpufreq. - */ -static struct dmi_system_id __cpuinitdata powernow_dmi_table[] = { - { - .callback = acer_cpufreq_pst, - .ident = "Acer Aspire", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Insyde Software"), - DMI_MATCH(DMI_BIOS_VERSION, "3A71"), - }, - }, - { } -}; - -static int __cpuinit powernow_cpu_init(struct cpufreq_policy *policy) -{ - union msr_fidvidstatus fidvidstatus; - int result; - - if (policy->cpu != 0) - return -ENODEV; - - rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val); - - recalibrate_cpu_khz(); - - fsb = (10 * cpu_khz) / fid_codes[fidvidstatus.bits.CFID]; - if (!fsb) { - printk(KERN_WARNING PFX "can not determine bus frequency\n"); - return -EINVAL; - } - dprintk("FSB: %3dMHz\n", fsb/1000); - - if (dmi_check_system(powernow_dmi_table) || acpi_force) { - printk(KERN_INFO PFX "PSB/PST known to be broken. " - "Trying ACPI instead\n"); - result = powernow_acpi_init(); - } else { - result = powernow_decode_bios(fidvidstatus.bits.MFID, - fidvidstatus.bits.SVID); - if (result) { - printk(KERN_INFO PFX "Trying ACPI perflib\n"); - maximum_speed = 0; - minimum_speed = -1; - latency = 0; - result = powernow_acpi_init(); - if (result) { - printk(KERN_INFO PFX - "ACPI and legacy methods failed\n"); - } - } else { - /* SGTC use the bus clock as timer */ - latency = fixup_sgtc(); - printk(KERN_INFO PFX "SGTC: %d\n", latency); - } - } - - if (result) - return result; - - printk(KERN_INFO PFX "Minimum speed %d MHz. Maximum speed %d MHz.\n", - minimum_speed/1000, maximum_speed/1000); - - policy->cpuinfo.transition_latency = - cpufreq_scale(2000000UL, fsb, latency); - - policy->cur = powernow_get(0); - - cpufreq_frequency_table_get_attr(powernow_table, policy->cpu); - - return cpufreq_frequency_table_cpuinfo(policy, powernow_table); -} - -static int powernow_cpu_exit(struct cpufreq_policy *policy) -{ - cpufreq_frequency_table_put_attr(policy->cpu); - -#ifdef CONFIG_X86_POWERNOW_K7_ACPI - if (acpi_processor_perf) { - acpi_processor_unregister_performance(acpi_processor_perf, 0); - free_cpumask_var(acpi_processor_perf->shared_cpu_map); - kfree(acpi_processor_perf); - } -#endif - - kfree(powernow_table); - return 0; -} - -static struct freq_attr *powernow_table_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; - -static struct cpufreq_driver powernow_driver = { - .verify = powernow_verify, - .target = powernow_target, - .get = powernow_get, -#ifdef CONFIG_X86_POWERNOW_K7_ACPI - .bios_limit = acpi_processor_get_bios_limit, -#endif - .init = powernow_cpu_init, - .exit = powernow_cpu_exit, - .name = "powernow-k7", - .owner = THIS_MODULE, - .attr = powernow_table_attr, -}; - -static int __init powernow_init(void) -{ - if (check_powernow() == 0) - return -ENODEV; - return cpufreq_register_driver(&powernow_driver); -} - - -static void __exit powernow_exit(void) -{ - cpufreq_unregister_driver(&powernow_driver); -} - -module_param(acpi_force, int, 0444); -MODULE_PARM_DESC(acpi_force, "Force ACPI to be used."); - -MODULE_AUTHOR("Dave Jones <davej@redhat.com>"); -MODULE_DESCRIPTION("Powernow driver for AMD K7 processors."); -MODULE_LICENSE("GPL"); - -late_initcall(powernow_init); -module_exit(powernow_exit); - diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h b/arch/x86/kernel/cpu/cpufreq/powernow-k7.h deleted file mode 100644 index 35fb4eaf6e1..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * (C) 2003 Dave Jones. - * - * Licensed under the terms of the GNU GPL License version 2. - * - * AMD-specific information - * - */ - -union msr_fidvidctl { - struct { - unsigned FID:5, // 4:0 - reserved1:3, // 7:5 - VID:5, // 12:8 - reserved2:3, // 15:13 - FIDC:1, // 16 - VIDC:1, // 17 - reserved3:2, // 19:18 - FIDCHGRATIO:1, // 20 - reserved4:11, // 31-21 - SGTC:20, // 32:51 - reserved5:12; // 63:52 - } bits; - unsigned long long val; -}; - -union msr_fidvidstatus { - struct { - unsigned CFID:5, // 4:0 - reserved1:3, // 7:5 - SFID:5, // 12:8 - reserved2:3, // 15:13 - MFID:5, // 20:16 - reserved3:11, // 31:21 - CVID:5, // 36:32 - reserved4:3, // 39:37 - SVID:5, // 44:40 - reserved5:3, // 47:45 - MVID:5, // 52:48 - reserved6:11; // 63:53 - } bits; - unsigned long long val; -}; diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c deleted file mode 100644 index 491977baf6c..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ /dev/null @@ -1,1601 +0,0 @@ -/* - * (c) 2003-2010 Advanced Micro Devices, Inc. - * Your use of this code is subject to the terms and conditions of the - * GNU general public license version 2. See "COPYING" or - * http://www.gnu.org/licenses/gpl.html - * - * Support : mark.langsdorf@amd.com - * - * Based on the powernow-k7.c module written by Dave Jones. - * (C) 2003 Dave Jones on behalf of SuSE Labs - * (C) 2004 Dominik Brodowski <linux@brodo.de> - * (C) 2004 Pavel Machek <pavel@ucw.cz> - * Licensed under the terms of the GNU GPL License version 2. - * Based upon datasheets & sample CPUs kindly provided by AMD. - * - * Valuable input gratefully received from Dave Jones, Pavel Machek, - * Dominik Brodowski, Jacob Shin, and others. - * Originally developed by Paul Devriendt. - * Processor information obtained from Chapter 9 (Power and Thermal Management) - * of the "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD - * Opteron Processors" available for download from www.amd.com - * - * Tables for specific CPUs can be inferred from - * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/30430.pdf - */ - -#include <linux/kernel.h> -#include <linux/smp.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/cpufreq.h> -#include <linux/slab.h> -#include <linux/string.h> -#include <linux/cpumask.h> -#include <linux/sched.h> /* for current / set_cpus_allowed() */ -#include <linux/io.h> -#include <linux/delay.h> - -#include <asm/msr.h> - -#include <linux/acpi.h> -#include <linux/mutex.h> -#include <acpi/processor.h> - -#define PFX "powernow-k8: " -#define VERSION "version 2.20.00" -#include "powernow-k8.h" -#include "mperf.h" - -/* serialize freq changes */ -static DEFINE_MUTEX(fidvid_mutex); - -static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data); - -static int cpu_family = CPU_OPTERON; - -/* core performance boost */ -static bool cpb_capable, cpb_enabled; -static struct msr __percpu *msrs; - -static struct cpufreq_driver cpufreq_amd64_driver; - -#ifndef CONFIG_SMP -static inline const struct cpumask *cpu_core_mask(int cpu) -{ - return cpumask_of(0); -} -#endif - -/* Return a frequency in MHz, given an input fid */ -static u32 find_freq_from_fid(u32 fid) -{ - return 800 + (fid * 100); -} - -/* Return a frequency in KHz, given an input fid */ -static u32 find_khz_freq_from_fid(u32 fid) -{ - return 1000 * find_freq_from_fid(fid); -} - -static u32 find_khz_freq_from_pstate(struct cpufreq_frequency_table *data, - u32 pstate) -{ - return data[pstate].frequency; -} - -/* Return the vco fid for an input fid - * - * Each "low" fid has corresponding "high" fid, and you can get to "low" fids - * only from corresponding high fids. This returns "high" fid corresponding to - * "low" one. - */ -static u32 convert_fid_to_vco_fid(u32 fid) -{ - if (fid < HI_FID_TABLE_BOTTOM) - return 8 + (2 * fid); - else - return fid; -} - -/* - * Return 1 if the pending bit is set. Unless we just instructed the processor - * to transition to a new state, seeing this bit set is really bad news. - */ -static int pending_bit_stuck(void) -{ - u32 lo, hi; - - if (cpu_family == CPU_HW_PSTATE) - return 0; - - rdmsr(MSR_FIDVID_STATUS, lo, hi); - return lo & MSR_S_LO_CHANGE_PENDING ? 1 : 0; -} - -/* - * Update the global current fid / vid values from the status msr. - * Returns 1 on error. - */ -static int query_current_values_with_pending_wait(struct powernow_k8_data *data) -{ - u32 lo, hi; - u32 i = 0; - - if (cpu_family == CPU_HW_PSTATE) { - rdmsr(MSR_PSTATE_STATUS, lo, hi); - i = lo & HW_PSTATE_MASK; - data->currpstate = i; - - /* - * a workaround for family 11h erratum 311 might cause - * an "out-of-range Pstate if the core is in Pstate-0 - */ - if ((boot_cpu_data.x86 == 0x11) && (i >= data->numps)) - data->currpstate = HW_PSTATE_0; - - return 0; - } - do { - if (i++ > 10000) { - dprintk("detected change pending stuck\n"); - return 1; - } - rdmsr(MSR_FIDVID_STATUS, lo, hi); - } while (lo & MSR_S_LO_CHANGE_PENDING); - - data->currvid = hi & MSR_S_HI_CURRENT_VID; - data->currfid = lo & MSR_S_LO_CURRENT_FID; - - return 0; -} - -/* the isochronous relief time */ -static void count_off_irt(struct powernow_k8_data *data) -{ - udelay((1 << data->irt) * 10); - return; -} - -/* the voltage stabilization time */ -static void count_off_vst(struct powernow_k8_data *data) -{ - udelay(data->vstable * VST_UNITS_20US); - return; -} - -/* need to init the control msr to a safe value (for each cpu) */ -static void fidvid_msr_init(void) -{ - u32 lo, hi; - u8 fid, vid; - - rdmsr(MSR_FIDVID_STATUS, lo, hi); - vid = hi & MSR_S_HI_CURRENT_VID; - fid = lo & MSR_S_LO_CURRENT_FID; - lo = fid | (vid << MSR_C_LO_VID_SHIFT); - hi = MSR_C_HI_STP_GNT_BENIGN; - dprintk("cpu%d, init lo 0x%x, hi 0x%x\n", smp_processor_id(), lo, hi); - wrmsr(MSR_FIDVID_CTL, lo, hi); -} - -/* write the new fid value along with the other control fields to the msr */ -static int write_new_fid(struct powernow_k8_data *data, u32 fid) -{ - u32 lo; - u32 savevid = data->currvid; - u32 i = 0; - - if ((fid & INVALID_FID_MASK) || (data->currvid & INVALID_VID_MASK)) { - printk(KERN_ERR PFX "internal error - overflow on fid write\n"); - return 1; - } - - lo = fid; - lo |= (data->currvid << MSR_C_LO_VID_SHIFT); - lo |= MSR_C_LO_INIT_FID_VID; - - dprintk("writing fid 0x%x, lo 0x%x, hi 0x%x\n", - fid, lo, data->plllock * PLL_LOCK_CONVERSION); - - do { - wrmsr(MSR_FIDVID_CTL, lo, data->plllock * PLL_LOCK_CONVERSION); - if (i++ > 100) { - printk(KERN_ERR PFX - "Hardware error - pending bit very stuck - " - "no further pstate changes possible\n"); - return 1; - } - } while (query_current_values_with_pending_wait(data)); - - count_off_irt(data); - - if (savevid != data->currvid) { - printk(KERN_ERR PFX - "vid change on fid trans, old 0x%x, new 0x%x\n", - savevid, data->currvid); - return 1; - } - - if (fid != data->currfid) { - printk(KERN_ERR PFX - "fid trans failed, fid 0x%x, curr 0x%x\n", fid, - data->currfid); - return 1; - } - - return 0; -} - -/* Write a new vid to the hardware */ -static int write_new_vid(struct powernow_k8_data *data, u32 vid) -{ - u32 lo; - u32 savefid = data->currfid; - int i = 0; - - if ((data->currfid & INVALID_FID_MASK) || (vid & INVALID_VID_MASK)) { - printk(KERN_ERR PFX "internal error - overflow on vid write\n"); - return 1; - } - - lo = data->currfid; - lo |= (vid << MSR_C_LO_VID_SHIFT); - lo |= MSR_C_LO_INIT_FID_VID; - - dprintk("writing vid 0x%x, lo 0x%x, hi 0x%x\n", - vid, lo, STOP_GRANT_5NS); - - do { - wrmsr(MSR_FIDVID_CTL, lo, STOP_GRANT_5NS); - if (i++ > 100) { - printk(KERN_ERR PFX "internal error - pending bit " - "very stuck - no further pstate " - "changes possible\n"); - return 1; - } - } while (query_current_values_with_pending_wait(data)); - - if (savefid != data->currfid) { - printk(KERN_ERR PFX "fid changed on vid trans, old " - "0x%x new 0x%x\n", - savefid, data->currfid); - return 1; - } - - if (vid != data->currvid) { - printk(KERN_ERR PFX "vid trans failed, vid 0x%x, " - "curr 0x%x\n", - vid, data->currvid); - return 1; - } - - return 0; -} - -/* - * Reduce the vid by the max of step or reqvid. - * Decreasing vid codes represent increasing voltages: - * vid of 0 is 1.550V, vid of 0x1e is 0.800V, vid of VID_OFF is off. - */ -static int decrease_vid_code_by_step(struct powernow_k8_data *data, - u32 reqvid, u32 step) -{ - if ((data->currvid - reqvid) > step) - reqvid = data->currvid - step; - - if (write_new_vid(data, reqvid)) - return 1; - - count_off_vst(data); - - return 0; -} - -/* Change hardware pstate by single MSR write */ -static int transition_pstate(struct powernow_k8_data *data, u32 pstate) -{ - wrmsr(MSR_PSTATE_CTRL, pstate, 0); - data->currpstate = pstate; - return 0; -} - -/* Change Opteron/Athlon64 fid and vid, by the 3 phases. */ -static int transition_fid_vid(struct powernow_k8_data *data, - u32 reqfid, u32 reqvid) -{ - if (core_voltage_pre_transition(data, reqvid, reqfid)) - return 1; - - if (core_frequency_transition(data, reqfid)) - return 1; - - if (core_voltage_post_transition(data, reqvid)) - return 1; - - if (query_current_values_with_pending_wait(data)) - return 1; - - if ((reqfid != data->currfid) || (reqvid != data->currvid)) { - printk(KERN_ERR PFX "failed (cpu%d): req 0x%x 0x%x, " - "curr 0x%x 0x%x\n", - smp_processor_id(), - reqfid, reqvid, data->currfid, data->currvid); - return 1; - } - - dprintk("transitioned (cpu%d): new fid 0x%x, vid 0x%x\n", - smp_processor_id(), data->currfid, data->currvid); - - return 0; -} - -/* Phase 1 - core voltage transition ... setup voltage */ -static int core_voltage_pre_transition(struct powernow_k8_data *data, - u32 reqvid, u32 reqfid) -{ - u32 rvosteps = data->rvo; - u32 savefid = data->currfid; - u32 maxvid, lo, rvomult = 1; - - dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, " - "reqvid 0x%x, rvo 0x%x\n", - smp_processor_id(), - data->currfid, data->currvid, reqvid, data->rvo); - - if ((savefid < LO_FID_TABLE_TOP) && (reqfid < LO_FID_TABLE_TOP)) - rvomult = 2; - rvosteps *= rvomult; - rdmsr(MSR_FIDVID_STATUS, lo, maxvid); - maxvid = 0x1f & (maxvid >> 16); - dprintk("ph1 maxvid=0x%x\n", maxvid); - if (reqvid < maxvid) /* lower numbers are higher voltages */ - reqvid = maxvid; - - while (data->currvid > reqvid) { - dprintk("ph1: curr 0x%x, req vid 0x%x\n", - data->currvid, reqvid); - if (decrease_vid_code_by_step(data, reqvid, data->vidmvs)) - return 1; - } - - while ((rvosteps > 0) && - ((rvomult * data->rvo + data->currvid) > reqvid)) { - if (data->currvid == maxvid) { - rvosteps = 0; - } else { - dprintk("ph1: changing vid for rvo, req 0x%x\n", - data->currvid - 1); - if (decrease_vid_code_by_step(data, data->currvid-1, 1)) - return 1; - rvosteps--; - } - } - - if (query_current_values_with_pending_wait(data)) - return 1; - - if (savefid != data->currfid) { - printk(KERN_ERR PFX "ph1 err, currfid changed 0x%x\n", - data->currfid); - return 1; - } - - dprintk("ph1 complete, currfid 0x%x, currvid 0x%x\n", - data->currfid, data->currvid); - - return 0; -} - -/* Phase 2 - core frequency transition */ -static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid) -{ - u32 vcoreqfid, vcocurrfid, vcofiddiff; - u32 fid_interval, savevid = data->currvid; - - if (data->currfid == reqfid) { - printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n", - data->currfid); - return 0; - } - - dprintk("ph2 (cpu%d): starting, currfid 0x%x, currvid 0x%x, " - "reqfid 0x%x\n", - smp_processor_id(), - data->currfid, data->currvid, reqfid); - - vcoreqfid = convert_fid_to_vco_fid(reqfid); - vcocurrfid = convert_fid_to_vco_fid(data->currfid); - vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid - : vcoreqfid - vcocurrfid; - - if ((reqfid <= LO_FID_TABLE_TOP) && (data->currfid <= LO_FID_TABLE_TOP)) - vcofiddiff = 0; - - while (vcofiddiff > 2) { - (data->currfid & 1) ? (fid_interval = 1) : (fid_interval = 2); - - if (reqfid > data->currfid) { - if (data->currfid > LO_FID_TABLE_TOP) { - if (write_new_fid(data, - data->currfid + fid_interval)) - return 1; - } else { - if (write_new_fid - (data, - 2 + convert_fid_to_vco_fid(data->currfid))) - return 1; - } - } else { - if (write_new_fid(data, data->currfid - fid_interval)) - return 1; - } - - vcocurrfid = convert_fid_to_vco_fid(data->currfid); - vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid - : vcoreqfid - vcocurrfid; - } - - if (write_new_fid(data, reqfid)) - return 1; - - if (query_current_values_with_pending_wait(data)) - return 1; - - if (data->currfid != reqfid) { - printk(KERN_ERR PFX - "ph2: mismatch, failed fid transition, " - "curr 0x%x, req 0x%x\n", - data->currfid, reqfid); - return 1; - } - - if (savevid != data->currvid) { - printk(KERN_ERR PFX "ph2: vid changed, save 0x%x, curr 0x%x\n", - savevid, data->currvid); - return 1; - } - - dprintk("ph2 complete, currfid 0x%x, currvid 0x%x\n", - data->currfid, data->currvid); - - return 0; -} - -/* Phase 3 - core voltage transition flow ... jump to the final vid. */ -static int core_voltage_post_transition(struct powernow_k8_data *data, - u32 reqvid) -{ - u32 savefid = data->currfid; - u32 savereqvid = reqvid; - - dprintk("ph3 (cpu%d): starting, currfid 0x%x, currvid 0x%x\n", - smp_processor_id(), - data->currfid, data->currvid); - - if (reqvid != data->currvid) { - if (write_new_vid(data, reqvid)) - return 1; - - if (savefid != data->currfid) { - printk(KERN_ERR PFX - "ph3: bad fid change, save 0x%x, curr 0x%x\n", - savefid, data->currfid); - return 1; - } - - if (data->currvid != reqvid) { - printk(KERN_ERR PFX - "ph3: failed vid transition\n, " - "req 0x%x, curr 0x%x", - reqvid, data->currvid); - return 1; - } - } - - if (query_current_values_with_pending_wait(data)) - return 1; - - if (savereqvid != data->currvid) { - dprintk("ph3 failed, currvid 0x%x\n", data->currvid); - return 1; - } - - if (savefid != data->currfid) { - dprintk("ph3 failed, currfid changed 0x%x\n", - data->currfid); - return 1; - } - - dprintk("ph3 complete, currfid 0x%x, currvid 0x%x\n", - data->currfid, data->currvid); - - return 0; -} - -static void check_supported_cpu(void *_rc) -{ - u32 eax, ebx, ecx, edx; - int *rc = _rc; - - *rc = -ENODEV; - - if (current_cpu_data.x86_vendor != X86_VENDOR_AMD) - return; - - eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); - if (((eax & CPUID_XFAM) != CPUID_XFAM_K8) && - ((eax & CPUID_XFAM) < CPUID_XFAM_10H)) - return; - - if ((eax & CPUID_XFAM) == CPUID_XFAM_K8) { - if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) || - ((eax & CPUID_XMOD) > CPUID_XMOD_REV_MASK)) { - printk(KERN_INFO PFX - "Processor cpuid %x not supported\n", eax); - return; - } - - eax = cpuid_eax(CPUID_GET_MAX_CAPABILITIES); - if (eax < CPUID_FREQ_VOLT_CAPABILITIES) { - printk(KERN_INFO PFX - "No frequency change capabilities detected\n"); - return; - } - - cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); - if ((edx & P_STATE_TRANSITION_CAPABLE) - != P_STATE_TRANSITION_CAPABLE) { - printk(KERN_INFO PFX - "Power state transitions not supported\n"); - return; - } - } else { /* must be a HW Pstate capable processor */ - cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); - if ((edx & USE_HW_PSTATE) == USE_HW_PSTATE) - cpu_family = CPU_HW_PSTATE; - else - return; - } - - *rc = 0; -} - -static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst, - u8 maxvid) -{ - unsigned int j; - u8 lastfid = 0xff; - - for (j = 0; j < data->numps; j++) { - if (pst[j].vid > LEAST_VID) { - printk(KERN_ERR FW_BUG PFX "vid %d invalid : 0x%x\n", - j, pst[j].vid); - return -EINVAL; - } - if (pst[j].vid < data->rvo) { - /* vid + rvo >= 0 */ - printk(KERN_ERR FW_BUG PFX "0 vid exceeded with pstate" - " %d\n", j); - return -ENODEV; - } - if (pst[j].vid < maxvid + data->rvo) { - /* vid + rvo >= maxvid */ - printk(KERN_ERR FW_BUG PFX "maxvid exceeded with pstate" - " %d\n", j); - return -ENODEV; - } - if (pst[j].fid > MAX_FID) { - printk(KERN_ERR FW_BUG PFX "maxfid exceeded with pstate" - " %d\n", j); - return -ENODEV; - } - if (j && (pst[j].fid < HI_FID_TABLE_BOTTOM)) { - /* Only first fid is allowed to be in "low" range */ - printk(KERN_ERR FW_BUG PFX "two low fids - %d : " - "0x%x\n", j, pst[j].fid); - return -EINVAL; - } - if (pst[j].fid < lastfid) - lastfid = pst[j].fid; - } - if (lastfid & 1) { - printk(KERN_ERR FW_BUG PFX "lastfid invalid\n"); - return -EINVAL; - } - if (lastfid > LO_FID_TABLE_TOP) - printk(KERN_INFO FW_BUG PFX - "first fid not from lo freq table\n"); - - return 0; -} - -static void invalidate_entry(struct cpufreq_frequency_table *powernow_table, - unsigned int entry) -{ - powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID; -} - -static void print_basics(struct powernow_k8_data *data) -{ - int j; - for (j = 0; j < data->numps; j++) { - if (data->powernow_table[j].frequency != - CPUFREQ_ENTRY_INVALID) { - if (cpu_family == CPU_HW_PSTATE) { - printk(KERN_INFO PFX - " %d : pstate %d (%d MHz)\n", j, - data->powernow_table[j].index, - data->powernow_table[j].frequency/1000); - } else { - printk(KERN_INFO PFX - " %d : fid 0x%x (%d MHz), vid 0x%x\n", - j, - data->powernow_table[j].index & 0xff, - data->powernow_table[j].frequency/1000, - data->powernow_table[j].index >> 8); - } - } - } - if (data->batps) - printk(KERN_INFO PFX "Only %d pstates on battery\n", - data->batps); -} - -static u32 freq_from_fid_did(u32 fid, u32 did) -{ - u32 mhz = 0; - - if (boot_cpu_data.x86 == 0x10) - mhz = (100 * (fid + 0x10)) >> did; - else if (boot_cpu_data.x86 == 0x11) - mhz = (100 * (fid + 8)) >> did; - else - BUG(); - - return mhz * 1000; -} - -static int fill_powernow_table(struct powernow_k8_data *data, - struct pst_s *pst, u8 maxvid) -{ - struct cpufreq_frequency_table *powernow_table; - unsigned int j; - - if (data->batps) { - /* use ACPI support to get full speed on mains power */ - printk(KERN_WARNING PFX - "Only %d pstates usable (use ACPI driver for full " - "range\n", data->batps); - data->numps = data->batps; - } - - for (j = 1; j < data->numps; j++) { - if (pst[j-1].fid >= pst[j].fid) { - printk(KERN_ERR PFX "PST out of sequence\n"); - return -EINVAL; - } - } - - if (data->numps < 2) { - printk(KERN_ERR PFX "no p states to transition\n"); - return -ENODEV; - } - - if (check_pst_table(data, pst, maxvid)) - return -EINVAL; - - powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table) - * (data->numps + 1)), GFP_KERNEL); - if (!powernow_table) { - printk(KERN_ERR PFX "powernow_table memory alloc failure\n"); - return -ENOMEM; - } - - for (j = 0; j < data->numps; j++) { - int freq; - powernow_table[j].index = pst[j].fid; /* lower 8 bits */ - powernow_table[j].index |= (pst[j].vid << 8); /* upper 8 bits */ - freq = find_khz_freq_from_fid(pst[j].fid); - powernow_table[j].frequency = freq; - } - powernow_table[data->numps].frequency = CPUFREQ_TABLE_END; - powernow_table[data->numps].index = 0; - - if (query_current_values_with_pending_wait(data)) { - kfree(powernow_table); - return -EIO; - } - - dprintk("cfid 0x%x, cvid 0x%x\n", data->currfid, data->currvid); - data->powernow_table = powernow_table; - if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu) - print_basics(data); - - for (j = 0; j < data->numps; j++) - if ((pst[j].fid == data->currfid) && - (pst[j].vid == data->currvid)) - return 0; - - dprintk("currfid/vid do not match PST, ignoring\n"); - return 0; -} - -/* Find and validate the PSB/PST table in BIOS. */ -static int find_psb_table(struct powernow_k8_data *data) -{ - struct psb_s *psb; - unsigned int i; - u32 mvs; - u8 maxvid; - u32 cpst = 0; - u32 thiscpuid; - - for (i = 0xc0000; i < 0xffff0; i += 0x10) { - /* Scan BIOS looking for the signature. */ - /* It can not be at ffff0 - it is too big. */ - - psb = phys_to_virt(i); - if (memcmp(psb, PSB_ID_STRING, PSB_ID_STRING_LEN) != 0) - continue; - - dprintk("found PSB header at 0x%p\n", psb); - - dprintk("table vers: 0x%x\n", psb->tableversion); - if (psb->tableversion != PSB_VERSION_1_4) { - printk(KERN_ERR FW_BUG PFX "PSB table is not v1.4\n"); - return -ENODEV; - } - - dprintk("flags: 0x%x\n", psb->flags1); - if (psb->flags1) { - printk(KERN_ERR FW_BUG PFX "unknown flags\n"); - return -ENODEV; - } - - data->vstable = psb->vstable; - dprintk("voltage stabilization time: %d(*20us)\n", - data->vstable); - - dprintk("flags2: 0x%x\n", psb->flags2); - data->rvo = psb->flags2 & 3; - data->irt = ((psb->flags2) >> 2) & 3; - mvs = ((psb->flags2) >> 4) & 3; - data->vidmvs = 1 << mvs; - data->batps = ((psb->flags2) >> 6) & 3; - - dprintk("ramp voltage offset: %d\n", data->rvo); - dprintk("isochronous relief time: %d\n", data->irt); - dprintk("maximum voltage step: %d - 0x%x\n", mvs, data->vidmvs); - - dprintk("numpst: 0x%x\n", psb->num_tables); - cpst = psb->num_tables; - if ((psb->cpuid == 0x00000fc0) || - (psb->cpuid == 0x00000fe0)) { - thiscpuid = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); - if ((thiscpuid == 0x00000fc0) || - (thiscpuid == 0x00000fe0)) - cpst = 1; - } - if (cpst != 1) { - printk(KERN_ERR FW_BUG PFX "numpst must be 1\n"); - return -ENODEV; - } - - data->plllock = psb->plllocktime; - dprintk("plllocktime: 0x%x (units 1us)\n", psb->plllocktime); - dprintk("maxfid: 0x%x\n", psb->maxfid); - dprintk("maxvid: 0x%x\n", psb->maxvid); - maxvid = psb->maxvid; - - data->numps = psb->numps; - dprintk("numpstates: 0x%x\n", data->numps); - return fill_powernow_table(data, - (struct pst_s *)(psb+1), maxvid); - } - /* - * If you see this message, complain to BIOS manufacturer. If - * he tells you "we do not support Linux" or some similar - * nonsense, remember that Windows 2000 uses the same legacy - * mechanism that the old Linux PSB driver uses. Tell them it - * is broken with Windows 2000. - * - * The reference to the AMD documentation is chapter 9 in the - * BIOS and Kernel Developer's Guide, which is available on - * www.amd.com - */ - printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n"); - printk(KERN_ERR PFX "Make sure that your BIOS is up to date" - " and Cool'N'Quiet support is enabled in BIOS setup\n"); - return -ENODEV; -} - -static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, - unsigned int index) -{ - u64 control; - - if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE)) - return; - - control = data->acpi_data.states[index].control; - data->irt = (control >> IRT_SHIFT) & IRT_MASK; - data->rvo = (control >> RVO_SHIFT) & RVO_MASK; - data->exttype = (control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK; - data->plllock = (control >> PLL_L_SHIFT) & PLL_L_MASK; - data->vidmvs = 1 << ((control >> MVS_SHIFT) & MVS_MASK); - data->vstable = (control >> VST_SHIFT) & VST_MASK; -} - -static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) -{ - struct cpufreq_frequency_table *powernow_table; - int ret_val = -ENODEV; - u64 control, status; - - if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) { - dprintk("register performance failed: bad ACPI data\n"); - return -EIO; - } - - /* verify the data contained in the ACPI structures */ - if (data->acpi_data.state_count <= 1) { - dprintk("No ACPI P-States\n"); - goto err_out; - } - - control = data->acpi_data.control_register.space_id; - status = data->acpi_data.status_register.space_id; - - if ((control != ACPI_ADR_SPACE_FIXED_HARDWARE) || - (status != ACPI_ADR_SPACE_FIXED_HARDWARE)) { - dprintk("Invalid control/status registers (%x - %x)\n", - control, status); - goto err_out; - } - - /* fill in data->powernow_table */ - powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table) - * (data->acpi_data.state_count + 1)), GFP_KERNEL); - if (!powernow_table) { - dprintk("powernow_table memory alloc failure\n"); - goto err_out; - } - - /* fill in data */ - data->numps = data->acpi_data.state_count; - powernow_k8_acpi_pst_values(data, 0); - - if (cpu_family == CPU_HW_PSTATE) - ret_val = fill_powernow_table_pstate(data, powernow_table); - else - ret_val = fill_powernow_table_fidvid(data, powernow_table); - if (ret_val) - goto err_out_mem; - - powernow_table[data->acpi_data.state_count].frequency = - CPUFREQ_TABLE_END; - powernow_table[data->acpi_data.state_count].index = 0; - data->powernow_table = powernow_table; - - if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu) - print_basics(data); - - /* notify BIOS that we exist */ - acpi_processor_notify_smm(THIS_MODULE); - - if (!zalloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) { - printk(KERN_ERR PFX - "unable to alloc powernow_k8_data cpumask\n"); - ret_val = -ENOMEM; - goto err_out_mem; - } - - return 0; - -err_out_mem: - kfree(powernow_table); - -err_out: - acpi_processor_unregister_performance(&data->acpi_data, data->cpu); - - /* data->acpi_data.state_count informs us at ->exit() - * whether ACPI was used */ - data->acpi_data.state_count = 0; - - return ret_val; -} - -static int fill_powernow_table_pstate(struct powernow_k8_data *data, - struct cpufreq_frequency_table *powernow_table) -{ - int i; - u32 hi = 0, lo = 0; - rdmsr(MSR_PSTATE_CUR_LIMIT, lo, hi); - data->max_hw_pstate = (lo & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT; - - for (i = 0; i < data->acpi_data.state_count; i++) { - u32 index; - - index = data->acpi_data.states[i].control & HW_PSTATE_MASK; - if (index > data->max_hw_pstate) { - printk(KERN_ERR PFX "invalid pstate %d - " - "bad value %d.\n", i, index); - printk(KERN_ERR PFX "Please report to BIOS " - "manufacturer\n"); - invalidate_entry(powernow_table, i); - continue; - } - rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi); - if (!(hi & HW_PSTATE_VALID_MASK)) { - dprintk("invalid pstate %d, ignoring\n", index); - invalidate_entry(powernow_table, i); - continue; - } - - powernow_table[i].index = index; - - /* Frequency may be rounded for these */ - if ((boot_cpu_data.x86 == 0x10 && boot_cpu_data.x86_model < 10) - || boot_cpu_data.x86 == 0x11) { - powernow_table[i].frequency = - freq_from_fid_did(lo & 0x3f, (lo >> 6) & 7); - } else - powernow_table[i].frequency = - data->acpi_data.states[i].core_frequency * 1000; - } - return 0; -} - -static int fill_powernow_table_fidvid(struct powernow_k8_data *data, - struct cpufreq_frequency_table *powernow_table) -{ - int i; - - for (i = 0; i < data->acpi_data.state_count; i++) { - u32 fid; - u32 vid; - u32 freq, index; - u64 status, control; - - if (data->exttype) { - status = data->acpi_data.states[i].status; - fid = status & EXT_FID_MASK; - vid = (status >> VID_SHIFT) & EXT_VID_MASK; - } else { - control = data->acpi_data.states[i].control; - fid = control & FID_MASK; - vid = (control >> VID_SHIFT) & VID_MASK; - } - - dprintk(" %d : fid 0x%x, vid 0x%x\n", i, fid, vid); - - index = fid | (vid<<8); - powernow_table[i].index = index; - - freq = find_khz_freq_from_fid(fid); - powernow_table[i].frequency = freq; - - /* verify frequency is OK */ - if ((freq > (MAX_FREQ * 1000)) || (freq < (MIN_FREQ * 1000))) { - dprintk("invalid freq %u kHz, ignoring\n", freq); - invalidate_entry(powernow_table, i); - continue; - } - - /* verify voltage is OK - - * BIOSs are using "off" to indicate invalid */ - if (vid == VID_OFF) { - dprintk("invalid vid %u, ignoring\n", vid); - invalidate_entry(powernow_table, i); - continue; - } - - if (freq != (data->acpi_data.states[i].core_frequency * 1000)) { - printk(KERN_INFO PFX "invalid freq entries " - "%u kHz vs. %u kHz\n", freq, - (unsigned int) - (data->acpi_data.states[i].core_frequency - * 1000)); - invalidate_entry(powernow_table, i); - continue; - } - } - return 0; -} - -static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data) -{ - if (data->acpi_data.state_count) - acpi_processor_unregister_performance(&data->acpi_data, - data->cpu); - free_cpumask_var(data->acpi_data.shared_cpu_map); -} - -static int get_transition_latency(struct powernow_k8_data *data) -{ - int max_latency = 0; - int i; - for (i = 0; i < data->acpi_data.state_count; i++) { - int cur_latency = data->acpi_data.states[i].transition_latency - + data->acpi_data.states[i].bus_master_latency; - if (cur_latency > max_latency) - max_latency = cur_latency; - } - if (max_latency == 0) { - /* - * Fam 11h and later may return 0 as transition latency. This - * is intended and means "very fast". While cpufreq core and - * governors currently can handle that gracefully, better set it - * to 1 to avoid problems in the future. - */ - if (boot_cpu_data.x86 < 0x11) - printk(KERN_ERR FW_WARN PFX "Invalid zero transition " - "latency\n"); - max_latency = 1; - } - /* value in usecs, needs to be in nanoseconds */ - return 1000 * max_latency; -} - -/* Take a frequency, and issue the fid/vid transition command */ -static int transition_frequency_fidvid(struct powernow_k8_data *data, - unsigned int index) -{ - u32 fid = 0; - u32 vid = 0; - int res, i; - struct cpufreq_freqs freqs; - - dprintk("cpu %d transition to index %u\n", smp_processor_id(), index); - - /* fid/vid correctness check for k8 */ - /* fid are the lower 8 bits of the index we stored into - * the cpufreq frequency table in find_psb_table, vid - * are the upper 8 bits. - */ - fid = data->powernow_table[index].index & 0xFF; - vid = (data->powernow_table[index].index & 0xFF00) >> 8; - - dprintk("table matched fid 0x%x, giving vid 0x%x\n", fid, vid); - - if (query_current_values_with_pending_wait(data)) - return 1; - - if ((data->currvid == vid) && (data->currfid == fid)) { - dprintk("target matches current values (fid 0x%x, vid 0x%x)\n", - fid, vid); - return 0; - } - - dprintk("cpu %d, changing to fid 0x%x, vid 0x%x\n", - smp_processor_id(), fid, vid); - freqs.old = find_khz_freq_from_fid(data->currfid); - freqs.new = find_khz_freq_from_fid(fid); - - for_each_cpu(i, data->available_cores) { - freqs.cpu = i; - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - } - - res = transition_fid_vid(data, fid, vid); - freqs.new = find_khz_freq_from_fid(data->currfid); - - for_each_cpu(i, data->available_cores) { - freqs.cpu = i; - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - } - return res; -} - -/* Take a frequency, and issue the hardware pstate transition command */ -static int transition_frequency_pstate(struct powernow_k8_data *data, - unsigned int index) -{ - u32 pstate = 0; - int res, i; - struct cpufreq_freqs freqs; - - dprintk("cpu %d transition to index %u\n", smp_processor_id(), index); - - /* get MSR index for hardware pstate transition */ - pstate = index & HW_PSTATE_MASK; - if (pstate > data->max_hw_pstate) - return 0; - freqs.old = find_khz_freq_from_pstate(data->powernow_table, - data->currpstate); - freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); - - for_each_cpu(i, data->available_cores) { - freqs.cpu = i; - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - } - - res = transition_pstate(data, pstate); - freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); - - for_each_cpu(i, data->available_cores) { - freqs.cpu = i; - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - } - return res; -} - -/* Driver entry point to switch to the target frequency */ -static int powernowk8_target(struct cpufreq_policy *pol, - unsigned targfreq, unsigned relation) -{ - cpumask_var_t oldmask; - struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); - u32 checkfid; - u32 checkvid; - unsigned int newstate; - int ret = -EIO; - - if (!data) - return -EINVAL; - - checkfid = data->currfid; - checkvid = data->currvid; - - /* only run on specific CPU from here on. */ - /* This is poor form: use a workqueue or smp_call_function_single */ - if (!alloc_cpumask_var(&oldmask, GFP_KERNEL)) - return -ENOMEM; - - cpumask_copy(oldmask, tsk_cpus_allowed(current)); - set_cpus_allowed_ptr(current, cpumask_of(pol->cpu)); - - if (smp_processor_id() != pol->cpu) { - printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); - goto err_out; - } - - if (pending_bit_stuck()) { - printk(KERN_ERR PFX "failing targ, change pending bit set\n"); - goto err_out; - } - - dprintk("targ: cpu %d, %d kHz, min %d, max %d, relation %d\n", - pol->cpu, targfreq, pol->min, pol->max, relation); - - if (query_current_values_with_pending_wait(data)) - goto err_out; - - if (cpu_family != CPU_HW_PSTATE) { - dprintk("targ: curr fid 0x%x, vid 0x%x\n", - data->currfid, data->currvid); - - if ((checkvid != data->currvid) || - (checkfid != data->currfid)) { - printk(KERN_INFO PFX - "error - out of sync, fix 0x%x 0x%x, " - "vid 0x%x 0x%x\n", - checkfid, data->currfid, - checkvid, data->currvid); - } - } - - if (cpufreq_frequency_table_target(pol, data->powernow_table, - targfreq, relation, &newstate)) - goto err_out; - - mutex_lock(&fidvid_mutex); - - powernow_k8_acpi_pst_values(data, newstate); - - if (cpu_family == CPU_HW_PSTATE) - ret = transition_frequency_pstate(data, newstate); - else - ret = transition_frequency_fidvid(data, newstate); - if (ret) { - printk(KERN_ERR PFX "transition frequency failed\n"); - ret = 1; - mutex_unlock(&fidvid_mutex); - goto err_out; - } - mutex_unlock(&fidvid_mutex); - - if (cpu_family == CPU_HW_PSTATE) - pol->cur = find_khz_freq_from_pstate(data->powernow_table, - newstate); - else - pol->cur = find_khz_freq_from_fid(data->currfid); - ret = 0; - -err_out: - set_cpus_allowed_ptr(current, oldmask); - free_cpumask_var(oldmask); - return ret; -} - -/* Driver entry point to verify the policy and range of frequencies */ -static int powernowk8_verify(struct cpufreq_policy *pol) -{ - struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); - - if (!data) - return -EINVAL; - - return cpufreq_frequency_table_verify(pol, data->powernow_table); -} - -struct init_on_cpu { - struct powernow_k8_data *data; - int rc; -}; - -static void __cpuinit powernowk8_cpu_init_on_cpu(void *_init_on_cpu) -{ - struct init_on_cpu *init_on_cpu = _init_on_cpu; - - if (pending_bit_stuck()) { - printk(KERN_ERR PFX "failing init, change pending bit set\n"); - init_on_cpu->rc = -ENODEV; - return; - } - - if (query_current_values_with_pending_wait(init_on_cpu->data)) { - init_on_cpu->rc = -ENODEV; - return; - } - - if (cpu_family == CPU_OPTERON) - fidvid_msr_init(); - - init_on_cpu->rc = 0; -} - -/* per CPU init entry point to the driver */ -static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) -{ - static const char ACPI_PSS_BIOS_BUG_MSG[] = - KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n" - FW_BUG PFX "Try again with latest BIOS.\n"; - struct powernow_k8_data *data; - struct init_on_cpu init_on_cpu; - int rc; - struct cpuinfo_x86 *c = &cpu_data(pol->cpu); - - if (!cpu_online(pol->cpu)) - return -ENODEV; - - smp_call_function_single(pol->cpu, check_supported_cpu, &rc, 1); - if (rc) - return -ENODEV; - - data = kzalloc(sizeof(struct powernow_k8_data), GFP_KERNEL); - if (!data) { - printk(KERN_ERR PFX "unable to alloc powernow_k8_data"); - return -ENOMEM; - } - - data->cpu = pol->cpu; - data->currpstate = HW_PSTATE_INVALID; - - if (powernow_k8_cpu_init_acpi(data)) { - /* - * Use the PSB BIOS structure. This is only availabe on - * an UP version, and is deprecated by AMD. - */ - if (num_online_cpus() != 1) { - printk_once(ACPI_PSS_BIOS_BUG_MSG); - goto err_out; - } - if (pol->cpu != 0) { - printk(KERN_ERR FW_BUG PFX "No ACPI _PSS objects for " - "CPU other than CPU0. Complain to your BIOS " - "vendor.\n"); - goto err_out; - } - rc = find_psb_table(data); - if (rc) - goto err_out; - - /* Take a crude guess here. - * That guess was in microseconds, so multiply with 1000 */ - pol->cpuinfo.transition_latency = ( - ((data->rvo + 8) * data->vstable * VST_UNITS_20US) + - ((1 << data->irt) * 30)) * 1000; - } else /* ACPI _PSS objects available */ - pol->cpuinfo.transition_latency = get_transition_latency(data); - - /* only run on specific CPU from here on */ - init_on_cpu.data = data; - smp_call_function_single(data->cpu, powernowk8_cpu_init_on_cpu, - &init_on_cpu, 1); - rc = init_on_cpu.rc; - if (rc != 0) - goto err_out_exit_acpi; - - if (cpu_family == CPU_HW_PSTATE) - cpumask_copy(pol->cpus, cpumask_of(pol->cpu)); - else - cpumask_copy(pol->cpus, cpu_core_mask(pol->cpu)); - data->available_cores = pol->cpus; - - if (cpu_family == CPU_HW_PSTATE) - pol->cur = find_khz_freq_from_pstate(data->powernow_table, - data->currpstate); - else - pol->cur = find_khz_freq_from_fid(data->currfid); - dprintk("policy current frequency %d kHz\n", pol->cur); - - /* min/max the cpu is capable of */ - if (cpufreq_frequency_table_cpuinfo(pol, data->powernow_table)) { - printk(KERN_ERR FW_BUG PFX "invalid powernow_table\n"); - powernow_k8_cpu_exit_acpi(data); - kfree(data->powernow_table); - kfree(data); - return -EINVAL; - } - - /* Check for APERF/MPERF support in hardware */ - if (cpu_has(c, X86_FEATURE_APERFMPERF)) - cpufreq_amd64_driver.getavg = cpufreq_get_measured_perf; - - cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu); - - if (cpu_family == CPU_HW_PSTATE) - dprintk("cpu_init done, current pstate 0x%x\n", - data->currpstate); - else - dprintk("cpu_init done, current fid 0x%x, vid 0x%x\n", - data->currfid, data->currvid); - - per_cpu(powernow_data, pol->cpu) = data; - - return 0; - -err_out_exit_acpi: - powernow_k8_cpu_exit_acpi(data); - -err_out: - kfree(data); - return -ENODEV; -} - -static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol) -{ - struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); - - if (!data) - return -EINVAL; - - powernow_k8_cpu_exit_acpi(data); - - cpufreq_frequency_table_put_attr(pol->cpu); - - kfree(data->powernow_table); - kfree(data); - per_cpu(powernow_data, pol->cpu) = NULL; - - return 0; -} - -static void query_values_on_cpu(void *_err) -{ - int *err = _err; - struct powernow_k8_data *data = __get_cpu_var(powernow_data); - - *err = query_current_values_with_pending_wait(data); -} - -static unsigned int powernowk8_get(unsigned int cpu) -{ - struct powernow_k8_data *data = per_cpu(powernow_data, cpu); - unsigned int khz = 0; - int err; - - if (!data) - return 0; - - smp_call_function_single(cpu, query_values_on_cpu, &err, true); - if (err) - goto out; - - if (cpu_family == CPU_HW_PSTATE) - khz = find_khz_freq_from_pstate(data->powernow_table, - data->currpstate); - else - khz = find_khz_freq_from_fid(data->currfid); - - -out: - return khz; -} - -static void _cpb_toggle_msrs(bool t) -{ - int cpu; - - get_online_cpus(); - - rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs); - - for_each_cpu(cpu, cpu_online_mask) { - struct msr *reg = per_cpu_ptr(msrs, cpu); - if (t) - reg->l &= ~BIT(25); - else - reg->l |= BIT(25); - } - wrmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs); - - put_online_cpus(); -} - -/* - * Switch on/off core performance boosting. - * - * 0=disable - * 1=enable. - */ -static void cpb_toggle(bool t) -{ - if (!cpb_capable) - return; - - if (t && !cpb_enabled) { - cpb_enabled = true; - _cpb_toggle_msrs(t); - printk(KERN_INFO PFX "Core Boosting enabled.\n"); - } else if (!t && cpb_enabled) { - cpb_enabled = false; - _cpb_toggle_msrs(t); - printk(KERN_INFO PFX "Core Boosting disabled.\n"); - } -} - -static ssize_t store_cpb(struct cpufreq_policy *policy, const char *buf, - size_t count) -{ - int ret = -EINVAL; - unsigned long val = 0; - - ret = strict_strtoul(buf, 10, &val); - if (!ret && (val == 0 || val == 1) && cpb_capable) - cpb_toggle(val); - else - return -EINVAL; - - return count; -} - -static ssize_t show_cpb(struct cpufreq_policy *policy, char *buf) -{ - return sprintf(buf, "%u\n", cpb_enabled); -} - -#define define_one_rw(_name) \ -static struct freq_attr _name = \ -__ATTR(_name, 0644, show_##_name, store_##_name) - -define_one_rw(cpb); - -static struct freq_attr *powernow_k8_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - &cpb, - NULL, -}; - -static struct cpufreq_driver cpufreq_amd64_driver = { - .verify = powernowk8_verify, - .target = powernowk8_target, - .bios_limit = acpi_processor_get_bios_limit, - .init = powernowk8_cpu_init, - .exit = __devexit_p(powernowk8_cpu_exit), - .get = powernowk8_get, - .name = "powernow-k8", - .owner = THIS_MODULE, - .attr = powernow_k8_attr, -}; - -/* - * Clear the boost-disable flag on the CPU_DOWN path so that this cpu - * cannot block the remaining ones from boosting. On the CPU_UP path we - * simply keep the boost-disable flag in sync with the current global - * state. - */ -static int cpb_notify(struct notifier_block *nb, unsigned long action, - void *hcpu) -{ - unsigned cpu = (long)hcpu; - u32 lo, hi; - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - - if (!cpb_enabled) { - rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi); - lo |= BIT(25); - wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi); - } - break; - - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi); - lo &= ~BIT(25); - wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi); - break; - - default: - break; - } - - return NOTIFY_OK; -} - -static struct notifier_block cpb_nb = { - .notifier_call = cpb_notify, -}; - -/* driver entry point for init */ -static int __cpuinit powernowk8_init(void) -{ - unsigned int i, supported_cpus = 0, cpu; - - for_each_online_cpu(i) { - int rc; - smp_call_function_single(i, check_supported_cpu, &rc, 1); - if (rc == 0) - supported_cpus++; - } - - if (supported_cpus != num_online_cpus()) - return -ENODEV; - - printk(KERN_INFO PFX "Found %d %s (%d cpu cores) (" VERSION ")\n", - num_online_nodes(), boot_cpu_data.x86_model_id, supported_cpus); - - if (boot_cpu_has(X86_FEATURE_CPB)) { - - cpb_capable = true; - - register_cpu_notifier(&cpb_nb); - - msrs = msrs_alloc(); - if (!msrs) { - printk(KERN_ERR "%s: Error allocating msrs!\n", __func__); - return -ENOMEM; - } - - rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs); - - for_each_cpu(cpu, cpu_online_mask) { - struct msr *reg = per_cpu_ptr(msrs, cpu); - cpb_enabled |= !(!!(reg->l & BIT(25))); - } - - printk(KERN_INFO PFX "Core Performance Boosting: %s.\n", - (cpb_enabled ? "on" : "off")); - } - - return cpufreq_register_driver(&cpufreq_amd64_driver); -} - -/* driver entry point for term */ -static void __exit powernowk8_exit(void) -{ - dprintk("exit\n"); - - if (boot_cpu_has(X86_FEATURE_CPB)) { - msrs_free(msrs); - msrs = NULL; - - unregister_cpu_notifier(&cpb_nb); - } - - cpufreq_unregister_driver(&cpufreq_amd64_driver); -} - -MODULE_AUTHOR("Paul Devriendt <paul.devriendt@amd.com> and " - "Mark Langsdorf <mark.langsdorf@amd.com>"); -MODULE_DESCRIPTION("AMD Athlon 64 and Opteron processor frequency driver."); -MODULE_LICENSE("GPL"); - -late_initcall(powernowk8_init); -module_exit(powernowk8_exit); diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h deleted file mode 100644 index df3529b1c02..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h +++ /dev/null @@ -1,224 +0,0 @@ -/* - * (c) 2003-2006 Advanced Micro Devices, Inc. - * Your use of this code is subject to the terms and conditions of the - * GNU general public license version 2. See "COPYING" or - * http://www.gnu.org/licenses/gpl.html - */ - -enum pstate { - HW_PSTATE_INVALID = 0xff, - HW_PSTATE_0 = 0, - HW_PSTATE_1 = 1, - HW_PSTATE_2 = 2, - HW_PSTATE_3 = 3, - HW_PSTATE_4 = 4, - HW_PSTATE_5 = 5, - HW_PSTATE_6 = 6, - HW_PSTATE_7 = 7, -}; - -struct powernow_k8_data { - unsigned int cpu; - - u32 numps; /* number of p-states */ - u32 batps; /* number of p-states supported on battery */ - u32 max_hw_pstate; /* maximum legal hardware pstate */ - - /* these values are constant when the PSB is used to determine - * vid/fid pairings, but are modified during the ->target() call - * when ACPI is used */ - u32 rvo; /* ramp voltage offset */ - u32 irt; /* isochronous relief time */ - u32 vidmvs; /* usable value calculated from mvs */ - u32 vstable; /* voltage stabilization time, units 20 us */ - u32 plllock; /* pll lock time, units 1 us */ - u32 exttype; /* extended interface = 1 */ - - /* keep track of the current fid / vid or pstate */ - u32 currvid; - u32 currfid; - enum pstate currpstate; - - /* the powernow_table includes all frequency and vid/fid pairings: - * fid are the lower 8 bits of the index, vid are the upper 8 bits. - * frequency is in kHz */ - struct cpufreq_frequency_table *powernow_table; - - /* the acpi table needs to be kept. it's only available if ACPI was - * used to determine valid frequency/vid/fid states */ - struct acpi_processor_performance acpi_data; - - /* we need to keep track of associated cores, but let cpufreq - * handle hotplug events - so just point at cpufreq pol->cpus - * structure */ - struct cpumask *available_cores; -}; - -/* processor's cpuid instruction support */ -#define CPUID_PROCESSOR_SIGNATURE 1 /* function 1 */ -#define CPUID_XFAM 0x0ff00000 /* extended family */ -#define CPUID_XFAM_K8 0 -#define CPUID_XMOD 0x000f0000 /* extended model */ -#define CPUID_XMOD_REV_MASK 0x000c0000 -#define CPUID_XFAM_10H 0x00100000 /* family 0x10 */ -#define CPUID_USE_XFAM_XMOD 0x00000f00 -#define CPUID_GET_MAX_CAPABILITIES 0x80000000 -#define CPUID_FREQ_VOLT_CAPABILITIES 0x80000007 -#define P_STATE_TRANSITION_CAPABLE 6 - -/* Model Specific Registers for p-state transitions. MSRs are 64-bit. For */ -/* writes (wrmsr - opcode 0f 30), the register number is placed in ecx, and */ -/* the value to write is placed in edx:eax. For reads (rdmsr - opcode 0f 32), */ -/* the register number is placed in ecx, and the data is returned in edx:eax. */ - -#define MSR_FIDVID_CTL 0xc0010041 -#define MSR_FIDVID_STATUS 0xc0010042 - -/* Field definitions within the FID VID Low Control MSR : */ -#define MSR_C_LO_INIT_FID_VID 0x00010000 -#define MSR_C_LO_NEW_VID 0x00003f00 -#define MSR_C_LO_NEW_FID 0x0000003f -#define MSR_C_LO_VID_SHIFT 8 - -/* Field definitions within the FID VID High Control MSR : */ -#define MSR_C_HI_STP_GNT_TO 0x000fffff - -/* Field definitions within the FID VID Low Status MSR : */ -#define MSR_S_LO_CHANGE_PENDING 0x80000000 /* cleared when completed */ -#define MSR_S_LO_MAX_RAMP_VID 0x3f000000 -#define MSR_S_LO_MAX_FID 0x003f0000 -#define MSR_S_LO_START_FID 0x00003f00 -#define MSR_S_LO_CURRENT_FID 0x0000003f - -/* Field definitions within the FID VID High Status MSR : */ -#define MSR_S_HI_MIN_WORKING_VID 0x3f000000 -#define MSR_S_HI_MAX_WORKING_VID 0x003f0000 -#define MSR_S_HI_START_VID 0x00003f00 -#define MSR_S_HI_CURRENT_VID 0x0000003f -#define MSR_C_HI_STP_GNT_BENIGN 0x00000001 - - -/* Hardware Pstate _PSS and MSR definitions */ -#define USE_HW_PSTATE 0x00000080 -#define HW_PSTATE_MASK 0x00000007 -#define HW_PSTATE_VALID_MASK 0x80000000 -#define HW_PSTATE_MAX_MASK 0x000000f0 -#define HW_PSTATE_MAX_SHIFT 4 -#define MSR_PSTATE_DEF_BASE 0xc0010064 /* base of Pstate MSRs */ -#define MSR_PSTATE_STATUS 0xc0010063 /* Pstate Status MSR */ -#define MSR_PSTATE_CTRL 0xc0010062 /* Pstate control MSR */ -#define MSR_PSTATE_CUR_LIMIT 0xc0010061 /* pstate current limit MSR */ - -/* define the two driver architectures */ -#define CPU_OPTERON 0 -#define CPU_HW_PSTATE 1 - - -/* - * There are restrictions frequencies have to follow: - * - only 1 entry in the low fid table ( <=1.4GHz ) - * - lowest entry in the high fid table must be >= 2 * the entry in the - * low fid table - * - lowest entry in the high fid table must be a <= 200MHz + 2 * the entry - * in the low fid table - * - the parts can only step at <= 200 MHz intervals, odd fid values are - * supported in revision G and later revisions. - * - lowest frequency must be >= interprocessor hypertransport link speed - * (only applies to MP systems obviously) - */ - -/* fids (frequency identifiers) are arranged in 2 tables - lo and hi */ -#define LO_FID_TABLE_TOP 7 /* fid values marking the boundary */ -#define HI_FID_TABLE_BOTTOM 8 /* between the low and high tables */ - -#define LO_VCOFREQ_TABLE_TOP 1400 /* corresponding vco frequency values */ -#define HI_VCOFREQ_TABLE_BOTTOM 1600 - -#define MIN_FREQ_RESOLUTION 200 /* fids jump by 2 matching freq jumps by 200 */ - -#define MAX_FID 0x2a /* Spec only gives FID values as far as 5 GHz */ -#define LEAST_VID 0x3e /* Lowest (numerically highest) useful vid value */ - -#define MIN_FREQ 800 /* Min and max freqs, per spec */ -#define MAX_FREQ 5000 - -#define INVALID_FID_MASK 0xffffffc0 /* not a valid fid if these bits are set */ -#define INVALID_VID_MASK 0xffffffc0 /* not a valid vid if these bits are set */ - -#define VID_OFF 0x3f - -#define STOP_GRANT_5NS 1 /* min poss memory access latency for voltage change */ - -#define PLL_LOCK_CONVERSION (1000/5) /* ms to ns, then divide by clock period */ - -#define MAXIMUM_VID_STEPS 1 /* Current cpus only allow a single step of 25mV */ -#define VST_UNITS_20US 20 /* Voltage Stabilization Time is in units of 20us */ - -/* - * Most values of interest are encoded in a single field of the _PSS - * entries: the "control" value. - */ - -#define IRT_SHIFT 30 -#define RVO_SHIFT 28 -#define EXT_TYPE_SHIFT 27 -#define PLL_L_SHIFT 20 -#define MVS_SHIFT 18 -#define VST_SHIFT 11 -#define VID_SHIFT 6 -#define IRT_MASK 3 -#define RVO_MASK 3 -#define EXT_TYPE_MASK 1 -#define PLL_L_MASK 0x7f -#define MVS_MASK 3 -#define VST_MASK 0x7f -#define VID_MASK 0x1f -#define FID_MASK 0x1f -#define EXT_VID_MASK 0x3f -#define EXT_FID_MASK 0x3f - - -/* - * Version 1.4 of the PSB table. This table is constructed by BIOS and is - * to tell the OS's power management driver which VIDs and FIDs are - * supported by this particular processor. - * If the data in the PSB / PST is wrong, then this driver will program the - * wrong values into hardware, which is very likely to lead to a crash. - */ - -#define PSB_ID_STRING "AMDK7PNOW!" -#define PSB_ID_STRING_LEN 10 - -#define PSB_VERSION_1_4 0x14 - -struct psb_s { - u8 signature[10]; - u8 tableversion; - u8 flags1; - u16 vstable; - u8 flags2; - u8 num_tables; - u32 cpuid; - u8 plllocktime; - u8 maxfid; - u8 maxvid; - u8 numps; -}; - -/* Pairs of fid/vid values are appended to the version 1.4 PSB table. */ -struct pst_s { - u8 fid; - u8 vid; -}; - -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k8", msg) - -static int core_voltage_pre_transition(struct powernow_k8_data *data, - u32 reqvid, u32 regfid); -static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid); -static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid); - -static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index); - -static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table); -static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table); diff --git a/arch/x86/kernel/cpu/cpufreq/sc520_freq.c b/arch/x86/kernel/cpu/cpufreq/sc520_freq.c deleted file mode 100644 index 435a996a613..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/sc520_freq.c +++ /dev/null @@ -1,194 +0,0 @@ -/* - * sc520_freq.c: cpufreq driver for the AMD Elan sc520 - * - * Copyright (C) 2005 Sean Young <sean@mess.org> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Based on elanfreq.c - * - * 2005-03-30: - initial revision - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> - -#include <linux/delay.h> -#include <linux/cpufreq.h> -#include <linux/timex.h> -#include <linux/io.h> - -#include <asm/msr.h> - -#define MMCR_BASE 0xfffef000 /* The default base address */ -#define OFFS_CPUCTL 0x2 /* CPU Control Register */ - -static __u8 __iomem *cpuctl; - -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "sc520_freq", msg) -#define PFX "sc520_freq: " - -static struct cpufreq_frequency_table sc520_freq_table[] = { - {0x01, 100000}, - {0x02, 133000}, - {0, CPUFREQ_TABLE_END}, -}; - -static unsigned int sc520_freq_get_cpu_frequency(unsigned int cpu) -{ - u8 clockspeed_reg = *cpuctl; - - switch (clockspeed_reg & 0x03) { - default: - printk(KERN_ERR PFX "error: cpuctl register has unexpected " - "value %02x\n", clockspeed_reg); - case 0x01: - return 100000; - case 0x02: - return 133000; - } -} - -static void sc520_freq_set_cpu_state(unsigned int state) -{ - - struct cpufreq_freqs freqs; - u8 clockspeed_reg; - - freqs.old = sc520_freq_get_cpu_frequency(0); - freqs.new = sc520_freq_table[state].frequency; - freqs.cpu = 0; /* AMD Elan is UP */ - - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - - dprintk("attempting to set frequency to %i kHz\n", - sc520_freq_table[state].frequency); - - local_irq_disable(); - - clockspeed_reg = *cpuctl & ~0x03; - *cpuctl = clockspeed_reg | sc520_freq_table[state].index; - - local_irq_enable(); - - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); -}; - -static int sc520_freq_verify(struct cpufreq_policy *policy) -{ - return cpufreq_frequency_table_verify(policy, &sc520_freq_table[0]); -} - -static int sc520_freq_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) -{ - unsigned int newstate = 0; - - if (cpufreq_frequency_table_target(policy, sc520_freq_table, - target_freq, relation, &newstate)) - return -EINVAL; - - sc520_freq_set_cpu_state(newstate); - - return 0; -} - - -/* - * Module init and exit code - */ - -static int sc520_freq_cpu_init(struct cpufreq_policy *policy) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - int result; - - /* capability check */ - if (c->x86_vendor != X86_VENDOR_AMD || - c->x86 != 4 || c->x86_model != 9) - return -ENODEV; - - /* cpuinfo and default policy values */ - policy->cpuinfo.transition_latency = 1000000; /* 1ms */ - policy->cur = sc520_freq_get_cpu_frequency(0); - - result = cpufreq_frequency_table_cpuinfo(policy, sc520_freq_table); - if (result) - return result; - - cpufreq_frequency_table_get_attr(sc520_freq_table, policy->cpu); - - return 0; -} - - -static int sc520_freq_cpu_exit(struct cpufreq_policy *policy) -{ - cpufreq_frequency_table_put_attr(policy->cpu); - return 0; -} - - -static struct freq_attr *sc520_freq_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; - - -static struct cpufreq_driver sc520_freq_driver = { - .get = sc520_freq_get_cpu_frequency, - .verify = sc520_freq_verify, - .target = sc520_freq_target, - .init = sc520_freq_cpu_init, - .exit = sc520_freq_cpu_exit, - .name = "sc520_freq", - .owner = THIS_MODULE, - .attr = sc520_freq_attr, -}; - - -static int __init sc520_freq_init(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - int err; - - /* Test if we have the right hardware */ - if (c->x86_vendor != X86_VENDOR_AMD || - c->x86 != 4 || c->x86_model != 9) { - dprintk("no Elan SC520 processor found!\n"); - return -ENODEV; - } - cpuctl = ioremap((unsigned long)(MMCR_BASE + OFFS_CPUCTL), 1); - if (!cpuctl) { - printk(KERN_ERR "sc520_freq: error: failed to remap memory\n"); - return -ENOMEM; - } - - err = cpufreq_register_driver(&sc520_freq_driver); - if (err) - iounmap(cpuctl); - - return err; -} - - -static void __exit sc520_freq_exit(void) -{ - cpufreq_unregister_driver(&sc520_freq_driver); - iounmap(cpuctl); -} - - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Sean Young <sean@mess.org>"); -MODULE_DESCRIPTION("cpufreq driver for AMD's Elan sc520 CPU"); - -module_init(sc520_freq_init); -module_exit(sc520_freq_exit); - diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c deleted file mode 100644 index 9b1ff37de46..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c +++ /dev/null @@ -1,636 +0,0 @@ -/* - * cpufreq driver for Enhanced SpeedStep, as found in Intel's Pentium - * M (part of the Centrino chipset). - * - * Since the original Pentium M, most new Intel CPUs support Enhanced - * SpeedStep. - * - * Despite the "SpeedStep" in the name, this is almost entirely unlike - * traditional SpeedStep. - * - * Modelled on speedstep.c - * - * Copyright (C) 2003 Jeremy Fitzhardinge <jeremy@goop.org> - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/cpufreq.h> -#include <linux/sched.h> /* current */ -#include <linux/delay.h> -#include <linux/compiler.h> -#include <linux/gfp.h> - -#include <asm/msr.h> -#include <asm/processor.h> -#include <asm/cpufeature.h> - -#define PFX "speedstep-centrino: " -#define MAINTAINER "cpufreq@vger.kernel.org" - -#define dprintk(msg...) \ - cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg) - -#define INTEL_MSR_RANGE (0xffff) - -struct cpu_id -{ - __u8 x86; /* CPU family */ - __u8 x86_model; /* model */ - __u8 x86_mask; /* stepping */ -}; - -enum { - CPU_BANIAS, - CPU_DOTHAN_A1, - CPU_DOTHAN_A2, - CPU_DOTHAN_B0, - CPU_MP4HT_D0, - CPU_MP4HT_E0, -}; - -static const struct cpu_id cpu_ids[] = { - [CPU_BANIAS] = { 6, 9, 5 }, - [CPU_DOTHAN_A1] = { 6, 13, 1 }, - [CPU_DOTHAN_A2] = { 6, 13, 2 }, - [CPU_DOTHAN_B0] = { 6, 13, 6 }, - [CPU_MP4HT_D0] = {15, 3, 4 }, - [CPU_MP4HT_E0] = {15, 4, 1 }, -}; -#define N_IDS ARRAY_SIZE(cpu_ids) - -struct cpu_model -{ - const struct cpu_id *cpu_id; - const char *model_name; - unsigned max_freq; /* max clock in kHz */ - - struct cpufreq_frequency_table *op_points; /* clock/voltage pairs */ -}; -static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, - const struct cpu_id *x); - -/* Operating points for current CPU */ -static DEFINE_PER_CPU(struct cpu_model *, centrino_model); -static DEFINE_PER_CPU(const struct cpu_id *, centrino_cpu); - -static struct cpufreq_driver centrino_driver; - -#ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE - -/* Computes the correct form for IA32_PERF_CTL MSR for a particular - frequency/voltage operating point; frequency in MHz, volts in mV. - This is stored as "index" in the structure. */ -#define OP(mhz, mv) \ - { \ - .frequency = (mhz) * 1000, \ - .index = (((mhz)/100) << 8) | ((mv - 700) / 16) \ - } - -/* - * These voltage tables were derived from the Intel Pentium M - * datasheet, document 25261202.pdf, Table 5. I have verified they - * are consistent with my IBM ThinkPad X31, which has a 1.3GHz Pentium - * M. - */ - -/* Ultra Low Voltage Intel Pentium M processor 900MHz (Banias) */ -static struct cpufreq_frequency_table banias_900[] = -{ - OP(600, 844), - OP(800, 988), - OP(900, 1004), - { .frequency = CPUFREQ_TABLE_END } -}; - -/* Ultra Low Voltage Intel Pentium M processor 1000MHz (Banias) */ -static struct cpufreq_frequency_table banias_1000[] = -{ - OP(600, 844), - OP(800, 972), - OP(900, 988), - OP(1000, 1004), - { .frequency = CPUFREQ_TABLE_END } -}; - -/* Low Voltage Intel Pentium M processor 1.10GHz (Banias) */ -static struct cpufreq_frequency_table banias_1100[] = -{ - OP( 600, 956), - OP( 800, 1020), - OP( 900, 1100), - OP(1000, 1164), - OP(1100, 1180), - { .frequency = CPUFREQ_TABLE_END } -}; - - -/* Low Voltage Intel Pentium M processor 1.20GHz (Banias) */ -static struct cpufreq_frequency_table banias_1200[] = -{ - OP( 600, 956), - OP( 800, 1004), - OP( 900, 1020), - OP(1000, 1100), - OP(1100, 1164), - OP(1200, 1180), - { .frequency = CPUFREQ_TABLE_END } -}; - -/* Intel Pentium M processor 1.30GHz (Banias) */ -static struct cpufreq_frequency_table banias_1300[] = -{ - OP( 600, 956), - OP( 800, 1260), - OP(1000, 1292), - OP(1200, 1356), - OP(1300, 1388), - { .frequency = CPUFREQ_TABLE_END } -}; - -/* Intel Pentium M processor 1.40GHz (Banias) */ -static struct cpufreq_frequency_table banias_1400[] = -{ - OP( 600, 956), - OP( 800, 1180), - OP(1000, 1308), - OP(1200, 1436), - OP(1400, 1484), - { .frequency = CPUFREQ_TABLE_END } -}; - -/* Intel Pentium M processor 1.50GHz (Banias) */ -static struct cpufreq_frequency_table banias_1500[] = -{ - OP( 600, 956), - OP( 800, 1116), - OP(1000, 1228), - OP(1200, 1356), - OP(1400, 1452), - OP(1500, 1484), - { .frequency = CPUFREQ_TABLE_END } -}; - -/* Intel Pentium M processor 1.60GHz (Banias) */ -static struct cpufreq_frequency_table banias_1600[] = -{ - OP( 600, 956), - OP( 800, 1036), - OP(1000, 1164), - OP(1200, 1276), - OP(1400, 1420), - OP(1600, 1484), - { .frequency = CPUFREQ_TABLE_END } -}; - -/* Intel Pentium M processor 1.70GHz (Banias) */ -static struct cpufreq_frequency_table banias_1700[] = -{ - OP( 600, 956), - OP( 800, 1004), - OP(1000, 1116), - OP(1200, 1228), - OP(1400, 1308), - OP(1700, 1484), - { .frequency = CPUFREQ_TABLE_END } -}; -#undef OP - -#define _BANIAS(cpuid, max, name) \ -{ .cpu_id = cpuid, \ - .model_name = "Intel(R) Pentium(R) M processor " name "MHz", \ - .max_freq = (max)*1000, \ - .op_points = banias_##max, \ -} -#define BANIAS(max) _BANIAS(&cpu_ids[CPU_BANIAS], max, #max) - -/* CPU models, their operating frequency range, and freq/voltage - operating points */ -static struct cpu_model models[] = -{ - _BANIAS(&cpu_ids[CPU_BANIAS], 900, " 900"), - BANIAS(1000), - BANIAS(1100), - BANIAS(1200), - BANIAS(1300), - BANIAS(1400), - BANIAS(1500), - BANIAS(1600), - BANIAS(1700), - - /* NULL model_name is a wildcard */ - { &cpu_ids[CPU_DOTHAN_A1], NULL, 0, NULL }, - { &cpu_ids[CPU_DOTHAN_A2], NULL, 0, NULL }, - { &cpu_ids[CPU_DOTHAN_B0], NULL, 0, NULL }, - { &cpu_ids[CPU_MP4HT_D0], NULL, 0, NULL }, - { &cpu_ids[CPU_MP4HT_E0], NULL, 0, NULL }, - - { NULL, } -}; -#undef _BANIAS -#undef BANIAS - -static int centrino_cpu_init_table(struct cpufreq_policy *policy) -{ - struct cpuinfo_x86 *cpu = &cpu_data(policy->cpu); - struct cpu_model *model; - - for(model = models; model->cpu_id != NULL; model++) - if (centrino_verify_cpu_id(cpu, model->cpu_id) && - (model->model_name == NULL || - strcmp(cpu->x86_model_id, model->model_name) == 0)) - break; - - if (model->cpu_id == NULL) { - /* No match at all */ - dprintk("no support for CPU model \"%s\": " - "send /proc/cpuinfo to " MAINTAINER "\n", - cpu->x86_model_id); - return -ENOENT; - } - - if (model->op_points == NULL) { - /* Matched a non-match */ - dprintk("no table support for CPU model \"%s\"\n", - cpu->x86_model_id); - dprintk("try using the acpi-cpufreq driver\n"); - return -ENOENT; - } - - per_cpu(centrino_model, policy->cpu) = model; - - dprintk("found \"%s\": max frequency: %dkHz\n", - model->model_name, model->max_freq); - - return 0; -} - -#else -static inline int centrino_cpu_init_table(struct cpufreq_policy *policy) -{ - return -ENODEV; -} -#endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */ - -static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, - const struct cpu_id *x) -{ - if ((c->x86 == x->x86) && - (c->x86_model == x->x86_model) && - (c->x86_mask == x->x86_mask)) - return 1; - return 0; -} - -/* To be called only after centrino_model is initialized */ -static unsigned extract_clock(unsigned msr, unsigned int cpu, int failsafe) -{ - int i; - - /* - * Extract clock in kHz from PERF_CTL value - * for centrino, as some DSDTs are buggy. - * Ideally, this can be done using the acpi_data structure. - */ - if ((per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_BANIAS]) || - (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_A1]) || - (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_B0])) { - msr = (msr >> 8) & 0xff; - return msr * 100000; - } - - if ((!per_cpu(centrino_model, cpu)) || - (!per_cpu(centrino_model, cpu)->op_points)) - return 0; - - msr &= 0xffff; - for (i = 0; - per_cpu(centrino_model, cpu)->op_points[i].frequency - != CPUFREQ_TABLE_END; - i++) { - if (msr == per_cpu(centrino_model, cpu)->op_points[i].index) - return per_cpu(centrino_model, cpu)-> - op_points[i].frequency; - } - if (failsafe) - return per_cpu(centrino_model, cpu)->op_points[i-1].frequency; - else - return 0; -} - -/* Return the current CPU frequency in kHz */ -static unsigned int get_cur_freq(unsigned int cpu) -{ - unsigned l, h; - unsigned clock_freq; - - rdmsr_on_cpu(cpu, MSR_IA32_PERF_STATUS, &l, &h); - clock_freq = extract_clock(l, cpu, 0); - - if (unlikely(clock_freq == 0)) { - /* - * On some CPUs, we can see transient MSR values (which are - * not present in _PSS), while CPU is doing some automatic - * P-state transition (like TM2). Get the last freq set - * in PERF_CTL. - */ - rdmsr_on_cpu(cpu, MSR_IA32_PERF_CTL, &l, &h); - clock_freq = extract_clock(l, cpu, 1); - } - return clock_freq; -} - - -static int centrino_cpu_init(struct cpufreq_policy *policy) -{ - struct cpuinfo_x86 *cpu = &cpu_data(policy->cpu); - unsigned freq; - unsigned l, h; - int ret; - int i; - - /* Only Intel makes Enhanced Speedstep-capable CPUs */ - if (cpu->x86_vendor != X86_VENDOR_INTEL || - !cpu_has(cpu, X86_FEATURE_EST)) - return -ENODEV; - - if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC)) - centrino_driver.flags |= CPUFREQ_CONST_LOOPS; - - if (policy->cpu != 0) - return -ENODEV; - - for (i = 0; i < N_IDS; i++) - if (centrino_verify_cpu_id(cpu, &cpu_ids[i])) - break; - - if (i != N_IDS) - per_cpu(centrino_cpu, policy->cpu) = &cpu_ids[i]; - - if (!per_cpu(centrino_cpu, policy->cpu)) { - dprintk("found unsupported CPU with " - "Enhanced SpeedStep: send /proc/cpuinfo to " - MAINTAINER "\n"); - return -ENODEV; - } - - if (centrino_cpu_init_table(policy)) { - return -ENODEV; - } - - /* Check to see if Enhanced SpeedStep is enabled, and try to - enable it if not. */ - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - - if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) { - l |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP; - dprintk("trying to enable Enhanced SpeedStep (%x)\n", l); - wrmsr(MSR_IA32_MISC_ENABLE, l, h); - - /* check to see if it stuck */ - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) { - printk(KERN_INFO PFX - "couldn't enable Enhanced SpeedStep\n"); - return -ENODEV; - } - } - - freq = get_cur_freq(policy->cpu); - policy->cpuinfo.transition_latency = 10000; - /* 10uS transition latency */ - policy->cur = freq; - - dprintk("centrino_cpu_init: cur=%dkHz\n", policy->cur); - - ret = cpufreq_frequency_table_cpuinfo(policy, - per_cpu(centrino_model, policy->cpu)->op_points); - if (ret) - return (ret); - - cpufreq_frequency_table_get_attr( - per_cpu(centrino_model, policy->cpu)->op_points, policy->cpu); - - return 0; -} - -static int centrino_cpu_exit(struct cpufreq_policy *policy) -{ - unsigned int cpu = policy->cpu; - - if (!per_cpu(centrino_model, cpu)) - return -ENODEV; - - cpufreq_frequency_table_put_attr(cpu); - - per_cpu(centrino_model, cpu) = NULL; - - return 0; -} - -/** - * centrino_verify - verifies a new CPUFreq policy - * @policy: new policy - * - * Limit must be within this model's frequency range at least one - * border included. - */ -static int centrino_verify (struct cpufreq_policy *policy) -{ - return cpufreq_frequency_table_verify(policy, - per_cpu(centrino_model, policy->cpu)->op_points); -} - -/** - * centrino_setpolicy - set a new CPUFreq policy - * @policy: new policy - * @target_freq: the target frequency - * @relation: how that frequency relates to achieved frequency - * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) - * - * Sets a new CPUFreq policy. - */ -static int centrino_target (struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) -{ - unsigned int newstate = 0; - unsigned int msr, oldmsr = 0, h = 0, cpu = policy->cpu; - struct cpufreq_freqs freqs; - int retval = 0; - unsigned int j, k, first_cpu, tmp; - cpumask_var_t covered_cpus; - - if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL))) - return -ENOMEM; - - if (unlikely(per_cpu(centrino_model, cpu) == NULL)) { - retval = -ENODEV; - goto out; - } - - if (unlikely(cpufreq_frequency_table_target(policy, - per_cpu(centrino_model, cpu)->op_points, - target_freq, - relation, - &newstate))) { - retval = -EINVAL; - goto out; - } - - first_cpu = 1; - for_each_cpu(j, policy->cpus) { - int good_cpu; - - /* cpufreq holds the hotplug lock, so we are safe here */ - if (!cpu_online(j)) - continue; - - /* - * Support for SMP systems. - * Make sure we are running on CPU that wants to change freq - */ - if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) - good_cpu = cpumask_any_and(policy->cpus, - cpu_online_mask); - else - good_cpu = j; - - if (good_cpu >= nr_cpu_ids) { - dprintk("couldn't limit to CPUs in this domain\n"); - retval = -EAGAIN; - if (first_cpu) { - /* We haven't started the transition yet. */ - goto out; - } - break; - } - - msr = per_cpu(centrino_model, cpu)->op_points[newstate].index; - - if (first_cpu) { - rdmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, &oldmsr, &h); - if (msr == (oldmsr & 0xffff)) { - dprintk("no change needed - msr was and needs " - "to be %x\n", oldmsr); - retval = 0; - goto out; - } - - freqs.old = extract_clock(oldmsr, cpu, 0); - freqs.new = extract_clock(msr, cpu, 0); - - dprintk("target=%dkHz old=%d new=%d msr=%04x\n", - target_freq, freqs.old, freqs.new, msr); - - for_each_cpu(k, policy->cpus) { - if (!cpu_online(k)) - continue; - freqs.cpu = k; - cpufreq_notify_transition(&freqs, - CPUFREQ_PRECHANGE); - } - - first_cpu = 0; - /* all but 16 LSB are reserved, treat them with care */ - oldmsr &= ~0xffff; - msr &= 0xffff; - oldmsr |= msr; - } - - wrmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, oldmsr, h); - if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) - break; - - cpumask_set_cpu(j, covered_cpus); - } - - for_each_cpu(k, policy->cpus) { - if (!cpu_online(k)) - continue; - freqs.cpu = k; - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - } - - if (unlikely(retval)) { - /* - * We have failed halfway through the frequency change. - * We have sent callbacks to policy->cpus and - * MSRs have already been written on coverd_cpus. - * Best effort undo.. - */ - - for_each_cpu(j, covered_cpus) - wrmsr_on_cpu(j, MSR_IA32_PERF_CTL, oldmsr, h); - - tmp = freqs.new; - freqs.new = freqs.old; - freqs.old = tmp; - for_each_cpu(j, policy->cpus) { - if (!cpu_online(j)) - continue; - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - } - } - retval = 0; - -out: - free_cpumask_var(covered_cpus); - return retval; -} - -static struct freq_attr* centrino_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; - -static struct cpufreq_driver centrino_driver = { - .name = "centrino", /* should be speedstep-centrino, - but there's a 16 char limit */ - .init = centrino_cpu_init, - .exit = centrino_cpu_exit, - .verify = centrino_verify, - .target = centrino_target, - .get = get_cur_freq, - .attr = centrino_attr, - .owner = THIS_MODULE, -}; - - -/** - * centrino_init - initializes the Enhanced SpeedStep CPUFreq driver - * - * Initializes the Enhanced SpeedStep support. Returns -ENODEV on - * unsupported devices, -ENOENT if there's no voltage table for this - * particular CPU model, -EINVAL on problems during initiatization, - * and zero on success. - * - * This is quite picky. Not only does the CPU have to advertise the - * "est" flag in the cpuid capability flags, we look for a specific - * CPU model and stepping, and we need to have the exact model name in - * our voltage tables. That is, be paranoid about not releasing - * someone's valuable magic smoke. - */ -static int __init centrino_init(void) -{ - struct cpuinfo_x86 *cpu = &cpu_data(0); - - if (!cpu_has(cpu, X86_FEATURE_EST)) - return -ENODEV; - - return cpufreq_register_driver(¢rino_driver); -} - -static void __exit centrino_exit(void) -{ - cpufreq_unregister_driver(¢rino_driver); -} - -MODULE_AUTHOR ("Jeremy Fitzhardinge <jeremy@goop.org>"); -MODULE_DESCRIPTION ("Enhanced SpeedStep driver for Intel Pentium M processors."); -MODULE_LICENSE ("GPL"); - -late_initcall(centrino_init); -module_exit(centrino_exit); diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c deleted file mode 100644 index 561758e9518..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c +++ /dev/null @@ -1,452 +0,0 @@ -/* - * (C) 2001 Dave Jones, Arjan van de ven. - * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de> - * - * Licensed under the terms of the GNU GPL License version 2. - * Based upon reverse engineered information, and on Intel documentation - * for chipsets ICH2-M and ICH3-M. - * - * Many thanks to Ducrot Bruno for finding and fixing the last - * "missing link" for ICH2-M/ICH3-M support, and to Thomas Winkler - * for extensive testing. - * - * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* - */ - - -/********************************************************************* - * SPEEDSTEP - DEFINITIONS * - *********************************************************************/ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/cpufreq.h> -#include <linux/pci.h> -#include <linux/sched.h> - -#include "speedstep-lib.h" - - -/* speedstep_chipset: - * It is necessary to know which chipset is used. As accesses to - * this device occur at various places in this module, we need a - * static struct pci_dev * pointing to that device. - */ -static struct pci_dev *speedstep_chipset_dev; - - -/* speedstep_processor - */ -static enum speedstep_processor speedstep_processor; - -static u32 pmbase; - -/* - * There are only two frequency states for each processor. Values - * are in kHz for the time being. - */ -static struct cpufreq_frequency_table speedstep_freqs[] = { - {SPEEDSTEP_HIGH, 0}, - {SPEEDSTEP_LOW, 0}, - {0, CPUFREQ_TABLE_END}, -}; - - -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "speedstep-ich", msg) - - -/** - * speedstep_find_register - read the PMBASE address - * - * Returns: -ENODEV if no register could be found - */ -static int speedstep_find_register(void) -{ - if (!speedstep_chipset_dev) - return -ENODEV; - - /* get PMBASE */ - pci_read_config_dword(speedstep_chipset_dev, 0x40, &pmbase); - if (!(pmbase & 0x01)) { - printk(KERN_ERR "speedstep-ich: could not find speedstep register\n"); - return -ENODEV; - } - - pmbase &= 0xFFFFFFFE; - if (!pmbase) { - printk(KERN_ERR "speedstep-ich: could not find speedstep register\n"); - return -ENODEV; - } - - dprintk("pmbase is 0x%x\n", pmbase); - return 0; -} - -/** - * speedstep_set_state - set the SpeedStep state - * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH) - * - * Tries to change the SpeedStep state. Can be called from - * smp_call_function_single. - */ -static void speedstep_set_state(unsigned int state) -{ - u8 pm2_blk; - u8 value; - unsigned long flags; - - if (state > 0x1) - return; - - /* Disable IRQs */ - local_irq_save(flags); - - /* read state */ - value = inb(pmbase + 0x50); - - dprintk("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value); - - /* write new state */ - value &= 0xFE; - value |= state; - - dprintk("writing 0x%x to pmbase 0x%x + 0x50\n", value, pmbase); - - /* Disable bus master arbitration */ - pm2_blk = inb(pmbase + 0x20); - pm2_blk |= 0x01; - outb(pm2_blk, (pmbase + 0x20)); - - /* Actual transition */ - outb(value, (pmbase + 0x50)); - - /* Restore bus master arbitration */ - pm2_blk &= 0xfe; - outb(pm2_blk, (pmbase + 0x20)); - - /* check if transition was successful */ - value = inb(pmbase + 0x50); - - /* Enable IRQs */ - local_irq_restore(flags); - - dprintk("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value); - - if (state == (value & 0x1)) - dprintk("change to %u MHz succeeded\n", - speedstep_get_frequency(speedstep_processor) / 1000); - else - printk(KERN_ERR "cpufreq: change failed - I/O error\n"); - - return; -} - -/* Wrapper for smp_call_function_single. */ -static void _speedstep_set_state(void *_state) -{ - speedstep_set_state(*(unsigned int *)_state); -} - -/** - * speedstep_activate - activate SpeedStep control in the chipset - * - * Tries to activate the SpeedStep status and control registers. - * Returns -EINVAL on an unsupported chipset, and zero on success. - */ -static int speedstep_activate(void) -{ - u16 value = 0; - - if (!speedstep_chipset_dev) - return -EINVAL; - - pci_read_config_word(speedstep_chipset_dev, 0x00A0, &value); - if (!(value & 0x08)) { - value |= 0x08; - dprintk("activating SpeedStep (TM) registers\n"); - pci_write_config_word(speedstep_chipset_dev, 0x00A0, value); - } - - return 0; -} - - -/** - * speedstep_detect_chipset - detect the Southbridge which contains SpeedStep logic - * - * Detects ICH2-M, ICH3-M and ICH4-M so far. The pci_dev points to - * the LPC bridge / PM module which contains all power-management - * functions. Returns the SPEEDSTEP_CHIPSET_-number for the detected - * chipset, or zero on failure. - */ -static unsigned int speedstep_detect_chipset(void) -{ - speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL, - PCI_DEVICE_ID_INTEL_82801DB_12, - PCI_ANY_ID, PCI_ANY_ID, - NULL); - if (speedstep_chipset_dev) - return 4; /* 4-M */ - - speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL, - PCI_DEVICE_ID_INTEL_82801CA_12, - PCI_ANY_ID, PCI_ANY_ID, - NULL); - if (speedstep_chipset_dev) - return 3; /* 3-M */ - - - speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL, - PCI_DEVICE_ID_INTEL_82801BA_10, - PCI_ANY_ID, PCI_ANY_ID, - NULL); - if (speedstep_chipset_dev) { - /* speedstep.c causes lockups on Dell Inspirons 8000 and - * 8100 which use a pretty old revision of the 82815 - * host brige. Abort on these systems. - */ - static struct pci_dev *hostbridge; - - hostbridge = pci_get_subsys(PCI_VENDOR_ID_INTEL, - PCI_DEVICE_ID_INTEL_82815_MC, - PCI_ANY_ID, PCI_ANY_ID, - NULL); - - if (!hostbridge) - return 2; /* 2-M */ - - if (hostbridge->revision < 5) { - dprintk("hostbridge does not support speedstep\n"); - speedstep_chipset_dev = NULL; - pci_dev_put(hostbridge); - return 0; - } - - pci_dev_put(hostbridge); - return 2; /* 2-M */ - } - - return 0; -} - -static void get_freq_data(void *_speed) -{ - unsigned int *speed = _speed; - - *speed = speedstep_get_frequency(speedstep_processor); -} - -static unsigned int speedstep_get(unsigned int cpu) -{ - unsigned int speed; - - /* You're supposed to ensure CPU is online. */ - if (smp_call_function_single(cpu, get_freq_data, &speed, 1) != 0) - BUG(); - - dprintk("detected %u kHz as current frequency\n", speed); - return speed; -} - -/** - * speedstep_target - set a new CPUFreq policy - * @policy: new policy - * @target_freq: the target frequency - * @relation: how that frequency relates to achieved frequency - * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) - * - * Sets a new CPUFreq policy. - */ -static int speedstep_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) -{ - unsigned int newstate = 0, policy_cpu; - struct cpufreq_freqs freqs; - int i; - - if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0], - target_freq, relation, &newstate)) - return -EINVAL; - - policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask); - freqs.old = speedstep_get(policy_cpu); - freqs.new = speedstep_freqs[newstate].frequency; - freqs.cpu = policy->cpu; - - dprintk("transiting from %u to %u kHz\n", freqs.old, freqs.new); - - /* no transition necessary */ - if (freqs.old == freqs.new) - return 0; - - for_each_cpu(i, policy->cpus) { - freqs.cpu = i; - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - } - - smp_call_function_single(policy_cpu, _speedstep_set_state, &newstate, - true); - - for_each_cpu(i, policy->cpus) { - freqs.cpu = i; - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - } - - return 0; -} - - -/** - * speedstep_verify - verifies a new CPUFreq policy - * @policy: new policy - * - * Limit must be within speedstep_low_freq and speedstep_high_freq, with - * at least one border included. - */ -static int speedstep_verify(struct cpufreq_policy *policy) -{ - return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]); -} - -struct get_freqs { - struct cpufreq_policy *policy; - int ret; -}; - -static void get_freqs_on_cpu(void *_get_freqs) -{ - struct get_freqs *get_freqs = _get_freqs; - - get_freqs->ret = - speedstep_get_freqs(speedstep_processor, - &speedstep_freqs[SPEEDSTEP_LOW].frequency, - &speedstep_freqs[SPEEDSTEP_HIGH].frequency, - &get_freqs->policy->cpuinfo.transition_latency, - &speedstep_set_state); -} - -static int speedstep_cpu_init(struct cpufreq_policy *policy) -{ - int result; - unsigned int policy_cpu, speed; - struct get_freqs gf; - - /* only run on CPU to be set, or on its sibling */ -#ifdef CONFIG_SMP - cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu)); -#endif - policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask); - - /* detect low and high frequency and transition latency */ - gf.policy = policy; - smp_call_function_single(policy_cpu, get_freqs_on_cpu, &gf, 1); - if (gf.ret) - return gf.ret; - - /* get current speed setting */ - speed = speedstep_get(policy_cpu); - if (!speed) - return -EIO; - - dprintk("currently at %s speed setting - %i MHz\n", - (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency) - ? "low" : "high", - (speed / 1000)); - - /* cpuinfo and default policy values */ - policy->cur = speed; - - result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs); - if (result) - return result; - - cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu); - - return 0; -} - - -static int speedstep_cpu_exit(struct cpufreq_policy *policy) -{ - cpufreq_frequency_table_put_attr(policy->cpu); - return 0; -} - -static struct freq_attr *speedstep_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; - - -static struct cpufreq_driver speedstep_driver = { - .name = "speedstep-ich", - .verify = speedstep_verify, - .target = speedstep_target, - .init = speedstep_cpu_init, - .exit = speedstep_cpu_exit, - .get = speedstep_get, - .owner = THIS_MODULE, - .attr = speedstep_attr, -}; - - -/** - * speedstep_init - initializes the SpeedStep CPUFreq driver - * - * Initializes the SpeedStep support. Returns -ENODEV on unsupported - * devices, -EINVAL on problems during initiatization, and zero on - * success. - */ -static int __init speedstep_init(void) -{ - /* detect processor */ - speedstep_processor = speedstep_detect_processor(); - if (!speedstep_processor) { - dprintk("Intel(R) SpeedStep(TM) capable processor " - "not found\n"); - return -ENODEV; - } - - /* detect chipset */ - if (!speedstep_detect_chipset()) { - dprintk("Intel(R) SpeedStep(TM) for this chipset not " - "(yet) available.\n"); - return -ENODEV; - } - - /* activate speedstep support */ - if (speedstep_activate()) { - pci_dev_put(speedstep_chipset_dev); - return -EINVAL; - } - - if (speedstep_find_register()) - return -ENODEV; - - return cpufreq_register_driver(&speedstep_driver); -} - - -/** - * speedstep_exit - unregisters SpeedStep support - * - * Unregisters SpeedStep support. - */ -static void __exit speedstep_exit(void) -{ - pci_dev_put(speedstep_chipset_dev); - cpufreq_unregister_driver(&speedstep_driver); -} - - -MODULE_AUTHOR("Dave Jones <davej@redhat.com>, " - "Dominik Brodowski <linux@brodo.de>"); -MODULE_DESCRIPTION("Speedstep driver for Intel mobile processors on chipsets " - "with ICH-M southbridges."); -MODULE_LICENSE("GPL"); - -module_init(speedstep_init); -module_exit(speedstep_exit); diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c deleted file mode 100644 index a94ec6be69f..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c +++ /dev/null @@ -1,481 +0,0 @@ -/* - * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de> - * - * Licensed under the terms of the GNU GPL License version 2. - * - * Library for common functions for Intel SpeedStep v.1 and v.2 support - * - * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/moduleparam.h> -#include <linux/init.h> -#include <linux/cpufreq.h> - -#include <asm/msr.h> -#include <asm/tsc.h> -#include "speedstep-lib.h" - -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "speedstep-lib", msg) - -#define PFX "speedstep-lib: " - -#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK -static int relaxed_check; -#else -#define relaxed_check 0 -#endif - -/********************************************************************* - * GET PROCESSOR CORE SPEED IN KHZ * - *********************************************************************/ - -static unsigned int pentium3_get_frequency(enum speedstep_processor processor) -{ - /* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */ - struct { - unsigned int ratio; /* Frequency Multiplier (x10) */ - u8 bitmap; /* power on configuration bits - [27, 25:22] (in MSR 0x2a) */ - } msr_decode_mult[] = { - { 30, 0x01 }, - { 35, 0x05 }, - { 40, 0x02 }, - { 45, 0x06 }, - { 50, 0x00 }, - { 55, 0x04 }, - { 60, 0x0b }, - { 65, 0x0f }, - { 70, 0x09 }, - { 75, 0x0d }, - { 80, 0x0a }, - { 85, 0x26 }, - { 90, 0x20 }, - { 100, 0x2b }, - { 0, 0xff } /* error or unknown value */ - }; - - /* PIII(-M) FSB settings: see table b1-b of 24547206.pdf */ - struct { - unsigned int value; /* Front Side Bus speed in MHz */ - u8 bitmap; /* power on configuration bits [18: 19] - (in MSR 0x2a) */ - } msr_decode_fsb[] = { - { 66, 0x0 }, - { 100, 0x2 }, - { 133, 0x1 }, - { 0, 0xff} - }; - - u32 msr_lo, msr_tmp; - int i = 0, j = 0; - - /* read MSR 0x2a - we only need the low 32 bits */ - rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp); - dprintk("P3 - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp); - msr_tmp = msr_lo; - - /* decode the FSB */ - msr_tmp &= 0x00c0000; - msr_tmp >>= 18; - while (msr_tmp != msr_decode_fsb[i].bitmap) { - if (msr_decode_fsb[i].bitmap == 0xff) - return 0; - i++; - } - - /* decode the multiplier */ - if (processor == SPEEDSTEP_CPU_PIII_C_EARLY) { - dprintk("workaround for early PIIIs\n"); - msr_lo &= 0x03c00000; - } else - msr_lo &= 0x0bc00000; - msr_lo >>= 22; - while (msr_lo != msr_decode_mult[j].bitmap) { - if (msr_decode_mult[j].bitmap == 0xff) - return 0; - j++; - } - - dprintk("speed is %u\n", - (msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100)); - - return msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100; -} - - -static unsigned int pentiumM_get_frequency(void) -{ - u32 msr_lo, msr_tmp; - - rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp); - dprintk("PM - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp); - - /* see table B-2 of 24547212.pdf */ - if (msr_lo & 0x00040000) { - printk(KERN_DEBUG PFX "PM - invalid FSB: 0x%x 0x%x\n", - msr_lo, msr_tmp); - return 0; - } - - msr_tmp = (msr_lo >> 22) & 0x1f; - dprintk("bits 22-26 are 0x%x, speed is %u\n", - msr_tmp, (msr_tmp * 100 * 1000)); - - return msr_tmp * 100 * 1000; -} - -static unsigned int pentium_core_get_frequency(void) -{ - u32 fsb = 0; - u32 msr_lo, msr_tmp; - int ret; - - rdmsr(MSR_FSB_FREQ, msr_lo, msr_tmp); - /* see table B-2 of 25366920.pdf */ - switch (msr_lo & 0x07) { - case 5: - fsb = 100000; - break; - case 1: - fsb = 133333; - break; - case 3: - fsb = 166667; - break; - case 2: - fsb = 200000; - break; - case 0: - fsb = 266667; - break; - case 4: - fsb = 333333; - break; - default: - printk(KERN_ERR "PCORE - MSR_FSB_FREQ undefined value"); - } - - rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp); - dprintk("PCORE - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", - msr_lo, msr_tmp); - - msr_tmp = (msr_lo >> 22) & 0x1f; - dprintk("bits 22-26 are 0x%x, speed is %u\n", - msr_tmp, (msr_tmp * fsb)); - - ret = (msr_tmp * fsb); - return ret; -} - - -static unsigned int pentium4_get_frequency(void) -{ - struct cpuinfo_x86 *c = &boot_cpu_data; - u32 msr_lo, msr_hi, mult; - unsigned int fsb = 0; - unsigned int ret; - u8 fsb_code; - - /* Pentium 4 Model 0 and 1 do not have the Core Clock Frequency - * to System Bus Frequency Ratio Field in the Processor Frequency - * Configuration Register of the MSR. Therefore the current - * frequency cannot be calculated and has to be measured. - */ - if (c->x86_model < 2) - return cpu_khz; - - rdmsr(0x2c, msr_lo, msr_hi); - - dprintk("P4 - MSR_EBC_FREQUENCY_ID: 0x%x 0x%x\n", msr_lo, msr_hi); - - /* decode the FSB: see IA-32 Intel (C) Architecture Software - * Developer's Manual, Volume 3: System Prgramming Guide, - * revision #12 in Table B-1: MSRs in the Pentium 4 and - * Intel Xeon Processors, on page B-4 and B-5. - */ - fsb_code = (msr_lo >> 16) & 0x7; - switch (fsb_code) { - case 0: - fsb = 100 * 1000; - break; - case 1: - fsb = 13333 * 10; - break; - case 2: - fsb = 200 * 1000; - break; - } - - if (!fsb) - printk(KERN_DEBUG PFX "couldn't detect FSB speed. " - "Please send an e-mail to <linux@brodo.de>\n"); - - /* Multiplier. */ - mult = msr_lo >> 24; - - dprintk("P4 - FSB %u kHz; Multiplier %u; Speed %u kHz\n", - fsb, mult, (fsb * mult)); - - ret = (fsb * mult); - return ret; -} - - -/* Warning: may get called from smp_call_function_single. */ -unsigned int speedstep_get_frequency(enum speedstep_processor processor) -{ - switch (processor) { - case SPEEDSTEP_CPU_PCORE: - return pentium_core_get_frequency(); - case SPEEDSTEP_CPU_PM: - return pentiumM_get_frequency(); - case SPEEDSTEP_CPU_P4D: - case SPEEDSTEP_CPU_P4M: - return pentium4_get_frequency(); - case SPEEDSTEP_CPU_PIII_T: - case SPEEDSTEP_CPU_PIII_C: - case SPEEDSTEP_CPU_PIII_C_EARLY: - return pentium3_get_frequency(processor); - default: - return 0; - }; - return 0; -} -EXPORT_SYMBOL_GPL(speedstep_get_frequency); - - -/********************************************************************* - * DETECT SPEEDSTEP-CAPABLE PROCESSOR * - *********************************************************************/ - -unsigned int speedstep_detect_processor(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - u32 ebx, msr_lo, msr_hi; - - dprintk("x86: %x, model: %x\n", c->x86, c->x86_model); - - if ((c->x86_vendor != X86_VENDOR_INTEL) || - ((c->x86 != 6) && (c->x86 != 0xF))) - return 0; - - if (c->x86 == 0xF) { - /* Intel Mobile Pentium 4-M - * or Intel Mobile Pentium 4 with 533 MHz FSB */ - if (c->x86_model != 2) - return 0; - - ebx = cpuid_ebx(0x00000001); - ebx &= 0x000000FF; - - dprintk("ebx value is %x, x86_mask is %x\n", ebx, c->x86_mask); - - switch (c->x86_mask) { - case 4: - /* - * B-stepping [M-P4-M] - * sample has ebx = 0x0f, production has 0x0e. - */ - if ((ebx == 0x0e) || (ebx == 0x0f)) - return SPEEDSTEP_CPU_P4M; - break; - case 7: - /* - * C-stepping [M-P4-M] - * needs to have ebx=0x0e, else it's a celeron: - * cf. 25130917.pdf / page 7, footnote 5 even - * though 25072120.pdf / page 7 doesn't say - * samples are only of B-stepping... - */ - if (ebx == 0x0e) - return SPEEDSTEP_CPU_P4M; - break; - case 9: - /* - * D-stepping [M-P4-M or M-P4/533] - * - * this is totally strange: CPUID 0x0F29 is - * used by M-P4-M, M-P4/533 and(!) Celeron CPUs. - * The latter need to be sorted out as they don't - * support speedstep. - * Celerons with CPUID 0x0F29 may have either - * ebx=0x8 or 0xf -- 25130917.pdf doesn't say anything - * specific. - * M-P4-Ms may have either ebx=0xe or 0xf [see above] - * M-P4/533 have either ebx=0xe or 0xf. [25317607.pdf] - * also, M-P4M HTs have ebx=0x8, too - * For now, they are distinguished by the model_id - * string - */ - if ((ebx == 0x0e) || - (strstr(c->x86_model_id, - "Mobile Intel(R) Pentium(R) 4") != NULL)) - return SPEEDSTEP_CPU_P4M; - break; - default: - break; - } - return 0; - } - - switch (c->x86_model) { - case 0x0B: /* Intel PIII [Tualatin] */ - /* cpuid_ebx(1) is 0x04 for desktop PIII, - * 0x06 for mobile PIII-M */ - ebx = cpuid_ebx(0x00000001); - dprintk("ebx is %x\n", ebx); - - ebx &= 0x000000FF; - - if (ebx != 0x06) - return 0; - - /* So far all PIII-M processors support SpeedStep. See - * Intel's 24540640.pdf of June 2003 - */ - return SPEEDSTEP_CPU_PIII_T; - - case 0x08: /* Intel PIII [Coppermine] */ - - /* all mobile PIII Coppermines have FSB 100 MHz - * ==> sort out a few desktop PIIIs. */ - rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_hi); - dprintk("Coppermine: MSR_IA32_EBL_CR_POWERON is 0x%x, 0x%x\n", - msr_lo, msr_hi); - msr_lo &= 0x00c0000; - if (msr_lo != 0x0080000) - return 0; - - /* - * If the processor is a mobile version, - * platform ID has bit 50 set - * it has SpeedStep technology if either - * bit 56 or 57 is set - */ - rdmsr(MSR_IA32_PLATFORM_ID, msr_lo, msr_hi); - dprintk("Coppermine: MSR_IA32_PLATFORM ID is 0x%x, 0x%x\n", - msr_lo, msr_hi); - if ((msr_hi & (1<<18)) && - (relaxed_check ? 1 : (msr_hi & (3<<24)))) { - if (c->x86_mask == 0x01) { - dprintk("early PIII version\n"); - return SPEEDSTEP_CPU_PIII_C_EARLY; - } else - return SPEEDSTEP_CPU_PIII_C; - } - - default: - return 0; - } -} -EXPORT_SYMBOL_GPL(speedstep_detect_processor); - - -/********************************************************************* - * DETECT SPEEDSTEP SPEEDS * - *********************************************************************/ - -unsigned int speedstep_get_freqs(enum speedstep_processor processor, - unsigned int *low_speed, - unsigned int *high_speed, - unsigned int *transition_latency, - void (*set_state) (unsigned int state)) -{ - unsigned int prev_speed; - unsigned int ret = 0; - unsigned long flags; - struct timeval tv1, tv2; - - if ((!processor) || (!low_speed) || (!high_speed) || (!set_state)) - return -EINVAL; - - dprintk("trying to determine both speeds\n"); - - /* get current speed */ - prev_speed = speedstep_get_frequency(processor); - if (!prev_speed) - return -EIO; - - dprintk("previous speed is %u\n", prev_speed); - - local_irq_save(flags); - - /* switch to low state */ - set_state(SPEEDSTEP_LOW); - *low_speed = speedstep_get_frequency(processor); - if (!*low_speed) { - ret = -EIO; - goto out; - } - - dprintk("low speed is %u\n", *low_speed); - - /* start latency measurement */ - if (transition_latency) - do_gettimeofday(&tv1); - - /* switch to high state */ - set_state(SPEEDSTEP_HIGH); - - /* end latency measurement */ - if (transition_latency) - do_gettimeofday(&tv2); - - *high_speed = speedstep_get_frequency(processor); - if (!*high_speed) { - ret = -EIO; - goto out; - } - - dprintk("high speed is %u\n", *high_speed); - - if (*low_speed == *high_speed) { - ret = -ENODEV; - goto out; - } - - /* switch to previous state, if necessary */ - if (*high_speed != prev_speed) - set_state(SPEEDSTEP_LOW); - - if (transition_latency) { - *transition_latency = (tv2.tv_sec - tv1.tv_sec) * USEC_PER_SEC + - tv2.tv_usec - tv1.tv_usec; - dprintk("transition latency is %u uSec\n", *transition_latency); - - /* convert uSec to nSec and add 20% for safety reasons */ - *transition_latency *= 1200; - - /* check if the latency measurement is too high or too low - * and set it to a safe value (500uSec) in that case - */ - if (*transition_latency > 10000000 || - *transition_latency < 50000) { - printk(KERN_WARNING PFX "frequency transition " - "measured seems out of range (%u " - "nSec), falling back to a safe one of" - "%u nSec.\n", - *transition_latency, 500000); - *transition_latency = 500000; - } - } - -out: - local_irq_restore(flags); - return ret; -} -EXPORT_SYMBOL_GPL(speedstep_get_freqs); - -#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK -module_param(relaxed_check, int, 0444); -MODULE_PARM_DESC(relaxed_check, - "Don't do all checks for speedstep capability."); -#endif - -MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>"); -MODULE_DESCRIPTION("Library for Intel SpeedStep 1 or 2 cpufreq drivers."); -MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h deleted file mode 100644 index 70d9cea1219..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h +++ /dev/null @@ -1,49 +0,0 @@ -/* - * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de> - * - * Licensed under the terms of the GNU GPL License version 2. - * - * Library for common functions for Intel SpeedStep v.1 and v.2 support - * - * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* - */ - - - -/* processors */ -enum speedstep_processor { - SPEEDSTEP_CPU_PIII_C_EARLY = 0x00000001, /* Coppermine core */ - SPEEDSTEP_CPU_PIII_C = 0x00000002, /* Coppermine core */ - SPEEDSTEP_CPU_PIII_T = 0x00000003, /* Tualatin core */ - SPEEDSTEP_CPU_P4M = 0x00000004, /* P4-M */ -/* the following processors are not speedstep-capable and are not auto-detected - * in speedstep_detect_processor(). However, their speed can be detected using - * the speedstep_get_frequency() call. */ - SPEEDSTEP_CPU_PM = 0xFFFFFF03, /* Pentium M */ - SPEEDSTEP_CPU_P4D = 0xFFFFFF04, /* desktop P4 */ - SPEEDSTEP_CPU_PCORE = 0xFFFFFF05, /* Core */ -}; - -/* speedstep states -- only two of them */ - -#define SPEEDSTEP_HIGH 0x00000000 -#define SPEEDSTEP_LOW 0x00000001 - - -/* detect a speedstep-capable processor */ -extern enum speedstep_processor speedstep_detect_processor(void); - -/* detect the current speed (in khz) of the processor */ -extern unsigned int speedstep_get_frequency(enum speedstep_processor processor); - - -/* detect the low and high speeds of the processor. The callback - * set_state"'s first argument is either SPEEDSTEP_HIGH or - * SPEEDSTEP_LOW; the second argument is zero so that no - * cpufreq_notify_transition calls are initiated. - */ -extern unsigned int speedstep_get_freqs(enum speedstep_processor processor, - unsigned int *low_speed, - unsigned int *high_speed, - unsigned int *transition_latency, - void (*set_state) (unsigned int state)); diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c deleted file mode 100644 index 8abd869baab..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c +++ /dev/null @@ -1,467 +0,0 @@ -/* - * Intel SpeedStep SMI driver. - * - * (C) 2003 Hiroshi Miura <miura@da-cha.org> - * - * Licensed under the terms of the GNU GPL License version 2. - * - */ - - -/********************************************************************* - * SPEEDSTEP - DEFINITIONS * - *********************************************************************/ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/moduleparam.h> -#include <linux/init.h> -#include <linux/cpufreq.h> -#include <linux/delay.h> -#include <linux/io.h> -#include <asm/ist.h> - -#include "speedstep-lib.h" - -/* speedstep system management interface port/command. - * - * These parameters are got from IST-SMI BIOS call. - * If user gives it, these are used. - * - */ -static int smi_port; -static int smi_cmd; -static unsigned int smi_sig; - -/* info about the processor */ -static enum speedstep_processor speedstep_processor; - -/* - * There are only two frequency states for each processor. Values - * are in kHz for the time being. - */ -static struct cpufreq_frequency_table speedstep_freqs[] = { - {SPEEDSTEP_HIGH, 0}, - {SPEEDSTEP_LOW, 0}, - {0, CPUFREQ_TABLE_END}, -}; - -#define GET_SPEEDSTEP_OWNER 0 -#define GET_SPEEDSTEP_STATE 1 -#define SET_SPEEDSTEP_STATE 2 -#define GET_SPEEDSTEP_FREQS 4 - -/* how often shall the SMI call be tried if it failed, e.g. because - * of DMA activity going on? */ -#define SMI_TRIES 5 - -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "speedstep-smi", msg) - -/** - * speedstep_smi_ownership - */ -static int speedstep_smi_ownership(void) -{ - u32 command, result, magic, dummy; - u32 function = GET_SPEEDSTEP_OWNER; - unsigned char magic_data[] = "Copyright (c) 1999 Intel Corporation"; - - command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff); - magic = virt_to_phys(magic_data); - - dprintk("trying to obtain ownership with command %x at port %x\n", - command, smi_port); - - __asm__ __volatile__( - "push %%ebp\n" - "out %%al, (%%dx)\n" - "pop %%ebp\n" - : "=D" (result), - "=a" (dummy), "=b" (dummy), "=c" (dummy), "=d" (dummy), - "=S" (dummy) - : "a" (command), "b" (function), "c" (0), "d" (smi_port), - "D" (0), "S" (magic) - : "memory" - ); - - dprintk("result is %x\n", result); - - return result; -} - -/** - * speedstep_smi_get_freqs - get SpeedStep preferred & current freq. - * @low: the low frequency value is placed here - * @high: the high frequency value is placed here - * - * Only available on later SpeedStep-enabled systems, returns false results or - * even hangs [cf. bugme.osdl.org # 1422] on earlier systems. Empirical testing - * shows that the latter occurs if !(ist_info.event & 0xFFFF). - */ -static int speedstep_smi_get_freqs(unsigned int *low, unsigned int *high) -{ - u32 command, result = 0, edi, high_mhz, low_mhz, dummy; - u32 state = 0; - u32 function = GET_SPEEDSTEP_FREQS; - - if (!(ist_info.event & 0xFFFF)) { - dprintk("bug #1422 -- can't read freqs from BIOS\n"); - return -ENODEV; - } - - command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff); - - dprintk("trying to determine frequencies with command %x at port %x\n", - command, smi_port); - - __asm__ __volatile__( - "push %%ebp\n" - "out %%al, (%%dx)\n" - "pop %%ebp" - : "=a" (result), - "=b" (high_mhz), - "=c" (low_mhz), - "=d" (state), "=D" (edi), "=S" (dummy) - : "a" (command), - "b" (function), - "c" (state), - "d" (smi_port), "S" (0), "D" (0) - ); - - dprintk("result %x, low_freq %u, high_freq %u\n", - result, low_mhz, high_mhz); - - /* abort if results are obviously incorrect... */ - if ((high_mhz + low_mhz) < 600) - return -EINVAL; - - *high = high_mhz * 1000; - *low = low_mhz * 1000; - - return result; -} - -/** - * speedstep_get_state - set the SpeedStep state - * @state: processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH) - * - */ -static int speedstep_get_state(void) -{ - u32 function = GET_SPEEDSTEP_STATE; - u32 result, state, edi, command, dummy; - - command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff); - - dprintk("trying to determine current setting with command %x " - "at port %x\n", command, smi_port); - - __asm__ __volatile__( - "push %%ebp\n" - "out %%al, (%%dx)\n" - "pop %%ebp\n" - : "=a" (result), - "=b" (state), "=D" (edi), - "=c" (dummy), "=d" (dummy), "=S" (dummy) - : "a" (command), "b" (function), "c" (0), - "d" (smi_port), "S" (0), "D" (0) - ); - - dprintk("state is %x, result is %x\n", state, result); - - return state & 1; -} - - -/** - * speedstep_set_state - set the SpeedStep state - * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH) - * - */ -static void speedstep_set_state(unsigned int state) -{ - unsigned int result = 0, command, new_state, dummy; - unsigned long flags; - unsigned int function = SET_SPEEDSTEP_STATE; - unsigned int retry = 0; - - if (state > 0x1) - return; - - /* Disable IRQs */ - local_irq_save(flags); - - command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff); - - dprintk("trying to set frequency to state %u " - "with command %x at port %x\n", - state, command, smi_port); - - do { - if (retry) { - dprintk("retry %u, previous result %u, waiting...\n", - retry, result); - mdelay(retry * 50); - } - retry++; - __asm__ __volatile__( - "push %%ebp\n" - "out %%al, (%%dx)\n" - "pop %%ebp" - : "=b" (new_state), "=D" (result), - "=c" (dummy), "=a" (dummy), - "=d" (dummy), "=S" (dummy) - : "a" (command), "b" (function), "c" (state), - "d" (smi_port), "S" (0), "D" (0) - ); - } while ((new_state != state) && (retry <= SMI_TRIES)); - - /* enable IRQs */ - local_irq_restore(flags); - - if (new_state == state) - dprintk("change to %u MHz succeeded after %u tries " - "with result %u\n", - (speedstep_freqs[new_state].frequency / 1000), - retry, result); - else - printk(KERN_ERR "cpufreq: change to state %u " - "failed with new_state %u and result %u\n", - state, new_state, result); - - return; -} - - -/** - * speedstep_target - set a new CPUFreq policy - * @policy: new policy - * @target_freq: new freq - * @relation: - * - * Sets a new CPUFreq policy/freq. - */ -static int speedstep_target(struct cpufreq_policy *policy, - unsigned int target_freq, unsigned int relation) -{ - unsigned int newstate = 0; - struct cpufreq_freqs freqs; - - if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0], - target_freq, relation, &newstate)) - return -EINVAL; - - freqs.old = speedstep_freqs[speedstep_get_state()].frequency; - freqs.new = speedstep_freqs[newstate].frequency; - freqs.cpu = 0; /* speedstep.c is UP only driver */ - - if (freqs.old == freqs.new) - return 0; - - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - speedstep_set_state(newstate); - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - - return 0; -} - - -/** - * speedstep_verify - verifies a new CPUFreq policy - * @policy: new policy - * - * Limit must be within speedstep_low_freq and speedstep_high_freq, with - * at least one border included. - */ -static int speedstep_verify(struct cpufreq_policy *policy) -{ - return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]); -} - - -static int speedstep_cpu_init(struct cpufreq_policy *policy) -{ - int result; - unsigned int speed, state; - unsigned int *low, *high; - - /* capability check */ - if (policy->cpu != 0) - return -ENODEV; - - result = speedstep_smi_ownership(); - if (result) { - dprintk("fails in aquiring ownership of a SMI interface.\n"); - return -EINVAL; - } - - /* detect low and high frequency */ - low = &speedstep_freqs[SPEEDSTEP_LOW].frequency; - high = &speedstep_freqs[SPEEDSTEP_HIGH].frequency; - - result = speedstep_smi_get_freqs(low, high); - if (result) { - /* fall back to speedstep_lib.c dection mechanism: - * try both states out */ - dprintk("could not detect low and high frequencies " - "by SMI call.\n"); - result = speedstep_get_freqs(speedstep_processor, - low, high, - NULL, - &speedstep_set_state); - - if (result) { - dprintk("could not detect two different speeds" - " -- aborting.\n"); - return result; - } else - dprintk("workaround worked.\n"); - } - - /* get current speed setting */ - state = speedstep_get_state(); - speed = speedstep_freqs[state].frequency; - - dprintk("currently at %s speed setting - %i MHz\n", - (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency) - ? "low" : "high", - (speed / 1000)); - - /* cpuinfo and default policy values */ - policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; - policy->cur = speed; - - result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs); - if (result) - return result; - - cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu); - - return 0; -} - -static int speedstep_cpu_exit(struct cpufreq_policy *policy) -{ - cpufreq_frequency_table_put_attr(policy->cpu); - return 0; -} - -static unsigned int speedstep_get(unsigned int cpu) -{ - if (cpu) - return -ENODEV; - return speedstep_get_frequency(speedstep_processor); -} - - -static int speedstep_resume(struct cpufreq_policy *policy) -{ - int result = speedstep_smi_ownership(); - - if (result) - dprintk("fails in re-aquiring ownership of a SMI interface.\n"); - - return result; -} - -static struct freq_attr *speedstep_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; - -static struct cpufreq_driver speedstep_driver = { - .name = "speedstep-smi", - .verify = speedstep_verify, - .target = speedstep_target, - .init = speedstep_cpu_init, - .exit = speedstep_cpu_exit, - .get = speedstep_get, - .resume = speedstep_resume, - .owner = THIS_MODULE, - .attr = speedstep_attr, -}; - -/** - * speedstep_init - initializes the SpeedStep CPUFreq driver - * - * Initializes the SpeedStep support. Returns -ENODEV on unsupported - * BIOS, -EINVAL on problems during initiatization, and zero on - * success. - */ -static int __init speedstep_init(void) -{ - speedstep_processor = speedstep_detect_processor(); - - switch (speedstep_processor) { - case SPEEDSTEP_CPU_PIII_T: - case SPEEDSTEP_CPU_PIII_C: - case SPEEDSTEP_CPU_PIII_C_EARLY: - break; - default: - speedstep_processor = 0; - } - - if (!speedstep_processor) { - dprintk("No supported Intel CPU detected.\n"); - return -ENODEV; - } - - dprintk("signature:0x%.8lx, command:0x%.8lx, " - "event:0x%.8lx, perf_level:0x%.8lx.\n", - ist_info.signature, ist_info.command, - ist_info.event, ist_info.perf_level); - - /* Error if no IST-SMI BIOS or no PARM - sig= 'ISGE' aka 'Intel Speedstep Gate E' */ - if ((ist_info.signature != 0x47534943) && ( - (smi_port == 0) || (smi_cmd == 0))) - return -ENODEV; - - if (smi_sig == 1) - smi_sig = 0x47534943; - else - smi_sig = ist_info.signature; - - /* setup smi_port from MODLULE_PARM or BIOS */ - if ((smi_port > 0xff) || (smi_port < 0)) - return -EINVAL; - else if (smi_port == 0) - smi_port = ist_info.command & 0xff; - - if ((smi_cmd > 0xff) || (smi_cmd < 0)) - return -EINVAL; - else if (smi_cmd == 0) - smi_cmd = (ist_info.command >> 16) & 0xff; - - return cpufreq_register_driver(&speedstep_driver); -} - - -/** - * speedstep_exit - unregisters SpeedStep support - * - * Unregisters SpeedStep support. - */ -static void __exit speedstep_exit(void) -{ - cpufreq_unregister_driver(&speedstep_driver); -} - -module_param(smi_port, int, 0444); -module_param(smi_cmd, int, 0444); -module_param(smi_sig, uint, 0444); - -MODULE_PARM_DESC(smi_port, "Override the BIOS-given IST port with this value " - "-- Intel's default setting is 0xb2"); -MODULE_PARM_DESC(smi_cmd, "Override the BIOS-given IST command with this value " - "-- Intel's default setting is 0x82"); -MODULE_PARM_DESC(smi_sig, "Set to 1 to fake the IST signature when using the " - "SMI interface."); - -MODULE_AUTHOR("Hiroshi Miura"); -MODULE_DESCRIPTION("Speedstep driver for IST applet SMI interface."); -MODULE_LICENSE("GPL"); - -module_init(speedstep_init); -module_exit(speedstep_exit); diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 4fbd384fb64..aaf152e7963 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -1,4 +1,3 @@ -#include <linux/init.h> #include <linux/bitops.h> #include <linux/delay.h> #include <linux/pci.h> @@ -15,7 +14,7 @@ /* * Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU */ -static void __cpuinit __do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) +static void __do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) { unsigned char ccr2, ccr3; @@ -44,7 +43,7 @@ static void __cpuinit __do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) } } -static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) +static void do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) { unsigned long flags; @@ -59,25 +58,25 @@ static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) * Actually since bugs.h doesn't even reference this perhaps someone should * fix the documentation ??? */ -static unsigned char Cx86_dir0_msb __cpuinitdata = 0; +static unsigned char Cx86_dir0_msb = 0; -static const char __cpuinitconst Cx86_model[][9] = { +static const char Cx86_model[][9] = { "Cx486", "Cx486", "5x86 ", "6x86", "MediaGX ", "6x86MX ", "M II ", "Unknown" }; -static const char __cpuinitconst Cx486_name[][5] = { +static const char Cx486_name[][5] = { "SLC", "DLC", "SLC2", "DLC2", "SRx", "DRx", "SRx2", "DRx2" }; -static const char __cpuinitconst Cx486S_name[][4] = { +static const char Cx486S_name[][4] = { "S", "S2", "Se", "S2e" }; -static const char __cpuinitconst Cx486D_name[][4] = { +static const char Cx486D_name[][4] = { "DX", "DX2", "?", "?", "?", "DX4" }; -static char Cx86_cb[] __cpuinitdata = "?.5x Core/Bus Clock"; -static const char __cpuinitconst cyrix_model_mult1[] = "12??43"; -static const char __cpuinitconst cyrix_model_mult2[] = "12233445"; +static char Cx86_cb[] = "?.5x Core/Bus Clock"; +static const char cyrix_model_mult1[] = "12??43"; +static const char cyrix_model_mult2[] = "12233445"; /* * Reset the slow-loop (SLOP) bit on the 686(L) which is set by some old @@ -87,7 +86,7 @@ static const char __cpuinitconst cyrix_model_mult2[] = "12233445"; * FIXME: our newer udelay uses the tsc. We don't need to frob with SLOP */ -static void __cpuinit check_cx686_slop(struct cpuinfo_x86 *c) +static void check_cx686_slop(struct cpuinfo_x86 *c) { unsigned long flags; @@ -112,7 +111,7 @@ static void __cpuinit check_cx686_slop(struct cpuinfo_x86 *c) } -static void __cpuinit set_cx86_reorder(void) +static void set_cx86_reorder(void) { u8 ccr3; @@ -127,7 +126,7 @@ static void __cpuinit set_cx86_reorder(void) setCx86(CX86_CCR3, ccr3); } -static void __cpuinit set_cx86_memwb(void) +static void set_cx86_memwb(void) { printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n"); @@ -143,7 +142,7 @@ static void __cpuinit set_cx86_memwb(void) * Configure later MediaGX and/or Geode processor. */ -static void __cpuinit geode_configure(void) +static void geode_configure(void) { unsigned long flags; u8 ccr3; @@ -166,7 +165,7 @@ static void __cpuinit geode_configure(void) local_irq_restore(flags); } -static void __cpuinit early_init_cyrix(struct cpuinfo_x86 *c) +static void early_init_cyrix(struct cpuinfo_x86 *c) { unsigned char dir0, dir0_msn, dir1 = 0; @@ -185,7 +184,7 @@ static void __cpuinit early_init_cyrix(struct cpuinfo_x86 *c) } } -static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) +static void init_cyrix(struct cpuinfo_x86 *c) { unsigned char dir0, dir0_msn, dir0_lsn, dir1 = 0; char *buf = c->x86_model_id; @@ -249,7 +248,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) /* Emulate MTRRs using Cyrix's ARRs. */ set_cpu_cap(c, X86_FEATURE_CYRIX_ARR); /* 6x86's contain this bug */ - c->coma_bug = 1; + set_cpu_bug(c, X86_BUG_COMA); break; case 4: /* MediaGX/GXm or Geode GXM/GXLV/GX1 */ @@ -317,7 +316,8 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) /* Enable MMX extensions (App note 108) */ setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7)|1); } else { - c->coma_bug = 1; /* 6x86MX, it has the bug. */ + /* A 6x86MX - it has the bug. */ + set_cpu_bug(c, X86_BUG_COMA); } tmp = (!(dir0_lsn & 7) || dir0_lsn & 1) ? 2 : 0; Cx86_cb[tmp] = cyrix_model_mult2[dir0_lsn & 7]; @@ -332,7 +332,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) switch (dir0_lsn) { case 0xd: /* either a 486SLC or DLC w/o DEVID */ dir0_msn = 0; - p = Cx486_name[(c->hard_math) ? 1 : 0]; + p = Cx486_name[(cpu_has_fpu ? 1 : 0)]; break; case 0xe: /* a 486S A step */ @@ -355,7 +355,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) /* * Handle National Semiconductor branded processors */ -static void __cpuinit init_nsc(struct cpuinfo_x86 *c) +static void init_nsc(struct cpuinfo_x86 *c) { /* * There may be GX1 processors in the wild that are branded @@ -404,7 +404,7 @@ static inline int test_cyrix_52div(void) return (unsigned char) (test >> 8) == 0x02; } -static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c) +static void cyrix_identify(struct cpuinfo_x86 *c) { /* Detect Cyrix with disabled CPUID */ if (c->x86 == 4 && test_cyrix_52div()) { @@ -440,7 +440,7 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c) } } -static const struct cpu_dev __cpuinitconst cyrix_cpu_dev = { +static const struct cpu_dev cyrix_cpu_dev = { .c_vendor = "Cyrix", .c_ident = { "CyrixInstead" }, .c_early_init = early_init_cyrix, @@ -451,7 +451,7 @@ static const struct cpu_dev __cpuinitconst cyrix_cpu_dev = { cpu_dev_register(cyrix_cpu_dev); -static const struct cpu_dev __cpuinitconst nsc_cpu_dev = { +static const struct cpu_dev nsc_cpu_dev = { .c_vendor = "NSC", .c_ident = { "Geode by NSC" }, .c_init = init_nsc, diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 8095f8611f8..36ce402a3fa 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -25,18 +25,16 @@ #include <asm/processor.h> #include <asm/hypervisor.h> -/* - * Hypervisor detect order. This is specified explicitly here because - * some hypervisors might implement compatibility modes for other - * hypervisors and therefore need to be detected in specific sequence. - */ static const __initconst struct hypervisor_x86 * const hypervisors[] = { - &x86_hyper_vmware, - &x86_hyper_ms_hyperv, #ifdef CONFIG_XEN_PVHVM &x86_hyper_xen_hvm, #endif + &x86_hyper_vmware, + &x86_hyper_ms_hyperv, +#ifdef CONFIG_KVM_GUEST + &x86_hyper_kvm, +#endif }; const struct hypervisor_x86 *x86_hyper; @@ -46,18 +44,22 @@ static inline void __init detect_hypervisor_vendor(void) { const struct hypervisor_x86 *h, * const *p; + uint32_t pri, max_pri = 0; for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) { h = *p; - if (h->detect()) { + pri = h->detect(); + if (pri != 0 && pri > max_pri) { + max_pri = pri; x86_hyper = h; - printk(KERN_INFO "Hypervisor detected: %s\n", h->name); - break; } } + + if (max_pri) + printk(KERN_INFO "Hypervisor detected: %s\n", x86_hyper->name); } -void __cpuinit init_hypervisor(struct cpuinfo_x86 *c) +void init_hypervisor(struct cpuinfo_x86 *c) { if (x86_hyper && x86_hyper->set_cpu_features) x86_hyper->set_cpu_features(c); @@ -76,3 +78,10 @@ void __init init_hypervisor_platform(void) if (x86_hyper->init_platform) x86_hyper->init_platform(); } + +bool __init hypervisor_x2apic_available(void) +{ + return x86_hyper && + x86_hyper->x2apic_available && + x86_hyper->x2apic_available(); +} diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index d16c2c53d6b..f9e4fdd3b87 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -1,4 +1,3 @@ -#include <linux/init.h> #include <linux/kernel.h> #include <linux/string.h> @@ -17,7 +16,6 @@ #ifdef CONFIG_X86_64 #include <linux/topology.h> -#include <asm/numa_64.h> #endif #include "cpu.h" @@ -27,17 +25,14 @@ #include <asm/apic.h> #endif -static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) +static void early_init_intel(struct cpuinfo_x86 *c) { + u64 misc_enable; + /* Unmask CPUID levels if masked: */ if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { - u64 misc_enable; - - rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); - - if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) { - misc_enable &= ~MSR_IA32_MISC_ENABLE_LIMIT_CPUID; - wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); + if (msr_clear_bit(MSR_IA32_MISC_ENABLE, + MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) > 0) { c->cpuid_level = cpuid_eax(0); get_cpu_cap(c); } @@ -47,6 +42,15 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) (c->x86 == 0x6 && c->x86_model >= 0x0e)) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64)) { + unsigned lower_word; + + wrmsr(MSR_IA32_UCODE_REV, 0, 0); + /* Required by the SDM */ + sync_core(); + rdmsr(MSR_IA32_UCODE_REV, lower_word, c->microcode); + } + /* * Atom erratum AAE44/AAF40/AAG38/AAH41: * @@ -55,17 +59,10 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) * need the microcode to have already been loaded... so if it is * not, recommend a BIOS update and disable large pages. */ - if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_mask <= 2) { - u32 ucode, junk; - - wrmsr(MSR_IA32_UCODE_REV, 0, 0); - sync_core(); - rdmsr(MSR_IA32_UCODE_REV, junk, ucode); - - if (ucode < 0x20e) { - printk(KERN_WARNING "Atom PSE erratum detected, BIOS microcode update recommended\n"); - clear_cpu_cap(c, X86_FEATURE_PSE); - } + if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_mask <= 2 && + c->microcode < 0x20e) { + printk(KERN_WARNING "Atom PSE erratum detected, BIOS microcode update recommended\n"); + clear_cpu_cap(c, X86_FEATURE_PSE); } #ifdef CONFIG_X86_64 @@ -92,7 +89,19 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); if (!check_tsc_unstable()) - sched_clock_stable = 1; + set_sched_clock_stable(); + } + + /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */ + if (c->x86 == 6) { + switch (c->x86_model) { + case 0x27: /* Penwell */ + case 0x35: /* Cloverview */ + set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3); + break; + default: + break; + } } /* @@ -117,19 +126,24 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) * Ingo Molnar reported a Pentium D (model 6) and a Xeon * (model 2) with the same problem. */ - if (c->x86 == 15) { - u64 misc_enable; + if (c->x86 == 15) + if (msr_clear_bit(MSR_IA32_MISC_ENABLE, + MSR_IA32_MISC_ENABLE_FAST_STRING_BIT) > 0) + pr_info("kmemcheck: Disabling fast string operations\n"); +#endif + /* + * If fast string is not enabled in IA32_MISC_ENABLE for any reason, + * clear the fast string and enhanced fast string CPU capabilities. + */ + if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); - - if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) { - printk(KERN_INFO "kmemcheck: Disabling fast string operations\n"); - - misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING; - wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); + if (!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) { + printk(KERN_INFO "Disabled fast string operations\n"); + setup_clear_cpu_cap(X86_FEATURE_REP_GOOD); + setup_clear_cpu_cap(X86_FEATURE_ERMS); } } -#endif } #ifdef CONFIG_X86_32 @@ -139,7 +153,7 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) * This is called before we do cpu ident work */ -int __cpuinit ppro_with_ram_bug(void) +int ppro_with_ram_bug(void) { /* Uses data from early_cpu_detect now */ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && @@ -152,23 +166,8 @@ int __cpuinit ppro_with_ram_bug(void) return 0; } -#ifdef CONFIG_X86_F00F_BUG -static void __cpuinit trap_init_f00f_bug(void) -{ - __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); - - /* - * Update the IDT descriptor and reload the IDT so that - * it uses the read-only mapped virtual address. - */ - idt_descr.address = fix_to_virt(FIX_F00F_IDT); - load_idt(&idt_descr); -} -#endif - -static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c) +static void intel_smp_check(struct cpuinfo_x86 *c) { -#ifdef CONFIG_SMP /* calling is from identify_secondary_cpu() ? */ if (!c->cpu_index) return; @@ -185,27 +184,30 @@ static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c) WARN_ONCE(1, "WARNING: SMP operation may be unreliable" "with B stepping processors.\n"); } -#endif } -static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) +static int forcepae; +static int __init forcepae_setup(char *__unused) { - unsigned long lo, hi; + forcepae = 1; + return 1; +} +__setup("forcepae", forcepae_setup); +static void intel_workarounds(struct cpuinfo_x86 *c) +{ #ifdef CONFIG_X86_F00F_BUG /* * All current models of Pentium and Pentium with MMX technology CPUs * have the F0 0F bug, which lets nonprivileged users lock up the - * system. - * Note that the workaround only should be initialized once... + * system. Announce that the fault handler will be checking for it. */ - c->f00f_bug = 0; + clear_cpu_bug(c, X86_BUG_F00F); if (!paravirt_enabled() && c->x86 == 5) { static int f00f_workaround_enabled; - c->f00f_bug = 1; + set_cpu_bug(c, X86_BUG_F00F); if (!f00f_workaround_enabled) { - trap_init_f00f_bug(); printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n"); f00f_workaround_enabled = 1; } @@ -220,16 +222,26 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) clear_cpu_cap(c, X86_FEATURE_SEP); /* + * PAE CPUID issue: many Pentium M report no PAE but may have a + * functionally usable PAE implementation. + * Forcefully enable PAE if kernel parameter "forcepae" is present. + */ + if (forcepae) { + printk(KERN_WARNING "PAE forced!\n"); + set_cpu_cap(c, X86_FEATURE_PAE); + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE); + } + + /* * P4 Xeon errata 037 workaround. * Hardware prefetcher may cause stale data to be loaded into the cache. */ if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) { - rdmsr(MSR_IA32_MISC_ENABLE, lo, hi); - if ((lo & MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE) == 0) { - printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n"); - printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n"); - lo |= MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE; - wrmsr(MSR_IA32_MISC_ENABLE, lo, hi); + if (msr_set_bit(MSR_IA32_MISC_ENABLE, + MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT) + > 0) { + pr_info("CPU: C0 stepping P4 Xeon detected.\n"); + pr_info("CPU: Disabling hardware prefetching (Errata 037)\n"); } } @@ -262,28 +274,23 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) } #endif -#ifdef CONFIG_X86_NUMAQ - numaq_tsc_disable(); -#endif - intel_smp_check(c); } #else -static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) +static void intel_workarounds(struct cpuinfo_x86 *c) { } #endif -static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) +static void srat_detect_node(struct cpuinfo_x86 *c) { -#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) +#ifdef CONFIG_NUMA unsigned node; int cpu = smp_processor_id(); - int apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid; /* Don't do the funky fallback heuristics the AMD version employs for now. */ - node = apicid_to_node[apicid]; + node = numa_cpu_node(cpu); if (node == NUMA_NO_NODE || !node_online(node)) { /* reuse the value from init_cpu_to_node() */ node = cpu_to_node(cpu); @@ -295,7 +302,7 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) /* * find out the number of processor cores on the die */ -static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) +static int intel_num_cpu_cores(struct cpuinfo_x86 *c) { unsigned int eax, ebx, ecx, edx; @@ -310,7 +317,7 @@ static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) return 1; } -static void __cpuinit detect_vmx_virtcap(struct cpuinfo_x86 *c) +static void detect_vmx_virtcap(struct cpuinfo_x86 *c) { /* Intel VMX MSR indicated features */ #define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000 @@ -348,7 +355,7 @@ static void __cpuinit detect_vmx_virtcap(struct cpuinfo_x86 *c) } } -static void __cpuinit init_intel(struct cpuinfo_x86 *c) +static void init_intel(struct cpuinfo_x86 *c) { unsigned int l2 = 0; @@ -363,6 +370,17 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) */ detect_extended_topology(c); + if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) { + /* + * let's use the legacy cpuid vector 0x1 and 0x4 for topology + * detection. + */ + c->x86_max_cores = intel_num_cpu_cores(c); +#ifdef CONFIG_X86_32 + detect_ht(c); +#endif + } + l2 = init_intel_cacheinfo(c); if (c->cpuid_level > 9) { unsigned eax = cpuid_eax(10); @@ -382,7 +400,8 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_PEBS); } - if (c->x86 == 6 && c->x86_model == 29 && cpu_has_clflush) + if (c->x86 == 6 && cpu_has_clflush && + (c->x86_model == 29 || c->x86_model == 46 || c->x86_model == 47)) set_cpu_cap(c, X86_FEATURE_CLFLUSH_MONITOR); #ifdef CONFIG_X86_64 @@ -401,12 +420,10 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) switch (c->x86_model) { case 5: - if (c->x86_mask == 0) { - if (l2 == 0) - p = "Celeron (Covington)"; - else if (l2 == 256) - p = "Mobile Pentium II (Dixon)"; - } + if (l2 == 0) + p = "Celeron (Covington)"; + else if (l2 == 256) + p = "Mobile Pentium II (Dixon)"; break; case 6: @@ -432,26 +449,33 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_P3); #endif - if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) { - /* - * let's use the legacy cpuid vector 0x1 and 0x4 for topology - * detection. - */ - c->x86_max_cores = intel_num_cpu_cores(c); -#ifdef CONFIG_X86_32 - detect_ht(c); -#endif - } - /* Work around errata */ srat_detect_node(c); if (cpu_has(c, X86_FEATURE_VMX)) detect_vmx_virtcap(c); + + /* + * Initialize MSR_IA32_ENERGY_PERF_BIAS if BIOS did not. + * x86_energy_perf_policy(8) is available to change it at run-time + */ + if (cpu_has(c, X86_FEATURE_EPB)) { + u64 epb; + + rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); + if ((epb & 0xF) == ENERGY_PERF_BIAS_PERFORMANCE) { + printk_once(KERN_WARNING "ENERGY_PERF_BIAS:" + " Set to 'normal', was 'performance'\n" + "ENERGY_PERF_BIAS: View and update with" + " x86_energy_perf_policy(8)\n"); + epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL; + wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); + } + } } #ifdef CONFIG_X86_32 -static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) +static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) { /* * Intel PIII Tualatin. This comes in two flavours. @@ -465,12 +489,209 @@ static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned i } #endif -static const struct cpu_dev __cpuinitconst intel_cpu_dev = { +#define TLB_INST_4K 0x01 +#define TLB_INST_4M 0x02 +#define TLB_INST_2M_4M 0x03 + +#define TLB_INST_ALL 0x05 +#define TLB_INST_1G 0x06 + +#define TLB_DATA_4K 0x11 +#define TLB_DATA_4M 0x12 +#define TLB_DATA_2M_4M 0x13 +#define TLB_DATA_4K_4M 0x14 + +#define TLB_DATA_1G 0x16 + +#define TLB_DATA0_4K 0x21 +#define TLB_DATA0_4M 0x22 +#define TLB_DATA0_2M_4M 0x23 + +#define STLB_4K 0x41 +#define STLB_4K_2M 0x42 + +static const struct _tlb_table intel_tlb_table[] = { + { 0x01, TLB_INST_4K, 32, " TLB_INST 4 KByte pages, 4-way set associative" }, + { 0x02, TLB_INST_4M, 2, " TLB_INST 4 MByte pages, full associative" }, + { 0x03, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way set associative" }, + { 0x04, TLB_DATA_4M, 8, " TLB_DATA 4 MByte pages, 4-way set associative" }, + { 0x05, TLB_DATA_4M, 32, " TLB_DATA 4 MByte pages, 4-way set associative" }, + { 0x0b, TLB_INST_4M, 4, " TLB_INST 4 MByte pages, 4-way set associative" }, + { 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages */" }, + { 0x50, TLB_INST_ALL, 64, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, + { 0x51, TLB_INST_ALL, 128, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, + { 0x52, TLB_INST_ALL, 256, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, + { 0x55, TLB_INST_2M_4M, 7, " TLB_INST 2-MByte or 4-MByte pages, fully associative" }, + { 0x56, TLB_DATA0_4M, 16, " TLB_DATA0 4 MByte pages, 4-way set associative" }, + { 0x57, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, 4-way associative" }, + { 0x59, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, fully associative" }, + { 0x5a, TLB_DATA0_2M_4M, 32, " TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative" }, + { 0x5b, TLB_DATA_4K_4M, 64, " TLB_DATA 4 KByte and 4 MByte pages" }, + { 0x5c, TLB_DATA_4K_4M, 128, " TLB_DATA 4 KByte and 4 MByte pages" }, + { 0x5d, TLB_DATA_4K_4M, 256, " TLB_DATA 4 KByte and 4 MByte pages" }, + { 0x61, TLB_INST_4K, 48, " TLB_INST 4 KByte pages, full associative" }, + { 0x63, TLB_DATA_1G, 4, " TLB_DATA 1 GByte pages, 4-way set associative" }, + { 0x76, TLB_INST_2M_4M, 8, " TLB_INST 2-MByte or 4-MByte pages, fully associative" }, + { 0xb0, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 4-way set associative" }, + { 0xb1, TLB_INST_2M_4M, 4, " TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" }, + { 0xb2, TLB_INST_4K, 64, " TLB_INST 4KByte pages, 4-way set associative" }, + { 0xb3, TLB_DATA_4K, 128, " TLB_DATA 4 KByte pages, 4-way set associative" }, + { 0xb4, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 4-way associative" }, + { 0xb5, TLB_INST_4K, 64, " TLB_INST 4 KByte pages, 8-way set ssociative" }, + { 0xb6, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 8-way set ssociative" }, + { 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" }, + { 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" }, + { 0xc1, STLB_4K_2M, 1024, " STLB 4 KByte and 2 MByte pages, 8-way associative" }, + { 0xc2, TLB_DATA_2M_4M, 16, " DTLB 2 MByte/4MByte pages, 4-way associative" }, + { 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" }, + { 0x00, 0, 0 } +}; + +static void intel_tlb_lookup(const unsigned char desc) +{ + unsigned char k; + if (desc == 0) + return; + + /* look up this descriptor in the table */ + for (k = 0; intel_tlb_table[k].descriptor != desc && \ + intel_tlb_table[k].descriptor != 0; k++) + ; + + if (intel_tlb_table[k].tlb_type == 0) + return; + + switch (intel_tlb_table[k].tlb_type) { + case STLB_4K: + if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; + break; + case STLB_4K_2M: + if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_INST_ALL: + if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_INST_4K: + if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_INST_4M: + if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_INST_2M_4M: + if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_DATA_4K: + case TLB_DATA0_4K: + if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_DATA_4M: + case TLB_DATA0_4M: + if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_DATA_2M_4M: + case TLB_DATA0_2M_4M: + if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_DATA_4K_4M: + if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; + if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + break; + case TLB_DATA_1G: + if (tlb_lld_1g[ENTRIES] < intel_tlb_table[k].entries) + tlb_lld_1g[ENTRIES] = intel_tlb_table[k].entries; + break; + } +} + +static void intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c) +{ + switch ((c->x86 << 8) + c->x86_model) { + case 0x60f: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ + case 0x616: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ + case 0x617: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ + case 0x61d: /* six-core 45 nm xeon "Dunnington" */ + tlb_flushall_shift = -1; + break; + case 0x63a: /* Ivybridge */ + tlb_flushall_shift = 2; + break; + case 0x61a: /* 45 nm nehalem, "Bloomfield" */ + case 0x61e: /* 45 nm nehalem, "Lynnfield" */ + case 0x625: /* 32 nm nehalem, "Clarkdale" */ + case 0x62c: /* 32 nm nehalem, "Gulftown" */ + case 0x62e: /* 45 nm nehalem-ex, "Beckton" */ + case 0x62f: /* 32 nm Xeon E7 */ + case 0x62a: /* SandyBridge */ + case 0x62d: /* SandyBridge, "Romely-EP" */ + default: + tlb_flushall_shift = 6; + } +} + +static void intel_detect_tlb(struct cpuinfo_x86 *c) +{ + int i, j, n; + unsigned int regs[4]; + unsigned char *desc = (unsigned char *)regs; + + if (c->cpuid_level < 2) + return; + + /* Number of times to iterate */ + n = cpuid_eax(2) & 0xFF; + + for (i = 0 ; i < n ; i++) { + cpuid(2, ®s[0], ®s[1], ®s[2], ®s[3]); + + /* If bit 31 is set, this is an unknown format */ + for (j = 0 ; j < 3 ; j++) + if (regs[j] & (1 << 31)) + regs[j] = 0; + + /* Byte 0 is level count, not a descriptor */ + for (j = 1 ; j < 16 ; j++) + intel_tlb_lookup(desc[j]); + } + intel_tlb_flushall_shift_set(c); +} + +static const struct cpu_dev intel_cpu_dev = { .c_vendor = "Intel", .c_ident = { "GenuineIntel" }, #ifdef CONFIG_X86_32 - .c_models = { - { .vendor = X86_VENDOR_INTEL, .family = 4, .model_names = + .legacy_models = { + { .family = 4, .model_names = { [0] = "486 DX-25/33", [1] = "486 DX-50", @@ -483,7 +704,7 @@ static const struct cpu_dev __cpuinitconst intel_cpu_dev = { [9] = "486 DX/4-WB" } }, - { .vendor = X86_VENDOR_INTEL, .family = 5, .model_names = + { .family = 5, .model_names = { [0] = "Pentium 60/66 A-step", [1] = "Pentium 60/66", @@ -494,7 +715,7 @@ static const struct cpu_dev __cpuinitconst intel_cpu_dev = { [8] = "Mobile Pentium MMX" } }, - { .vendor = X86_VENDOR_INTEL, .family = 6, .model_names = + { .family = 6, .model_names = { [0] = "Pentium Pro A-step", [1] = "Pentium Pro", @@ -508,7 +729,7 @@ static const struct cpu_dev __cpuinitconst intel_cpu_dev = { [11] = "Pentium III (Tualatin)", } }, - { .vendor = X86_VENDOR_INTEL, .family = 15, .model_names = + { .family = 15, .model_names = { [0] = "Pentium 4 (Unknown)", [1] = "Pentium 4 (Willamette)", @@ -518,8 +739,9 @@ static const struct cpu_dev __cpuinitconst intel_cpu_dev = { } }, }, - .c_size_cache = intel_size_cache, + .legacy_cache_size = intel_size_cache, #endif + .c_detect_tlb = intel_detect_tlb, .c_early_init = early_init_intel, .c_init = init_intel, .c_x86_vendor = X86_VENDOR_INTEL, diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 17ad0336621..9c8f7394c61 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -1,5 +1,5 @@ /* - * Routines to indentify caches on Intel CPU. + * Routines to identify caches on Intel CPU. * * Changes: * Venkatesh Pallipadi : Adding cache identification through cpuid(4) @@ -37,7 +37,7 @@ struct _cache_table { /* All the cache descriptor types we care about (no TLB or trace cache entries) */ -static const struct _cache_table __cpuinitconst cache_table[] = +static const struct _cache_table cache_table[] = { { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */ { 0x08, LVL_1_INST, 16 }, /* 4-way set assoc, 32 byte line size */ @@ -45,6 +45,7 @@ static const struct _cache_table __cpuinitconst cache_table[] = { 0x0a, LVL_1_DATA, 8 }, /* 2 way set assoc, 32 byte line size */ { 0x0c, LVL_1_DATA, 16 }, /* 4-way set assoc, 32 byte line size */ { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */ + { 0x0e, LVL_1_DATA, 24 }, /* 6-way set assoc, 64 byte line size */ { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */ { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */ { 0x23, LVL_3, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ @@ -66,6 +67,7 @@ static const struct _cache_table __cpuinitconst cache_table[] = { 0x45, LVL_2, MB(2) }, /* 4-way set assoc, 32 byte line size */ { 0x46, LVL_3, MB(4) }, /* 4-way set assoc, 64 byte line size */ { 0x47, LVL_3, MB(8) }, /* 8-way set assoc, 64 byte line size */ + { 0x48, LVL_2, MB(3) }, /* 12-way set assoc, 64 byte line size */ { 0x49, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */ { 0x4a, LVL_3, MB(6) }, /* 12-way set assoc, 64 byte line size */ { 0x4b, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */ @@ -87,6 +89,7 @@ static const struct _cache_table __cpuinitconst cache_table[] = { 0x7c, LVL_2, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ { 0x7d, LVL_2, MB(2) }, /* 8-way set assoc, 64 byte line size */ { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */ + { 0x80, LVL_2, 512 }, /* 8-way set assoc, 64 byte line size */ { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */ { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */ { 0x84, LVL_2, MB(1) }, /* 8-way set assoc, 32 byte line size */ @@ -148,29 +151,17 @@ union _cpuid4_leaf_ecx { u32 full; }; -struct amd_l3_cache { - struct pci_dev *dev; - bool can_disable; - unsigned indices; - u8 subcaches[4]; -}; - -struct _cpuid4_info { +struct _cpuid4_info_regs { union _cpuid4_leaf_eax eax; union _cpuid4_leaf_ebx ebx; union _cpuid4_leaf_ecx ecx; unsigned long size; - struct amd_l3_cache *l3; - DECLARE_BITMAP(shared_cpu_map, NR_CPUS); + struct amd_northbridge *nb; }; -/* subset of above _cpuid4_info w/o shared_cpu_map */ -struct _cpuid4_info_regs { - union _cpuid4_leaf_eax eax; - union _cpuid4_leaf_ebx ebx; - union _cpuid4_leaf_ecx ecx; - unsigned long size; - struct amd_l3_cache *l3; +struct _cpuid4_info { + struct _cpuid4_info_regs base; + DECLARE_BITMAP(shared_cpu_map, NR_CPUS); }; unsigned short num_cache_leaves; @@ -212,7 +203,7 @@ union l3_cache { unsigned val; }; -static const unsigned short __cpuinitconst assocs[] = { +static const unsigned short assocs[] = { [1] = 1, [2] = 2, [4] = 4, @@ -226,10 +217,10 @@ static const unsigned short __cpuinitconst assocs[] = { [0xf] = 0xffff /* fully associative - no way to show this currently */ }; -static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 }; -static const unsigned char __cpuinitconst types[] = { 1, 2, 3, 3 }; +static const unsigned char levels[] = { 1, 1, 2, 3 }; +static const unsigned char types[] = { 1, 2, 3, 3 }; -static void __cpuinit +static void amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, union _cpuid4_leaf_ebx *ebx, union _cpuid4_leaf_ecx *ecx) @@ -266,7 +257,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, line_size = l2.line_size; lines_per_tag = l2.lines_per_tag; /* cpu_data has errata corrections for K7 applied */ - size_in_kb = current_cpu_data.x86_cache_size; + size_in_kb = __this_cpu_read(cpu_info.x86_cache_size); break; case 3: if (!l3.val) @@ -288,7 +279,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, eax->split.type = types[leaf]; eax->split.level = levels[leaf]; eax->split.num_threads_sharing = 0; - eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1; + eax->split.num_cores_on_die = __this_cpu_read(cpu_info.x86_max_cores) - 1; if (assoc == 0xffff) @@ -302,99 +293,50 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, struct _cache_attr { struct attribute attr; - ssize_t (*show)(struct _cpuid4_info *, char *); - ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count); + ssize_t (*show)(struct _cpuid4_info *, char *, unsigned int); + ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count, + unsigned int); }; -#ifdef CONFIG_AMD_NB - +#if defined(CONFIG_AMD_NB) && defined(CONFIG_SYSFS) /* * L3 cache descriptors */ -static struct amd_l3_cache **__cpuinitdata l3_caches; - -static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3) +static void amd_calc_l3_indices(struct amd_northbridge *nb) { + struct amd_l3_cache *l3 = &nb->l3_cache; unsigned int sc0, sc1, sc2, sc3; u32 val = 0; - pci_read_config_dword(l3->dev, 0x1C4, &val); + pci_read_config_dword(nb->misc, 0x1C4, &val); /* calculate subcache sizes */ l3->subcaches[0] = sc0 = !(val & BIT(0)); l3->subcaches[1] = sc1 = !(val & BIT(4)); - l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9)); - l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13)); - l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1; - l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; -} - -static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node) -{ - struct amd_l3_cache *l3; - struct pci_dev *dev = node_to_k8_nb_misc(node); - - l3 = kzalloc(sizeof(struct amd_l3_cache), GFP_ATOMIC); - if (!l3) { - printk(KERN_WARNING "Error allocating L3 struct\n"); - return NULL; + if (boot_cpu_data.x86 == 0x15) { + l3->subcaches[0] = sc0 += !(val & BIT(1)); + l3->subcaches[1] = sc1 += !(val & BIT(5)); } - l3->dev = dev; - - amd_calc_l3_indices(l3); + l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9)); + l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13)); - return l3; + l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; } -static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, - int index) +static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index) { int node; - if (boot_cpu_data.x86 != 0x10) - return; - + /* only for L3, and not in virtualized environments */ if (index < 3) return; - /* see errata #382 and #388 */ - if (boot_cpu_data.x86_model < 0x8) - return; - - if ((boot_cpu_data.x86_model == 0x8 || - boot_cpu_data.x86_model == 0x9) - && - boot_cpu_data.x86_mask < 0x1) - return; - - /* not in virtualized environments */ - if (k8_northbridges.num == 0) - return; - - /* - * Strictly speaking, the amount in @size below is leaked since it is - * never freed but this is done only on shutdown so it doesn't matter. - */ - if (!l3_caches) { - int size = k8_northbridges.num * sizeof(struct amd_l3_cache *); - - l3_caches = kzalloc(size, GFP_ATOMIC); - if (!l3_caches) - return; - } - node = amd_get_nb_id(smp_processor_id()); - - if (!l3_caches[node]) { - l3_caches[node] = amd_init_l3_cache(node); - l3_caches[node]->can_disable = true; - } - - WARN_ON(!l3_caches[node]); - - this_leaf->l3 = l3_caches[node]; + this_leaf->nb = node_to_amd_nb(node); + if (this_leaf->nb && !this_leaf->nb->l3_cache.indices) + amd_calc_l3_indices(this_leaf->nb); } /* @@ -404,11 +346,11 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, * * @returns: the disabled index if used or negative value if slot free. */ -int amd_get_l3_disable_slot(struct amd_l3_cache *l3, unsigned slot) +int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned slot) { unsigned int reg = 0; - pci_read_config_dword(l3->dev, 0x1BC + slot * 4, ®); + pci_read_config_dword(nb->misc, 0x1BC + slot * 4, ®); /* check whether this slot is activated already */ if (reg & (3UL << 30)) @@ -422,10 +364,10 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, { int index; - if (!this_leaf->l3 || !this_leaf->l3->can_disable) + if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) return -EINVAL; - index = amd_get_l3_disable_slot(this_leaf->l3, slot); + index = amd_get_l3_disable_slot(this_leaf->base.nb, slot); if (index >= 0) return sprintf(buf, "%d\n", index); @@ -434,14 +376,15 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, #define SHOW_CACHE_DISABLE(slot) \ static ssize_t \ -show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf) \ +show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf, \ + unsigned int cpu) \ { \ return show_cache_disable(this_leaf, buf, slot); \ } SHOW_CACHE_DISABLE(0) SHOW_CACHE_DISABLE(1) -static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu, +static void amd_l3_disable_index(struct amd_northbridge *nb, int cpu, unsigned slot, unsigned long idx) { int i; @@ -454,10 +397,10 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu, for (i = 0; i < 4; i++) { u32 reg = idx | (i << 20); - if (!l3->subcaches[i]) + if (!nb->l3_cache.subcaches[i]) continue; - pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg); + pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg); /* * We need to WBINVD on a core on the node containing the L3 @@ -467,7 +410,7 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu, wbinvd_on_cpu(cpu); reg |= BIT(31); - pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg); + pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg); } } @@ -481,35 +424,24 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu, * * @return: 0 on success, error status on failure */ -int amd_set_l3_disable_slot(struct amd_l3_cache *l3, int cpu, unsigned slot, +int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, unsigned slot, unsigned long index) { int ret = 0; -#define SUBCACHE_MASK (3UL << 20) -#define SUBCACHE_INDEX 0xfff - - /* - * check whether this slot is already used or - * the index is already disabled - */ - ret = amd_get_l3_disable_slot(l3, slot); + /* check if @slot is already used or the index is already disabled */ + ret = amd_get_l3_disable_slot(nb, slot); if (ret >= 0) - return -EINVAL; + return -EEXIST; - /* - * check whether the other slot has disabled the - * same index already - */ - if (index == amd_get_l3_disable_slot(l3, !slot)) + if (index > nb->l3_cache.indices) return -EINVAL; - /* do not allow writes outside of allowed bits */ - if ((index & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) || - ((index & SUBCACHE_INDEX) > l3->indices)) - return -EINVAL; + /* check whether the other slot has disabled the same index already */ + if (index == amd_get_l3_disable_slot(nb, !slot)) + return -EEXIST; - amd_l3_disable_index(l3, cpu, slot, index); + amd_l3_disable_index(nb, cpu, slot, index); return 0; } @@ -524,7 +456,7 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!this_leaf->l3 || !this_leaf->l3->can_disable) + if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) return -EINVAL; cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); @@ -532,11 +464,11 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, if (strict_strtoul(buf, 10, &val) < 0) return -EINVAL; - err = amd_set_l3_disable_slot(this_leaf->l3, cpu, slot, val); + err = amd_set_l3_disable_slot(this_leaf->base.nb, cpu, slot, val); if (err) { if (err == -EEXIST) - printk(KERN_WARNING "L3 disable slot %d in use!\n", - slot); + pr_warning("L3 slot %d in use/index already disabled!\n", + slot); return err; } return count; @@ -545,7 +477,8 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, #define STORE_CACHE_DISABLE(slot) \ static ssize_t \ store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \ - const char *buf, size_t count) \ + const char *buf, size_t count, \ + unsigned int cpu) \ { \ return store_cache_disable(this_leaf, buf, count, slot); \ } @@ -557,25 +490,58 @@ static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, show_cache_disable_1, store_cache_disable_1); -#else /* CONFIG_AMD_NB */ -static void __cpuinit -amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index) +static ssize_t +show_subcaches(struct _cpuid4_info *this_leaf, char *buf, unsigned int cpu) { -}; -#endif /* CONFIG_AMD_NB */ + if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + return -EINVAL; + + return sprintf(buf, "%x\n", amd_get_subcaches(cpu)); +} + +static ssize_t +store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count, + unsigned int cpu) +{ + unsigned long val; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + return -EINVAL; + + if (strict_strtoul(buf, 16, &val) < 0) + return -EINVAL; + + if (amd_set_subcaches(cpu, val)) + return -EINVAL; + + return count; +} + +static struct _cache_attr subcaches = + __ATTR(subcaches, 0644, show_subcaches, store_subcaches); + +#else +#define amd_init_l3_cache(x, y) +#endif /* CONFIG_AMD_NB && CONFIG_SYSFS */ static int -__cpuinit cpuid4_cache_lookup_regs(int index, - struct _cpuid4_info_regs *this_leaf) +cpuid4_cache_lookup_regs(int index, struct _cpuid4_info_regs *this_leaf) { - union _cpuid4_leaf_eax eax; - union _cpuid4_leaf_ebx ebx; - union _cpuid4_leaf_ecx ecx; + union _cpuid4_leaf_eax eax; + union _cpuid4_leaf_ebx ebx; + union _cpuid4_leaf_ecx ecx; unsigned edx; if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { - amd_cpuid4(index, &eax, &ebx, &ecx); - amd_check_l3_disable(this_leaf, index); + if (cpu_has_topoext) + cpuid_count(0x8000001d, index, &eax.full, + &ebx.full, &ecx.full, &edx); + else + amd_cpuid4(index, &eax, &ebx, &ecx); + amd_init_l3_cache(this_leaf, index); } else { cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); } @@ -593,22 +559,40 @@ __cpuinit cpuid4_cache_lookup_regs(int index, return 0; } -static int __cpuinit find_num_cache_leaves(void) +static int find_num_cache_leaves(struct cpuinfo_x86 *c) { - unsigned int eax, ebx, ecx, edx; + unsigned int eax, ebx, ecx, edx, op; union _cpuid4_leaf_eax cache_eax; int i = -1; + if (c->x86_vendor == X86_VENDOR_AMD) + op = 0x8000001d; + else + op = 4; + do { ++i; - /* Do cpuid(4) loop to find out num_cache_leaves */ - cpuid_count(4, i, &eax, &ebx, &ecx, &edx); + /* Do cpuid(op) loop to find out num_cache_leaves */ + cpuid_count(op, i, &eax, &ebx, &ecx, &edx); cache_eax.full = eax; } while (cache_eax.split.type != CACHE_TYPE_NULL); return i; } -unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) +void init_amd_cacheinfo(struct cpuinfo_x86 *c) +{ + + if (cpu_has_topoext) { + num_cache_leaves = find_num_cache_leaves(c); + } else if (c->extended_cpuid_level >= 0x80000006) { + if (cpuid_edx(0x80000006) & 0xf000) + num_cache_leaves = 4; + else + num_cache_leaves = 3; + } +} + +unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c) { /* Cache sizes */ unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; @@ -624,7 +608,7 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) if (is_initialized == 0) { /* Init num_cache_leaves from boot CPU */ - num_cache_leaves = find_num_cache_leaves(); + num_cache_leaves = find_num_cache_leaves(c); is_initialized++; } @@ -633,36 +617,34 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) * parameters cpuid leaf to find the cache details */ for (i = 0; i < num_cache_leaves; i++) { - struct _cpuid4_info_regs this_leaf; + struct _cpuid4_info_regs this_leaf = {}; int retval; retval = cpuid4_cache_lookup_regs(i, &this_leaf); - if (retval >= 0) { - switch (this_leaf.eax.split.level) { - case 1: - if (this_leaf.eax.split.type == - CACHE_TYPE_DATA) - new_l1d = this_leaf.size/1024; - else if (this_leaf.eax.split.type == - CACHE_TYPE_INST) - new_l1i = this_leaf.size/1024; - break; - case 2: - new_l2 = this_leaf.size/1024; - num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; - index_msb = get_count_order(num_threads_sharing); - l2_id = c->apicid >> index_msb; - break; - case 3: - new_l3 = this_leaf.size/1024; - num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; - index_msb = get_count_order( - num_threads_sharing); - l3_id = c->apicid >> index_msb; - break; - default: - break; - } + if (retval < 0) + continue; + + switch (this_leaf.eax.split.level) { + case 1: + if (this_leaf.eax.split.type == CACHE_TYPE_DATA) + new_l1d = this_leaf.size/1024; + else if (this_leaf.eax.split.type == CACHE_TYPE_INST) + new_l1i = this_leaf.size/1024; + break; + case 2: + new_l2 = this_leaf.size/1024; + num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; + index_msb = get_count_order(num_threads_sharing); + l2_id = c->apicid & ~((1 << index_msb) - 1); + break; + case 3: + new_l3 = this_leaf.size/1024; + num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; + index_msb = get_count_order(num_threads_sharing); + l3_id = c->apicid & ~((1 << index_msb) - 1); + break; + default: + break; } } } @@ -748,6 +730,18 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) #endif } +#ifdef CONFIG_X86_HT + /* + * If cpu_llc_id is not yet set, this means cpuid_level < 4 which in + * turns means that the only possibility is SMT (as indicated in + * cpuid1). Since cpuid2 doesn't specify shared caches, and we know + * that SMT shares all caches, we can unconditionally set cpu_llc_id to + * c->phys_proc_id. + */ + if (per_cpu(cpu_llc_id, cpu) == BAD_APICID) + per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; +#endif + c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d)); return l2; @@ -760,28 +754,70 @@ static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info); #define CPUID4_INFO_IDX(x, y) (&((per_cpu(ici_cpuid4_info, x))[y])) #ifdef CONFIG_SMP -static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) + +static int cache_shared_amd_cpu_map_setup(unsigned int cpu, int index) { - struct _cpuid4_info *this_leaf, *sibling_leaf; - unsigned long num_threads_sharing; - int index_msb, i, sibling; - struct cpuinfo_x86 *c = &cpu_data(cpu); + struct _cpuid4_info *this_leaf; + int i, sibling; - if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) { - for_each_cpu(i, c->llc_shared_map) { + if (cpu_has_topoext) { + unsigned int apicid, nshared, first, last; + + if (!per_cpu(ici_cpuid4_info, cpu)) + return 0; + + this_leaf = CPUID4_INFO_IDX(cpu, index); + nshared = this_leaf->base.eax.split.num_threads_sharing + 1; + apicid = cpu_data(cpu).apicid; + first = apicid - (apicid % nshared); + last = first + nshared - 1; + + for_each_online_cpu(i) { + apicid = cpu_data(i).apicid; + if ((apicid < first) || (apicid > last)) + continue; + if (!per_cpu(ici_cpuid4_info, i)) + continue; + this_leaf = CPUID4_INFO_IDX(i, index); + + for_each_online_cpu(sibling) { + apicid = cpu_data(sibling).apicid; + if ((apicid < first) || (apicid > last)) + continue; + set_bit(sibling, this_leaf->shared_cpu_map); + } + } + } else if (index == 3) { + for_each_cpu(i, cpu_llc_shared_mask(cpu)) { if (!per_cpu(ici_cpuid4_info, i)) continue; this_leaf = CPUID4_INFO_IDX(i, index); - for_each_cpu(sibling, c->llc_shared_map) { + for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) { if (!cpu_online(sibling)) continue; set_bit(sibling, this_leaf->shared_cpu_map); } } - return; + } else + return 0; + + return 1; +} + +static void cache_shared_cpu_map_setup(unsigned int cpu, int index) +{ + struct _cpuid4_info *this_leaf, *sibling_leaf; + unsigned long num_threads_sharing; + int index_msb, i; + struct cpuinfo_x86 *c = &cpu_data(cpu); + + if (c->x86_vendor == X86_VENDOR_AMD) { + if (cache_shared_amd_cpu_map_setup(cpu, index)) + return; } + this_leaf = CPUID4_INFO_IDX(cpu, index); - num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing; + num_threads_sharing = 1 + this_leaf->base.eax.split.num_threads_sharing; if (num_threads_sharing == 1) cpumask_set_cpu(cpu, to_cpumask(this_leaf->shared_cpu_map)); @@ -803,7 +839,7 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) } } } -static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index) +static void cache_remove_shared_cpu_map(unsigned int cpu, int index) { struct _cpuid4_info *this_leaf, *sibling_leaf; int sibling; @@ -816,45 +852,35 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index) } } #else -static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) +static void cache_shared_cpu_map_setup(unsigned int cpu, int index) { } -static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index) +static void cache_remove_shared_cpu_map(unsigned int cpu, int index) { } #endif -static void __cpuinit free_cache_attributes(unsigned int cpu) +static void free_cache_attributes(unsigned int cpu) { int i; for (i = 0; i < num_cache_leaves; i++) cache_remove_shared_cpu_map(cpu, i); - kfree(per_cpu(ici_cpuid4_info, cpu)->l3); kfree(per_cpu(ici_cpuid4_info, cpu)); per_cpu(ici_cpuid4_info, cpu) = NULL; } -static int -__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf) -{ - struct _cpuid4_info_regs *leaf_regs = - (struct _cpuid4_info_regs *)this_leaf; - - return cpuid4_cache_lookup_regs(index, leaf_regs); -} - -static void __cpuinit get_cpu_leaves(void *_retval) +static void get_cpu_leaves(void *_retval) { int j, *retval = _retval, cpu = smp_processor_id(); /* Do cpuid and store the results */ for (j = 0; j < num_cache_leaves; j++) { - struct _cpuid4_info *this_leaf; - this_leaf = CPUID4_INFO_IDX(cpu, j); - *retval = cpuid4_cache_lookup(j, this_leaf); + struct _cpuid4_info *this_leaf = CPUID4_INFO_IDX(cpu, j); + + *retval = cpuid4_cache_lookup_regs(j, &this_leaf->base); if (unlikely(*retval < 0)) { int i; @@ -866,7 +892,7 @@ static void __cpuinit get_cpu_leaves(void *_retval) } } -static int __cpuinit detect_cache_attributes(unsigned int cpu) +static int detect_cache_attributes(unsigned int cpu) { int retval; @@ -889,8 +915,7 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu) #include <linux/kobject.h> #include <linux/sysfs.h> - -extern struct sysdev_class cpu_sysdev_class; /* from drivers/base/cpu.c */ +#include <linux/cpu.h> /* pointer to kobject for cpuX/cache */ static DEFINE_PER_CPU(struct kobject *, ici_cache_kobject); @@ -906,21 +931,22 @@ static DEFINE_PER_CPU(struct _index_kobject *, ici_index_kobject); #define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(ici_index_kobject, x))[y])) #define show_one_plus(file_name, object, val) \ -static ssize_t show_##file_name \ - (struct _cpuid4_info *this_leaf, char *buf) \ +static ssize_t show_##file_name(struct _cpuid4_info *this_leaf, char *buf, \ + unsigned int cpu) \ { \ return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \ } -show_one_plus(level, eax.split.level, 0); -show_one_plus(coherency_line_size, ebx.split.coherency_line_size, 1); -show_one_plus(physical_line_partition, ebx.split.physical_line_partition, 1); -show_one_plus(ways_of_associativity, ebx.split.ways_of_associativity, 1); -show_one_plus(number_of_sets, ecx.split.number_of_sets, 1); +show_one_plus(level, base.eax.split.level, 0); +show_one_plus(coherency_line_size, base.ebx.split.coherency_line_size, 1); +show_one_plus(physical_line_partition, base.ebx.split.physical_line_partition, 1); +show_one_plus(ways_of_associativity, base.ebx.split.ways_of_associativity, 1); +show_one_plus(number_of_sets, base.ecx.split.number_of_sets, 1); -static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf) +static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf, + unsigned int cpu) { - return sprintf(buf, "%luK\n", this_leaf->size / 1024); + return sprintf(buf, "%luK\n", this_leaf->base.size / 1024); } static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf, @@ -942,19 +968,22 @@ static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf, return n; } -static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf) +static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf, + unsigned int cpu) { return show_shared_cpu_map_func(leaf, 0, buf); } -static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf) +static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf, + unsigned int cpu) { return show_shared_cpu_map_func(leaf, 1, buf); } -static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) +static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf, + unsigned int cpu) { - switch (this_leaf->eax.split.type) { + switch (this_leaf->base.eax.split.type) { case CACHE_TYPE_DATA: return sprintf(buf, "Data\n"); case CACHE_TYPE_INST: @@ -983,30 +1012,54 @@ define_one_ro(size); define_one_ro(shared_cpu_map); define_one_ro(shared_cpu_list); -#define DEFAULT_SYSFS_CACHE_ATTRS \ - &type.attr, \ - &level.attr, \ - &coherency_line_size.attr, \ - &physical_line_partition.attr, \ - &ways_of_associativity.attr, \ - &number_of_sets.attr, \ - &size.attr, \ - &shared_cpu_map.attr, \ - &shared_cpu_list.attr - static struct attribute *default_attrs[] = { - DEFAULT_SYSFS_CACHE_ATTRS, + &type.attr, + &level.attr, + &coherency_line_size.attr, + &physical_line_partition.attr, + &ways_of_associativity.attr, + &number_of_sets.attr, + &size.attr, + &shared_cpu_map.attr, + &shared_cpu_list.attr, NULL }; -static struct attribute *default_l3_attrs[] = { - DEFAULT_SYSFS_CACHE_ATTRS, #ifdef CONFIG_AMD_NB - &cache_disable_0.attr, - &cache_disable_1.attr, +static struct attribute **amd_l3_attrs(void) +{ + static struct attribute **attrs; + int n; + + if (attrs) + return attrs; + + n = ARRAY_SIZE(default_attrs); + + if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) + n += 2; + + if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + n += 1; + + attrs = kzalloc(n * sizeof (struct attribute *), GFP_KERNEL); + if (attrs == NULL) + return attrs = default_attrs; + + for (n = 0; default_attrs[n]; n++) + attrs[n] = default_attrs[n]; + + if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) { + attrs[n++] = &cache_disable_0.attr; + attrs[n++] = &cache_disable_1.attr; + } + + if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + attrs[n++] = &subcaches.attr; + + return attrs; +} #endif - NULL -}; static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) { @@ -1016,7 +1069,7 @@ static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) ret = fattr->show ? fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), - buf) : + buf, this_leaf->cpu) : 0; return ret; } @@ -1030,7 +1083,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr, ret = fattr->store ? fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), - buf, count) : + buf, count, this_leaf->cpu) : 0; return ret; } @@ -1049,7 +1102,7 @@ static struct kobj_type ktype_percpu_entry = { .sysfs_ops = &sysfs_ops, }; -static void __cpuinit cpuid4_cache_sysfs_exit(unsigned int cpu) +static void cpuid4_cache_sysfs_exit(unsigned int cpu) { kfree(per_cpu(ici_cache_kobject, cpu)); kfree(per_cpu(ici_index_kobject, cpu)); @@ -1058,7 +1111,7 @@ static void __cpuinit cpuid4_cache_sysfs_exit(unsigned int cpu) free_cache_attributes(cpu); } -static int __cpuinit cpuid4_cache_sysfs_init(unsigned int cpu) +static int cpuid4_cache_sysfs_init(unsigned int cpu) { int err; @@ -1090,9 +1143,9 @@ err_out: static DECLARE_BITMAP(cache_dev_map, NR_CPUS); /* Add/Remove cache interface for CPU device */ -static int __cpuinit cache_add_dev(struct sys_device * sys_dev) +static int cache_add_dev(struct device *dev) { - unsigned int cpu = sys_dev->id; + unsigned int cpu = dev->id; unsigned long i, j; struct _index_kobject *this_object; struct _cpuid4_info *this_leaf; @@ -1104,7 +1157,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) retval = kobject_init_and_add(per_cpu(ici_cache_kobject, cpu), &ktype_percpu_entry, - &sys_dev->kobj, "%s", "cache"); + &dev->kobj, "%s", "cache"); if (retval < 0) { cpuid4_cache_sysfs_exit(cpu); return retval; @@ -1117,11 +1170,11 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) this_leaf = CPUID4_INFO_IDX(cpu, i); - if (this_leaf->l3 && this_leaf->l3->can_disable) - ktype_cache.default_attrs = default_l3_attrs; - else - ktype_cache.default_attrs = default_attrs; - + ktype_cache.default_attrs = default_attrs; +#ifdef CONFIG_AMD_NB + if (this_leaf->base.nb) + ktype_cache.default_attrs = amd_l3_attrs(); +#endif retval = kobject_init_and_add(&(this_object->kobj), &ktype_cache, per_cpu(ici_cache_kobject, cpu), @@ -1141,9 +1194,9 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) return 0; } -static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) +static void cache_remove_dev(struct device *dev) { - unsigned int cpu = sys_dev->id; + unsigned int cpu = dev->id; unsigned long i; if (per_cpu(ici_cpuid4_info, cpu) == NULL) @@ -1158,47 +1211,50 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) cpuid4_cache_sysfs_exit(cpu); } -static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int cacheinfo_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; - struct sys_device *sys_dev; + struct device *dev; - sys_dev = get_cpu_sysdev(cpu); + dev = get_cpu_device(cpu); switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: - cache_add_dev(sys_dev); + cache_add_dev(dev); break; case CPU_DEAD: case CPU_DEAD_FROZEN: - cache_remove_dev(sys_dev); + cache_remove_dev(dev); break; } return NOTIFY_OK; } -static struct notifier_block __cpuinitdata cacheinfo_cpu_notifier = { +static struct notifier_block cacheinfo_cpu_notifier = { .notifier_call = cacheinfo_cpu_callback, }; -static int __cpuinit cache_sysfs_init(void) +static int __init cache_sysfs_init(void) { - int i; + int i, err = 0; if (num_cache_leaves == 0) return 0; + cpu_notifier_register_begin(); for_each_online_cpu(i) { - int err; - struct sys_device *sys_dev = get_cpu_sysdev(i); + struct device *dev = get_cpu_device(i); - err = cache_add_dev(sys_dev); + err = cache_add_dev(dev); if (err) - return err; + goto out; } - register_hotcpu_notifier(&cacheinfo_cpu_notifier); - return 0; + __register_hotcpu_notifier(&cacheinfo_cpu_notifier); + +out: + cpu_notifier_register_done(); + return err; } device_initcall(cache_sysfs_init); diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c new file mode 100644 index 00000000000..afa9f0d487e --- /dev/null +++ b/arch/x86/kernel/cpu/match.c @@ -0,0 +1,49 @@ +#include <asm/cpu_device_id.h> +#include <asm/processor.h> +#include <linux/cpu.h> +#include <linux/module.h> +#include <linux/slab.h> + +/** + * x86_match_cpu - match current CPU again an array of x86_cpu_ids + * @match: Pointer to array of x86_cpu_ids. Last entry terminated with + * {}. + * + * Return the entry if the current CPU matches the entries in the + * passed x86_cpu_id match table. Otherwise NULL. The match table + * contains vendor (X86_VENDOR_*), family, model and feature bits or + * respective wildcard entries. + * + * A typical table entry would be to match a specific CPU + * { X86_VENDOR_INTEL, 6, 0x12 } + * or to match a specific CPU feature + * { X86_FEATURE_MATCH(X86_FEATURE_FOOBAR) } + * + * Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY, + * %X86_MODEL_ANY, %X86_FEATURE_ANY or 0 (except for vendor) + * + * Arrays used to match for this should also be declared using + * MODULE_DEVICE_TABLE(x86cpu, ...) + * + * This always matches against the boot cpu, assuming models and features are + * consistent over all CPUs. + */ +const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match) +{ + const struct x86_cpu_id *m; + struct cpuinfo_x86 *c = &boot_cpu_data; + + for (m = match; m->vendor | m->family | m->model | m->feature; m++) { + if (m->vendor != X86_VENDOR_ANY && c->x86_vendor != m->vendor) + continue; + if (m->family != X86_FAMILY_ANY && c->x86 != m->family) + continue; + if (m->model != X86_MODEL_ANY && c->x86_model != m->model) + continue; + if (m->feature != X86_FEATURE_ANY && !cpu_has(c, m->feature)) + continue; + return m; + } + return NULL; +} +EXPORT_SYMBOL(x86_match_cpu); diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c index 8209472b27a..a1aef953315 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-apei.c +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c @@ -28,26 +28,33 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#include <linux/export.h> #include <linux/kernel.h> #include <linux/acpi.h> #include <linux/cper.h> #include <acpi/apei.h> +#include <acpi/ghes.h> #include <asm/mce.h> #include "mce-internal.h" -void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err) +void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) { struct mce m; - /* Only corrected MC is reported */ - if (!corrected) + if (!(mem_err->validation_bits & CPER_MEM_VALID_PA)) return; mce_setup(&m); m.bank = 1; - /* Fake a memory read corrected error with unknown channel */ + /* Fake a memory read error with unknown channel */ m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f; + + if (severity >= GHES_SEV_RECOVERABLE) + m.status |= MCI_STATUS_UC; + if (severity >= GHES_SEV_PANIC) + m.status |= MCI_STATUS_PCC; + m.addr = mem_err->physical_addr; mce_log(&m); mce_notify_irq(); @@ -106,24 +113,34 @@ int apei_write_mce(struct mce *m) ssize_t apei_read_mce(struct mce *m, u64 *record_id) { struct cper_mce_record rcd; - ssize_t len; - - len = erst_read_next(&rcd.hdr, sizeof(rcd)); - if (len <= 0) - return len; - /* Can not skip other records in storage via ERST unless clear them */ - else if (len != sizeof(rcd) || - uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) { - if (printk_ratelimit()) - pr_warning( - "MCE-APEI: Can not skip the unknown record in ERST"); - return -EIO; - } - + int rc, pos; + + rc = erst_get_record_id_begin(&pos); + if (rc) + return rc; +retry: + rc = erst_get_record_id_next(&pos, record_id); + if (rc) + goto out; + /* no more record */ + if (*record_id == APEI_ERST_INVALID_RECORD_ID) + goto out; + rc = erst_read(*record_id, &rcd.hdr, sizeof(rcd)); + /* someone else has cleared the record, try next one */ + if (rc == -ENOENT) + goto retry; + else if (rc < 0) + goto out; + /* try to skip other type records in storage */ + else if (rc != sizeof(rcd) || + uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) + goto retry; memcpy(m, &rcd.mce, sizeof(*m)); - *record_id = rcd.hdr.record_id; + rc = sizeof(*m); +out: + erst_get_record_id_end(); - return sizeof(*m); + return rc; } /* Check whether there is record in ERST */ diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index e7dbde7bfed..5ac2d1fb28b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -17,6 +17,7 @@ #include <linux/kernel.h> #include <linux/string.h> #include <linux/fs.h> +#include <linux/preempt.h> #include <linux/smp.h> #include <linux/notifier.h> #include <linux/kdebug.h> @@ -25,13 +26,14 @@ #include <linux/gfp.h> #include <asm/mce.h> #include <asm/apic.h> +#include <asm/nmi.h> /* Update fake mce registers on current CPU. */ static void inject_mce(struct mce *m) { struct mce *i = &per_cpu(injectm, m->extcpu); - /* Make sure noone reads partially written injectm */ + /* Make sure no one reads partially written injectm */ i->finished = 0; mb(); m->finished = 0; @@ -76,27 +78,33 @@ static void raise_exception(struct mce *m, struct pt_regs *pregs) } static cpumask_var_t mce_inject_cpumask; +static DEFINE_MUTEX(mce_inject_mutex); -static int mce_raise_notify(struct notifier_block *self, - unsigned long val, void *data) +static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs) { - struct die_args *args = (struct die_args *)data; int cpu = smp_processor_id(); struct mce *m = &__get_cpu_var(injectm); - if (val != DIE_NMI_IPI || !cpumask_test_cpu(cpu, mce_inject_cpumask)) - return NOTIFY_DONE; + if (!cpumask_test_cpu(cpu, mce_inject_cpumask)) + return NMI_DONE; cpumask_clear_cpu(cpu, mce_inject_cpumask); if (m->inject_flags & MCJ_EXCEPTION) - raise_exception(m, args->regs); + raise_exception(m, regs); else if (m->status) raise_poll(m); - return NOTIFY_STOP; + return NMI_HANDLED; } -static struct notifier_block mce_raise_nb = { - .notifier_call = mce_raise_notify, - .priority = 1000, -}; +static void mce_irq_ipi(void *info) +{ + int cpu = smp_processor_id(); + struct mce *m = &__get_cpu_var(injectm); + + if (cpumask_test_cpu(cpu, mce_inject_cpumask) && + m->inject_flags & MCJ_EXCEPTION) { + cpumask_clear_cpu(cpu, mce_inject_cpumask); + raise_exception(m, NULL); + } +} /* Inject mce on current CPU */ static int raise_local(void) @@ -145,9 +153,10 @@ static void raise_mce(struct mce *m) return; #ifdef CONFIG_X86_LOCAL_APIC - if (m->inject_flags & MCJ_NMI_BROADCAST) { + if (m->inject_flags & (MCJ_IRQ_BROADCAST | MCJ_NMI_BROADCAST)) { unsigned long start; int cpu; + get_online_cpus(); cpumask_copy(mce_inject_cpumask, cpu_online_mask); cpumask_clear_cpu(get_cpu(), mce_inject_cpumask); @@ -157,13 +166,25 @@ static void raise_mce(struct mce *m) MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM) cpumask_clear_cpu(cpu, mce_inject_cpumask); } - if (!cpumask_empty(mce_inject_cpumask)) - apic->send_IPI_mask(mce_inject_cpumask, NMI_VECTOR); + if (!cpumask_empty(mce_inject_cpumask)) { + if (m->inject_flags & MCJ_IRQ_BROADCAST) { + /* + * don't wait because mce_irq_ipi is necessary + * to be sync with following raise_local + */ + preempt_disable(); + smp_call_function_many(mce_inject_cpumask, + mce_irq_ipi, NULL, 0); + preempt_enable(); + } else if (m->inject_flags & MCJ_NMI_BROADCAST) + apic->send_IPI_mask(mce_inject_cpumask, + NMI_VECTOR); + } start = jiffies; while (!cpumask_empty(mce_inject_cpumask)) { if (!time_before(jiffies, start + 2*HZ)) { printk(KERN_ERR - "Timeout waiting for mce inject NMI %lx\n", + "Timeout waiting for mce inject %lx\n", *cpumask_bits(mce_inject_cpumask)); break; } @@ -174,7 +195,11 @@ static void raise_mce(struct mce *m) put_online_cpus(); } else #endif + { + preempt_disable(); raise_local(); + preempt_enable(); + } } /* Error injection interface */ @@ -205,7 +230,10 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf, * so do it a jiffie or two later everywhere. */ schedule_timeout(2); + + mutex_lock(&mce_inject_mutex); raise_mce(&m); + mutex_unlock(&mce_inject_mutex); return usize; } @@ -214,8 +242,9 @@ static int inject_init(void) if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL)) return -ENOMEM; printk(KERN_INFO "Machine check injector initialized\n"); - mce_chrdev_ops.write = mce_write; - register_die_notifier(&mce_raise_nb); + register_mce_write_callback(mce_write); + register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, + "mce_notify"); return 0; } diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index fefcc69ee8b..09edd0b65fe 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -1,4 +1,4 @@ -#include <linux/sysdev.h> +#include <linux/device.h> #include <asm/mce.h> enum severity_level { @@ -17,16 +17,29 @@ enum severity_level { struct mce_bank { u64 ctl; /* subevents to enable */ unsigned char init; /* initialise bank? */ - struct sysdev_attribute attr; /* sysdev attribute */ + struct device_attribute attr; /* device attribute */ char attrname[ATTR_LEN]; /* attribute name */ }; int mce_severity(struct mce *a, int tolerant, char **msg); struct dentry *mce_get_debugfs_dir(void); -extern int mce_ser; - extern struct mce_bank *mce_banks; +extern mce_banks_t mce_banks_ce_disabled; + +#ifdef CONFIG_X86_MCE_INTEL +unsigned long mce_intel_adjust_timer(unsigned long interval); +void mce_intel_cmci_poll(void); +void mce_intel_hcpu_update(unsigned long cpu); +void cmci_disable_bank(int bank); +#else +# define mce_intel_adjust_timer mce_adjust_timer_default +static inline void mce_intel_cmci_poll(void) { } +static inline void mce_intel_hcpu_update(unsigned long cpu) { } +static inline void cmci_disable_bank(int bank) { } +#endif + +void mce_timer_kick(unsigned long interval); #ifdef CONFIG_ACPI_APEI int apei_write_mce(struct mce *m); diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 1e8d66c1336..c370e1c4468 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -43,88 +43,154 @@ static struct severity { unsigned char covered; char *msg; } severities[] = { -#define KERNEL .context = IN_KERNEL -#define USER .context = IN_USER -#define SER .ser = SER_REQUIRED -#define NOSER .ser = NO_SER -#define SEV(s) .sev = MCE_ ## s ## _SEVERITY -#define BITCLR(x, s, m, r...) { .mask = x, .result = 0, SEV(s), .msg = m, ## r } -#define BITSET(x, s, m, r...) { .mask = x, .result = x, SEV(s), .msg = m, ## r } -#define MCGMASK(x, res, s, m, r...) \ - { .mcgmask = x, .mcgres = res, SEV(s), .msg = m, ## r } -#define MASK(x, y, s, m, r...) \ - { .mask = x, .result = y, SEV(s), .msg = m, ## r } +#define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c } +#define KERNEL .context = IN_KERNEL +#define USER .context = IN_USER +#define SER .ser = SER_REQUIRED +#define NOSER .ser = NO_SER +#define BITCLR(x) .mask = x, .result = 0 +#define BITSET(x) .mask = x, .result = x +#define MCGMASK(x, y) .mcgmask = x, .mcgres = y +#define MASK(x, y) .mask = x, .result = y #define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) -#define MCACOD 0xffff +#define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV) - BITCLR(MCI_STATUS_VAL, NO, "Invalid"), - BITCLR(MCI_STATUS_EN, NO, "Not enabled"), - BITSET(MCI_STATUS_PCC, PANIC, "Processor context corrupt"), + MCESEV( + NO, "Invalid", + BITCLR(MCI_STATUS_VAL) + ), + MCESEV( + NO, "Not enabled", + BITCLR(MCI_STATUS_EN) + ), + MCESEV( + PANIC, "Processor context corrupt", + BITSET(MCI_STATUS_PCC) + ), /* When MCIP is not set something is very confused */ - MCGMASK(MCG_STATUS_MCIP, 0, PANIC, "MCIP not set in MCA handler"), + MCESEV( + PANIC, "MCIP not set in MCA handler", + MCGMASK(MCG_STATUS_MCIP, 0) + ), /* Neither return not error IP -- no chance to recover -> PANIC */ - MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0, PANIC, - "Neither restart nor error IP"), - MCGMASK(MCG_STATUS_RIPV, 0, PANIC, "In kernel and no restart IP", - KERNEL), - BITCLR(MCI_STATUS_UC, KEEP, "Corrected error", NOSER), - MASK(MCI_STATUS_OVER|MCI_STATUS_UC|MCI_STATUS_EN, MCI_STATUS_UC, SOME, - "Spurious not enabled", SER), + MCESEV( + PANIC, "Neither restart nor error IP", + MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0) + ), + MCESEV( + PANIC, "In kernel and no restart IP", + KERNEL, MCGMASK(MCG_STATUS_RIPV, 0) + ), + MCESEV( + KEEP, "Corrected error", + NOSER, BITCLR(MCI_STATUS_UC) + ), /* ignore OVER for UCNA */ - MASK(MCI_UC_SAR, MCI_STATUS_UC, KEEP, - "Uncorrected no action required", SER), - MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR, PANIC, - "Illegal combination (UCNA with AR=1)", SER), - MASK(MCI_STATUS_S, 0, KEEP, "Non signalled machine check", SER), - - /* AR add known MCACODs here */ - MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_SAR, PANIC, - "Action required with lost events", SER), - MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_SAR, PANIC, - "Action required; unknown MCACOD", SER), + MCESEV( + KEEP, "Uncorrected no action required", + SER, MASK(MCI_UC_SAR, MCI_STATUS_UC) + ), + MCESEV( + PANIC, "Illegal combination (UCNA with AR=1)", + SER, + MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR) + ), + MCESEV( + KEEP, "Non signalled machine check", + SER, BITCLR(MCI_STATUS_S) + ), + + MCESEV( + PANIC, "Action required with lost events", + SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR) + ), + + /* known AR MCACODs: */ +#ifdef CONFIG_MEMORY_FAILURE + MCESEV( + KEEP, "Action required but unaffected thread is continuable", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR, MCI_UC_SAR|MCI_ADDR), + MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, MCG_STATUS_RIPV) + ), + MCESEV( + AR, "Action required: data load error in a user process", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), + USER + ), + MCESEV( + AR, "Action required: instruction fetch error in a user process", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR), + USER + ), +#endif + MCESEV( + PANIC, "Action required: unknown MCACOD", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR) + ), /* known AO MCACODs: */ - MASK(MCI_UC_SAR|MCI_STATUS_OVER|0xfff0, MCI_UC_S|0xc0, AO, - "Action optional: memory scrubbing error", SER), - MASK(MCI_UC_SAR|MCI_STATUS_OVER|MCACOD, MCI_UC_S|0x17a, AO, - "Action optional: last level cache writeback error", SER), - - MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S, SOME, - "Action optional unknown MCACOD", SER), - MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S|MCI_STATUS_OVER, SOME, - "Action optional with lost events", SER), - BITSET(MCI_STATUS_UC|MCI_STATUS_OVER, PANIC, "Overflowed uncorrected"), - BITSET(MCI_STATUS_UC, UC, "Uncorrected"), - BITSET(0, SOME, "No match") /* always matches. keep at end */ + MCESEV( + AO, "Action optional: memory scrubbing error", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD_SCRUBMSK, MCI_UC_S|MCACOD_SCRUB) + ), + MCESEV( + AO, "Action optional: last level cache writeback error", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|MCACOD_L3WB) + ), + MCESEV( + SOME, "Action optional: unknown MCACOD", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S) + ), + MCESEV( + SOME, "Action optional with lost events", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_S) + ), + + MCESEV( + PANIC, "Overflowed uncorrected", + BITSET(MCI_STATUS_OVER|MCI_STATUS_UC) + ), + MCESEV( + UC, "Uncorrected", + BITSET(MCI_STATUS_UC) + ), + MCESEV( + SOME, "No match", + BITSET(0) + ) /* always matches. keep at end */ }; /* - * If the EIPV bit is set, it means the saved IP is the - * instruction which caused the MCE. + * If mcgstatus indicated that ip/cs on the stack were + * no good, then "m->cs" will be zero and we will have + * to assume the worst case (IN_KERNEL) as we actually + * have no idea what we were executing when the machine + * check hit. + * If we do have a good "m->cs" (or a faked one in the + * case we were executing in VM86 mode) we can use it to + * distinguish an exception taken in user from from one + * taken in the kernel. */ static int error_context(struct mce *m) { - if (m->mcgstatus & MCG_STATUS_EIPV) - return (m->ip && (m->cs & 3) == 3) ? IN_USER : IN_KERNEL; - /* Unknown, assume kernel */ - return IN_KERNEL; + return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL; } -int mce_severity(struct mce *a, int tolerant, char **msg) +int mce_severity(struct mce *m, int tolerant, char **msg) { - enum context ctx = error_context(a); + enum context ctx = error_context(m); struct severity *s; for (s = severities;; s++) { - if ((a->status & s->mask) != s->result) + if ((m->status & s->mask) != s->result) continue; - if ((a->mcgstatus & s->mcgmask) != s->mcgres) + if ((m->mcgstatus & s->mcgmask) != s->mcgres) continue; - if (s->ser == SER_REQUIRED && !mce_ser) + if (s->ser == SER_REQUIRED && !mca_cfg.ser) continue; - if (s->ser == NO_SER && mce_ser) + if (s->ser == NO_SER && mca_cfg.ser) continue; if (s->context && ctx != s->context) continue; @@ -197,15 +263,15 @@ static const struct file_operations severities_coverage_fops = { static int __init severities_debugfs_init(void) { - struct dentry *dmce = NULL, *fseverities_coverage = NULL; + struct dentry *dmce, *fsev; dmce = mce_get_debugfs_dir(); - if (dmce == NULL) + if (!dmce) goto err_out; - fseverities_coverage = debugfs_create_file("severities-coverage", - 0444, dmce, NULL, - &severities_coverage_fops); - if (fseverities_coverage == NULL) + + fsev = debugfs_create_file("severities-coverage", 0444, dmce, NULL, + &severities_coverage_fops); + if (!fsev) goto err_out; return 0; @@ -214,4 +280,4 @@ err_out: return -ENOMEM; } late_initcall(severities_debugfs_init); -#endif +#endif /* CONFIG_DEBUG_FS */ diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 7a35b72d7c0..9a79c8dbd8e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -7,10 +7,12 @@ * Copyright 2008 Intel Corporation * Author: Andi Kleen */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/thread_info.h> #include <linux/capability.h> #include <linux/miscdevice.h> -#include <linux/interrupt.h> #include <linux/ratelimit.h> #include <linux/kallsyms.h> #include <linux/rcupdate.h> @@ -20,7 +22,8 @@ #include <linux/kernel.h> #include <linux/percpu.h> #include <linux/string.h> -#include <linux/sysdev.h> +#include <linux/device.h> +#include <linux/syscore_ops.h> #include <linux/delay.h> #include <linux/ctype.h> #include <linux/sched.h> @@ -36,95 +39,84 @@ #include <linux/fs.h> #include <linux/mm.h> #include <linux/debugfs.h> -#include <linux/edac_mce.h> +#include <linux/irq_work.h> +#include <linux/export.h> #include <asm/processor.h> -#include <asm/hw_irq.h> -#include <asm/apic.h> -#include <asm/idle.h> -#include <asm/ipi.h> #include <asm/mce.h> #include <asm/msr.h> #include "mce-internal.h" -static DEFINE_MUTEX(mce_read_mutex); +static DEFINE_MUTEX(mce_chrdev_read_mutex); #define rcu_dereference_check_mce(p) \ rcu_dereference_index_check((p), \ rcu_read_lock_sched_held() || \ - lockdep_is_held(&mce_read_mutex)) + lockdep_is_held(&mce_chrdev_read_mutex)) #define CREATE_TRACE_POINTS #include <trace/events/mce.h> -int mce_disabled __read_mostly; - -#define MISC_MCELOG_MINOR 227 - #define SPINUNIT 100 /* 100ns */ -atomic_t mce_entry; - DEFINE_PER_CPU(unsigned, mce_exception_count); -/* - * Tolerant levels: - * 0: always panic on uncorrected errors, log corrected errors - * 1: panic or SIGBUS on uncorrected errors, log corrected errors - * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors - * 3: never panic or SIGBUS, log all errors (for testing only) - */ -static int tolerant __read_mostly = 1; -static int banks __read_mostly; -static int rip_msr __read_mostly; -static int mce_bootlog __read_mostly = -1; -static int monarch_timeout __read_mostly = -1; -static int mce_panic_timeout __read_mostly; -static int mce_dont_log_ce __read_mostly; -int mce_cmci_disabled __read_mostly; -int mce_ignore_ce __read_mostly; -int mce_ser __read_mostly; - -struct mce_bank *mce_banks __read_mostly; +struct mce_bank *mce_banks __read_mostly; + +struct mca_config mca_cfg __read_mostly = { + .bootlog = -1, + /* + * Tolerant levels: + * 0: always panic on uncorrected errors, log corrected errors + * 1: panic or SIGBUS on uncorrected errors, log corrected errors + * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors + * 3: never panic or SIGBUS, log all errors (for testing only) + */ + .tolerant = 1, + .monarch_timeout = -1 +}; /* User mode helper program triggered by machine check event */ static unsigned long mce_need_notify; static char mce_helper[128]; static char *mce_helper_argv[2] = { mce_helper, NULL }; -static DECLARE_WAIT_QUEUE_HEAD(mce_wait); +static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); + static DEFINE_PER_CPU(struct mce, mces_seen); static int cpu_missing; +/* CMCI storm detection filter */ +static DEFINE_PER_CPU(unsigned long, mce_polled_error); + /* - * CPU/chipset specific EDAC code can register a notifier call here to print - * MCE errors in a human-readable form. + * MCA banks polled by the period polling timer for corrected events. + * With Intel CMCI, this only has MCA banks which do not support CMCI (if any). */ -ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); -EXPORT_SYMBOL_GPL(x86_mce_decoder_chain); - -static int default_decode_mce(struct notifier_block *nb, unsigned long val, - void *data) -{ - pr_emerg(HW_ERR "No human readable MCE decoding support on this CPU type.\n"); - pr_emerg(HW_ERR "Run the message through 'mcelog --ascii' to decode.\n"); - - return NOTIFY_STOP; -} - -static struct notifier_block mce_dec_nb = { - .notifier_call = default_decode_mce, - .priority = -1, -}; - -/* MCA banks polled by the period polling timer for corrected events */ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL }; +/* + * MCA banks controlled through firmware first for corrected errors. + * This is a global list of banks for which we won't enable CMCI and we + * won't poll. Firmware controls these banks and is responsible for + * reporting corrected errors through GHES. Uncorrected/recoverable + * errors are still notified through a machine check. + */ +mce_banks_t mce_banks_ce_disabled; + static DEFINE_PER_CPU(struct work_struct, mce_work); +static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); + +/* + * CPU/chipset specific EDAC code can register a notifier call here to print + * MCE errors in a human-readable form. + */ +ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); + /* Do initial initialization of a struct mce */ void mce_setup(struct mce *m) { @@ -135,9 +127,7 @@ void mce_setup(struct mce *m) m->time = get_seconds(); m->cpuvendor = boot_cpu_data.x86_vendor; m->cpuid = cpuid_eax(1); -#ifdef CONFIG_SMP m->socketid = cpu_data(m->extcpu).phys_proc_id; -#endif m->apicid = cpu_data(m->extcpu).initial_apicid; rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); } @@ -160,23 +150,20 @@ static struct mce_log mcelog = { void mce_log(struct mce *mce) { unsigned next, entry; + int ret = 0; /* Emit the trace record: */ trace_mce_record(mce); + ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); + if (ret == NOTIFY_STOP) + return; + mce->finished = 0; wmb(); for (;;) { entry = rcu_dereference_check_mce(mcelog.next); for (;;) { - /* - * If edac_mce is enabled, it will check the error type - * and will process it, if it is a known error. - * Otherwise, the error will be sent through mcelog - * interface - */ - if (edac_mce_parse(mce)) - return; /* * When the buffer fills up discard new entries. @@ -209,8 +196,61 @@ void mce_log(struct mce *mce) set_bit(0, &mce_need_notify); } +static void drain_mcelog_buffer(void) +{ + unsigned int next, i, prev = 0; + + next = ACCESS_ONCE(mcelog.next); + + do { + struct mce *m; + + /* drain what was logged during boot */ + for (i = prev; i < next; i++) { + unsigned long start = jiffies; + unsigned retries = 1; + + m = &mcelog.entry[i]; + + while (!m->finished) { + if (time_after_eq(jiffies, start + 2*retries)) + retries++; + + cpu_relax(); + + if (!m->finished && retries >= 4) { + pr_err("skipping error being logged currently!\n"); + break; + } + } + smp_rmb(); + atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); + } + + memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m)); + prev = next; + next = cmpxchg(&mcelog.next, prev, 0); + } while (next != prev); +} + + +void mce_register_decode_chain(struct notifier_block *nb) +{ + atomic_notifier_chain_register(&x86_mce_decoder_chain, nb); + drain_mcelog_buffer(); +} +EXPORT_SYMBOL_GPL(mce_register_decode_chain); + +void mce_unregister_decode_chain(struct notifier_block *nb) +{ + atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb); +} +EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); + static void print_mce(struct mce *m) { + int ret = 0; + pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n", m->extcpu, m->mcgstatus, m->bank, m->status); @@ -231,14 +271,23 @@ static void print_mce(struct mce *m) pr_cont("MISC %llx ", m->misc); pr_cont("\n"); - pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", - m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); + /* + * Note this output is parsed by external tools and old fields + * should not be changed. + */ + pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n", + m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid, + cpu_data(m->extcpu).microcode); /* * Print out human-readable details about the MCE error, * (if the CPU has an implementation for that) */ - atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); + ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); + if (ret == NOTIFY_STOP) + return; + + pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); } #define PANIC_TIMEOUT 5 /* 5 seconds */ @@ -258,7 +307,7 @@ static void wait_for_panic(void) while (timeout-- > 0) udelay(1); if (panic_timeout == 0) - panic_timeout = mce_panic_timeout; + panic_timeout = mca_cfg.panic_timeout; panic("Panicing machine check CPU died"); } @@ -316,7 +365,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp) pr_emerg(HW_ERR "Machine check: %s\n", exp); if (!fake_panic) { if (panic_timeout == 0) - panic_timeout = mce_panic_timeout; + panic_timeout = mca_cfg.panic_timeout; panic(msg); } else pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); @@ -326,9 +375,9 @@ static void mce_panic(char *msg, struct mce *final, char *exp) static int msr_to_offset(u32 msr) { - unsigned bank = __get_cpu_var(injectm.bank); + unsigned bank = __this_cpu_read(injectm.bank); - if (msr == rip_msr) + if (msr == mca_cfg.rip_msr) return offsetof(struct mce, ip); if (msr == MSR_IA32_MCx_STATUS(bank)) return offsetof(struct mce, status); @@ -346,7 +395,7 @@ static u64 mce_rdmsrl(u32 msr) { u64 v; - if (__get_cpu_var(injectm).finished) { + if (__this_cpu_read(injectm.finished)) { int offset = msr_to_offset(msr); if (offset < 0) @@ -369,7 +418,7 @@ static u64 mce_rdmsrl(u32 msr) static void mce_wrmsrl(u32 msr, u64 v) { - if (__get_cpu_var(injectm).finished) { + if (__this_cpu_read(injectm.finished)) { int offset = msr_to_offset(msr); if (offset >= 0) @@ -380,6 +429,39 @@ static void mce_wrmsrl(u32 msr, u64 v) } /* + * Collect all global (w.r.t. this processor) status about this machine + * check into our "mce" struct so that we can use it later to assess + * the severity of the problem as we read per-bank specific details. + */ +static inline void mce_gather_info(struct mce *m, struct pt_regs *regs) +{ + mce_setup(m); + + m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); + if (regs) { + /* + * Get the address of the instruction at the time of + * the machine check error. + */ + if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) { + m->ip = regs->ip; + m->cs = regs->cs; + + /* + * When in VM86 mode make the cs look like ring 3 + * always. This is a lie, but it's better than passing + * the additional vm86 bit around everywhere. + */ + if (v8086_mode(regs)) + m->cs |= 3; + } + /* Use accurate RIP reporting if available. */ + if (mca_cfg.rip_msr) + m->ip = mce_rdmsrl(mca_cfg.rip_msr); + } +} + +/* * Simple lockless ring to communicate PFNs from the exception handler with the * process context work function. This is vastly simplified because there's * only a single reader and a single writer. @@ -436,54 +518,24 @@ static int mce_ring_add(unsigned long pfn) int mce_available(struct cpuinfo_x86 *c) { - if (mce_disabled) + if (mca_cfg.disabled) return 0; return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); } static void mce_schedule_work(void) { - if (!mce_ring_empty()) { - struct work_struct *work = &__get_cpu_var(mce_work); - if (!work_pending(work)) - schedule_work(work); - } + if (!mce_ring_empty()) + schedule_work(&__get_cpu_var(mce_work)); } -/* - * Get the address of the instruction at the time of the machine check - * error. - */ -static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) -{ - - if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) { - m->ip = regs->ip; - m->cs = regs->cs; - } else { - m->ip = 0; - m->cs = 0; - } - if (rip_msr) - m->ip = mce_rdmsrl(rip_msr); -} +DEFINE_PER_CPU(struct irq_work, mce_irq_work); -#ifdef CONFIG_X86_LOCAL_APIC -/* - * Called after interrupts have been reenabled again - * when a MCE happened during an interrupts off region - * in the kernel. - */ -asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs) +static void mce_irq_work_cb(struct irq_work *entry) { - ack_APIC_irq(); - exit_idle(); - irq_enter(); mce_notify_irq(); mce_schedule_work(); - irq_exit(); } -#endif static void mce_report_event(struct pt_regs *regs) { @@ -499,29 +551,28 @@ static void mce_report_event(struct pt_regs *regs) return; } -#ifdef CONFIG_X86_LOCAL_APIC - /* - * Without APIC do not notify. The event will be picked - * up eventually. - */ - if (!cpu_has_apic) - return; + irq_work_queue(&__get_cpu_var(mce_irq_work)); +} - /* - * When interrupts are disabled we cannot use - * kernel services safely. Trigger an self interrupt - * through the APIC to instead do the notification - * after interrupts are reenabled again. - */ - apic->send_IPI_self(MCE_SELF_VECTOR); +/* + * Read ADDR and MISC registers. + */ +static void mce_read_aux(struct mce *m, int i) +{ + if (m->status & MCI_STATUS_MISCV) + m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); + if (m->status & MCI_STATUS_ADDRV) { + m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); - /* - * Wait for idle afterwards again so that we don't leave the - * APIC in a non idle state because the normal APIC writes - * cannot exclude us. - */ - apic_wait_icr_idle(); -#endif + /* + * Mask the reported address by the reported granularity. + */ + if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) { + u8 shift = MCI_MISC_ADDR_LSB(m->misc); + m->addr >>= shift; + m->addr <<= shift; + } + } } DEFINE_PER_CPU(unsigned, mce_poll_count); @@ -546,12 +597,11 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) struct mce m; int i; - percpu_inc(mce_poll_count); + this_cpu_inc(mce_poll_count); - mce_setup(&m); + mce_gather_info(&m, NULL); - m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { if (!mce_banks[i].ctl || !test_bit(i, *b)) continue; @@ -565,6 +615,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) if (!(m.status & MCI_STATUS_VAL)) continue; + this_cpu_write(mce_polled_error, 1); /* * Uncorrected or signalled events are handled by the exception * handler when it is enabled, so don't process those here. @@ -572,13 +623,10 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) * TBD do the same check for MCI_STATUS_EN here? */ if (!(flags & MCP_UC) && - (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) + (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC))) continue; - if (m.status & MCI_STATUS_MISCV) - m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); - if (m.status & MCI_STATUS_ADDRV) - m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); + mce_read_aux(&m, i); if (!(flags & MCP_TIMESTAMP)) m.tsc = 0; @@ -586,11 +634,8 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) * Don't get the IP here because it's unlikely to * have anything to do with the actual error location. */ - if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { + if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) mce_log(&m); - atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m); - add_taint(TAINT_MACHINE_CHECK); - } /* * Clear state for this bank. @@ -611,16 +656,22 @@ EXPORT_SYMBOL_GPL(machine_check_poll); * Do a quick check if any of the events requires a panic. * This decides if we keep the events around or clear them. */ -static int mce_no_way_out(struct mce *m, char **msg) +static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, + struct pt_regs *regs) { - int i; + int i, ret = 0; - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); - if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) - return 1; + if (m->status & MCI_STATUS_VAL) { + __set_bit(i, validp); + if (quirk_no_way_out) + quirk_no_way_out(i, m, regs); + } + if (mce_severity(m, mca_cfg.tolerant, msg) >= MCE_PANIC_SEVERITY) + ret = 1; } - return 0; + return ret; } /* @@ -648,11 +699,10 @@ static int mce_timed_out(u64 *t) rmb(); if (atomic_read(&mce_paniced)) wait_for_panic(); - if (!monarch_timeout) + if (!mca_cfg.monarch_timeout) goto out; if ((s64)*t < SPINUNIT) { - /* CHECKME: Make panic default for 1 too? */ - if (tolerant < 1) + if (mca_cfg.tolerant <= 1) mce_panic("Timeout synchronizing machine check over CPUs", NULL, NULL); cpu_missing = 1; @@ -702,7 +752,8 @@ static void mce_reign(void) * Grade the severity of the errors of all the CPUs. */ for_each_possible_cpu(cpu) { - int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant, + int severity = mce_severity(&per_cpu(mces_seen, cpu), + mca_cfg.tolerant, &nmsg); if (severity > global_worst) { msg = nmsg; @@ -716,7 +767,7 @@ static void mce_reign(void) * This dumps all the mces in the log buffer and stops the * other CPUs. */ - if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) + if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) mce_panic("Fatal Machine check", m, msg); /* @@ -729,7 +780,7 @@ static void mce_reign(void) * No machine check event found. Must be some external * source or one CPU is hung. Panic. */ - if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3) + if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3) mce_panic("Machine check from unknown source", NULL, NULL); /* @@ -753,7 +804,7 @@ static int mce_start(int *no_way_out) { int order; int cpus = num_online_cpus(); - u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; + u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; if (!timeout) return -1; @@ -817,7 +868,7 @@ static int mce_start(int *no_way_out) static int mce_end(int order) { int ret = -1; - u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC; + u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; if (!timeout) goto reset; @@ -881,15 +932,15 @@ reset: * Check if the address reported by the CPU is in a format we can parse. * It would be possible to add code for most other cases, but all would * be somewhat complicated (e.g. segment offset would require an instruction - * parser). So only support physical addresses upto page granuality for now. + * parser). So only support physical addresses up to page granuality for now. */ static int mce_usable_address(struct mce *m) { if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) return 0; - if ((m->misc & 0x3f) > PAGE_SHIFT) + if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT) return 0; - if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS) + if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS) return 0; return 1; } @@ -898,13 +949,58 @@ static void mce_clear_state(unsigned long *toclear) { int i; - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { if (test_bit(i, toclear)) mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); } } /* + * Need to save faulting physical address associated with a process + * in the machine check handler some place where we can grab it back + * later in mce_notify_process() + */ +#define MCE_INFO_MAX 16 + +struct mce_info { + atomic_t inuse; + struct task_struct *t; + __u64 paddr; + int restartable; +} mce_info[MCE_INFO_MAX]; + +static void mce_save_info(__u64 addr, int c) +{ + struct mce_info *mi; + + for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) { + if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) { + mi->t = current; + mi->paddr = addr; + mi->restartable = c; + return; + } + } + + mce_panic("Too many concurrent recoverable errors", NULL, NULL); +} + +static struct mce_info *mce_find_info(void) +{ + struct mce_info *mi; + + for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) + if (atomic_read(&mi->inuse) && mi->t == current) + return mi; + return NULL; +} + +static void mce_clear_info(struct mce_info *mi) +{ + atomic_set(&mi->inuse, 0); +} + +/* * The actual machine check handler. This only handles real * exceptions when something got corrupted coming in through int 18. * @@ -918,6 +1014,7 @@ static void mce_clear_state(unsigned long *toclear) */ void do_machine_check(struct pt_regs *regs, long error_code) { + struct mca_config *cfg = &mca_cfg; struct mce m, *final; int i; int worst = 0; @@ -929,7 +1026,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) int order; /* * If no_way_out gets set, there is no safe way to recover from this - * MCE. If tolerant is cranked up, we'll try anyway. + * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway. */ int no_way_out = 0; /* @@ -938,30 +1035,28 @@ void do_machine_check(struct pt_regs *regs, long error_code) */ int kill_it = 0; DECLARE_BITMAP(toclear, MAX_NR_BANKS); + DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); char *msg = "Unknown"; - atomic_inc(&mce_entry); + this_cpu_inc(mce_exception_count); - percpu_inc(mce_exception_count); - - if (notify_die(DIE_NMI, "machine check", regs, error_code, - 18, SIGKILL) == NOTIFY_STOP) - goto out; - if (!banks) + if (!cfg->banks) goto out; - mce_setup(&m); + mce_gather_info(&m, regs); - m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); final = &__get_cpu_var(mces_seen); *final = m; - no_way_out = mce_no_way_out(&m, &msg); + memset(valid_banks, 0, sizeof(valid_banks)); + no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs); barrier(); /* - * When no restart IP must always kill or panic. + * When no restart IP might need to kill or panic. + * Assume the worst for now, but if we find the + * severity is MCE_AR_SEVERITY we have other options. */ if (!(m.mcgstatus & MCG_STATUS_RIPV)) kill_it = 1; @@ -972,8 +1067,10 @@ void do_machine_check(struct pt_regs *regs, long error_code) * because the first one to see it will clear it. */ order = mce_start(&no_way_out); - for (i = 0; i < banks; i++) { + for (i = 0; i < cfg->banks; i++) { __clear_bit(i, toclear); + if (!test_bit(i, valid_banks)) + continue; if (!mce_banks[i].ctl) continue; @@ -989,16 +1086,16 @@ void do_machine_check(struct pt_regs *regs, long error_code) * Non uncorrected or non signaled errors are handled by * machine_check_poll. Leave them alone, unless this panics. */ - if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) && + if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) && !no_way_out) continue; /* * Set taint even when machine check was not enabled. */ - add_taint(TAINT_MACHINE_CHECK); + add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - severity = mce_severity(&m, tolerant, NULL); + severity = mce_severity(&m, cfg->tolerant, NULL); /* * When machine check was for corrected handler don't touch, @@ -1015,28 +1112,18 @@ void do_machine_check(struct pt_regs *regs, long error_code) continue; } - /* - * Kill on action required. - */ - if (severity == MCE_AR_SEVERITY) - kill_it = 1; - - if (m.status & MCI_STATUS_MISCV) - m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); - if (m.status & MCI_STATUS_ADDRV) - m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); + mce_read_aux(&m, i); /* * Action optional error. Queue address for later processing. * When the ring overflows we just ignore the AO error. * RED-PEN add some logging mechanism when * usable_address or mce_add_ring fails. - * RED-PEN don't ignore overflow for tolerant == 0 + * RED-PEN don't ignore overflow for mca_cfg.tolerant == 0 */ if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) mce_ring_add(m.addr >> PAGE_SHIFT); - mce_get_rip(&m, regs); mce_log(&m); if (severity > worst) { @@ -1045,6 +1132,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) } } + /* mce_clear_state will clear *final, save locally for use later */ + m = *final; + if (!no_way_out) mce_clear_state(toclear); @@ -1056,65 +1146,91 @@ void do_machine_check(struct pt_regs *regs, long error_code) no_way_out = worst >= MCE_PANIC_SEVERITY; /* - * If we have decided that we just CAN'T continue, and the user - * has not set tolerant to an insane level, give up and die. - * - * This is mainly used in the case when the system doesn't - * support MCE broadcasting or it has been disabled. + * At insane "tolerant" levels we take no action. Otherwise + * we only die if we have no other choice. For less serious + * issues we try to recover, or limit damage to the current + * process. */ - if (no_way_out && tolerant < 3) - mce_panic("Fatal machine check on current CPU", final, msg); - - /* - * If the error seems to be unrecoverable, something should be - * done. Try to kill as little as possible. If we can kill just - * one task, do that. If the user has set the tolerance very - * high, don't try to do anything at all. - */ - - if (kill_it && tolerant < 3) - force_sig(SIGBUS, current); - - /* notify userspace ASAP */ - set_thread_flag(TIF_MCE_NOTIFY); + if (cfg->tolerant < 3) { + if (no_way_out) + mce_panic("Fatal machine check on current CPU", &m, msg); + if (worst == MCE_AR_SEVERITY) { + /* schedule action before return to userland */ + mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV); + set_thread_flag(TIF_MCE_NOTIFY); + } else if (kill_it) { + force_sig(SIGBUS, current); + } + } if (worst > 0) mce_report_event(regs); mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); out: - atomic_dec(&mce_entry); sync_core(); } EXPORT_SYMBOL_GPL(do_machine_check); -/* dummy to break dependency. actual code is in mm/memory-failure.c */ -void __attribute__((weak)) memory_failure(unsigned long pfn, int vector) +#ifndef CONFIG_MEMORY_FAILURE +int memory_failure(unsigned long pfn, int vector, int flags) { - printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn); + /* mce_severity() should not hand us an ACTION_REQUIRED error */ + BUG_ON(flags & MF_ACTION_REQUIRED); + pr_err("Uncorrected memory error in page 0x%lx ignored\n" + "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", + pfn); + + return 0; } +#endif /* - * Called after mce notification in process context. This code - * is allowed to sleep. Call the high level VM handler to process - * any corrupted pages. - * Assume that the work queue code only calls this one at a time - * per CPU. - * Note we don't disable preemption, so this code might run on the wrong - * CPU. In this case the event is picked up by the scheduled work queue. - * This is merely a fast path to expedite processing in some common - * cases. + * Called in process context that interrupted by MCE and marked with + * TIF_MCE_NOTIFY, just before returning to erroneous userland. + * This code is allowed to sleep. + * Attempt possible recovery such as calling the high level VM handler to + * process any corrupted pages, and kill/signal current process if required. + * Action required errors are handled here. */ void mce_notify_process(void) { unsigned long pfn; - mce_notify_irq(); - while (mce_ring_get(&pfn)) - memory_failure(pfn, MCE_VECTOR); + struct mce_info *mi = mce_find_info(); + int flags = MF_ACTION_REQUIRED; + + if (!mi) + mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL); + pfn = mi->paddr >> PAGE_SHIFT; + + clear_thread_flag(TIF_MCE_NOTIFY); + + pr_err("Uncorrected hardware memory error in user-access at %llx", + mi->paddr); + /* + * We must call memory_failure() here even if the current process is + * doomed. We still need to mark the page as poisoned and alert any + * other users of the page. + */ + if (!mi->restartable) + flags |= MF_MUST_KILL; + if (memory_failure(pfn, MCE_VECTOR, flags) < 0) { + pr_err("Memory error not recovered"); + force_sig(SIGBUS, current); + } + mce_clear_info(mi); } +/* + * Action optional processing happens here (picking up + * from the list of faulting pages that do_machine_check() + * placed into the "ring"). + */ static void mce_process_work(struct work_struct *dummy) { - mce_notify_process(); + unsigned long pfn; + + while (mce_ring_get(&pfn)) + memory_failure(pfn, MCE_VECTOR, 0); } #ifdef CONFIG_X86_MCE_INTEL @@ -1147,35 +1263,88 @@ void mce_log_therm_throt_event(__u64 status) * poller finds an MCE, poll 2x faster. When the poller finds no more * errors, poll 2x slower (up to check_interval seconds). */ -static int check_interval = 5 * 60; /* 5 minutes */ +static unsigned long check_interval = 5 * 60; /* 5 minutes */ -static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ +static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ static DEFINE_PER_CPU(struct timer_list, mce_timer); -static void mce_start_timer(unsigned long data) +static unsigned long mce_adjust_timer_default(unsigned long interval) { - struct timer_list *t = &per_cpu(mce_timer, data); - int *n; + return interval; +} + +static unsigned long (*mce_adjust_timer)(unsigned long interval) = + mce_adjust_timer_default; + +static int cmc_error_seen(void) +{ + unsigned long *v = &__get_cpu_var(mce_polled_error); + + return test_and_clear_bit(0, v); +} + +static void mce_timer_fn(unsigned long data) +{ + struct timer_list *t = &__get_cpu_var(mce_timer); + unsigned long iv; + int notify; WARN_ON(smp_processor_id() != data); - if (mce_available(¤t_cpu_data)) { + if (mce_available(__this_cpu_ptr(&cpu_info))) { machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_poll_banks)); + mce_intel_cmci_poll(); } /* * Alert userspace if needed. If we logged an MCE, reduce the * polling interval, otherwise increase the polling interval. */ - n = &__get_cpu_var(mce_next_interval); - if (mce_notify_irq()) - *n = max(*n/2, HZ/100); - else - *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); + iv = __this_cpu_read(mce_next_interval); + notify = mce_notify_irq(); + notify |= cmc_error_seen(); + if (notify) { + iv = max(iv / 2, (unsigned long) HZ/100); + } else { + iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); + iv = mce_adjust_timer(iv); + } + __this_cpu_write(mce_next_interval, iv); + /* Might have become 0 after CMCI storm subsided */ + if (iv) { + t->expires = jiffies + iv; + add_timer_on(t, smp_processor_id()); + } +} + +/* + * Ensure that the timer is firing in @interval from now. + */ +void mce_timer_kick(unsigned long interval) +{ + struct timer_list *t = &__get_cpu_var(mce_timer); + unsigned long when = jiffies + interval; + unsigned long iv = __this_cpu_read(mce_next_interval); - t->expires = jiffies + *n; - add_timer_on(t, smp_processor_id()); + if (timer_pending(t)) { + if (time_before(when, t->expires)) + mod_timer_pinned(t, when); + } else { + t->expires = round_jiffies(when); + add_timer_on(t, smp_processor_id()); + } + if (interval < iv) + __this_cpu_write(mce_next_interval, interval); +} + +/* Must not be called in IRQ context where del_timer_sync() can deadlock */ +static void mce_timer_delete_all(void) +{ + int cpu; + + for_each_online_cpu(cpu) + del_timer_sync(&per_cpu(mce_timer, cpu)); } static void mce_do_trigger(struct work_struct *work) @@ -1195,17 +1364,11 @@ int mce_notify_irq(void) /* Not more than two messages every minute */ static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); - clear_thread_flag(TIF_MCE_NOTIFY); - if (test_and_clear_bit(0, &mce_need_notify)) { - wake_up_interruptible(&mce_wait); + /* wake processes polling /dev/mcelog */ + wake_up_interruptible(&mce_chrdev_wait); - /* - * There is no risk of missing notifications because - * work_pending is always cleared before the function is - * executed. - */ - if (mce_helper[0] && !work_pending(&mce_trigger_work)) + if (mce_helper[0]) schedule_work(&mce_trigger_work); if (__ratelimit(&ratelimit)) @@ -1217,14 +1380,16 @@ int mce_notify_irq(void) } EXPORT_SYMBOL_GPL(mce_notify_irq); -static int __cpuinit __mcheck_cpu_mce_banks_init(void) +static int __mcheck_cpu_mce_banks_init(void) { int i; + u8 num_banks = mca_cfg.banks; - mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL); + mce_banks = kzalloc(num_banks * sizeof(struct mce_bank), GFP_KERNEL); if (!mce_banks) return -ENOMEM; - for (i = 0; i < banks; i++) { + + for (i = 0; i < num_banks; i++) { struct mce_bank *b = &mce_banks[i]; b->ctl = -1ULL; @@ -1236,7 +1401,7 @@ static int __cpuinit __mcheck_cpu_mce_banks_init(void) /* * Initialize Machine Checks for a CPU. */ -static int __cpuinit __mcheck_cpu_cap_init(void) +static int __mcheck_cpu_cap_init(void) { unsigned b; u64 cap; @@ -1244,19 +1409,19 @@ static int __cpuinit __mcheck_cpu_cap_init(void) rdmsrl(MSR_IA32_MCG_CAP, cap); b = cap & MCG_BANKCNT_MASK; - if (!banks) - printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); + if (!mca_cfg.banks) + pr_info("CPU supports %d MCE banks\n", b); if (b > MAX_NR_BANKS) { - printk(KERN_WARNING - "MCE: Using only %u machine check banks out of %u\n", + pr_warn("Using only %u machine check banks out of %u\n", MAX_NR_BANKS, b); b = MAX_NR_BANKS; } /* Don't support asymmetric configurations today */ - WARN_ON(banks != 0 && b != banks); - banks = b; + WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks); + mca_cfg.banks = b; + if (!mce_banks) { int err = __mcheck_cpu_mce_banks_init(); @@ -1266,25 +1431,29 @@ static int __cpuinit __mcheck_cpu_cap_init(void) /* Use accurate RIP reporting if available. */ if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) - rip_msr = MSR_IA32_MCG_EIP; + mca_cfg.rip_msr = MSR_IA32_MCG_EIP; if (cap & MCG_SER_P) - mce_ser = 1; + mca_cfg.ser = true; return 0; } static void __mcheck_cpu_init_generic(void) { + enum mcp_flags m_fl = 0; mce_banks_t all_banks; u64 cap; int i; + if (!mca_cfg.bootlog) + m_fl = MCP_DONTLOG; + /* * Log the machine checks left over from the previous reset. */ bitmap_fill(all_banks, MAX_NR_BANKS); - machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks); + machine_check_poll(MCP_UC | m_fl, &all_banks); set_in_cr4(X86_CR4_MCE); @@ -1292,7 +1461,7 @@ static void __mcheck_cpu_init_generic(void) if (cap & MCG_CTL_P) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { struct mce_bank *b = &mce_banks[i]; if (!b->init) @@ -1302,17 +1471,47 @@ static void __mcheck_cpu_init_generic(void) } } +/* + * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and + * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM + * Vol 3B Table 15-20). But this confuses both the code that determines + * whether the machine check occurred in kernel or user mode, and also + * the severity assessment code. Pretend that EIPV was set, and take the + * ip/cs values from the pt_regs that mce_gather_info() ignored earlier. + */ +static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs) +{ + if (bank != 0) + return; + if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0) + return; + if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC| + MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV| + MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR| + MCACOD)) != + (MCI_STATUS_UC|MCI_STATUS_EN| + MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S| + MCI_STATUS_AR|MCACOD_INSTR)) + return; + + m->mcgstatus |= MCG_STATUS_EIPV; + m->ip = regs->ip; + m->cs = regs->cs; +} + /* Add per CPU specific workarounds here */ -static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) +static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) { + struct mca_config *cfg = &mca_cfg; + if (c->x86_vendor == X86_VENDOR_UNKNOWN) { - pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); + pr_info("unknown CPU type - not enabling MCE support\n"); return -EOPNOTSUPP; } /* This should be disabled by the BIOS, but isn't always */ if (c->x86_vendor == X86_VENDOR_AMD) { - if (c->x86 == 15 && banks > 4) { + if (c->x86 == 15 && cfg->banks > 4) { /* * disable GART TBL walk error reporting, which * trips off incorrectly with the IOMMU & 3ware @@ -1320,19 +1519,56 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) */ clear_bit(10, (unsigned long *)&mce_banks[4].ctl); } - if (c->x86 <= 17 && mce_bootlog < 0) { + if (c->x86 <= 17 && cfg->bootlog < 0) { /* * Lots of broken BIOS around that don't clear them * by default and leave crap in there. Don't log: */ - mce_bootlog = 0; + cfg->bootlog = 0; } /* * Various K7s with broken bank 0 around. Always disable * by default. */ - if (c->x86 == 6 && banks > 0) + if (c->x86 == 6 && cfg->banks > 0) mce_banks[0].ctl = 0; + + /* + * Turn off MC4_MISC thresholding banks on those models since + * they're not supported there. + */ + if (c->x86 == 0x15 && + (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) { + int i; + u64 val, hwcr; + bool need_toggle; + u32 msrs[] = { + 0x00000413, /* MC4_MISC0 */ + 0xc0000408, /* MC4_MISC1 */ + }; + + rdmsrl(MSR_K7_HWCR, hwcr); + + /* McStatusWrEn has to be set */ + need_toggle = !(hwcr & BIT(18)); + + if (need_toggle) + wrmsrl(MSR_K7_HWCR, hwcr | BIT(18)); + + for (i = 0; i < ARRAY_SIZE(msrs); i++) { + rdmsrl(msrs[i], val); + + /* CntP bit set? */ + if (val & BIT_64(62)) { + val &= ~BIT_64(62); + wrmsrl(msrs[i], val); + } + } + + /* restore old settings */ + if (need_toggle) + wrmsrl(MSR_K7_HWCR, hwcr); + } } if (c->x86_vendor == X86_VENDOR_INTEL) { @@ -1345,7 +1581,7 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) * valid event later, merely don't write CTL0. */ - if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0) + if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0) mce_banks[0].init = 0; /* @@ -1353,36 +1589,44 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) * synchronization with a one second timeout. */ if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && - monarch_timeout < 0) - monarch_timeout = USEC_PER_SEC; + cfg->monarch_timeout < 0) + cfg->monarch_timeout = USEC_PER_SEC; /* * There are also broken BIOSes on some Pentium M and * earlier systems: */ - if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0) - mce_bootlog = 0; + if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0) + cfg->bootlog = 0; + + if (c->x86 == 6 && c->x86_model == 45) + quirk_no_way_out = quirk_sandybridge_ifu; } - if (monarch_timeout < 0) - monarch_timeout = 0; - if (mce_bootlog != 0) - mce_panic_timeout = 30; + if (cfg->monarch_timeout < 0) + cfg->monarch_timeout = 0; + if (cfg->bootlog != 0) + cfg->panic_timeout = 30; return 0; } -static void __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) +static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) { if (c->x86 != 5) - return; + return 0; + switch (c->x86_vendor) { case X86_VENDOR_INTEL: intel_p5_mcheck_init(c); + return 1; break; case X86_VENDOR_CENTAUR: winchip_mcheck_init(c); + return 1; break; } + + return 0; } static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) @@ -1390,6 +1634,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) switch (c->x86_vendor) { case X86_VENDOR_INTEL: mce_intel_feature_init(c); + mce_adjust_timer = mce_intel_adjust_timer; break; case X86_VENDOR_AMD: mce_amd_feature_init(c); @@ -1399,27 +1644,32 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) } } -static void __mcheck_cpu_init_timer(void) +static void mce_start_timer(unsigned int cpu, struct timer_list *t) { - struct timer_list *t = &__get_cpu_var(mce_timer); - int *n = &__get_cpu_var(mce_next_interval); - - setup_timer(t, mce_start_timer, smp_processor_id()); + unsigned long iv = check_interval * HZ; - if (mce_ignore_ce) + if (mca_cfg.ignore_ce || !iv) return; - *n = check_interval * HZ; - if (!*n) - return; - t->expires = round_jiffies(jiffies + *n); - add_timer_on(t, smp_processor_id()); + per_cpu(mce_next_interval, cpu) = iv; + + t->expires = round_jiffies(jiffies + iv); + add_timer_on(t, cpu); +} + +static void __mcheck_cpu_init_timer(void) +{ + struct timer_list *t = &__get_cpu_var(mce_timer); + unsigned int cpu = smp_processor_id(); + + setup_timer(t, mce_timer_fn, cpu); + mce_start_timer(cpu, t); } /* Handle unconfigured int18 (should never happen) */ static void unexpected_machine_check(struct pt_regs *regs, long error_code) { - printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", + pr_err("CPU#%d: Unexpected int18 (Machine Check)\n", smp_processor_id()); } @@ -1431,18 +1681,19 @@ void (*machine_check_vector)(struct pt_regs *, long error_code) = * Called for each booted CPU to set up machine checks. * Must be called with preempt off: */ -void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) +void mcheck_cpu_init(struct cpuinfo_x86 *c) { - if (mce_disabled) + if (mca_cfg.disabled) return; - __mcheck_cpu_ancient_init(c); + if (__mcheck_cpu_ancient_init(c)) + return; if (!mce_available(c)) return; if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) { - mce_disabled = 1; + mca_cfg.disabled = true; return; } @@ -1452,44 +1703,45 @@ void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) __mcheck_cpu_init_vendor(c); __mcheck_cpu_init_timer(); INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); - + init_irq_work(&__get_cpu_var(mce_irq_work), &mce_irq_work_cb); } /* - * Character device to read and clear the MCE log. + * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log. */ -static DEFINE_SPINLOCK(mce_state_lock); -static int open_count; /* #times opened */ -static int open_exclu; /* already open exclusive? */ +static DEFINE_SPINLOCK(mce_chrdev_state_lock); +static int mce_chrdev_open_count; /* #times opened */ +static int mce_chrdev_open_exclu; /* already open exclusive? */ -static int mce_open(struct inode *inode, struct file *file) +static int mce_chrdev_open(struct inode *inode, struct file *file) { - spin_lock(&mce_state_lock); + spin_lock(&mce_chrdev_state_lock); - if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { - spin_unlock(&mce_state_lock); + if (mce_chrdev_open_exclu || + (mce_chrdev_open_count && (file->f_flags & O_EXCL))) { + spin_unlock(&mce_chrdev_state_lock); return -EBUSY; } if (file->f_flags & O_EXCL) - open_exclu = 1; - open_count++; + mce_chrdev_open_exclu = 1; + mce_chrdev_open_count++; - spin_unlock(&mce_state_lock); + spin_unlock(&mce_chrdev_state_lock); return nonseekable_open(inode, file); } -static int mce_release(struct inode *inode, struct file *file) +static int mce_chrdev_release(struct inode *inode, struct file *file) { - spin_lock(&mce_state_lock); + spin_lock(&mce_chrdev_state_lock); - open_count--; - open_exclu = 0; + mce_chrdev_open_count--; + mce_chrdev_open_exclu = 0; - spin_unlock(&mce_state_lock); + spin_unlock(&mce_chrdev_state_lock); return 0; } @@ -1517,6 +1769,12 @@ static int __mce_read_apei(char __user **ubuf, size_t usize) /* Error or no more MCE record */ if (rc <= 0) { mce_apei_read_done = 1; + /* + * When ERST is disabled, mce_chrdev_read() should return + * "no record" instead of "no device." + */ + if (rc == -ENODEV) + return 0; return rc; } rc = -EFAULT; @@ -1538,8 +1796,8 @@ static int __mce_read_apei(char __user **ubuf, size_t usize) return 0; } -static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, - loff_t *off) +static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf, + size_t usize, loff_t *off) { char __user *buf = ubuf; unsigned long *cpu_tsc; @@ -1550,7 +1808,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, if (!cpu_tsc) return -ENOMEM; - mutex_lock(&mce_read_mutex); + mutex_lock(&mce_chrdev_read_mutex); if (!mce_apei_read_done) { err = __mce_read_apei(&buf, usize); @@ -1570,19 +1828,18 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, do { for (i = prev; i < next; i++) { unsigned long start = jiffies; + struct mce *m = &mcelog.entry[i]; - while (!mcelog.entry[i].finished) { + while (!m->finished) { if (time_after_eq(jiffies, start + 2)) { - memset(mcelog.entry + i, 0, - sizeof(struct mce)); + memset(m, 0, sizeof(*m)); goto timeout; } cpu_relax(); } smp_rmb(); - err |= copy_to_user(buf, mcelog.entry + i, - sizeof(struct mce)); - buf += sizeof(struct mce); + err |= copy_to_user(buf, m, sizeof(*m)); + buf += sizeof(*m); timeout: ; } @@ -1602,13 +1859,13 @@ timeout: on_each_cpu(collect_tscs, cpu_tsc, 1); for (i = next; i < MCE_LOG_LEN; i++) { - if (mcelog.entry[i].finished && - mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) { - err |= copy_to_user(buf, mcelog.entry+i, - sizeof(struct mce)); + struct mce *m = &mcelog.entry[i]; + + if (m->finished && m->tsc < cpu_tsc[m->cpu]) { + err |= copy_to_user(buf, m, sizeof(*m)); smp_rmb(); - buf += sizeof(struct mce); - memset(&mcelog.entry[i], 0, sizeof(struct mce)); + buf += sizeof(*m); + memset(m, 0, sizeof(*m)); } } @@ -1616,23 +1873,24 @@ timeout: err = -EFAULT; out: - mutex_unlock(&mce_read_mutex); + mutex_unlock(&mce_chrdev_read_mutex); kfree(cpu_tsc); return err ? err : buf - ubuf; } -static unsigned int mce_poll(struct file *file, poll_table *wait) +static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait) { - poll_wait(file, &mce_wait, wait); - if (rcu_dereference_check_mce(mcelog.next)) + poll_wait(file, &mce_chrdev_wait, wait); + if (rcu_access_index(mcelog.next)) return POLLIN | POLLRDNORM; if (!mce_apei_read_done && apei_check_mce()) return POLLIN | POLLRDNORM; return 0; } -static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) +static long mce_chrdev_ioctl(struct file *f, unsigned int cmd, + unsigned long arg) { int __user *p = (int __user *)arg; @@ -1658,23 +1916,61 @@ static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) } } -/* Modified in mce-inject.c, so not static or const */ -struct file_operations mce_chrdev_ops = { - .open = mce_open, - .release = mce_release, - .read = mce_read, - .poll = mce_poll, - .unlocked_ioctl = mce_ioctl, - .llseek = no_llseek, +static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf, + size_t usize, loff_t *off); + +void register_mce_write_callback(ssize_t (*fn)(struct file *filp, + const char __user *ubuf, + size_t usize, loff_t *off)) +{ + mce_write = fn; +} +EXPORT_SYMBOL_GPL(register_mce_write_callback); + +ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf, + size_t usize, loff_t *off) +{ + if (mce_write) + return mce_write(filp, ubuf, usize, off); + else + return -EINVAL; +} + +static const struct file_operations mce_chrdev_ops = { + .open = mce_chrdev_open, + .release = mce_chrdev_release, + .read = mce_chrdev_read, + .write = mce_chrdev_write, + .poll = mce_chrdev_poll, + .unlocked_ioctl = mce_chrdev_ioctl, + .llseek = no_llseek, }; -EXPORT_SYMBOL_GPL(mce_chrdev_ops); -static struct miscdevice mce_log_device = { +static struct miscdevice mce_chrdev_device = { MISC_MCELOG_MINOR, "mcelog", &mce_chrdev_ops, }; +static void __mce_disable_bank(void *arg) +{ + int bank = *((int *)arg); + __clear_bit(bank, __get_cpu_var(mce_poll_banks)); + cmci_disable_bank(bank); +} + +void mce_disable_bank(int bank) +{ + if (bank >= mca_cfg.banks) { + pr_warn(FW_BUG + "Ignoring request to disable invalid MCA bank %d.\n", + bank); + return; + } + set_bit(bank, mce_banks_ce_disabled); + on_each_cpu(__mce_disable_bank, &bank, 1); +} + /* * mce=off Disables machine check * mce=no_cmci Disables CMCI @@ -1685,9 +1981,12 @@ static struct miscdevice mce_log_device = { * check, or 0 to not wait * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. * mce=nobootlog Don't log MCEs from before booting. + * mce=bios_cmci_threshold Don't program the CMCI threshold */ static int __init mcheck_enable(char *str) { + struct mca_config *cfg = &mca_cfg; + if (*str == 0) { enable_p5_mce(); return 1; @@ -1695,24 +1994,25 @@ static int __init mcheck_enable(char *str) if (*str == '=') str++; if (!strcmp(str, "off")) - mce_disabled = 1; + cfg->disabled = true; else if (!strcmp(str, "no_cmci")) - mce_cmci_disabled = 1; + cfg->cmci_disabled = true; else if (!strcmp(str, "dont_log_ce")) - mce_dont_log_ce = 1; + cfg->dont_log_ce = true; else if (!strcmp(str, "ignore_ce")) - mce_ignore_ce = 1; + cfg->ignore_ce = true; else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) - mce_bootlog = (str[0] == 'b'); + cfg->bootlog = (str[0] == 'b'); + else if (!strcmp(str, "bios_cmci_threshold")) + cfg->bios_cmci_threshold = true; else if (isdigit(str[0])) { - get_option(&str, &tolerant); + get_option(&str, &(cfg->tolerant)); if (*str == ',') { ++str; - get_option(&str, &monarch_timeout); + get_option(&str, &(cfg->monarch_timeout)); } } else { - printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", - str); + pr_info("mce argument %s ignored. Please use /sys\n", str); return 0; } return 1; @@ -1721,15 +2021,13 @@ __setup("mce", mcheck_enable); int __init mcheck_init(void) { - atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb); - mcheck_intel_therm_init(); return 0; } /* - * Sysfs support + * mce_syscore: PM support */ /* @@ -1740,7 +2038,7 @@ static int mce_disable_error_reporting(void) { int i; - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { struct mce_bank *b = &mce_banks[i]; if (b->init) @@ -1749,14 +2047,14 @@ static int mce_disable_error_reporting(void) return 0; } -static int mce_suspend(struct sys_device *dev, pm_message_t state) +static int mce_syscore_suspend(void) { return mce_disable_error_reporting(); } -static int mce_shutdown(struct sys_device *dev) +static void mce_syscore_shutdown(void) { - return mce_disable_error_reporting(); + mce_disable_error_reporting(); } /* @@ -1764,18 +2062,25 @@ static int mce_shutdown(struct sys_device *dev) * Only one CPU is active at this time, the others get re-added later using * CPU hotplug: */ -static int mce_resume(struct sys_device *dev) +static void mce_syscore_resume(void) { __mcheck_cpu_init_generic(); - __mcheck_cpu_init_vendor(¤t_cpu_data); - - return 0; + __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info)); } +static struct syscore_ops mce_syscore_ops = { + .suspend = mce_syscore_suspend, + .shutdown = mce_syscore_shutdown, + .resume = mce_syscore_resume, +}; + +/* + * mce_device: Sysfs support + */ + static void mce_cpu_restart(void *data) { - del_timer_sync(&__get_cpu_var(mce_timer)); - if (!mce_available(¤t_cpu_data)) + if (!mce_available(__this_cpu_ptr(&cpu_info))) return; __mcheck_cpu_init_generic(); __mcheck_cpu_init_timer(); @@ -1784,22 +2089,21 @@ static void mce_cpu_restart(void *data) /* Reinit MCEs after user configuration changes */ static void mce_restart(void) { + mce_timer_delete_all(); on_each_cpu(mce_cpu_restart, NULL, 1); } /* Toggle features for corrected errors */ -static void mce_disable_ce(void *all) +static void mce_disable_cmci(void *data) { - if (!mce_available(¤t_cpu_data)) + if (!mce_available(__this_cpu_ptr(&cpu_info))) return; - if (all) - del_timer_sync(&__get_cpu_var(mce_timer)); cmci_clear(); } static void mce_enable_ce(void *all) { - if (!mce_available(¤t_cpu_data)) + if (!mce_available(__this_cpu_ptr(&cpu_info))) return; cmci_reenable(); cmci_recheck(); @@ -1807,30 +2111,27 @@ static void mce_enable_ce(void *all) __mcheck_cpu_init_timer(); } -static struct sysdev_class mce_sysclass = { - .suspend = mce_suspend, - .shutdown = mce_shutdown, - .resume = mce_resume, +static struct bus_type mce_subsys = { .name = "machinecheck", + .dev_name = "machinecheck", }; -DEFINE_PER_CPU(struct sys_device, mce_dev); +DEFINE_PER_CPU(struct device *, mce_device); -__cpuinitdata void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); -static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr) +static inline struct mce_bank *attr_to_bank(struct device_attribute *attr) { return container_of(attr, struct mce_bank, attr); } -static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, +static ssize_t show_bank(struct device *s, struct device_attribute *attr, char *buf) { return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); } -static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, +static ssize_t set_bank(struct device *s, struct device_attribute *attr, const char *buf, size_t size) { u64 new; @@ -1845,14 +2146,14 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, } static ssize_t -show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) +show_trigger(struct device *s, struct device_attribute *attr, char *buf) { strcpy(buf, mce_helper); strcat(buf, "\n"); return strlen(mce_helper) + 1; } -static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, +static ssize_t set_trigger(struct device *s, struct device_attribute *attr, const char *buf, size_t siz) { char *p; @@ -1867,8 +2168,8 @@ static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, return strlen(mce_helper) + !!p; } -static ssize_t set_ignore_ce(struct sys_device *s, - struct sysdev_attribute *attr, +static ssize_t set_ignore_ce(struct device *s, + struct device_attribute *attr, const char *buf, size_t size) { u64 new; @@ -1876,22 +2177,23 @@ static ssize_t set_ignore_ce(struct sys_device *s, if (strict_strtoull(buf, 0, &new) < 0) return -EINVAL; - if (mce_ignore_ce ^ !!new) { + if (mca_cfg.ignore_ce ^ !!new) { if (new) { /* disable ce features */ - on_each_cpu(mce_disable_ce, (void *)1, 1); - mce_ignore_ce = 1; + mce_timer_delete_all(); + on_each_cpu(mce_disable_cmci, NULL, 1); + mca_cfg.ignore_ce = true; } else { /* enable ce features */ - mce_ignore_ce = 0; + mca_cfg.ignore_ce = false; on_each_cpu(mce_enable_ce, (void *)1, 1); } } return size; } -static ssize_t set_cmci_disabled(struct sys_device *s, - struct sysdev_attribute *attr, +static ssize_t set_cmci_disabled(struct device *s, + struct device_attribute *attr, const char *buf, size_t size) { u64 new; @@ -1899,135 +2201,147 @@ static ssize_t set_cmci_disabled(struct sys_device *s, if (strict_strtoull(buf, 0, &new) < 0) return -EINVAL; - if (mce_cmci_disabled ^ !!new) { + if (mca_cfg.cmci_disabled ^ !!new) { if (new) { /* disable cmci */ - on_each_cpu(mce_disable_ce, NULL, 1); - mce_cmci_disabled = 1; + on_each_cpu(mce_disable_cmci, NULL, 1); + mca_cfg.cmci_disabled = true; } else { /* enable cmci */ - mce_cmci_disabled = 0; + mca_cfg.cmci_disabled = false; on_each_cpu(mce_enable_ce, NULL, 1); } } return size; } -static ssize_t store_int_with_restart(struct sys_device *s, - struct sysdev_attribute *attr, +static ssize_t store_int_with_restart(struct device *s, + struct device_attribute *attr, const char *buf, size_t size) { - ssize_t ret = sysdev_store_int(s, attr, buf, size); + ssize_t ret = device_store_int(s, attr, buf, size); mce_restart(); return ret; } -static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); -static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); -static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); -static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); +static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger); +static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant); +static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout); +static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce); -static struct sysdev_ext_attribute attr_check_interval = { - _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, - store_int_with_restart), +static struct dev_ext_attribute dev_attr_check_interval = { + __ATTR(check_interval, 0644, device_show_int, store_int_with_restart), &check_interval }; -static struct sysdev_ext_attribute attr_ignore_ce = { - _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce), - &mce_ignore_ce +static struct dev_ext_attribute dev_attr_ignore_ce = { + __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce), + &mca_cfg.ignore_ce }; -static struct sysdev_ext_attribute attr_cmci_disabled = { - _SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled), - &mce_cmci_disabled +static struct dev_ext_attribute dev_attr_cmci_disabled = { + __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled), + &mca_cfg.cmci_disabled }; -static struct sysdev_attribute *mce_attrs[] = { - &attr_tolerant.attr, - &attr_check_interval.attr, - &attr_trigger, - &attr_monarch_timeout.attr, - &attr_dont_log_ce.attr, - &attr_ignore_ce.attr, - &attr_cmci_disabled.attr, +static struct device_attribute *mce_device_attrs[] = { + &dev_attr_tolerant.attr, + &dev_attr_check_interval.attr, + &dev_attr_trigger, + &dev_attr_monarch_timeout.attr, + &dev_attr_dont_log_ce.attr, + &dev_attr_ignore_ce.attr, + &dev_attr_cmci_disabled.attr, NULL }; -static cpumask_var_t mce_dev_initialized; +static cpumask_var_t mce_device_initialized; + +static void mce_device_release(struct device *dev) +{ + kfree(dev); +} -/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ -static __cpuinit int mce_create_device(unsigned int cpu) +/* Per cpu device init. All of the cpus still share the same ctrl bank: */ +static int mce_device_create(unsigned int cpu) { + struct device *dev; int err; int i, j; if (!mce_available(&boot_cpu_data)) return -EIO; - memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject)); - per_cpu(mce_dev, cpu).id = cpu; - per_cpu(mce_dev, cpu).cls = &mce_sysclass; + dev = kzalloc(sizeof *dev, GFP_KERNEL); + if (!dev) + return -ENOMEM; + dev->id = cpu; + dev->bus = &mce_subsys; + dev->release = &mce_device_release; - err = sysdev_register(&per_cpu(mce_dev, cpu)); - if (err) + err = device_register(dev); + if (err) { + put_device(dev); return err; + } - for (i = 0; mce_attrs[i]; i++) { - err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); + for (i = 0; mce_device_attrs[i]; i++) { + err = device_create_file(dev, mce_device_attrs[i]); if (err) goto error; } - for (j = 0; j < banks; j++) { - err = sysdev_create_file(&per_cpu(mce_dev, cpu), - &mce_banks[j].attr); + for (j = 0; j < mca_cfg.banks; j++) { + err = device_create_file(dev, &mce_banks[j].attr); if (err) goto error2; } - cpumask_set_cpu(cpu, mce_dev_initialized); + cpumask_set_cpu(cpu, mce_device_initialized); + per_cpu(mce_device, cpu) = dev; return 0; error2: while (--j >= 0) - sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr); + device_remove_file(dev, &mce_banks[j].attr); error: while (--i >= 0) - sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); + device_remove_file(dev, mce_device_attrs[i]); - sysdev_unregister(&per_cpu(mce_dev, cpu)); + device_unregister(dev); return err; } -static __cpuinit void mce_remove_device(unsigned int cpu) +static void mce_device_remove(unsigned int cpu) { + struct device *dev = per_cpu(mce_device, cpu); int i; - if (!cpumask_test_cpu(cpu, mce_dev_initialized)) + if (!cpumask_test_cpu(cpu, mce_device_initialized)) return; - for (i = 0; mce_attrs[i]; i++) - sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); + for (i = 0; mce_device_attrs[i]; i++) + device_remove_file(dev, mce_device_attrs[i]); - for (i = 0; i < banks; i++) - sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); + for (i = 0; i < mca_cfg.banks; i++) + device_remove_file(dev, &mce_banks[i].attr); - sysdev_unregister(&per_cpu(mce_dev, cpu)); - cpumask_clear_cpu(cpu, mce_dev_initialized); + device_unregister(dev); + cpumask_clear_cpu(cpu, mce_device_initialized); + per_cpu(mce_device, cpu) = NULL; } /* Make sure there are no machine checks on offlined CPUs. */ -static void __cpuinit mce_disable_cpu(void *h) +static void mce_disable_cpu(void *h) { unsigned long action = *(unsigned long *)h; int i; - if (!mce_available(¤t_cpu_data)) + if (!mce_available(__this_cpu_ptr(&cpu_info))) return; if (!(action & CPU_TASKS_FROZEN)) cmci_clear(); - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { struct mce_bank *b = &mce_banks[i]; if (b->init) @@ -2035,17 +2349,17 @@ static void __cpuinit mce_disable_cpu(void *h) } } -static void __cpuinit mce_reenable_cpu(void *h) +static void mce_reenable_cpu(void *h) { unsigned long action = *(unsigned long *)h; int i; - if (!mce_available(¤t_cpu_data)) + if (!mce_available(__this_cpu_ptr(&cpu_info))) return; if (!(action & CPU_TASKS_FROZEN)) cmci_reenable(); - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { struct mce_bank *b = &mce_banks[i]; if (b->init) @@ -2054,48 +2368,43 @@ static void __cpuinit mce_reenable_cpu(void *h) } /* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static int __cpuinit +static int mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; struct timer_list *t = &per_cpu(mce_timer, cpu); - switch (action) { + switch (action & ~CPU_TASKS_FROZEN) { case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - mce_create_device(cpu); + mce_device_create(cpu); if (threshold_cpu_callback) threshold_cpu_callback(action, cpu); break; case CPU_DEAD: - case CPU_DEAD_FROZEN: if (threshold_cpu_callback) threshold_cpu_callback(action, cpu); - mce_remove_device(cpu); + mce_device_remove(cpu); + mce_intel_hcpu_update(cpu); break; case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - del_timer_sync(t); smp_call_function_single(cpu, mce_disable_cpu, &action, 1); + del_timer_sync(t); break; case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - if (!mce_ignore_ce && check_interval) { - t->expires = round_jiffies(jiffies + - __get_cpu_var(mce_next_interval)); - add_timer_on(t, cpu); - } smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); + mce_start_timer(cpu, t); break; - case CPU_POST_DEAD: + } + + if (action == CPU_POST_DEAD) { /* intentionally ignoring frozen here */ - cmci_rediscover(cpu); - break; + cmci_rediscover(); } + return NOTIFY_OK; } -static struct notifier_block mce_cpu_notifier __cpuinitdata = { +static struct notifier_block mce_cpu_notifier = { .notifier_call = mce_cpu_callback, }; @@ -2103,9 +2412,9 @@ static __init void mce_init_banks(void) { int i; - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { struct mce_bank *b = &mce_banks[i]; - struct sysdev_attribute *a = &b->attr; + struct device_attribute *a = &b->attr; sysfs_attr_init(&a->attr); a->attr.name = b->attrname; @@ -2122,37 +2431,78 @@ static __init int mcheck_init_device(void) int err; int i = 0; - if (!mce_available(&boot_cpu_data)) - return -EIO; + if (!mce_available(&boot_cpu_data)) { + err = -EIO; + goto err_out; + } - zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); + if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) { + err = -ENOMEM; + goto err_out; + } mce_init_banks(); - err = sysdev_class_register(&mce_sysclass); + err = subsys_system_register(&mce_subsys, NULL); if (err) - return err; + goto err_out_mem; + cpu_notifier_register_begin(); for_each_online_cpu(i) { - err = mce_create_device(i); - if (err) - return err; + err = mce_device_create(i); + if (err) { + /* + * Register notifier anyway (and do not unreg it) so + * that we don't leave undeleted timers, see notifier + * callback above. + */ + __register_hotcpu_notifier(&mce_cpu_notifier); + cpu_notifier_register_done(); + goto err_device_create; + } } - register_hotcpu_notifier(&mce_cpu_notifier); - misc_register(&mce_log_device); + __register_hotcpu_notifier(&mce_cpu_notifier); + cpu_notifier_register_done(); + + register_syscore_ops(&mce_syscore_ops); + + /* register character device /dev/mcelog */ + err = misc_register(&mce_chrdev_device); + if (err) + goto err_register; + + return 0; + +err_register: + unregister_syscore_ops(&mce_syscore_ops); + +err_device_create: + /* + * We didn't keep track of which devices were created above, but + * even if we had, the set of online cpus might have changed. + * Play safe and remove for every possible cpu, since + * mce_device_remove() will do the right thing. + */ + for_each_possible_cpu(i) + mce_device_remove(i); + +err_out_mem: + free_cpumask_var(mce_device_initialized); + +err_out: + pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err); return err; } - -device_initcall(mcheck_init_device); +device_initcall_sync(mcheck_init_device); /* * Old style boot options parsing. Only for compatibility. */ static int __init mcheck_disable(char *str) { - mce_disabled = 1; + mca_cfg.disabled = true; return 1; } __setup("nomce", mcheck_disable); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 80c482382d5..603df4f7464 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -1,15 +1,17 @@ /* - * (c) 2005, 2006 Advanced Micro Devices, Inc. + * (c) 2005-2012 Advanced Micro Devices, Inc. * Your use of this code is subject to the terms and conditions of the * GNU general public license version 2. See "COPYING" or * http://www.gnu.org/licenses/gpl.html * * Written by Jacob Shin - AMD, Inc. * - * Support : jacob.shin@amd.com + * Maintained by: Borislav Petkov <bp@alien8.de> * * April 2006 * - added support for AMD Family 0x10 processors + * May 2012 + * - major scrubbing * * All MC4_MISCi registers are shared between multi-cores */ @@ -17,7 +19,6 @@ #include <linux/notifier.h> #include <linux/kobject.h> #include <linux/percpu.h> -#include <linux/sysdev.h> #include <linux/errno.h> #include <linux/sched.h> #include <linux/sysfs.h> @@ -26,14 +27,12 @@ #include <linux/cpu.h> #include <linux/smp.h> +#include <asm/amd_nb.h> #include <asm/apic.h> #include <asm/idle.h> #include <asm/mce.h> #include <asm/msr.h> -#define PFX "mce_threshold: " -#define VERSION "version 1.1.1" -#define NR_BANKS 6 #define NR_BLOCKS 9 #define THRESHOLD_MAX 0xFFF #define INT_TYPE_APIC 0x00020000 @@ -48,36 +47,16 @@ #define MASK_BLKPTR_LO 0xFF000000 #define MCG_XBLK_ADDR 0xC0000400 -struct threshold_block { - unsigned int block; - unsigned int bank; - unsigned int cpu; - u32 address; - u16 interrupt_enable; - u16 threshold_limit; - struct kobject kobj; - struct list_head miscj; +static const char * const th_names[] = { + "load_store", + "insn_fetch", + "combined_unit", + "", + "northbridge", + "execution_unit", }; -/* defaults used early on boot */ -static struct threshold_block threshold_defaults = { - .interrupt_enable = 0, - .threshold_limit = THRESHOLD_MAX, -}; - -struct threshold_bank { - struct kobject *kobj; - struct threshold_block *blocks; - cpumask_var_t cpus; -}; -static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks); - -#ifdef CONFIG_SMP -static unsigned char shared_bank[NR_BANKS] = { - 0, 0, 0, 0, 1 -}; -#endif - +static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ static void amd_threshold_interrupt(void); @@ -89,52 +68,153 @@ static void amd_threshold_interrupt(void); struct thresh_restart { struct threshold_block *b; int reset; + int set_lvt_off; + int lvt_off; u16 old_limit; }; -/* must be called with correct cpu affinity */ -/* Called via smp_call_function_single() */ +static inline bool is_shared_bank(int bank) +{ + /* Bank 4 is for northbridge reporting and is thus shared */ + return (bank == 4); +} + +static const char * const bank4_names(struct threshold_block *b) +{ + switch (b->address) { + /* MSR4_MISC0 */ + case 0x00000413: + return "dram"; + + case 0xc0000408: + return "ht_links"; + + case 0xc0000409: + return "l3_cache"; + + default: + WARN(1, "Funny MSR: 0x%08x\n", b->address); + return ""; + } +}; + + +static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits) +{ + /* + * bank 4 supports APIC LVT interrupts implicitly since forever. + */ + if (bank == 4) + return true; + + /* + * IntP: interrupt present; if this bit is set, the thresholding + * bank can generate APIC LVT interrupts + */ + return msr_high_bits & BIT(28); +} + +static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) +{ + int msr = (hi & MASK_LVTOFF_HI) >> 20; + + if (apic < 0) { + pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt " + "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu, + b->bank, b->block, b->address, hi, lo); + return 0; + } + + if (apic != msr) { + pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d " + "for bank %d, block %d (MSR%08X=0x%x%08x)\n", + b->cpu, apic, b->bank, b->block, b->address, hi, lo); + return 0; + } + + return 1; +}; + +/* + * Called via smp_call_function_single(), must be called with correct + * cpu affinity. + */ static void threshold_restart_bank(void *_tr) { struct thresh_restart *tr = _tr; - u32 mci_misc_hi, mci_misc_lo; + u32 hi, lo; - rdmsr(tr->b->address, mci_misc_lo, mci_misc_hi); + rdmsr(tr->b->address, lo, hi); - if (tr->b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX)) + if (tr->b->threshold_limit < (hi & THRESHOLD_MAX)) tr->reset = 1; /* limit cannot be lower than err count */ if (tr->reset) { /* reset err count and overflow bit */ - mci_misc_hi = - (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) | + hi = + (hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) | (THRESHOLD_MAX - tr->b->threshold_limit); } else if (tr->old_limit) { /* change limit w/o reset */ - int new_count = (mci_misc_hi & THRESHOLD_MAX) + + int new_count = (hi & THRESHOLD_MAX) + (tr->old_limit - tr->b->threshold_limit); - mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) | + hi = (hi & ~MASK_ERR_COUNT_HI) | (new_count & THRESHOLD_MAX); } - tr->b->interrupt_enable ? - (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) : - (mci_misc_hi &= ~MASK_INT_TYPE_HI); + /* clear IntType */ + hi &= ~MASK_INT_TYPE_HI; + + if (!tr->b->interrupt_capable) + goto done; - mci_misc_hi |= MASK_COUNT_EN_HI; - wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi); + if (tr->set_lvt_off) { + if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) { + /* set new lvt offset */ + hi &= ~MASK_LVTOFF_HI; + hi |= tr->lvt_off << 20; + } + } + + if (tr->b->interrupt_enable) + hi |= INT_TYPE_APIC; + + done: + + hi |= MASK_COUNT_EN_HI; + wrmsr(tr->b->address, lo, hi); +} + +static void mce_threshold_block_init(struct threshold_block *b, int offset) +{ + struct thresh_restart tr = { + .b = b, + .set_lvt_off = 1, + .lvt_off = offset, + }; + + b->threshold_limit = THRESHOLD_MAX; + threshold_restart_bank(&tr); +}; + +static int setup_APIC_mce(int reserved, int new) +{ + if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR, + APIC_EILVT_MSG_FIX, 0)) + return new; + + return reserved; } /* cpu init entry point, called from mce.c with preempt off */ void mce_amd_feature_init(struct cpuinfo_x86 *c) { + struct threshold_block b; unsigned int cpu = smp_processor_id(); u32 low = 0, high = 0, address = 0; unsigned int bank, block; - struct thresh_restart tr; - int lvt_off = -1; - u8 offset; + int offset = -1; - for (bank = 0; bank < NR_BANKS; ++bank) { + for (bank = 0; bank < mca_cfg.banks; ++bank) { for (block = 0; block < NR_BLOCKS; ++block) { if (block == 0) address = MSR_IA32_MC0_MISC + bank * 4; @@ -159,43 +239,20 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) if (!block) per_cpu(bank_map, cpu) |= (1 << bank); -#ifdef CONFIG_SMP - if (shared_bank[bank] && c->cpu_core_id) - break; -#endif - offset = (high & MASK_LVTOFF_HI) >> 20; - if (lvt_off < 0) { - if (setup_APIC_eilvt(offset, - THRESHOLD_APIC_VECTOR, - APIC_EILVT_MSG_FIX, 0)) { - pr_err(FW_BUG "cpu %d, failed to " - "setup threshold interrupt " - "for bank %d, block %d " - "(MSR%08X=0x%x%08x)", - smp_processor_id(), bank, block, - address, high, low); - continue; - } - lvt_off = offset; - } else if (lvt_off != offset) { - pr_err(FW_BUG "cpu %d, invalid threshold " - "interrupt offset %d for bank %d," - "block %d (MSR%08X=0x%x%08x)", - smp_processor_id(), lvt_off, bank, - block, address, high, low); - continue; - } - high &= ~MASK_LVTOFF_HI; - high |= lvt_off << 20; - wrmsr(address, low, high); + memset(&b, 0, sizeof(b)); + b.cpu = cpu; + b.bank = bank; + b.block = block; + b.address = address; + b.interrupt_capable = lvt_interrupt_supported(bank, high); - threshold_defaults.address = address; - tr.b = &threshold_defaults; - tr.reset = 0; - tr.old_limit = 0; - threshold_restart_bank(&tr); + if (b.interrupt_capable) { + int new = (high & MASK_LVTOFF_HI) >> 20; + offset = setup_APIC_mce(offset, new); + } + mce_threshold_block_init(&b, offset); mce_threshold_vector = amd_threshold_interrupt; } } @@ -219,7 +276,7 @@ static void amd_threshold_interrupt(void) mce_setup(&m); /* assume first bank caused it */ - for (bank = 0; bank < NR_BANKS; ++bank) { + for (bank = 0; bank < mca_cfg.banks; ++bank) { if (!(per_cpu(bank_map, m.cpu) & (1 << bank))) continue; for (block = 0; block < NR_BLOCKS; ++block) { @@ -282,7 +339,7 @@ struct threshold_attr { #define SHOW_FIELDS(name) \ static ssize_t show_ ## name(struct threshold_block *b, char *buf) \ { \ - return sprintf(buf, "%lx\n", (unsigned long) b->name); \ + return sprintf(buf, "%lu\n", (unsigned long) b->name); \ } SHOW_FIELDS(interrupt_enable) SHOW_FIELDS(threshold_limit) @@ -293,14 +350,16 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size) struct thresh_restart tr; unsigned long new; + if (!b->interrupt_capable) + return -EINVAL; + if (strict_strtoul(buf, 0, &new) < 0) return -EINVAL; b->interrupt_enable = !!new; + memset(&tr, 0, sizeof(tr)); tr.b = b; - tr.reset = 0; - tr.old_limit = 0; smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); @@ -321,48 +380,31 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size) if (new < 1) new = 1; + memset(&tr, 0, sizeof(tr)); tr.old_limit = b->threshold_limit; b->threshold_limit = new; tr.b = b; - tr.reset = 0; smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); return size; } -struct threshold_block_cross_cpu { - struct threshold_block *tb; - long retval; -}; - -static void local_error_count_handler(void *_tbcc) -{ - struct threshold_block_cross_cpu *tbcc = _tbcc; - struct threshold_block *b = tbcc->tb; - u32 low, high; - - rdmsr(b->address, low, high); - tbcc->retval = (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit); -} - static ssize_t show_error_count(struct threshold_block *b, char *buf) { - struct threshold_block_cross_cpu tbcc = { .tb = b, }; - - smp_call_function_single(b->cpu, local_error_count_handler, &tbcc, 1); - return sprintf(buf, "%lx\n", tbcc.retval); -} + u32 lo, hi; -static ssize_t store_error_count(struct threshold_block *b, - const char *buf, size_t count) -{ - struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 }; + rdmsr_on_cpu(b->cpu, b->address, &lo, &hi); - smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); - return 1; + return sprintf(buf, "%u\n", ((hi & THRESHOLD_MAX) - + (THRESHOLD_MAX - b->threshold_limit))); } +static struct threshold_attr error_count = { + .attr = {.name = __stringify(error_count), .mode = 0444 }, + .show = show_error_count, +}; + #define RW_ATTR(val) \ static struct threshold_attr val = { \ .attr = {.name = __stringify(val), .mode = 0644 }, \ @@ -372,13 +414,12 @@ static struct threshold_attr val = { \ RW_ATTR(interrupt_enable); RW_ATTR(threshold_limit); -RW_ATTR(error_count); static struct attribute *default_attrs[] = { - &interrupt_enable.attr, &threshold_limit.attr, &error_count.attr, - NULL + NULL, /* possibly interrupt_enable if supported, see below */ + NULL, }; #define to_block(k) container_of(k, struct threshold_block, kobj) @@ -417,16 +458,14 @@ static struct kobj_type threshold_ktype = { .default_attrs = default_attrs, }; -static __cpuinit int allocate_threshold_blocks(unsigned int cpu, - unsigned int bank, - unsigned int block, - u32 address) +static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank, + unsigned int block, u32 address) { struct threshold_block *b = NULL; u32 low, high; int err; - if ((bank >= NR_BANKS) || (block >= NR_BLOCKS)) + if ((bank >= mca_cfg.banks) || (block >= NR_BLOCKS)) return 0; if (rdmsr_safe_on_cpu(cpu, address, &low, &high)) @@ -452,8 +491,14 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu, b->cpu = cpu; b->address = address; b->interrupt_enable = 0; + b->interrupt_capable = lvt_interrupt_supported(bank, high); b->threshold_limit = THRESHOLD_MAX; + if (b->interrupt_capable) + threshold_ktype.default_attrs[2] = &interrupt_enable.attr; + else + threshold_ktype.default_attrs[2] = NULL; + INIT_LIST_HEAD(&b->miscj); if (per_cpu(threshold_banks, cpu)[bank]->blocks) { @@ -465,7 +510,7 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu, err = kobject_init_and_add(&b->kobj, &threshold_ktype, per_cpu(threshold_banks, cpu)[bank]->kobj, - "misc%i", block); + (bank == 4 ? bank4_names(b) : th_names[bank])); if (err) goto out_free; recurse: @@ -490,131 +535,125 @@ recurse: out_free: if (b) { kobject_put(&b->kobj); + list_del(&b->miscj); kfree(b); } return err; } -static __cpuinit long -local_allocate_threshold_blocks(int cpu, unsigned int bank) +static int __threshold_add_blocks(struct threshold_bank *b) { - return allocate_threshold_blocks(cpu, bank, 0, - MSR_IA32_MC0_MISC + bank * 4); -} + struct list_head *head = &b->blocks->miscj; + struct threshold_block *pos = NULL; + struct threshold_block *tmp = NULL; + int err = 0; -/* symlinks sibling shared banks to first core. first core owns dir/files. */ -static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) -{ - int i, err = 0; - struct threshold_bank *b = NULL; - char name[32]; -#ifdef CONFIG_SMP - struct cpuinfo_x86 *c = &cpu_data(cpu); -#endif + err = kobject_add(&b->blocks->kobj, b->kobj, b->blocks->kobj.name); + if (err) + return err; - sprintf(name, "threshold_bank%i", bank); + list_for_each_entry_safe(pos, tmp, head, miscj) { -#ifdef CONFIG_SMP - if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ - i = cpumask_first(c->llc_shared_map); + err = kobject_add(&pos->kobj, b->kobj, pos->kobj.name); + if (err) { + list_for_each_entry_safe_reverse(pos, tmp, head, miscj) + kobject_del(&pos->kobj); - /* first core not up yet */ - if (cpu_data(i).cpu_core_id) - goto out; + return err; + } + } + return err; +} - /* already linked */ - if (per_cpu(threshold_banks, cpu)[bank]) - goto out; +static int threshold_create_bank(unsigned int cpu, unsigned int bank) +{ + struct device *dev = per_cpu(mce_device, cpu); + struct amd_northbridge *nb = NULL; + struct threshold_bank *b = NULL; + const char *name = th_names[bank]; + int err = 0; - b = per_cpu(threshold_banks, i)[bank]; + if (is_shared_bank(bank)) { + nb = node_to_amd_nb(amd_get_nb_id(cpu)); - if (!b) - goto out; + /* threshold descriptor already initialized on this node? */ + if (nb && nb->bank4) { + /* yes, use it */ + b = nb->bank4; + err = kobject_add(b->kobj, &dev->kobj, name); + if (err) + goto out; - err = sysfs_create_link(&per_cpu(mce_dev, cpu).kobj, - b->kobj, name); - if (err) - goto out; + per_cpu(threshold_banks, cpu)[bank] = b; + atomic_inc(&b->cpus); - cpumask_copy(b->cpus, c->llc_shared_map); - per_cpu(threshold_banks, cpu)[bank] = b; + err = __threshold_add_blocks(b); - goto out; + goto out; + } } -#endif b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); if (!b) { err = -ENOMEM; goto out; } - if (!zalloc_cpumask_var(&b->cpus, GFP_KERNEL)) { - kfree(b); - err = -ENOMEM; - goto out; - } - b->kobj = kobject_create_and_add(name, &per_cpu(mce_dev, cpu).kobj); - if (!b->kobj) + b->kobj = kobject_create_and_add(name, &dev->kobj); + if (!b->kobj) { + err = -EINVAL; goto out_free; - -#ifndef CONFIG_SMP - cpumask_setall(b->cpus); -#else - cpumask_set_cpu(cpu, b->cpus); -#endif + } per_cpu(threshold_banks, cpu)[bank] = b; - err = local_allocate_threshold_blocks(cpu, bank); - if (err) - goto out_free; - - for_each_cpu(i, b->cpus) { - if (i == cpu) - continue; + if (is_shared_bank(bank)) { + atomic_set(&b->cpus, 1); - err = sysfs_create_link(&per_cpu(mce_dev, i).kobj, - b->kobj, name); - if (err) - goto out; - - per_cpu(threshold_banks, i)[bank] = b; + /* nb is already initialized, see above */ + if (nb) { + WARN_ON(nb->bank4); + nb->bank4 = b; + } } - goto out; + err = allocate_threshold_blocks(cpu, bank, 0, + MSR_IA32_MC0_MISC + bank * 4); + if (!err) + goto out; -out_free: - per_cpu(threshold_banks, cpu)[bank] = NULL; - free_cpumask_var(b->cpus); + out_free: kfree(b); -out: + + out: return err; } /* create dir/files for all valid threshold banks */ -static __cpuinit int threshold_create_device(unsigned int cpu) +static int threshold_create_device(unsigned int cpu) { unsigned int bank; + struct threshold_bank **bp; int err = 0; - for (bank = 0; bank < NR_BANKS; ++bank) { + bp = kzalloc(sizeof(struct threshold_bank *) * mca_cfg.banks, + GFP_KERNEL); + if (!bp) + return -ENOMEM; + + per_cpu(threshold_banks, cpu) = bp; + + for (bank = 0; bank < mca_cfg.banks; ++bank) { if (!(per_cpu(bank_map, cpu) & (1 << bank))) continue; err = threshold_create_bank(cpu, bank); if (err) - goto out; + return err; } -out: + return err; } -/* - * let's be hotplug friendly. - * in case of multiple core processors, the first core always takes ownership - * of shared sysfs dir/files, and rest of the cores will be symlinked to it. - */ - static void deallocate_threshold_block(unsigned int cpu, unsigned int bank) { @@ -635,37 +674,42 @@ static void deallocate_threshold_block(unsigned int cpu, per_cpu(threshold_banks, cpu)[bank]->blocks = NULL; } +static void __threshold_remove_blocks(struct threshold_bank *b) +{ + struct threshold_block *pos = NULL; + struct threshold_block *tmp = NULL; + + kobject_del(b->kobj); + + list_for_each_entry_safe(pos, tmp, &b->blocks->miscj, miscj) + kobject_del(&pos->kobj); +} + static void threshold_remove_bank(unsigned int cpu, int bank) { + struct amd_northbridge *nb; struct threshold_bank *b; - char name[32]; - int i = 0; b = per_cpu(threshold_banks, cpu)[bank]; if (!b) return; + if (!b->blocks) goto free_out; - sprintf(name, "threshold_bank%i", bank); - -#ifdef CONFIG_SMP - /* sibling symlink */ - if (shared_bank[bank] && b->blocks->cpu != cpu) { - sysfs_remove_link(&per_cpu(mce_dev, cpu).kobj, name); - per_cpu(threshold_banks, cpu)[bank] = NULL; - - return; - } -#endif - - /* remove all sibling symlinks before unregistering */ - for_each_cpu(i, b->cpus) { - if (i == cpu) - continue; - - sysfs_remove_link(&per_cpu(mce_dev, i).kobj, name); - per_cpu(threshold_banks, i)[bank] = NULL; + if (is_shared_bank(bank)) { + if (!atomic_dec_and_test(&b->cpus)) { + __threshold_remove_blocks(b); + per_cpu(threshold_banks, cpu)[bank] = NULL; + return; + } else { + /* + * the last CPU on this node using the shared bank is + * going away, remove that bank now. + */ + nb = node_to_amd_nb(amd_get_nb_id(cpu)); + nb->bank4 = NULL; + } } deallocate_threshold_block(cpu, bank); @@ -673,7 +717,6 @@ static void threshold_remove_bank(unsigned int cpu, int bank) free_out: kobject_del(b->kobj); kobject_put(b->kobj); - free_cpumask_var(b->cpus); kfree(b); per_cpu(threshold_banks, cpu)[bank] = NULL; } @@ -682,15 +725,16 @@ static void threshold_remove_device(unsigned int cpu) { unsigned int bank; - for (bank = 0; bank < NR_BANKS; ++bank) { + for (bank = 0; bank < mca_cfg.banks; ++bank) { if (!(per_cpu(bank_map, cpu) & (1 << bank))) continue; threshold_remove_bank(cpu, bank); } + kfree(per_cpu(threshold_banks, cpu)); } /* get notified when a cpu comes on/off */ -static void __cpuinit +static void amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu) { switch (action) { @@ -722,4 +766,24 @@ static __init int threshold_init_device(void) return 0; } -device_initcall(threshold_init_device); +/* + * there are 3 funcs which need to be _initcalled in a logic sequence: + * 1. xen_late_init_mcelog + * 2. mcheck_init_device + * 3. threshold_init_device + * + * xen_late_init_mcelog must register xen_mce_chrdev_device before + * native mce_chrdev_device registration if running under xen platform; + * + * mcheck_init_device should be inited before threshold_init_device to + * initialize mce_device, otherwise a NULL ptr dereference will cause panic. + * + * so we use following _initcalls + * 1. device_initcall(xen_late_init_mcelog); + * 2. device_initcall_sync(mcheck_init_device); + * 3. late_initcall(threshold_init_device); + * + * when running under xen, the initcall order is 1,2,3; + * on baremetal, we skip 1 and we do only 2 and 3. + */ +late_initcall(threshold_init_device); diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 6fcd0936194..9a316b21df8 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -6,15 +6,17 @@ */ #include <linux/gfp.h> -#include <linux/init.h> #include <linux/interrupt.h> #include <linux/percpu.h> #include <linux/sched.h> +#include <linux/cpumask.h> #include <asm/apic.h> #include <asm/processor.h> #include <asm/msr.h> #include <asm/mce.h> +#include "mce-internal.h" + /* * Support for Intel Correct Machine Check Interrupts. This allows * the CPU to raise an interrupt when a corrected machine check happened. @@ -22,6 +24,18 @@ * Also supports reliable discovery of shared banks. */ +/* + * CMCI can be delivered to multiple cpus that share a machine check bank + * so we need to designate a single cpu to process errors logged in each bank + * in the interrupt handler (otherwise we would have many races and potential + * double reporting of the same error). + * Note that this can change when a cpu is offlined or brought online since + * some MCA banks are shared across cpus. When a cpu is offlined, cmci_clear() + * disables CMCI on all banks owned by the cpu and clears this bitfield. At + * this point, cmci_rediscover() kicks in and a different cpu may end up + * taking ownership of some of the shared MCA banks that were previously + * owned by the offlined cpu. + */ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); /* @@ -30,13 +44,28 @@ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); */ static DEFINE_SPINLOCK(cmci_discover_lock); -#define CMCI_THRESHOLD 1 +#define CMCI_THRESHOLD 1 +#define CMCI_POLL_INTERVAL (30 * HZ) +#define CMCI_STORM_INTERVAL (1 * HZ) +#define CMCI_STORM_THRESHOLD 15 + +static DEFINE_PER_CPU(unsigned long, cmci_time_stamp); +static DEFINE_PER_CPU(unsigned int, cmci_storm_cnt); +static DEFINE_PER_CPU(unsigned int, cmci_storm_state); + +enum { + CMCI_STORM_NONE, + CMCI_STORM_ACTIVE, + CMCI_STORM_SUBSIDED, +}; + +static atomic_t cmci_storm_on_cpus; static int cmci_supported(int *banks) { u64 cap; - if (mce_cmci_disabled || mce_ignore_ce) + if (mca_cfg.cmci_disabled || mca_cfg.ignore_ce) return 0; /* @@ -53,6 +82,109 @@ static int cmci_supported(int *banks) return !!(cap & MCG_CMCI_P); } +void mce_intel_cmci_poll(void) +{ + if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE) + return; + machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); +} + +void mce_intel_hcpu_update(unsigned long cpu) +{ + if (per_cpu(cmci_storm_state, cpu) == CMCI_STORM_ACTIVE) + atomic_dec(&cmci_storm_on_cpus); + + per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE; +} + +unsigned long mce_intel_adjust_timer(unsigned long interval) +{ + int r; + + if (interval < CMCI_POLL_INTERVAL) + return interval; + + switch (__this_cpu_read(cmci_storm_state)) { + case CMCI_STORM_ACTIVE: + /* + * We switch back to interrupt mode once the poll timer has + * silenced itself. That means no events recorded and the + * timer interval is back to our poll interval. + */ + __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED); + r = atomic_sub_return(1, &cmci_storm_on_cpus); + if (r == 0) + pr_notice("CMCI storm subsided: switching to interrupt mode\n"); + /* FALLTHROUGH */ + + case CMCI_STORM_SUBSIDED: + /* + * We wait for all cpus to go back to SUBSIDED + * state. When that happens we switch back to + * interrupt mode. + */ + if (!atomic_read(&cmci_storm_on_cpus)) { + __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE); + cmci_reenable(); + cmci_recheck(); + } + return CMCI_POLL_INTERVAL; + default: + /* + * We have shiny weather. Let the poll do whatever it + * thinks. + */ + return interval; + } +} + +static void cmci_storm_disable_banks(void) +{ + unsigned long flags, *owned; + int bank; + u64 val; + + spin_lock_irqsave(&cmci_discover_lock, flags); + owned = __get_cpu_var(mce_banks_owned); + for_each_set_bit(bank, owned, MAX_NR_BANKS) { + rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + val &= ~MCI_CTL2_CMCI_EN; + wrmsrl(MSR_IA32_MCx_CTL2(bank), val); + } + spin_unlock_irqrestore(&cmci_discover_lock, flags); +} + +static bool cmci_storm_detect(void) +{ + unsigned int cnt = __this_cpu_read(cmci_storm_cnt); + unsigned long ts = __this_cpu_read(cmci_time_stamp); + unsigned long now = jiffies; + int r; + + if (__this_cpu_read(cmci_storm_state) != CMCI_STORM_NONE) + return true; + + if (time_before_eq(now, ts + CMCI_STORM_INTERVAL)) { + cnt++; + } else { + cnt = 1; + __this_cpu_write(cmci_time_stamp, now); + } + __this_cpu_write(cmci_storm_cnt, cnt); + + if (cnt <= CMCI_STORM_THRESHOLD) + return false; + + cmci_storm_disable_banks(); + __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE); + r = atomic_add_return(1, &cmci_storm_on_cpus); + mce_timer_kick(CMCI_POLL_INTERVAL); + + if (r == 1) + pr_notice("CMCI storm detected: switching to poll mode\n"); + return true; +} + /* * The interrupt handler. This is called on every event. * Just call the poller directly to log any events. @@ -61,64 +193,86 @@ static int cmci_supported(int *banks) */ static void intel_threshold_interrupt(void) { + if (cmci_storm_detect()) + return; machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); mce_notify_irq(); } -static void print_update(char *type, int *hdr, int num) -{ - if (*hdr == 0) - printk(KERN_INFO "CPU %d MCA banks", smp_processor_id()); - *hdr = 1; - printk(KERN_CONT " %s:%d", type, num); -} - /* * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks * on this CPU. Use the algorithm recommended in the SDM to discover shared * banks. */ -static void cmci_discover(int banks, int boot) +static void cmci_discover(int banks) { unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned); unsigned long flags; - int hdr = 0; int i; + int bios_wrong_thresh = 0; spin_lock_irqsave(&cmci_discover_lock, flags); for (i = 0; i < banks; i++) { u64 val; + int bios_zero_thresh = 0; if (test_bit(i, owned)) continue; + /* Skip banks in firmware first mode */ + if (test_bit(i, mce_banks_ce_disabled)) + continue; + rdmsrl(MSR_IA32_MCx_CTL2(i), val); /* Already owned by someone else? */ if (val & MCI_CTL2_CMCI_EN) { - if (test_and_clear_bit(i, owned) && !boot) - print_update("SHD", &hdr, i); + clear_bit(i, owned); __clear_bit(i, __get_cpu_var(mce_poll_banks)); continue; } - val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; - val |= MCI_CTL2_CMCI_EN | CMCI_THRESHOLD; + if (!mca_cfg.bios_cmci_threshold) { + val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; + val |= CMCI_THRESHOLD; + } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) { + /* + * If bios_cmci_threshold boot option was specified + * but the threshold is zero, we'll try to initialize + * it to 1. + */ + bios_zero_thresh = 1; + val |= CMCI_THRESHOLD; + } + + val |= MCI_CTL2_CMCI_EN; wrmsrl(MSR_IA32_MCx_CTL2(i), val); rdmsrl(MSR_IA32_MCx_CTL2(i), val); /* Did the enable bit stick? -- the bank supports CMCI */ if (val & MCI_CTL2_CMCI_EN) { - if (!test_and_set_bit(i, owned) && !boot) - print_update("CMCI", &hdr, i); + set_bit(i, owned); __clear_bit(i, __get_cpu_var(mce_poll_banks)); + /* + * We are able to set thresholds for some banks that + * had a threshold of 0. This means the BIOS has not + * set the thresholds properly or does not work with + * this boot option. Note down now and report later. + */ + if (mca_cfg.bios_cmci_threshold && bios_zero_thresh && + (val & MCI_CTL2_CMCI_THRESHOLD_MASK)) + bios_wrong_thresh = 1; } else { WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks))); } } spin_unlock_irqrestore(&cmci_discover_lock, flags); - if (hdr) - printk(KERN_CONT "\n"); + if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) { + pr_info_once( + "bios_cmci_threshold: Some banks do not have valid thresholds set\n"); + pr_info_once( + "bios_cmci_threshold: Make sure your BIOS supports this boot option\n"); + } } /* @@ -130,13 +284,26 @@ void cmci_recheck(void) unsigned long flags; int banks; - if (!mce_available(¤t_cpu_data) || !cmci_supported(&banks)) + if (!mce_available(__this_cpu_ptr(&cpu_info)) || !cmci_supported(&banks)) return; local_irq_save(flags); machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); local_irq_restore(flags); } +/* Caller must hold the lock on cmci_discover_lock */ +static void __cmci_disable_bank(int bank) +{ + u64 val; + + if (!test_bit(bank, __get_cpu_var(mce_banks_owned))) + return; + rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + val &= ~MCI_CTL2_CMCI_EN; + wrmsrl(MSR_IA32_MCx_CTL2(bank), val); + __clear_bit(bank, __get_cpu_var(mce_banks_owned)); +} + /* * Disable CMCI on this CPU for all banks it owns when it goes down. * This allows other CPUs to claim the banks on rediscovery. @@ -146,51 +313,33 @@ void cmci_clear(void) unsigned long flags; int i; int banks; - u64 val; if (!cmci_supported(&banks)) return; spin_lock_irqsave(&cmci_discover_lock, flags); - for (i = 0; i < banks; i++) { - if (!test_bit(i, __get_cpu_var(mce_banks_owned))) - continue; - /* Disable CMCI */ - rdmsrl(MSR_IA32_MCx_CTL2(i), val); - val &= ~(MCI_CTL2_CMCI_EN|MCI_CTL2_CMCI_THRESHOLD_MASK); - wrmsrl(MSR_IA32_MCx_CTL2(i), val); - __clear_bit(i, __get_cpu_var(mce_banks_owned)); - } + for (i = 0; i < banks; i++) + __cmci_disable_bank(i); spin_unlock_irqrestore(&cmci_discover_lock, flags); } -/* - * After a CPU went down cycle through all the others and rediscover - * Must run in process context. - */ -void cmci_rediscover(int dying) +static void cmci_rediscover_work_func(void *arg) +{ + int banks; + + /* Recheck banks in case CPUs don't all have the same */ + if (cmci_supported(&banks)) + cmci_discover(banks); +} + +/* After a CPU went down cycle through all the others and rediscover */ +void cmci_rediscover(void) { int banks; - int cpu; - cpumask_var_t old; if (!cmci_supported(&banks)) return; - if (!alloc_cpumask_var(&old, GFP_KERNEL)) - return; - cpumask_copy(old, ¤t->cpus_allowed); - for_each_online_cpu(cpu) { - if (cpu == dying) - continue; - if (set_cpus_allowed_ptr(current, cpumask_of(cpu))) - continue; - /* Recheck banks in case CPUs don't all have the same */ - if (cmci_supported(&banks)) - cmci_discover(banks, 0); - } - - set_cpus_allowed_ptr(current, old); - free_cpumask_var(old); + on_each_cpu(cmci_rediscover_work_func, NULL, 1); } /* @@ -200,7 +349,20 @@ void cmci_reenable(void) { int banks; if (cmci_supported(&banks)) - cmci_discover(banks, 0); + cmci_discover(banks); +} + +void cmci_disable_bank(int bank) +{ + int banks; + unsigned long flags; + + if (!cmci_supported(&banks)) + return; + + spin_lock_irqsave(&cmci_discover_lock, flags); + __cmci_disable_bank(bank); + spin_unlock_irqrestore(&cmci_discover_lock, flags); } static void intel_init_cmci(void) @@ -211,7 +373,7 @@ static void intel_init_cmci(void) return; mce_threshold_vector = intel_threshold_interrupt; - cmci_discover(banks, 1); + cmci_discover(banks); /* * For CPU #0 this runs with still disabled APIC, but that's * ok because only the vector is set up. We still do another diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index 5c0e6533d9b..a3042989398 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c @@ -5,11 +5,9 @@ #include <linux/interrupt.h> #include <linux/kernel.h> #include <linux/types.h> -#include <linux/init.h> #include <linux/smp.h> #include <asm/processor.h> -#include <asm/system.h> #include <asm/mce.h> #include <asm/msr.h> @@ -34,7 +32,7 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code) smp_processor_id()); } - add_taint(TAINT_MACHINE_CHECK); + add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); } /* Set up machine check reporting for processors with Intel style MCE: */ diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 4b683267eca..36a1bb6d1ee 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -18,18 +18,18 @@ #include <linux/jiffies.h> #include <linux/kernel.h> #include <linux/percpu.h> -#include <linux/sysdev.h> +#include <linux/export.h> #include <linux/types.h> #include <linux/init.h> #include <linux/smp.h> #include <linux/cpu.h> #include <asm/processor.h> -#include <asm/system.h> #include <asm/apic.h> #include <asm/idle.h> #include <asm/mce.h> #include <asm/msr.h> +#include <asm/trace/irq_vectors.h> /* How long to wait between reporting thermal events */ #define CHECK_INTERVAL (300 * HZ) @@ -53,8 +53,26 @@ struct thermal_state { struct _thermal_state core_power_limit; struct _thermal_state package_throttle; struct _thermal_state package_power_limit; + struct _thermal_state core_thresh0; + struct _thermal_state core_thresh1; + struct _thermal_state pkg_thresh0; + struct _thermal_state pkg_thresh1; }; +/* Callback to handle core threshold interrupts */ +int (*platform_thermal_notify)(__u64 msr_val); +EXPORT_SYMBOL(platform_thermal_notify); + +/* Callback to handle core package threshold_interrupts */ +int (*platform_thermal_package_notify)(__u64 msr_val); +EXPORT_SYMBOL_GPL(platform_thermal_package_notify); + +/* Callback support of rate control, return true, if + * callback has rate control */ +bool (*platform_thermal_package_rate_control)(void); +EXPORT_SYMBOL_GPL(platform_thermal_package_rate_control); + + static DEFINE_PER_CPU(struct thermal_state, thermal_state); static atomic_t therm_throt_en = ATOMIC_INIT(0); @@ -62,16 +80,16 @@ static atomic_t therm_throt_en = ATOMIC_INIT(0); static u32 lvtthmr_init __read_mostly; #ifdef CONFIG_SYSFS -#define define_therm_throt_sysdev_one_ro(_name) \ - static SYSDEV_ATTR(_name, 0444, \ - therm_throt_sysdev_show_##_name, \ +#define define_therm_throt_device_one_ro(_name) \ + static DEVICE_ATTR(_name, 0444, \ + therm_throt_device_show_##_name, \ NULL) \ -#define define_therm_throt_sysdev_show_func(event, name) \ +#define define_therm_throt_device_show_func(event, name) \ \ -static ssize_t therm_throt_sysdev_show_##event##_##name( \ - struct sys_device *dev, \ - struct sysdev_attribute *attr, \ +static ssize_t therm_throt_device_show_##event##_##name( \ + struct device *dev, \ + struct device_attribute *attr, \ char *buf) \ { \ unsigned int cpu = dev->id; \ @@ -88,20 +106,20 @@ static ssize_t therm_throt_sysdev_show_##event##_##name( \ return ret; \ } -define_therm_throt_sysdev_show_func(core_throttle, count); -define_therm_throt_sysdev_one_ro(core_throttle_count); +define_therm_throt_device_show_func(core_throttle, count); +define_therm_throt_device_one_ro(core_throttle_count); -define_therm_throt_sysdev_show_func(core_power_limit, count); -define_therm_throt_sysdev_one_ro(core_power_limit_count); +define_therm_throt_device_show_func(core_power_limit, count); +define_therm_throt_device_one_ro(core_power_limit_count); -define_therm_throt_sysdev_show_func(package_throttle, count); -define_therm_throt_sysdev_one_ro(package_throttle_count); +define_therm_throt_device_show_func(package_throttle, count); +define_therm_throt_device_one_ro(package_throttle_count); -define_therm_throt_sysdev_show_func(package_power_limit, count); -define_therm_throt_sysdev_one_ro(package_power_limit_count); +define_therm_throt_device_show_func(package_power_limit, count); +define_therm_throt_device_one_ro(package_power_limit_count); static struct attribute *thermal_throttle_attrs[] = { - &attr_core_throttle_count.attr, + &dev_attr_core_throttle_count.attr, NULL }; @@ -176,13 +194,6 @@ static int therm_throt_process(bool new_event, int event, int level) this_cpu, level == CORE_LEVEL ? "Core" : "Package", state->count); - else - printk(KERN_CRIT "CPU%d: %s power limit notification (total events = %lu)\n", - this_cpu, - level == CORE_LEVEL ? "Core" : "Package", - state->count); - - add_taint(TAINT_MACHINE_CHECK); return 1; } if (old_event) { @@ -190,86 +201,105 @@ static int therm_throt_process(bool new_event, int event, int level) printk(KERN_INFO "CPU%d: %s temperature/speed normal\n", this_cpu, level == CORE_LEVEL ? "Core" : "Package"); - else - printk(KERN_INFO "CPU%d: %s power limit normal\n", - this_cpu, - level == CORE_LEVEL ? "Core" : "Package"); return 1; } return 0; } +static int thresh_event_valid(int level, int event) +{ + struct _thermal_state *state; + unsigned int this_cpu = smp_processor_id(); + struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu); + u64 now = get_jiffies_64(); + + if (level == PACKAGE_LEVEL) + state = (event == 0) ? &pstate->pkg_thresh0 : + &pstate->pkg_thresh1; + else + state = (event == 0) ? &pstate->core_thresh0 : + &pstate->core_thresh1; + + if (time_before64(now, state->next_check)) + return 0; + + state->next_check = now + CHECK_INTERVAL; + + return 1; +} + +static bool int_pln_enable; +static int __init int_pln_enable_setup(char *s) +{ + int_pln_enable = true; + + return 1; +} +__setup("int_pln_enable", int_pln_enable_setup); + #ifdef CONFIG_SYSFS /* Add/Remove thermal_throttle interface for CPU device: */ -static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev, - unsigned int cpu) +static int thermal_throttle_add_dev(struct device *dev, unsigned int cpu) { int err; struct cpuinfo_x86 *c = &cpu_data(cpu); - err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group); + err = sysfs_create_group(&dev->kobj, &thermal_attr_group); if (err) return err; - if (cpu_has(c, X86_FEATURE_PLN)) - err = sysfs_add_file_to_group(&sys_dev->kobj, - &attr_core_power_limit_count.attr, + if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) + err = sysfs_add_file_to_group(&dev->kobj, + &dev_attr_core_power_limit_count.attr, thermal_attr_group.name); if (cpu_has(c, X86_FEATURE_PTS)) { - err = sysfs_add_file_to_group(&sys_dev->kobj, - &attr_package_throttle_count.attr, + err = sysfs_add_file_to_group(&dev->kobj, + &dev_attr_package_throttle_count.attr, thermal_attr_group.name); - if (cpu_has(c, X86_FEATURE_PLN)) - err = sysfs_add_file_to_group(&sys_dev->kobj, - &attr_package_power_limit_count.attr, + if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) + err = sysfs_add_file_to_group(&dev->kobj, + &dev_attr_package_power_limit_count.attr, thermal_attr_group.name); } return err; } -static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) +static void thermal_throttle_remove_dev(struct device *dev) { - sysfs_remove_group(&sys_dev->kobj, &thermal_attr_group); + sysfs_remove_group(&dev->kobj, &thermal_attr_group); } -/* Mutex protecting device creation against CPU hotplug: */ -static DEFINE_MUTEX(therm_cpu_lock); - /* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static __cpuinit int +static int thermal_throttle_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; - struct sys_device *sys_dev; + struct device *dev; int err = 0; - sys_dev = get_cpu_sysdev(cpu); + dev = get_cpu_device(cpu); switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - mutex_lock(&therm_cpu_lock); - err = thermal_throttle_add_dev(sys_dev, cpu); - mutex_unlock(&therm_cpu_lock); + err = thermal_throttle_add_dev(dev, cpu); WARN_ON(err); break; case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: - mutex_lock(&therm_cpu_lock); - thermal_throttle_remove_dev(sys_dev); - mutex_unlock(&therm_cpu_lock); + thermal_throttle_remove_dev(dev); break; } return notifier_from_errno(err); } -static struct notifier_block thermal_throttle_cpu_notifier __cpuinitdata = +static struct notifier_block thermal_throttle_cpu_notifier = { .notifier_call = thermal_throttle_cpu_callback, }; @@ -282,19 +312,16 @@ static __init int thermal_throttle_init_device(void) if (!atomic_read(&therm_throt_en)) return 0; - register_hotcpu_notifier(&thermal_throttle_cpu_notifier); + cpu_notifier_register_begin(); -#ifdef CONFIG_HOTPLUG_CPU - mutex_lock(&therm_cpu_lock); -#endif /* connect live CPUs to sysfs */ for_each_online_cpu(cpu) { - err = thermal_throttle_add_dev(get_cpu_sysdev(cpu), cpu); + err = thermal_throttle_add_dev(get_cpu_device(cpu), cpu); WARN_ON(err); } -#ifdef CONFIG_HOTPLUG_CPU - mutex_unlock(&therm_cpu_lock); -#endif + + __register_hotcpu_notifier(&thermal_throttle_cpu_notifier); + cpu_notifier_register_done(); return 0; } @@ -302,49 +329,89 @@ device_initcall(thermal_throttle_init_device); #endif /* CONFIG_SYSFS */ -/* - * Set up the most two significant bit to notify mce log that this thermal - * event type. - * This is a temp solution. May be changed in the future with mce log - * infrasture. - */ -#define CORE_THROTTLED (0) -#define CORE_POWER_LIMIT ((__u64)1 << 62) -#define PACKAGE_THROTTLED ((__u64)2 << 62) -#define PACKAGE_POWER_LIMIT ((__u64)3 << 62) +static void notify_package_thresholds(__u64 msr_val) +{ + bool notify_thres_0 = false; + bool notify_thres_1 = false; + + if (!platform_thermal_package_notify) + return; + + /* lower threshold check */ + if (msr_val & THERM_LOG_THRESHOLD0) + notify_thres_0 = true; + /* higher threshold check */ + if (msr_val & THERM_LOG_THRESHOLD1) + notify_thres_1 = true; + + if (!notify_thres_0 && !notify_thres_1) + return; + + if (platform_thermal_package_rate_control && + platform_thermal_package_rate_control()) { + /* Rate control is implemented in callback */ + platform_thermal_package_notify(msr_val); + return; + } + + /* lower threshold reached */ + if (notify_thres_0 && thresh_event_valid(PACKAGE_LEVEL, 0)) + platform_thermal_package_notify(msr_val); + /* higher threshold reached */ + if (notify_thres_1 && thresh_event_valid(PACKAGE_LEVEL, 1)) + platform_thermal_package_notify(msr_val); +} + +static void notify_thresholds(__u64 msr_val) +{ + /* check whether the interrupt handler is defined; + * otherwise simply return + */ + if (!platform_thermal_notify) + return; + + /* lower threshold reached */ + if ((msr_val & THERM_LOG_THRESHOLD0) && + thresh_event_valid(CORE_LEVEL, 0)) + platform_thermal_notify(msr_val); + /* higher threshold reached */ + if ((msr_val & THERM_LOG_THRESHOLD1) && + thresh_event_valid(CORE_LEVEL, 1)) + platform_thermal_notify(msr_val); +} /* Thermal transition interrupt handler */ static void intel_thermal_interrupt(void) { __u64 msr_val; - struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); rdmsrl(MSR_IA32_THERM_STATUS, msr_val); + /* Check for violation of core thermal thresholds*/ + notify_thresholds(msr_val); + if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT, THERMAL_THROTTLING_EVENT, CORE_LEVEL) != 0) - mce_log_therm_throt_event(CORE_THROTTLED | msr_val); + mce_log_therm_throt_event(msr_val); - if (cpu_has(c, X86_FEATURE_PLN)) - if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, + if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable) + therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, POWER_LIMIT_EVENT, - CORE_LEVEL) != 0) - mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val); + CORE_LEVEL); - if (cpu_has(c, X86_FEATURE_PTS)) { + if (this_cpu_has(X86_FEATURE_PTS)) { rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); - if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, + /* check violations of package thermal thresholds */ + notify_package_thresholds(msr_val); + therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, THERMAL_THROTTLING_EVENT, - PACKAGE_LEVEL) != 0) - mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val); - if (cpu_has(c, X86_FEATURE_PLN)) - if (therm_throt_process(msr_val & + PACKAGE_LEVEL); + if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable) + therm_throt_process(msr_val & PACKAGE_THERM_STATUS_POWER_LIMIT, POWER_LIMIT_EVENT, - PACKAGE_LEVEL) != 0) - mce_log_therm_throt_event(PACKAGE_POWER_LIMIT - | msr_val); + PACKAGE_LEVEL); } } @@ -352,20 +419,30 @@ static void unexpected_thermal_interrupt(void) { printk(KERN_ERR "CPU%d: Unexpected LVT thermal interrupt!\n", smp_processor_id()); - add_taint(TAINT_MACHINE_CHECK); } static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt; -asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) +static inline void __smp_thermal_interrupt(void) { - exit_idle(); - irq_enter(); inc_irq_stat(irq_thermal_count); smp_thermal_vector(); - irq_exit(); - /* Ack only at the end to avoid potential reentry */ - ack_APIC_irq(); +} + +asmlinkage __visible void smp_thermal_interrupt(struct pt_regs *regs) +{ + entering_irq(); + __smp_thermal_interrupt(); + exiting_ack_irq(); +} + +asmlinkage __visible void smp_trace_thermal_interrupt(struct pt_regs *regs) +{ + entering_irq(); + trace_thermal_apic_entry(THERMAL_APIC_VECTOR); + __smp_thermal_interrupt(); + trace_thermal_apic_exit(THERMAL_APIC_VECTOR); + exiting_ack_irq(); } /* Thermal monitoring depends on APIC, ACPI and clock modulation */ @@ -405,18 +482,20 @@ void intel_init_thermal(struct cpuinfo_x86 *c) */ rdmsr(MSR_IA32_MISC_ENABLE, l, h); + h = lvtthmr_init; /* * The initial value of thermal LVT entries on all APs always reads * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI * sequence to them and LVT registers are reset to 0s except for * the mask bits which are set to 1s when APs receive INIT IPI. - * Always restore the value that BIOS has programmed on AP based on - * BSP's info we saved since BIOS is always setting the same value - * for all threads/cores + * If BIOS takes over the thermal interrupt and sets its interrupt + * delivery mode to SMI (not fixed), it restores the value that the + * BIOS has programmed on AP based on BSP's info we saved since BIOS + * is always setting the same value for all threads/cores. */ - apic_write(APIC_LVTTHMR, lvtthmr_init); + if ((h & APIC_DM_FIXED_MASK) != APIC_DM_FIXED) + apic_write(APIC_LVTTHMR, lvtthmr_init); - h = lvtthmr_init; if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { printk(KERN_DEBUG @@ -447,9 +526,13 @@ void intel_init_thermal(struct cpuinfo_x86 *c) apic_write(APIC_LVTTHMR, h); rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); - if (cpu_has(c, X86_FEATURE_PLN)) + if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable) + wrmsr(MSR_IA32_THERM_INTERRUPT, + (l | (THERM_INT_LOW_ENABLE + | THERM_INT_HIGH_ENABLE)) & ~THERM_INT_PLN_ENABLE, h); + else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) wrmsr(MSR_IA32_THERM_INTERRUPT, - l | (THERM_INT_LOW_ENABLE + l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h); else wrmsr(MSR_IA32_THERM_INTERRUPT, @@ -457,9 +540,14 @@ void intel_init_thermal(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_PTS)) { rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); - if (cpu_has(c, X86_FEATURE_PLN)) + if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable) wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, - l | (PACKAGE_THERM_INT_LOW_ENABLE + (l | (PACKAGE_THERM_INT_LOW_ENABLE + | PACKAGE_THERM_INT_HIGH_ENABLE)) + & ~PACKAGE_THERM_INT_PLN_ENABLE, h); + else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) + wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, + l | (PACKAGE_THERM_INT_LOW_ENABLE | PACKAGE_THERM_INT_HIGH_ENABLE | PACKAGE_THERM_INT_PLN_ENABLE), h); else diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index d746df2909c..7245980186e 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -8,6 +8,7 @@ #include <asm/apic.h> #include <asm/idle.h> #include <asm/mce.h> +#include <asm/trace/irq_vectors.h> static void default_threshold_interrupt(void) { @@ -17,13 +18,24 @@ static void default_threshold_interrupt(void) void (*mce_threshold_vector)(void) = default_threshold_interrupt; -asmlinkage void smp_threshold_interrupt(void) +static inline void __smp_threshold_interrupt(void) { - exit_idle(); - irq_enter(); inc_irq_stat(irq_threshold_count); mce_threshold_vector(); - irq_exit(); - /* Ack only at the end to avoid potential reentry */ - ack_APIC_irq(); +} + +asmlinkage __visible void smp_threshold_interrupt(void) +{ + entering_irq(); + __smp_threshold_interrupt(); + exiting_ack_irq(); +} + +asmlinkage __visible void smp_trace_threshold_interrupt(void) +{ + entering_irq(); + trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR); + __smp_threshold_interrupt(); + trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR); + exiting_ack_irq(); } diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c index 54060f56597..7dc5564d0cd 100644 --- a/arch/x86/kernel/cpu/mcheck/winchip.c +++ b/arch/x86/kernel/cpu/mcheck/winchip.c @@ -5,10 +5,8 @@ #include <linux/interrupt.h> #include <linux/kernel.h> #include <linux/types.h> -#include <linux/init.h> #include <asm/processor.h> -#include <asm/system.h> #include <asm/mce.h> #include <asm/msr.h> @@ -16,7 +14,7 @@ static void winchip_machine_check(struct pt_regs *regs, long error_code) { printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); - add_taint(TAINT_MACHINE_CHECK); + add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); } /* Set up machine check reporting on the Winchip C6 series */ diff --git a/arch/x86/kernel/cpu/microcode/Makefile b/arch/x86/kernel/cpu/microcode/Makefile new file mode 100644 index 00000000000..285c85427c3 --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/Makefile @@ -0,0 +1,7 @@ +microcode-y := core.o +obj-$(CONFIG_MICROCODE) += microcode.o +microcode-$(CONFIG_MICROCODE_INTEL) += intel.o intel_lib.o +microcode-$(CONFIG_MICROCODE_AMD) += amd.o +obj-$(CONFIG_MICROCODE_EARLY) += core_early.o +obj-$(CONFIG_MICROCODE_INTEL_EARLY) += intel_early.o +obj-$(CONFIG_MICROCODE_AMD_EARLY) += amd_early.o diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c new file mode 100644 index 00000000000..8fffd845e22 --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -0,0 +1,492 @@ +/* + * AMD CPU Microcode Update Driver for Linux + * Copyright (C) 2008-2011 Advanced Micro Devices Inc. + * + * Author: Peter Oruba <peter.oruba@amd.com> + * + * Based on work by: + * Tigran Aivazian <tigran@aivazian.fsnet.co.uk> + * + * Maintainers: + * Andreas Herrmann <herrmann.der.user@googlemail.com> + * Borislav Petkov <bp@alien8.de> + * + * This driver allows to upgrade microcode on F10h AMD + * CPUs and later. + * + * Licensed under the terms of the GNU General Public + * License version 2. See file COPYING for details. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/firmware.h> +#include <linux/pci_ids.h> +#include <linux/uaccess.h> +#include <linux/vmalloc.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/pci.h> + +#include <asm/microcode.h> +#include <asm/processor.h> +#include <asm/msr.h> +#include <asm/microcode_amd.h> + +MODULE_DESCRIPTION("AMD Microcode Update Driver"); +MODULE_AUTHOR("Peter Oruba"); +MODULE_LICENSE("GPL v2"); + +static struct equiv_cpu_entry *equiv_cpu_table; + +struct ucode_patch { + struct list_head plist; + void *data; + u32 patch_id; + u16 equiv_cpu; +}; + +static LIST_HEAD(pcache); + +static u16 __find_equiv_id(unsigned int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + return find_equiv_id(equiv_cpu_table, uci->cpu_sig.sig); +} + +static u32 find_cpu_family_by_equiv_cpu(u16 equiv_cpu) +{ + int i = 0; + + BUG_ON(!equiv_cpu_table); + + while (equiv_cpu_table[i].equiv_cpu != 0) { + if (equiv_cpu == equiv_cpu_table[i].equiv_cpu) + return equiv_cpu_table[i].installed_cpu; + i++; + } + return 0; +} + +/* + * a small, trivial cache of per-family ucode patches + */ +static struct ucode_patch *cache_find_patch(u16 equiv_cpu) +{ + struct ucode_patch *p; + + list_for_each_entry(p, &pcache, plist) + if (p->equiv_cpu == equiv_cpu) + return p; + return NULL; +} + +static void update_cache(struct ucode_patch *new_patch) +{ + struct ucode_patch *p; + + list_for_each_entry(p, &pcache, plist) { + if (p->equiv_cpu == new_patch->equiv_cpu) { + if (p->patch_id >= new_patch->patch_id) + /* we already have the latest patch */ + return; + + list_replace(&p->plist, &new_patch->plist); + kfree(p->data); + kfree(p); + return; + } + } + /* no patch found, add it */ + list_add_tail(&new_patch->plist, &pcache); +} + +static void free_cache(void) +{ + struct ucode_patch *p, *tmp; + + list_for_each_entry_safe(p, tmp, &pcache, plist) { + __list_del(p->plist.prev, p->plist.next); + kfree(p->data); + kfree(p); + } +} + +static struct ucode_patch *find_patch(unsigned int cpu) +{ + u16 equiv_id; + + equiv_id = __find_equiv_id(cpu); + if (!equiv_id) + return NULL; + + return cache_find_patch(equiv_id); +} + +static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + struct ucode_patch *p; + + csig->sig = cpuid_eax(0x00000001); + csig->rev = c->microcode; + + /* + * a patch could have been loaded early, set uci->mc so that + * mc_bp_resume() can call apply_microcode() + */ + p = find_patch(cpu); + if (p && (p->patch_id == csig->rev)) + uci->mc = p->data; + + pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev); + + return 0; +} + +static unsigned int verify_patch_size(u8 family, u32 patch_size, + unsigned int size) +{ + u32 max_size; + +#define F1XH_MPB_MAX_SIZE 2048 +#define F14H_MPB_MAX_SIZE 1824 +#define F15H_MPB_MAX_SIZE 4096 +#define F16H_MPB_MAX_SIZE 3458 + + switch (family) { + case 0x14: + max_size = F14H_MPB_MAX_SIZE; + break; + case 0x15: + max_size = F15H_MPB_MAX_SIZE; + break; + case 0x16: + max_size = F16H_MPB_MAX_SIZE; + break; + default: + max_size = F1XH_MPB_MAX_SIZE; + break; + } + + if (patch_size > min_t(u32, size, max_size)) { + pr_err("patch size mismatch\n"); + return 0; + } + + return patch_size; +} + +int __apply_microcode_amd(struct microcode_amd *mc_amd) +{ + u32 rev, dummy; + + native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); + + /* verify patch application was successful */ + native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); + if (rev != mc_amd->hdr.patch_id) + return -1; + + return 0; +} + +int apply_microcode_amd(int cpu) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu); + struct microcode_amd *mc_amd; + struct ucode_cpu_info *uci; + struct ucode_patch *p; + u32 rev, dummy; + + BUG_ON(raw_smp_processor_id() != cpu); + + uci = ucode_cpu_info + cpu; + + p = find_patch(cpu); + if (!p) + return 0; + + mc_amd = p->data; + uci->mc = p->data; + + rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); + + /* need to apply patch? */ + if (rev >= mc_amd->hdr.patch_id) { + c->microcode = rev; + uci->cpu_sig.rev = rev; + return 0; + } + + if (__apply_microcode_amd(mc_amd)) { + pr_err("CPU%d: update failed for patch_level=0x%08x\n", + cpu, mc_amd->hdr.patch_id); + return -1; + } + pr_info("CPU%d: new patch_level=0x%08x\n", cpu, + mc_amd->hdr.patch_id); + + uci->cpu_sig.rev = mc_amd->hdr.patch_id; + c->microcode = mc_amd->hdr.patch_id; + + return 0; +} + +static int install_equiv_cpu_table(const u8 *buf) +{ + unsigned int *ibuf = (unsigned int *)buf; + unsigned int type = ibuf[1]; + unsigned int size = ibuf[2]; + + if (type != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { + pr_err("empty section/" + "invalid type field in container file section header\n"); + return -EINVAL; + } + + equiv_cpu_table = vmalloc(size); + if (!equiv_cpu_table) { + pr_err("failed to allocate equivalent CPU table\n"); + return -ENOMEM; + } + + memcpy(equiv_cpu_table, buf + CONTAINER_HDR_SZ, size); + + /* add header length */ + return size + CONTAINER_HDR_SZ; +} + +static void free_equiv_cpu_table(void) +{ + vfree(equiv_cpu_table); + equiv_cpu_table = NULL; +} + +static void cleanup(void) +{ + free_equiv_cpu_table(); + free_cache(); +} + +/* + * We return the current size even if some of the checks failed so that + * we can skip over the next patch. If we return a negative value, we + * signal a grave error like a memory allocation has failed and the + * driver cannot continue functioning normally. In such cases, we tear + * down everything we've used up so far and exit. + */ +static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover) +{ + struct microcode_header_amd *mc_hdr; + struct ucode_patch *patch; + unsigned int patch_size, crnt_size, ret; + u32 proc_fam; + u16 proc_id; + + patch_size = *(u32 *)(fw + 4); + crnt_size = patch_size + SECTION_HDR_SIZE; + mc_hdr = (struct microcode_header_amd *)(fw + SECTION_HDR_SIZE); + proc_id = mc_hdr->processor_rev_id; + + proc_fam = find_cpu_family_by_equiv_cpu(proc_id); + if (!proc_fam) { + pr_err("No patch family for equiv ID: 0x%04x\n", proc_id); + return crnt_size; + } + + /* check if patch is for the current family */ + proc_fam = ((proc_fam >> 8) & 0xf) + ((proc_fam >> 20) & 0xff); + if (proc_fam != family) + return crnt_size; + + if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) { + pr_err("Patch-ID 0x%08x: chipset-specific code unsupported.\n", + mc_hdr->patch_id); + return crnt_size; + } + + ret = verify_patch_size(family, patch_size, leftover); + if (!ret) { + pr_err("Patch-ID 0x%08x: size mismatch.\n", mc_hdr->patch_id); + return crnt_size; + } + + patch = kzalloc(sizeof(*patch), GFP_KERNEL); + if (!patch) { + pr_err("Patch allocation failure.\n"); + return -EINVAL; + } + + patch->data = kzalloc(patch_size, GFP_KERNEL); + if (!patch->data) { + pr_err("Patch data allocation failure.\n"); + kfree(patch); + return -EINVAL; + } + + /* All looks ok, copy patch... */ + memcpy(patch->data, fw + SECTION_HDR_SIZE, patch_size); + INIT_LIST_HEAD(&patch->plist); + patch->patch_id = mc_hdr->patch_id; + patch->equiv_cpu = proc_id; + + pr_debug("%s: Added patch_id: 0x%08x, proc_id: 0x%04x\n", + __func__, patch->patch_id, proc_id); + + /* ... and add to cache. */ + update_cache(patch); + + return crnt_size; +} + +static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, + size_t size) +{ + enum ucode_state ret = UCODE_ERROR; + unsigned int leftover; + u8 *fw = (u8 *)data; + int crnt_size = 0; + int offset; + + offset = install_equiv_cpu_table(data); + if (offset < 0) { + pr_err("failed to create equivalent cpu table\n"); + return ret; + } + fw += offset; + leftover = size - offset; + + if (*(u32 *)fw != UCODE_UCODE_TYPE) { + pr_err("invalid type field in container file section header\n"); + free_equiv_cpu_table(); + return ret; + } + + while (leftover) { + crnt_size = verify_and_add_patch(family, fw, leftover); + if (crnt_size < 0) + return ret; + + fw += crnt_size; + leftover -= crnt_size; + } + + return UCODE_OK; +} + +enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size) +{ + enum ucode_state ret; + + /* free old equiv table */ + free_equiv_cpu_table(); + + ret = __load_microcode_amd(family, data, size); + + if (ret != UCODE_OK) + cleanup(); + +#if defined(CONFIG_MICROCODE_AMD_EARLY) && defined(CONFIG_X86_32) + /* save BSP's matching patch for early load */ + if (cpu_data(smp_processor_id()).cpu_index == boot_cpu_data.cpu_index) { + struct ucode_patch *p = find_patch(smp_processor_id()); + if (p) { + memset(amd_ucode_patch, 0, PATCH_MAX_SIZE); + memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data), + PATCH_MAX_SIZE)); + } + } +#endif + return ret; +} + +/* + * AMD microcode firmware naming convention, up to family 15h they are in + * the legacy file: + * + * amd-ucode/microcode_amd.bin + * + * This legacy file is always smaller than 2K in size. + * + * Beginning with family 15h, they are in family-specific firmware files: + * + * amd-ucode/microcode_amd_fam15h.bin + * amd-ucode/microcode_amd_fam16h.bin + * ... + * + * These might be larger than 2K. + */ +static enum ucode_state request_microcode_amd(int cpu, struct device *device, + bool refresh_fw) +{ + char fw_name[36] = "amd-ucode/microcode_amd.bin"; + struct cpuinfo_x86 *c = &cpu_data(cpu); + enum ucode_state ret = UCODE_NFOUND; + const struct firmware *fw; + + /* reload ucode container only on the boot cpu */ + if (!refresh_fw || c->cpu_index != boot_cpu_data.cpu_index) + return UCODE_OK; + + if (c->x86 >= 0x15) + snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86); + + if (request_firmware_direct(&fw, (const char *)fw_name, device)) { + pr_debug("failed to load file %s\n", fw_name); + goto out; + } + + ret = UCODE_ERROR; + if (*(u32 *)fw->data != UCODE_MAGIC) { + pr_err("invalid magic value (0x%08x)\n", *(u32 *)fw->data); + goto fw_release; + } + + ret = load_microcode_amd(c->x86, fw->data, fw->size); + + fw_release: + release_firmware(fw); + + out: + return ret; +} + +static enum ucode_state +request_microcode_user(int cpu, const void __user *buf, size_t size) +{ + return UCODE_ERROR; +} + +static void microcode_fini_cpu_amd(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + uci->mc = NULL; +} + +static struct microcode_ops microcode_amd_ops = { + .request_microcode_user = request_microcode_user, + .request_microcode_fw = request_microcode_amd, + .collect_cpu_info = collect_cpu_info_amd, + .apply_microcode = apply_microcode_amd, + .microcode_fini_cpu = microcode_fini_cpu_amd, +}; + +struct microcode_ops * __init init_amd_microcode(void) +{ + struct cpuinfo_x86 *c = &cpu_data(0); + + if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { + pr_warning("AMD CPU family 0x%x not supported\n", c->x86); + return NULL; + } + + return µcode_amd_ops; +} + +void __exit exit_amd_microcode(void) +{ + cleanup(); +} diff --git a/arch/x86/kernel/cpu/microcode/amd_early.c b/arch/x86/kernel/cpu/microcode/amd_early.c new file mode 100644 index 00000000000..617a9e28424 --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/amd_early.c @@ -0,0 +1,395 @@ +/* + * Copyright (C) 2013 Advanced Micro Devices, Inc. + * + * Author: Jacob Shin <jacob.shin@amd.com> + * Fixes: Borislav Petkov <bp@suse.de> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/earlycpio.h> +#include <linux/initrd.h> + +#include <asm/cpu.h> +#include <asm/setup.h> +#include <asm/microcode_amd.h> + +/* + * This points to the current valid container of microcode patches which we will + * save from the initrd before jettisoning its contents. + */ +static u8 *container; +static size_t container_size; + +static u32 ucode_new_rev; +u8 amd_ucode_patch[PATCH_MAX_SIZE]; +static u16 this_equiv_id; + +struct cpio_data ucode_cpio; + +/* + * Microcode patch container file is prepended to the initrd in cpio format. + * See Documentation/x86/early-microcode.txt + */ +static __initdata char ucode_path[] = "kernel/x86/microcode/AuthenticAMD.bin"; + +static struct cpio_data __init find_ucode_in_initrd(void) +{ + long offset = 0; + char *path; + void *start; + size_t size; + +#ifdef CONFIG_X86_32 + struct boot_params *p; + + /* + * On 32-bit, early load occurs before paging is turned on so we need + * to use physical addresses. + */ + p = (struct boot_params *)__pa_nodebug(&boot_params); + path = (char *)__pa_nodebug(ucode_path); + start = (void *)p->hdr.ramdisk_image; + size = p->hdr.ramdisk_size; +#else + path = ucode_path; + start = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET); + size = boot_params.hdr.ramdisk_size; +#endif + + return find_cpio_data(path, start, size, &offset); +} + +static size_t compute_container_size(u8 *data, u32 total_size) +{ + size_t size = 0; + u32 *header = (u32 *)data; + + if (header[0] != UCODE_MAGIC || + header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */ + header[2] == 0) /* size */ + return size; + + size = header[2] + CONTAINER_HDR_SZ; + total_size -= size; + data += size; + + while (total_size) { + u16 patch_size; + + header = (u32 *)data; + + if (header[0] != UCODE_UCODE_TYPE) + break; + + /* + * Sanity-check patch size. + */ + patch_size = header[1]; + if (patch_size > PATCH_MAX_SIZE) + break; + + size += patch_size + SECTION_HDR_SIZE; + data += patch_size + SECTION_HDR_SIZE; + total_size -= patch_size + SECTION_HDR_SIZE; + } + + return size; +} + +/* + * Early load occurs before we can vmalloc(). So we look for the microcode + * patch container file in initrd, traverse equivalent cpu table, look for a + * matching microcode patch, and update, all in initrd memory in place. + * When vmalloc() is available for use later -- on 64-bit during first AP load, + * and on 32-bit during save_microcode_in_initrd_amd() -- we can call + * load_microcode_amd() to save equivalent cpu table and microcode patches in + * kernel heap memory. + */ +static void apply_ucode_in_initrd(void *ucode, size_t size) +{ + struct equiv_cpu_entry *eq; + size_t *cont_sz; + u32 *header; + u8 *data, **cont; + u16 eq_id = 0; + int offset, left; + u32 rev, eax, ebx, ecx, edx; + u32 *new_rev; + +#ifdef CONFIG_X86_32 + new_rev = (u32 *)__pa_nodebug(&ucode_new_rev); + cont_sz = (size_t *)__pa_nodebug(&container_size); + cont = (u8 **)__pa_nodebug(&container); +#else + new_rev = &ucode_new_rev; + cont_sz = &container_size; + cont = &container; +#endif + + data = ucode; + left = size; + header = (u32 *)data; + + /* find equiv cpu table */ + if (header[0] != UCODE_MAGIC || + header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */ + header[2] == 0) /* size */ + return; + + eax = 0x00000001; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + + while (left > 0) { + eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ); + + *cont = data; + + /* Advance past the container header */ + offset = header[2] + CONTAINER_HDR_SZ; + data += offset; + left -= offset; + + eq_id = find_equiv_id(eq, eax); + if (eq_id) { + this_equiv_id = eq_id; + *cont_sz = compute_container_size(*cont, left + offset); + + /* + * truncate how much we need to iterate over in the + * ucode update loop below + */ + left = *cont_sz - offset; + break; + } + + /* + * support multiple container files appended together. if this + * one does not have a matching equivalent cpu entry, we fast + * forward to the next container file. + */ + while (left > 0) { + header = (u32 *)data; + if (header[0] == UCODE_MAGIC && + header[1] == UCODE_EQUIV_CPU_TABLE_TYPE) + break; + + offset = header[1] + SECTION_HDR_SIZE; + data += offset; + left -= offset; + } + + /* mark where the next microcode container file starts */ + offset = data - (u8 *)ucode; + ucode = data; + } + + if (!eq_id) { + *cont = NULL; + *cont_sz = 0; + return; + } + + /* find ucode and update if needed */ + + native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax); + + while (left > 0) { + struct microcode_amd *mc; + + header = (u32 *)data; + if (header[0] != UCODE_UCODE_TYPE || /* type */ + header[1] == 0) /* size */ + break; + + mc = (struct microcode_amd *)(data + SECTION_HDR_SIZE); + + if (eq_id == mc->hdr.processor_rev_id && rev < mc->hdr.patch_id) { + + if (!__apply_microcode_amd(mc)) { + rev = mc->hdr.patch_id; + *new_rev = rev; + + /* save ucode patch */ + memcpy(amd_ucode_patch, mc, + min_t(u32, header[1], PATCH_MAX_SIZE)); + } + } + + offset = header[1] + SECTION_HDR_SIZE; + data += offset; + left -= offset; + } +} + +void __init load_ucode_amd_bsp(void) +{ + struct cpio_data cp; + void **data; + size_t *size; + +#ifdef CONFIG_X86_32 + data = (void **)__pa_nodebug(&ucode_cpio.data); + size = (size_t *)__pa_nodebug(&ucode_cpio.size); +#else + data = &ucode_cpio.data; + size = &ucode_cpio.size; +#endif + + cp = find_ucode_in_initrd(); + if (!cp.data) + return; + + *data = cp.data; + *size = cp.size; + + apply_ucode_in_initrd(cp.data, cp.size); +} + +#ifdef CONFIG_X86_32 +/* + * On 32-bit, since AP's early load occurs before paging is turned on, we + * cannot traverse cpu_equiv_table and pcache in kernel heap memory. So during + * cold boot, AP will apply_ucode_in_initrd() just like the BSP. During + * save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch, + * which is used upon resume from suspend. + */ +void load_ucode_amd_ap(void) +{ + struct microcode_amd *mc; + size_t *usize; + void **ucode; + + mc = (struct microcode_amd *)__pa(amd_ucode_patch); + if (mc->hdr.patch_id && mc->hdr.processor_rev_id) { + __apply_microcode_amd(mc); + return; + } + + ucode = (void *)__pa_nodebug(&container); + usize = (size_t *)__pa_nodebug(&container_size); + + if (!*ucode || !*usize) + return; + + apply_ucode_in_initrd(*ucode, *usize); +} + +static void __init collect_cpu_sig_on_bsp(void *arg) +{ + unsigned int cpu = smp_processor_id(); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + uci->cpu_sig.sig = cpuid_eax(0x00000001); +} + +static void __init get_bsp_sig(void) +{ + unsigned int bsp = boot_cpu_data.cpu_index; + struct ucode_cpu_info *uci = ucode_cpu_info + bsp; + + if (!uci->cpu_sig.sig) + smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1); +} +#else +void load_ucode_amd_ap(void) +{ + unsigned int cpu = smp_processor_id(); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + struct equiv_cpu_entry *eq; + struct microcode_amd *mc; + u32 rev, eax; + u16 eq_id; + + /* Exit if called on the BSP. */ + if (!cpu) + return; + + if (!container) + return; + + rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax); + + uci->cpu_sig.rev = rev; + uci->cpu_sig.sig = eax; + + eax = cpuid_eax(0x00000001); + eq = (struct equiv_cpu_entry *)(container + CONTAINER_HDR_SZ); + + eq_id = find_equiv_id(eq, eax); + if (!eq_id) + return; + + if (eq_id == this_equiv_id) { + mc = (struct microcode_amd *)amd_ucode_patch; + + if (mc && rev < mc->hdr.patch_id) { + if (!__apply_microcode_amd(mc)) + ucode_new_rev = mc->hdr.patch_id; + } + + } else { + if (!ucode_cpio.data) + return; + + /* + * AP has a different equivalence ID than BSP, looks like + * mixed-steppings silicon so go through the ucode blob anew. + */ + apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size); + } +} +#endif + +int __init save_microcode_in_initrd_amd(void) +{ + unsigned long cont; + enum ucode_state ret; + u32 eax; + + if (!container) + return -EINVAL; + +#ifdef CONFIG_X86_32 + get_bsp_sig(); + cont = (unsigned long)container; +#else + /* + * We need the physical address of the container for both bitness since + * boot_params.hdr.ramdisk_image is a physical address. + */ + cont = __pa(container); +#endif + + /* + * Take into account the fact that the ramdisk might get relocated and + * therefore we need to recompute the container's position in virtual + * memory space. + */ + if (relocated_ramdisk) + container = (u8 *)(__va(relocated_ramdisk) + + (cont - boot_params.hdr.ramdisk_image)); + + if (ucode_new_rev) + pr_info("microcode: updated early to new patch_level=0x%08x\n", + ucode_new_rev); + + eax = cpuid_eax(0x00000001); + eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); + + ret = load_microcode_amd(eax, container, container_size); + if (ret != UCODE_OK) + return -EINVAL; + + /* + * This will be freed any msec now, stash patches for the current + * family and switch to patch cache for cpu hotplug, etc later. + */ + container = NULL; + container_size = 0; + + return 0; +} diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c new file mode 100644 index 00000000000..dd9d6190b08 --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -0,0 +1,651 @@ +/* + * Intel CPU Microcode Update Driver for Linux + * + * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk> + * 2006 Shaohua Li <shaohua.li@intel.com> + * + * This driver allows to upgrade microcode on Intel processors + * belonging to IA-32 family - PentiumPro, Pentium II, + * Pentium III, Xeon, Pentium 4, etc. + * + * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture + * Software Developer's Manual + * Order Number 253668 or free download from: + * + * http://developer.intel.com/Assets/PDF/manual/253668.pdf + * + * For more information, go to http://www.urbanmyth.org/microcode + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com> + * Initial release. + * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com> + * Added read() support + cleanups. + * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com> + * Added 'device trimming' support. open(O_WRONLY) zeroes + * and frees the saved copy of applied microcode. + * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com> + * Made to use devfs (/dev/cpu/microcode) + cleanups. + * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com> + * Added misc device support (now uses both devfs and misc). + * Added MICROCODE_IOCFREE ioctl to clear memory. + * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com> + * Messages for error cases (non Intel & no suitable microcode). + * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com> + * Removed ->release(). Removed exclusive open and status bitmap. + * Added microcode_rwsem to serialize read()/write()/ioctl(). + * Removed global kernel lock usage. + * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com> + * Write 0 to 0x8B msr and then cpuid before reading revision, + * so that it works even if there were no update done by the + * BIOS. Otherwise, reading from 0x8B gives junk (which happened + * to be 0 on my machine which is why it worked even when I + * disabled update by the BIOS) + * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix. + * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and + * Tigran Aivazian <tigran@veritas.com> + * Intel Pentium 4 processor support and bugfixes. + * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com> + * Bugfix for HT (Hyper-Threading) enabled processors + * whereby processor resources are shared by all logical processors + * in a single CPU package. + * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and + * Tigran Aivazian <tigran@veritas.com>, + * Serialize updates as required on HT processors due to + * speculative nature of implementation. + * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com> + * Fix the panic when writing zero-length microcode chunk. + * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>, + * Jun Nakajima <jun.nakajima@intel.com> + * Support for the microcode updates in the new format. + * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com> + * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl + * because we no longer hold a copy of applied microcode + * in kernel memory. + * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com> + * Fix sigmatch() macro to handle old CPUs with pf == 0. + * Thanks to Stuart Swales for pointing out this bug. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/platform_device.h> +#include <linux/miscdevice.h> +#include <linux/capability.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/cpu.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/syscore_ops.h> + +#include <asm/microcode.h> +#include <asm/processor.h> +#include <asm/cpu_device_id.h> +#include <asm/perf_event.h> + +MODULE_DESCRIPTION("Microcode Update Driver"); +MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); +MODULE_LICENSE("GPL"); + +#define MICROCODE_VERSION "2.00" + +static struct microcode_ops *microcode_ops; + +bool dis_ucode_ldr; +module_param(dis_ucode_ldr, bool, 0); + +/* + * Synchronization. + * + * All non cpu-hotplug-callback call sites use: + * + * - microcode_mutex to synchronize with each other; + * - get/put_online_cpus() to synchronize with + * the cpu-hotplug-callback call sites. + * + * We guarantee that only a single cpu is being + * updated at any particular moment of time. + */ +static DEFINE_MUTEX(microcode_mutex); + +struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; +EXPORT_SYMBOL_GPL(ucode_cpu_info); + +/* + * Operations that are run on a target cpu: + */ + +struct cpu_info_ctx { + struct cpu_signature *cpu_sig; + int err; +}; + +static void collect_cpu_info_local(void *arg) +{ + struct cpu_info_ctx *ctx = arg; + + ctx->err = microcode_ops->collect_cpu_info(smp_processor_id(), + ctx->cpu_sig); +} + +static int collect_cpu_info_on_target(int cpu, struct cpu_signature *cpu_sig) +{ + struct cpu_info_ctx ctx = { .cpu_sig = cpu_sig, .err = 0 }; + int ret; + + ret = smp_call_function_single(cpu, collect_cpu_info_local, &ctx, 1); + if (!ret) + ret = ctx.err; + + return ret; +} + +static int collect_cpu_info(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + int ret; + + memset(uci, 0, sizeof(*uci)); + + ret = collect_cpu_info_on_target(cpu, &uci->cpu_sig); + if (!ret) + uci->valid = 1; + + return ret; +} + +struct apply_microcode_ctx { + int err; +}; + +static void apply_microcode_local(void *arg) +{ + struct apply_microcode_ctx *ctx = arg; + + ctx->err = microcode_ops->apply_microcode(smp_processor_id()); +} + +static int apply_microcode_on_target(int cpu) +{ + struct apply_microcode_ctx ctx = { .err = 0 }; + int ret; + + ret = smp_call_function_single(cpu, apply_microcode_local, &ctx, 1); + if (!ret) + ret = ctx.err; + + return ret; +} + +#ifdef CONFIG_MICROCODE_OLD_INTERFACE +static int do_microcode_update(const void __user *buf, size_t size) +{ + int error = 0; + int cpu; + + for_each_online_cpu(cpu) { + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + enum ucode_state ustate; + + if (!uci->valid) + continue; + + ustate = microcode_ops->request_microcode_user(cpu, buf, size); + if (ustate == UCODE_ERROR) { + error = -1; + break; + } else if (ustate == UCODE_OK) + apply_microcode_on_target(cpu); + } + + return error; +} + +static int microcode_open(struct inode *inode, struct file *file) +{ + return capable(CAP_SYS_RAWIO) ? nonseekable_open(inode, file) : -EPERM; +} + +static ssize_t microcode_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + ssize_t ret = -EINVAL; + + if ((len >> PAGE_SHIFT) > totalram_pages) { + pr_err("too much data (max %ld pages)\n", totalram_pages); + return ret; + } + + get_online_cpus(); + mutex_lock(µcode_mutex); + + if (do_microcode_update(buf, len) == 0) + ret = (ssize_t)len; + + if (ret > 0) + perf_check_microcode(); + + mutex_unlock(µcode_mutex); + put_online_cpus(); + + return ret; +} + +static const struct file_operations microcode_fops = { + .owner = THIS_MODULE, + .write = microcode_write, + .open = microcode_open, + .llseek = no_llseek, +}; + +static struct miscdevice microcode_dev = { + .minor = MICROCODE_MINOR, + .name = "microcode", + .nodename = "cpu/microcode", + .fops = µcode_fops, +}; + +static int __init microcode_dev_init(void) +{ + int error; + + error = misc_register(µcode_dev); + if (error) { + pr_err("can't misc_register on minor=%d\n", MICROCODE_MINOR); + return error; + } + + return 0; +} + +static void __exit microcode_dev_exit(void) +{ + misc_deregister(µcode_dev); +} + +MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); +MODULE_ALIAS("devname:cpu/microcode"); +#else +#define microcode_dev_init() 0 +#define microcode_dev_exit() do { } while (0) +#endif + +/* fake device for request_firmware */ +static struct platform_device *microcode_pdev; + +static int reload_for_cpu(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + enum ucode_state ustate; + int err = 0; + + if (!uci->valid) + return err; + + ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev, true); + if (ustate == UCODE_OK) + apply_microcode_on_target(cpu); + else + if (ustate == UCODE_ERROR) + err = -EINVAL; + return err; +} + +static ssize_t reload_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t size) +{ + unsigned long val; + int cpu; + ssize_t ret = 0, tmp_ret; + + ret = kstrtoul(buf, 0, &val); + if (ret) + return ret; + + if (val != 1) + return size; + + get_online_cpus(); + mutex_lock(µcode_mutex); + for_each_online_cpu(cpu) { + tmp_ret = reload_for_cpu(cpu); + if (tmp_ret != 0) + pr_warn("Error reloading microcode on CPU %d\n", cpu); + + /* save retval of the first encountered reload error */ + if (!ret) + ret = tmp_ret; + } + if (!ret) + perf_check_microcode(); + mutex_unlock(µcode_mutex); + put_online_cpus(); + + if (!ret) + ret = size; + + return ret; +} + +static ssize_t version_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; + + return sprintf(buf, "0x%x\n", uci->cpu_sig.rev); +} + +static ssize_t pf_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; + + return sprintf(buf, "0x%x\n", uci->cpu_sig.pf); +} + +static DEVICE_ATTR(reload, 0200, NULL, reload_store); +static DEVICE_ATTR(version, 0400, version_show, NULL); +static DEVICE_ATTR(processor_flags, 0400, pf_show, NULL); + +static struct attribute *mc_default_attrs[] = { + &dev_attr_version.attr, + &dev_attr_processor_flags.attr, + NULL +}; + +static struct attribute_group mc_attr_group = { + .attrs = mc_default_attrs, + .name = "microcode", +}; + +static void microcode_fini_cpu(int cpu) +{ + microcode_ops->microcode_fini_cpu(cpu); +} + +static enum ucode_state microcode_resume_cpu(int cpu) +{ + pr_debug("CPU%d updated upon resume\n", cpu); + + if (apply_microcode_on_target(cpu)) + return UCODE_ERROR; + + return UCODE_OK; +} + +static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw) +{ + enum ucode_state ustate; + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + if (uci && uci->valid) + return UCODE_OK; + + if (collect_cpu_info(cpu)) + return UCODE_ERROR; + + /* --dimm. Trigger a delayed update? */ + if (system_state != SYSTEM_RUNNING) + return UCODE_NFOUND; + + ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev, + refresh_fw); + + if (ustate == UCODE_OK) { + pr_debug("CPU%d updated upon init\n", cpu); + apply_microcode_on_target(cpu); + } + + return ustate; +} + +static enum ucode_state microcode_update_cpu(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + if (uci->valid) + return microcode_resume_cpu(cpu); + + return microcode_init_cpu(cpu, false); +} + +static int mc_device_add(struct device *dev, struct subsys_interface *sif) +{ + int err, cpu = dev->id; + + if (!cpu_online(cpu)) + return 0; + + pr_debug("CPU%d added\n", cpu); + + err = sysfs_create_group(&dev->kobj, &mc_attr_group); + if (err) + return err; + + if (microcode_init_cpu(cpu, true) == UCODE_ERROR) + return -EINVAL; + + return err; +} + +static int mc_device_remove(struct device *dev, struct subsys_interface *sif) +{ + int cpu = dev->id; + + if (!cpu_online(cpu)) + return 0; + + pr_debug("CPU%d removed\n", cpu); + microcode_fini_cpu(cpu); + sysfs_remove_group(&dev->kobj, &mc_attr_group); + return 0; +} + +static struct subsys_interface mc_cpu_interface = { + .name = "microcode", + .subsys = &cpu_subsys, + .add_dev = mc_device_add, + .remove_dev = mc_device_remove, +}; + +/** + * mc_bp_resume - Update boot CPU microcode during resume. + */ +static void mc_bp_resume(void) +{ + int cpu = smp_processor_id(); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + if (uci->valid && uci->mc) + microcode_ops->apply_microcode(cpu); +} + +static struct syscore_ops mc_syscore_ops = { + .resume = mc_bp_resume, +}; + +static int +mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + struct device *dev; + + dev = get_cpu_device(cpu); + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_ONLINE: + microcode_update_cpu(cpu); + pr_debug("CPU%d added\n", cpu); + /* + * "break" is missing on purpose here because we want to fall + * through in order to create the sysfs group. + */ + + case CPU_DOWN_FAILED: + if (sysfs_create_group(&dev->kobj, &mc_attr_group)) + pr_err("Failed to create group for CPU%d\n", cpu); + break; + + case CPU_DOWN_PREPARE: + /* Suspend is in progress, only remove the interface */ + sysfs_remove_group(&dev->kobj, &mc_attr_group); + pr_debug("CPU%d removed\n", cpu); + break; + + /* + * case CPU_DEAD: + * + * When a CPU goes offline, don't free up or invalidate the copy of + * the microcode in kernel memory, so that we can reuse it when the + * CPU comes back online without unnecessarily requesting the userspace + * for it again. + */ + } + + /* The CPU refused to come up during a system resume */ + if (action == CPU_UP_CANCELED_FROZEN) + microcode_fini_cpu(cpu); + + return NOTIFY_OK; +} + +static struct notifier_block __refdata mc_cpu_notifier = { + .notifier_call = mc_cpu_callback, +}; + +#ifdef MODULE +/* Autoload on Intel and AMD systems */ +static const struct x86_cpu_id __initconst microcode_id[] = { +#ifdef CONFIG_MICROCODE_INTEL + { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, }, +#endif +#ifdef CONFIG_MICROCODE_AMD + { X86_VENDOR_AMD, X86_FAMILY_ANY, X86_MODEL_ANY, }, +#endif + {} +}; +MODULE_DEVICE_TABLE(x86cpu, microcode_id); +#endif + +static struct attribute *cpu_root_microcode_attrs[] = { + &dev_attr_reload.attr, + NULL +}; + +static struct attribute_group cpu_root_microcode_group = { + .name = "microcode", + .attrs = cpu_root_microcode_attrs, +}; + +static int __init microcode_init(void) +{ + struct cpuinfo_x86 *c = &cpu_data(0); + int error; + + if (dis_ucode_ldr) + return 0; + + if (c->x86_vendor == X86_VENDOR_INTEL) + microcode_ops = init_intel_microcode(); + else if (c->x86_vendor == X86_VENDOR_AMD) + microcode_ops = init_amd_microcode(); + else + pr_err("no support for this CPU vendor\n"); + + if (!microcode_ops) + return -ENODEV; + + microcode_pdev = platform_device_register_simple("microcode", -1, + NULL, 0); + if (IS_ERR(microcode_pdev)) + return PTR_ERR(microcode_pdev); + + get_online_cpus(); + mutex_lock(µcode_mutex); + + error = subsys_interface_register(&mc_cpu_interface); + if (!error) + perf_check_microcode(); + mutex_unlock(µcode_mutex); + put_online_cpus(); + + if (error) + goto out_pdev; + + error = sysfs_create_group(&cpu_subsys.dev_root->kobj, + &cpu_root_microcode_group); + + if (error) { + pr_err("Error creating microcode group!\n"); + goto out_driver; + } + + error = microcode_dev_init(); + if (error) + goto out_ucode_group; + + register_syscore_ops(&mc_syscore_ops); + register_hotcpu_notifier(&mc_cpu_notifier); + + pr_info("Microcode Update Driver: v" MICROCODE_VERSION + " <tigran@aivazian.fsnet.co.uk>, Peter Oruba\n"); + + return 0; + + out_ucode_group: + sysfs_remove_group(&cpu_subsys.dev_root->kobj, + &cpu_root_microcode_group); + + out_driver: + get_online_cpus(); + mutex_lock(µcode_mutex); + + subsys_interface_unregister(&mc_cpu_interface); + + mutex_unlock(µcode_mutex); + put_online_cpus(); + + out_pdev: + platform_device_unregister(microcode_pdev); + return error; + +} +module_init(microcode_init); + +static void __exit microcode_exit(void) +{ + struct cpuinfo_x86 *c = &cpu_data(0); + + microcode_dev_exit(); + + unregister_hotcpu_notifier(&mc_cpu_notifier); + unregister_syscore_ops(&mc_syscore_ops); + + sysfs_remove_group(&cpu_subsys.dev_root->kobj, + &cpu_root_microcode_group); + + get_online_cpus(); + mutex_lock(µcode_mutex); + + subsys_interface_unregister(&mc_cpu_interface); + + mutex_unlock(µcode_mutex); + put_online_cpus(); + + platform_device_unregister(microcode_pdev); + + microcode_ops = NULL; + + if (c->x86_vendor == X86_VENDOR_AMD) + exit_amd_microcode(); + + pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); +} +module_exit(microcode_exit); diff --git a/arch/x86/kernel/cpu/microcode/core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c new file mode 100644 index 00000000000..5f28a64e71e --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/core_early.c @@ -0,0 +1,178 @@ +/* + * X86 CPU microcode early update for Linux + * + * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com> + * H Peter Anvin" <hpa@zytor.com> + * + * This driver allows to early upgrade microcode on Intel processors + * belonging to IA-32 family - PentiumPro, Pentium II, + * Pentium III, Xeon, Pentium 4, etc. + * + * Reference: Section 9.11 of Volume 3, IA-32 Intel Architecture + * Software Developer's Manual. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/module.h> +#include <asm/microcode.h> +#include <asm/microcode_intel.h> +#include <asm/microcode_amd.h> +#include <asm/processor.h> +#include <asm/cmdline.h> + +#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24)) +#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u') +#define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I') +#define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l') +#define CPUID_AMD1 QCHAR('A', 'u', 't', 'h') +#define CPUID_AMD2 QCHAR('e', 'n', 't', 'i') +#define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D') + +#define CPUID_IS(a, b, c, ebx, ecx, edx) \ + (!((ebx ^ (a))|(edx ^ (b))|(ecx ^ (c)))) + +/* + * In early loading microcode phase on BSP, boot_cpu_data is not set up yet. + * x86_vendor() gets vendor id for BSP. + * + * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify + * coding, we still use x86_vendor() to get vendor id for AP. + * + * x86_vendor() gets vendor information directly through cpuid. + */ +static int x86_vendor(void) +{ + u32 eax = 0x00000000; + u32 ebx, ecx = 0, edx; + + native_cpuid(&eax, &ebx, &ecx, &edx); + + if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx)) + return X86_VENDOR_INTEL; + + if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx)) + return X86_VENDOR_AMD; + + return X86_VENDOR_UNKNOWN; +} + +static int x86_family(void) +{ + u32 eax = 0x00000001; + u32 ebx, ecx = 0, edx; + int x86; + + native_cpuid(&eax, &ebx, &ecx, &edx); + + x86 = (eax >> 8) & 0xf; + if (x86 == 15) + x86 += (eax >> 20) & 0xff; + + return x86; +} + +static bool __init check_loader_disabled_bsp(void) +{ +#ifdef CONFIG_X86_32 + const char *cmdline = (const char *)__pa_nodebug(boot_command_line); + const char *opt = "dis_ucode_ldr"; + const char *option = (const char *)__pa_nodebug(opt); + bool *res = (bool *)__pa_nodebug(&dis_ucode_ldr); + +#else /* CONFIG_X86_64 */ + const char *cmdline = boot_command_line; + const char *option = "dis_ucode_ldr"; + bool *res = &dis_ucode_ldr; +#endif + + if (cmdline_find_option_bool(cmdline, option)) + *res = true; + + return *res; +} + +void __init load_ucode_bsp(void) +{ + int vendor, x86; + + if (check_loader_disabled_bsp()) + return; + + if (!have_cpuid_p()) + return; + + vendor = x86_vendor(); + x86 = x86_family(); + + switch (vendor) { + case X86_VENDOR_INTEL: + if (x86 >= 6) + load_ucode_intel_bsp(); + break; + case X86_VENDOR_AMD: + if (x86 >= 0x10) + load_ucode_amd_bsp(); + break; + default: + break; + } +} + +static bool check_loader_disabled_ap(void) +{ +#ifdef CONFIG_X86_32 + return __pa_nodebug(dis_ucode_ldr); +#else + return dis_ucode_ldr; +#endif +} + +void load_ucode_ap(void) +{ + int vendor, x86; + + if (check_loader_disabled_ap()) + return; + + if (!have_cpuid_p()) + return; + + vendor = x86_vendor(); + x86 = x86_family(); + + switch (vendor) { + case X86_VENDOR_INTEL: + if (x86 >= 6) + load_ucode_intel_ap(); + break; + case X86_VENDOR_AMD: + if (x86 >= 0x10) + load_ucode_amd_ap(); + break; + default: + break; + } +} + +int __init save_microcode_in_initrd(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + switch (c->x86_vendor) { + case X86_VENDOR_INTEL: + if (c->x86 >= 6) + save_microcode_in_initrd_intel(); + break; + case X86_VENDOR_AMD: + if (c->x86 >= 0x10) + save_microcode_in_initrd_amd(); + break; + default: + break; + } + + return 0; +} diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c new file mode 100644 index 00000000000..a276fa75d9b --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -0,0 +1,333 @@ +/* + * Intel CPU Microcode Update Driver for Linux + * + * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk> + * 2006 Shaohua Li <shaohua.li@intel.com> + * + * This driver allows to upgrade microcode on Intel processors + * belonging to IA-32 family - PentiumPro, Pentium II, + * Pentium III, Xeon, Pentium 4, etc. + * + * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture + * Software Developer's Manual + * Order Number 253668 or free download from: + * + * http://developer.intel.com/Assets/PDF/manual/253668.pdf + * + * For more information, go to http://www.urbanmyth.org/microcode + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com> + * Initial release. + * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com> + * Added read() support + cleanups. + * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com> + * Added 'device trimming' support. open(O_WRONLY) zeroes + * and frees the saved copy of applied microcode. + * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com> + * Made to use devfs (/dev/cpu/microcode) + cleanups. + * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com> + * Added misc device support (now uses both devfs and misc). + * Added MICROCODE_IOCFREE ioctl to clear memory. + * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com> + * Messages for error cases (non Intel & no suitable microcode). + * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com> + * Removed ->release(). Removed exclusive open and status bitmap. + * Added microcode_rwsem to serialize read()/write()/ioctl(). + * Removed global kernel lock usage. + * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com> + * Write 0 to 0x8B msr and then cpuid before reading revision, + * so that it works even if there were no update done by the + * BIOS. Otherwise, reading from 0x8B gives junk (which happened + * to be 0 on my machine which is why it worked even when I + * disabled update by the BIOS) + * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix. + * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and + * Tigran Aivazian <tigran@veritas.com> + * Intel Pentium 4 processor support and bugfixes. + * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com> + * Bugfix for HT (Hyper-Threading) enabled processors + * whereby processor resources are shared by all logical processors + * in a single CPU package. + * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and + * Tigran Aivazian <tigran@veritas.com>, + * Serialize updates as required on HT processors due to + * speculative nature of implementation. + * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com> + * Fix the panic when writing zero-length microcode chunk. + * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>, + * Jun Nakajima <jun.nakajima@intel.com> + * Support for the microcode updates in the new format. + * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com> + * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl + * because we no longer hold a copy of applied microcode + * in kernel memory. + * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com> + * Fix sigmatch() macro to handle old CPUs with pf == 0. + * Thanks to Stuart Swales for pointing out this bug. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/firmware.h> +#include <linux/uaccess.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/vmalloc.h> + +#include <asm/microcode_intel.h> +#include <asm/processor.h> +#include <asm/msr.h> + +MODULE_DESCRIPTION("Microcode Update Driver"); +MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); +MODULE_LICENSE("GPL"); + +static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu_num); + unsigned int val[2]; + + memset(csig, 0, sizeof(*csig)); + + csig->sig = cpuid_eax(0x00000001); + + if ((c->x86_model >= 5) || (c->x86 > 6)) { + /* get processor flags from MSR 0x17 */ + rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); + csig->pf = 1 << ((val[1] >> 18) & 7); + } + + csig->rev = c->microcode; + pr_info("CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n", + cpu_num, csig->sig, csig->pf, csig->rev); + + return 0; +} + +/* + * return 0 - no update found + * return 1 - found update + */ +static int get_matching_mc(struct microcode_intel *mc_intel, int cpu) +{ + struct cpu_signature cpu_sig; + unsigned int csig, cpf, crev; + + collect_cpu_info(cpu, &cpu_sig); + + csig = cpu_sig.sig; + cpf = cpu_sig.pf; + crev = cpu_sig.rev; + + return get_matching_microcode(csig, cpf, mc_intel, crev); +} + +int apply_microcode(int cpu) +{ + struct microcode_intel *mc_intel; + struct ucode_cpu_info *uci; + unsigned int val[2]; + int cpu_num = raw_smp_processor_id(); + struct cpuinfo_x86 *c = &cpu_data(cpu_num); + + uci = ucode_cpu_info + cpu; + mc_intel = uci->mc; + + /* We should bind the task to the CPU */ + BUG_ON(cpu_num != cpu); + + if (mc_intel == NULL) + return 0; + + /* + * Microcode on this CPU could be updated earlier. Only apply the + * microcode patch in mc_intel when it is newer than the one on this + * CPU. + */ + if (get_matching_mc(mc_intel, cpu) == 0) + return 0; + + /* write microcode via MSR 0x79 */ + wrmsr(MSR_IA32_UCODE_WRITE, + (unsigned long) mc_intel->bits, + (unsigned long) mc_intel->bits >> 16 >> 16); + wrmsr(MSR_IA32_UCODE_REV, 0, 0); + + /* As documented in the SDM: Do a CPUID 1 here */ + sync_core(); + + /* get the current revision from MSR 0x8B */ + rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); + + if (val[1] != mc_intel->hdr.rev) { + pr_err("CPU%d update to revision 0x%x failed\n", + cpu_num, mc_intel->hdr.rev); + return -1; + } + pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x\n", + cpu_num, val[1], + mc_intel->hdr.date & 0xffff, + mc_intel->hdr.date >> 24, + (mc_intel->hdr.date >> 16) & 0xff); + + uci->cpu_sig.rev = val[1]; + c->microcode = val[1]; + + return 0; +} + +static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, + int (*get_ucode_data)(void *, const void *, size_t)) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + u8 *ucode_ptr = data, *new_mc = NULL, *mc = NULL; + int new_rev = uci->cpu_sig.rev; + unsigned int leftover = size; + enum ucode_state state = UCODE_OK; + unsigned int curr_mc_size = 0; + unsigned int csig, cpf; + + while (leftover) { + struct microcode_header_intel mc_header; + unsigned int mc_size; + + if (get_ucode_data(&mc_header, ucode_ptr, sizeof(mc_header))) + break; + + mc_size = get_totalsize(&mc_header); + if (!mc_size || mc_size > leftover) { + pr_err("error! Bad data in microcode data file\n"); + break; + } + + /* For performance reasons, reuse mc area when possible */ + if (!mc || mc_size > curr_mc_size) { + vfree(mc); + mc = vmalloc(mc_size); + if (!mc) + break; + curr_mc_size = mc_size; + } + + if (get_ucode_data(mc, ucode_ptr, mc_size) || + microcode_sanity_check(mc, 1) < 0) { + break; + } + + csig = uci->cpu_sig.sig; + cpf = uci->cpu_sig.pf; + if (get_matching_microcode(csig, cpf, mc, new_rev)) { + vfree(new_mc); + new_rev = mc_header.rev; + new_mc = mc; + mc = NULL; /* trigger new vmalloc */ + } + + ucode_ptr += mc_size; + leftover -= mc_size; + } + + vfree(mc); + + if (leftover) { + vfree(new_mc); + state = UCODE_ERROR; + goto out; + } + + if (!new_mc) { + state = UCODE_NFOUND; + goto out; + } + + vfree(uci->mc); + uci->mc = (struct microcode_intel *)new_mc; + + /* + * If early loading microcode is supported, save this mc into + * permanent memory. So it will be loaded early when a CPU is hot added + * or resumes. + */ + save_mc_for_early(new_mc); + + pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", + cpu, new_rev, uci->cpu_sig.rev); +out: + return state; +} + +static int get_ucode_fw(void *to, const void *from, size_t n) +{ + memcpy(to, from, n); + return 0; +} + +static enum ucode_state request_microcode_fw(int cpu, struct device *device, + bool refresh_fw) +{ + char name[30]; + struct cpuinfo_x86 *c = &cpu_data(cpu); + const struct firmware *firmware; + enum ucode_state ret; + + sprintf(name, "intel-ucode/%02x-%02x-%02x", + c->x86, c->x86_model, c->x86_mask); + + if (request_firmware_direct(&firmware, name, device)) { + pr_debug("data file %s load failed\n", name); + return UCODE_NFOUND; + } + + ret = generic_load_microcode(cpu, (void *)firmware->data, + firmware->size, &get_ucode_fw); + + release_firmware(firmware); + + return ret; +} + +static int get_ucode_user(void *to, const void *from, size_t n) +{ + return copy_from_user(to, from, n); +} + +static enum ucode_state +request_microcode_user(int cpu, const void __user *buf, size_t size) +{ + return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user); +} + +static void microcode_fini_cpu(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + vfree(uci->mc); + uci->mc = NULL; +} + +static struct microcode_ops microcode_intel_ops = { + .request_microcode_user = request_microcode_user, + .request_microcode_fw = request_microcode_fw, + .collect_cpu_info = collect_cpu_info, + .apply_microcode = apply_microcode, + .microcode_fini_cpu = microcode_fini_cpu, +}; + +struct microcode_ops * __init init_intel_microcode(void) +{ + struct cpuinfo_x86 *c = &cpu_data(0); + + if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || + cpu_has(c, X86_FEATURE_IA64)) { + pr_err("Intel CPU family 0x%x not supported\n", c->x86); + return NULL; + } + + return µcode_intel_ops; +} + diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c new file mode 100644 index 00000000000..18f739129e7 --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/intel_early.c @@ -0,0 +1,787 @@ +/* + * Intel CPU microcode early update for Linux + * + * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com> + * H Peter Anvin" <hpa@zytor.com> + * + * This allows to early upgrade microcode on Intel processors + * belonging to IA-32 family - PentiumPro, Pentium II, + * Pentium III, Xeon, Pentium 4, etc. + * + * Reference: Section 9.11 of Volume 3, IA-32 Intel Architecture + * Software Developer's Manual. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/earlycpio.h> +#include <linux/initrd.h> +#include <linux/cpu.h> +#include <asm/msr.h> +#include <asm/microcode_intel.h> +#include <asm/processor.h> +#include <asm/tlbflush.h> +#include <asm/setup.h> + +unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT]; +struct mc_saved_data { + unsigned int mc_saved_count; + struct microcode_intel **mc_saved; +} mc_saved_data; + +static enum ucode_state +generic_load_microcode_early(struct microcode_intel **mc_saved_p, + unsigned int mc_saved_count, + struct ucode_cpu_info *uci) +{ + struct microcode_intel *ucode_ptr, *new_mc = NULL; + int new_rev = uci->cpu_sig.rev; + enum ucode_state state = UCODE_OK; + unsigned int mc_size; + struct microcode_header_intel *mc_header; + unsigned int csig = uci->cpu_sig.sig; + unsigned int cpf = uci->cpu_sig.pf; + int i; + + for (i = 0; i < mc_saved_count; i++) { + ucode_ptr = mc_saved_p[i]; + + mc_header = (struct microcode_header_intel *)ucode_ptr; + mc_size = get_totalsize(mc_header); + if (get_matching_microcode(csig, cpf, ucode_ptr, new_rev)) { + new_rev = mc_header->rev; + new_mc = ucode_ptr; + } + } + + if (!new_mc) { + state = UCODE_NFOUND; + goto out; + } + + uci->mc = (struct microcode_intel *)new_mc; +out: + return state; +} + +static void +microcode_pointer(struct microcode_intel **mc_saved, + unsigned long *mc_saved_in_initrd, + unsigned long initrd_start, int mc_saved_count) +{ + int i; + + for (i = 0; i < mc_saved_count; i++) + mc_saved[i] = (struct microcode_intel *) + (mc_saved_in_initrd[i] + initrd_start); +} + +#ifdef CONFIG_X86_32 +static void +microcode_phys(struct microcode_intel **mc_saved_tmp, + struct mc_saved_data *mc_saved_data) +{ + int i; + struct microcode_intel ***mc_saved; + + mc_saved = (struct microcode_intel ***) + __pa_nodebug(&mc_saved_data->mc_saved); + for (i = 0; i < mc_saved_data->mc_saved_count; i++) { + struct microcode_intel *p; + + p = *(struct microcode_intel **) + __pa_nodebug(mc_saved_data->mc_saved + i); + mc_saved_tmp[i] = (struct microcode_intel *)__pa_nodebug(p); + } +} +#endif + +static enum ucode_state +load_microcode(struct mc_saved_data *mc_saved_data, + unsigned long *mc_saved_in_initrd, + unsigned long initrd_start, + struct ucode_cpu_info *uci) +{ + struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; + unsigned int count = mc_saved_data->mc_saved_count; + + if (!mc_saved_data->mc_saved) { + microcode_pointer(mc_saved_tmp, mc_saved_in_initrd, + initrd_start, count); + + return generic_load_microcode_early(mc_saved_tmp, count, uci); + } else { +#ifdef CONFIG_X86_32 + microcode_phys(mc_saved_tmp, mc_saved_data); + return generic_load_microcode_early(mc_saved_tmp, count, uci); +#else + return generic_load_microcode_early(mc_saved_data->mc_saved, + count, uci); +#endif + } +} + +static u8 get_x86_family(unsigned long sig) +{ + u8 x86; + + x86 = (sig >> 8) & 0xf; + + if (x86 == 0xf) + x86 += (sig >> 20) & 0xff; + + return x86; +} + +static u8 get_x86_model(unsigned long sig) +{ + u8 x86, x86_model; + + x86 = get_x86_family(sig); + x86_model = (sig >> 4) & 0xf; + + if (x86 == 0x6 || x86 == 0xf) + x86_model += ((sig >> 16) & 0xf) << 4; + + return x86_model; +} + +/* + * Given CPU signature and a microcode patch, this function finds if the + * microcode patch has matching family and model with the CPU. + */ +static enum ucode_state +matching_model_microcode(struct microcode_header_intel *mc_header, + unsigned long sig) +{ + u8 x86, x86_model; + u8 x86_ucode, x86_model_ucode; + struct extended_sigtable *ext_header; + unsigned long total_size = get_totalsize(mc_header); + unsigned long data_size = get_datasize(mc_header); + int ext_sigcount, i; + struct extended_signature *ext_sig; + + x86 = get_x86_family(sig); + x86_model = get_x86_model(sig); + + x86_ucode = get_x86_family(mc_header->sig); + x86_model_ucode = get_x86_model(mc_header->sig); + + if (x86 == x86_ucode && x86_model == x86_model_ucode) + return UCODE_OK; + + /* Look for ext. headers: */ + if (total_size <= data_size + MC_HEADER_SIZE) + return UCODE_NFOUND; + + ext_header = (struct extended_sigtable *) + mc_header + data_size + MC_HEADER_SIZE; + ext_sigcount = ext_header->count; + ext_sig = (void *)ext_header + EXT_HEADER_SIZE; + + for (i = 0; i < ext_sigcount; i++) { + x86_ucode = get_x86_family(ext_sig->sig); + x86_model_ucode = get_x86_model(ext_sig->sig); + + if (x86 == x86_ucode && x86_model == x86_model_ucode) + return UCODE_OK; + + ext_sig++; + } + + return UCODE_NFOUND; +} + +static int +save_microcode(struct mc_saved_data *mc_saved_data, + struct microcode_intel **mc_saved_src, + unsigned int mc_saved_count) +{ + int i, j; + struct microcode_intel **mc_saved_p; + int ret; + + if (!mc_saved_count) + return -EINVAL; + + /* + * Copy new microcode data. + */ + mc_saved_p = kmalloc(mc_saved_count*sizeof(struct microcode_intel *), + GFP_KERNEL); + if (!mc_saved_p) + return -ENOMEM; + + for (i = 0; i < mc_saved_count; i++) { + struct microcode_intel *mc = mc_saved_src[i]; + struct microcode_header_intel *mc_header = &mc->hdr; + unsigned long mc_size = get_totalsize(mc_header); + mc_saved_p[i] = kmalloc(mc_size, GFP_KERNEL); + if (!mc_saved_p[i]) { + ret = -ENOMEM; + goto err; + } + if (!mc_saved_src[i]) { + ret = -EINVAL; + goto err; + } + memcpy(mc_saved_p[i], mc, mc_size); + } + + /* + * Point to newly saved microcode. + */ + mc_saved_data->mc_saved = mc_saved_p; + mc_saved_data->mc_saved_count = mc_saved_count; + + return 0; + +err: + for (j = 0; j <= i; j++) + kfree(mc_saved_p[j]); + kfree(mc_saved_p); + + return ret; +} + +/* + * A microcode patch in ucode_ptr is saved into mc_saved + * - if it has matching signature and newer revision compared to an existing + * patch mc_saved. + * - or if it is a newly discovered microcode patch. + * + * The microcode patch should have matching model with CPU. + */ +static void _save_mc(struct microcode_intel **mc_saved, u8 *ucode_ptr, + unsigned int *mc_saved_count_p) +{ + int i; + int found = 0; + unsigned int mc_saved_count = *mc_saved_count_p; + struct microcode_header_intel *mc_header; + + mc_header = (struct microcode_header_intel *)ucode_ptr; + for (i = 0; i < mc_saved_count; i++) { + unsigned int sig, pf; + unsigned int new_rev; + struct microcode_header_intel *mc_saved_header = + (struct microcode_header_intel *)mc_saved[i]; + sig = mc_saved_header->sig; + pf = mc_saved_header->pf; + new_rev = mc_header->rev; + + if (get_matching_sig(sig, pf, ucode_ptr, new_rev)) { + found = 1; + if (update_match_revision(mc_header, new_rev)) { + /* + * Found an older ucode saved before. + * Replace the older one with this newer + * one. + */ + mc_saved[i] = + (struct microcode_intel *)ucode_ptr; + break; + } + } + } + if (i >= mc_saved_count && !found) + /* + * This ucode is first time discovered in ucode file. + * Save it to memory. + */ + mc_saved[mc_saved_count++] = + (struct microcode_intel *)ucode_ptr; + + *mc_saved_count_p = mc_saved_count; +} + +/* + * Get microcode matching with BSP's model. Only CPUs with the same model as + * BSP can stay in the platform. + */ +static enum ucode_state __init +get_matching_model_microcode(int cpu, unsigned long start, + void *data, size_t size, + struct mc_saved_data *mc_saved_data, + unsigned long *mc_saved_in_initrd, + struct ucode_cpu_info *uci) +{ + u8 *ucode_ptr = data; + unsigned int leftover = size; + enum ucode_state state = UCODE_OK; + unsigned int mc_size; + struct microcode_header_intel *mc_header; + struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; + unsigned int mc_saved_count = mc_saved_data->mc_saved_count; + int i; + + while (leftover) { + mc_header = (struct microcode_header_intel *)ucode_ptr; + + mc_size = get_totalsize(mc_header); + if (!mc_size || mc_size > leftover || + microcode_sanity_check(ucode_ptr, 0) < 0) + break; + + leftover -= mc_size; + + /* + * Since APs with same family and model as the BSP may boot in + * the platform, we need to find and save microcode patches + * with the same family and model as the BSP. + */ + if (matching_model_microcode(mc_header, uci->cpu_sig.sig) != + UCODE_OK) { + ucode_ptr += mc_size; + continue; + } + + _save_mc(mc_saved_tmp, ucode_ptr, &mc_saved_count); + + ucode_ptr += mc_size; + } + + if (leftover) { + state = UCODE_ERROR; + goto out; + } + + if (mc_saved_count == 0) { + state = UCODE_NFOUND; + goto out; + } + + for (i = 0; i < mc_saved_count; i++) + mc_saved_in_initrd[i] = (unsigned long)mc_saved_tmp[i] - start; + + mc_saved_data->mc_saved_count = mc_saved_count; +out: + return state; +} + +static int collect_cpu_info_early(struct ucode_cpu_info *uci) +{ + unsigned int val[2]; + u8 x86, x86_model; + struct cpu_signature csig; + unsigned int eax, ebx, ecx, edx; + + csig.sig = 0; + csig.pf = 0; + csig.rev = 0; + + memset(uci, 0, sizeof(*uci)); + + eax = 0x00000001; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + csig.sig = eax; + + x86 = get_x86_family(csig.sig); + x86_model = get_x86_model(csig.sig); + + if ((x86_model >= 5) || (x86 > 6)) { + /* get processor flags from MSR 0x17 */ + native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); + csig.pf = 1 << ((val[1] >> 18) & 7); + } + native_wrmsr(MSR_IA32_UCODE_REV, 0, 0); + + /* As documented in the SDM: Do a CPUID 1 here */ + sync_core(); + + /* get the current revision from MSR 0x8B */ + native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); + + csig.rev = val[1]; + + uci->cpu_sig = csig; + uci->valid = 1; + + return 0; +} + +#ifdef DEBUG +static void __ref show_saved_mc(void) +{ + int i, j; + unsigned int sig, pf, rev, total_size, data_size, date; + struct ucode_cpu_info uci; + + if (mc_saved_data.mc_saved_count == 0) { + pr_debug("no micorcode data saved.\n"); + return; + } + pr_debug("Total microcode saved: %d\n", mc_saved_data.mc_saved_count); + + collect_cpu_info_early(&uci); + + sig = uci.cpu_sig.sig; + pf = uci.cpu_sig.pf; + rev = uci.cpu_sig.rev; + pr_debug("CPU%d: sig=0x%x, pf=0x%x, rev=0x%x\n", + smp_processor_id(), sig, pf, rev); + + for (i = 0; i < mc_saved_data.mc_saved_count; i++) { + struct microcode_header_intel *mc_saved_header; + struct extended_sigtable *ext_header; + int ext_sigcount; + struct extended_signature *ext_sig; + + mc_saved_header = (struct microcode_header_intel *) + mc_saved_data.mc_saved[i]; + sig = mc_saved_header->sig; + pf = mc_saved_header->pf; + rev = mc_saved_header->rev; + total_size = get_totalsize(mc_saved_header); + data_size = get_datasize(mc_saved_header); + date = mc_saved_header->date; + + pr_debug("mc_saved[%d]: sig=0x%x, pf=0x%x, rev=0x%x, toal size=0x%x, date = %04x-%02x-%02x\n", + i, sig, pf, rev, total_size, + date & 0xffff, + date >> 24, + (date >> 16) & 0xff); + + /* Look for ext. headers: */ + if (total_size <= data_size + MC_HEADER_SIZE) + continue; + + ext_header = (struct extended_sigtable *) + mc_saved_header + data_size + MC_HEADER_SIZE; + ext_sigcount = ext_header->count; + ext_sig = (void *)ext_header + EXT_HEADER_SIZE; + + for (j = 0; j < ext_sigcount; j++) { + sig = ext_sig->sig; + pf = ext_sig->pf; + + pr_debug("\tExtended[%d]: sig=0x%x, pf=0x%x\n", + j, sig, pf); + + ext_sig++; + } + + } +} +#else +static inline void show_saved_mc(void) +{ +} +#endif + +#if defined(CONFIG_MICROCODE_INTEL_EARLY) && defined(CONFIG_HOTPLUG_CPU) +static DEFINE_MUTEX(x86_cpu_microcode_mutex); +/* + * Save this mc into mc_saved_data. So it will be loaded early when a CPU is + * hot added or resumes. + * + * Please make sure this mc should be a valid microcode patch before calling + * this function. + */ +int save_mc_for_early(u8 *mc) +{ + struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; + unsigned int mc_saved_count_init; + unsigned int mc_saved_count; + struct microcode_intel **mc_saved; + int ret = 0; + int i; + + /* + * Hold hotplug lock so mc_saved_data is not accessed by a CPU in + * hotplug. + */ + mutex_lock(&x86_cpu_microcode_mutex); + + mc_saved_count_init = mc_saved_data.mc_saved_count; + mc_saved_count = mc_saved_data.mc_saved_count; + mc_saved = mc_saved_data.mc_saved; + + if (mc_saved && mc_saved_count) + memcpy(mc_saved_tmp, mc_saved, + mc_saved_count * sizeof(struct mirocode_intel *)); + /* + * Save the microcode patch mc in mc_save_tmp structure if it's a newer + * version. + */ + + _save_mc(mc_saved_tmp, mc, &mc_saved_count); + + /* + * Save the mc_save_tmp in global mc_saved_data. + */ + ret = save_microcode(&mc_saved_data, mc_saved_tmp, mc_saved_count); + if (ret) { + pr_err("Cannot save microcode patch.\n"); + goto out; + } + + show_saved_mc(); + + /* + * Free old saved microcod data. + */ + if (mc_saved) { + for (i = 0; i < mc_saved_count_init; i++) + kfree(mc_saved[i]); + kfree(mc_saved); + } + +out: + mutex_unlock(&x86_cpu_microcode_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(save_mc_for_early); +#endif + +static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin"; +static __init enum ucode_state +scan_microcode(unsigned long start, unsigned long end, + struct mc_saved_data *mc_saved_data, + unsigned long *mc_saved_in_initrd, + struct ucode_cpu_info *uci) +{ + unsigned int size = end - start + 1; + struct cpio_data cd; + long offset = 0; +#ifdef CONFIG_X86_32 + char *p = (char *)__pa_nodebug(ucode_name); +#else + char *p = ucode_name; +#endif + + cd.data = NULL; + cd.size = 0; + + cd = find_cpio_data(p, (void *)start, size, &offset); + if (!cd.data) + return UCODE_ERROR; + + + return get_matching_model_microcode(0, start, cd.data, cd.size, + mc_saved_data, mc_saved_in_initrd, + uci); +} + +/* + * Print ucode update info. + */ +static void +print_ucode_info(struct ucode_cpu_info *uci, unsigned int date) +{ + int cpu = smp_processor_id(); + + pr_info("CPU%d microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n", + cpu, + uci->cpu_sig.rev, + date & 0xffff, + date >> 24, + (date >> 16) & 0xff); +} + +#ifdef CONFIG_X86_32 + +static int delay_ucode_info; +static int current_mc_date; + +/* + * Print early updated ucode info after printk works. This is delayed info dump. + */ +void show_ucode_info_early(void) +{ + struct ucode_cpu_info uci; + + if (delay_ucode_info) { + collect_cpu_info_early(&uci); + print_ucode_info(&uci, current_mc_date); + delay_ucode_info = 0; + } +} + +/* + * At this point, we can not call printk() yet. Keep microcode patch number in + * mc_saved_data.mc_saved and delay printing microcode info in + * show_ucode_info_early() until printk() works. + */ +static void print_ucode(struct ucode_cpu_info *uci) +{ + struct microcode_intel *mc_intel; + int *delay_ucode_info_p; + int *current_mc_date_p; + + mc_intel = uci->mc; + if (mc_intel == NULL) + return; + + delay_ucode_info_p = (int *)__pa_nodebug(&delay_ucode_info); + current_mc_date_p = (int *)__pa_nodebug(¤t_mc_date); + + *delay_ucode_info_p = 1; + *current_mc_date_p = mc_intel->hdr.date; +} +#else + +/* + * Flush global tlb. We only do this in x86_64 where paging has been enabled + * already and PGE should be enabled as well. + */ +static inline void flush_tlb_early(void) +{ + __native_flush_tlb_global_irq_disabled(); +} + +static inline void print_ucode(struct ucode_cpu_info *uci) +{ + struct microcode_intel *mc_intel; + + mc_intel = uci->mc; + if (mc_intel == NULL) + return; + + print_ucode_info(uci, mc_intel->hdr.date); +} +#endif + +static int apply_microcode_early(struct mc_saved_data *mc_saved_data, + struct ucode_cpu_info *uci) +{ + struct microcode_intel *mc_intel; + unsigned int val[2]; + + mc_intel = uci->mc; + if (mc_intel == NULL) + return 0; + + /* write microcode via MSR 0x79 */ + native_wrmsr(MSR_IA32_UCODE_WRITE, + (unsigned long) mc_intel->bits, + (unsigned long) mc_intel->bits >> 16 >> 16); + native_wrmsr(MSR_IA32_UCODE_REV, 0, 0); + + /* As documented in the SDM: Do a CPUID 1 here */ + sync_core(); + + /* get the current revision from MSR 0x8B */ + native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); + if (val[1] != mc_intel->hdr.rev) + return -1; + +#ifdef CONFIG_X86_64 + /* Flush global tlb. This is precaution. */ + flush_tlb_early(); +#endif + uci->cpu_sig.rev = val[1]; + + print_ucode(uci); + + return 0; +} + +/* + * This function converts microcode patch offsets previously stored in + * mc_saved_in_initrd to pointers and stores the pointers in mc_saved_data. + */ +int __init save_microcode_in_initrd_intel(void) +{ + unsigned int count = mc_saved_data.mc_saved_count; + struct microcode_intel *mc_saved[MAX_UCODE_COUNT]; + int ret = 0; + + if (count == 0) + return ret; + + microcode_pointer(mc_saved, mc_saved_in_initrd, initrd_start, count); + ret = save_microcode(&mc_saved_data, mc_saved, count); + if (ret) + pr_err("Cannot save microcode patches from initrd.\n"); + + show_saved_mc(); + + return ret; +} + +static void __init +_load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data, + unsigned long *mc_saved_in_initrd, + unsigned long initrd_start_early, + unsigned long initrd_end_early, + struct ucode_cpu_info *uci) +{ + collect_cpu_info_early(uci); + scan_microcode(initrd_start_early, initrd_end_early, mc_saved_data, + mc_saved_in_initrd, uci); + load_microcode(mc_saved_data, mc_saved_in_initrd, + initrd_start_early, uci); + apply_microcode_early(mc_saved_data, uci); +} + +void __init +load_ucode_intel_bsp(void) +{ + u64 ramdisk_image, ramdisk_size; + unsigned long initrd_start_early, initrd_end_early; + struct ucode_cpu_info uci; +#ifdef CONFIG_X86_32 + struct boot_params *boot_params_p; + + boot_params_p = (struct boot_params *)__pa_nodebug(&boot_params); + ramdisk_image = boot_params_p->hdr.ramdisk_image; + ramdisk_size = boot_params_p->hdr.ramdisk_size; + initrd_start_early = ramdisk_image; + initrd_end_early = initrd_start_early + ramdisk_size; + + _load_ucode_intel_bsp( + (struct mc_saved_data *)__pa_nodebug(&mc_saved_data), + (unsigned long *)__pa_nodebug(&mc_saved_in_initrd), + initrd_start_early, initrd_end_early, &uci); +#else + ramdisk_image = boot_params.hdr.ramdisk_image; + ramdisk_size = boot_params.hdr.ramdisk_size; + initrd_start_early = ramdisk_image + PAGE_OFFSET; + initrd_end_early = initrd_start_early + ramdisk_size; + + _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, + initrd_start_early, initrd_end_early, &uci); +#endif +} + +void load_ucode_intel_ap(void) +{ + struct mc_saved_data *mc_saved_data_p; + struct ucode_cpu_info uci; + unsigned long *mc_saved_in_initrd_p; + unsigned long initrd_start_addr; +#ifdef CONFIG_X86_32 + unsigned long *initrd_start_p; + + mc_saved_in_initrd_p = + (unsigned long *)__pa_nodebug(mc_saved_in_initrd); + mc_saved_data_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data); + initrd_start_p = (unsigned long *)__pa_nodebug(&initrd_start); + initrd_start_addr = (unsigned long)__pa_nodebug(*initrd_start_p); +#else + mc_saved_data_p = &mc_saved_data; + mc_saved_in_initrd_p = mc_saved_in_initrd; + initrd_start_addr = initrd_start; +#endif + + /* + * If there is no valid ucode previously saved in memory, no need to + * update ucode on this AP. + */ + if (mc_saved_data_p->mc_saved_count == 0) + return; + + collect_cpu_info_early(&uci); + load_microcode(mc_saved_data_p, mc_saved_in_initrd_p, + initrd_start_addr, &uci); + apply_microcode_early(mc_saved_data_p, &uci); +} diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c new file mode 100644 index 00000000000..ce69320d017 --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/intel_lib.c @@ -0,0 +1,174 @@ +/* + * Intel CPU Microcode Update Driver for Linux + * + * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com> + * H Peter Anvin" <hpa@zytor.com> + * + * This driver allows to upgrade microcode on Intel processors + * belonging to IA-32 family - PentiumPro, Pentium II, + * Pentium III, Xeon, Pentium 4, etc. + * + * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture + * Software Developer's Manual + * Order Number 253668 or free download from: + * + * http://developer.intel.com/Assets/PDF/manual/253668.pdf + * + * For more information, go to http://www.urbanmyth.org/microcode + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ +#include <linux/firmware.h> +#include <linux/uaccess.h> +#include <linux/kernel.h> +#include <linux/module.h> + +#include <asm/microcode_intel.h> +#include <asm/processor.h> +#include <asm/msr.h> + +static inline int +update_match_cpu(unsigned int csig, unsigned int cpf, + unsigned int sig, unsigned int pf) +{ + return (!sigmatch(sig, csig, pf, cpf)) ? 0 : 1; +} + +int +update_match_revision(struct microcode_header_intel *mc_header, int rev) +{ + return (mc_header->rev <= rev) ? 0 : 1; +} + +int microcode_sanity_check(void *mc, int print_err) +{ + unsigned long total_size, data_size, ext_table_size; + struct microcode_header_intel *mc_header = mc; + struct extended_sigtable *ext_header = NULL; + int sum, orig_sum, ext_sigcount = 0, i; + struct extended_signature *ext_sig; + + total_size = get_totalsize(mc_header); + data_size = get_datasize(mc_header); + + if (data_size + MC_HEADER_SIZE > total_size) { + if (print_err) + pr_err("error! Bad data size in microcode data file\n"); + return -EINVAL; + } + + if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { + if (print_err) + pr_err("error! Unknown microcode update format\n"); + return -EINVAL; + } + ext_table_size = total_size - (MC_HEADER_SIZE + data_size); + if (ext_table_size) { + if ((ext_table_size < EXT_HEADER_SIZE) + || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { + if (print_err) + pr_err("error! Small exttable size in microcode data file\n"); + return -EINVAL; + } + ext_header = mc + MC_HEADER_SIZE + data_size; + if (ext_table_size != exttable_size(ext_header)) { + if (print_err) + pr_err("error! Bad exttable size in microcode data file\n"); + return -EFAULT; + } + ext_sigcount = ext_header->count; + } + + /* check extended table checksum */ + if (ext_table_size) { + int ext_table_sum = 0; + int *ext_tablep = (int *)ext_header; + + i = ext_table_size / DWSIZE; + while (i--) + ext_table_sum += ext_tablep[i]; + if (ext_table_sum) { + if (print_err) + pr_warn("aborting, bad extended signature table checksum\n"); + return -EINVAL; + } + } + + /* calculate the checksum */ + orig_sum = 0; + i = (MC_HEADER_SIZE + data_size) / DWSIZE; + while (i--) + orig_sum += ((int *)mc)[i]; + if (orig_sum) { + if (print_err) + pr_err("aborting, bad checksum\n"); + return -EINVAL; + } + if (!ext_table_size) + return 0; + /* check extended signature checksum */ + for (i = 0; i < ext_sigcount; i++) { + ext_sig = (void *)ext_header + EXT_HEADER_SIZE + + EXT_SIGNATURE_SIZE * i; + sum = orig_sum + - (mc_header->sig + mc_header->pf + mc_header->cksum) + + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); + if (sum) { + if (print_err) + pr_err("aborting, bad checksum\n"); + return -EINVAL; + } + } + return 0; +} +EXPORT_SYMBOL_GPL(microcode_sanity_check); + +/* + * return 0 - no update found + * return 1 - found update + */ +int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev) +{ + struct microcode_header_intel *mc_header = mc; + struct extended_sigtable *ext_header; + unsigned long total_size = get_totalsize(mc_header); + int ext_sigcount, i; + struct extended_signature *ext_sig; + + if (update_match_cpu(csig, cpf, mc_header->sig, mc_header->pf)) + return 1; + + /* Look for ext. headers: */ + if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE) + return 0; + + ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE; + ext_sigcount = ext_header->count; + ext_sig = (void *)ext_header + EXT_HEADER_SIZE; + + for (i = 0; i < ext_sigcount; i++) { + if (update_match_cpu(csig, cpf, ext_sig->sig, ext_sig->pf)) + return 1; + ext_sig++; + } + return 0; +} + +/* + * return 0 - no update found + * return 1 - found update + */ +int get_matching_microcode(unsigned int csig, int cpf, void *mc, int rev) +{ + struct microcode_header_intel *mc_header = mc; + + if (!update_match_revision(mc_header, rev)) + return 0; + + return get_matching_sig(csig, cpf, mc, rev); +} +EXPORT_SYMBOL_GPL(get_matching_microcode); diff --git a/arch/x86/kernel/cpu/mkcapflags.pl b/arch/x86/kernel/cpu/mkcapflags.pl deleted file mode 100644 index dfea390e160..00000000000 --- a/arch/x86/kernel/cpu/mkcapflags.pl +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/perl -# -# Generate the x86_cap_flags[] array from include/asm-x86/cpufeature.h -# - -($in, $out) = @ARGV; - -open(IN, "< $in\0") or die "$0: cannot open: $in: $!\n"; -open(OUT, "> $out\0") or die "$0: cannot create: $out: $!\n"; - -print OUT "#include <asm/cpufeature.h>\n\n"; -print OUT "const char * const x86_cap_flags[NCAPINTS*32] = {\n"; - -while (defined($line = <IN>)) { - if ($line =~ /^\s*\#\s*define\s+(X86_FEATURE_(\S+))\s+(.*)$/) { - $macro = $1; - $feature = $2; - $tail = $3; - if ($tail =~ /\/\*\s*\"([^"]*)\".*\*\//) { - $feature = $1; - } - - if ($feature ne '') { - printf OUT "\t%-32s = \"%s\",\n", - "[$macro]", "\L$feature"; - } - } -} -print OUT "};\n"; - -close(IN); -close(OUT); diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh new file mode 100644 index 00000000000..2bf61650549 --- /dev/null +++ b/arch/x86/kernel/cpu/mkcapflags.sh @@ -0,0 +1,41 @@ +#!/bin/sh +# +# Generate the x86_cap_flags[] array from include/asm/cpufeature.h +# + +IN=$1 +OUT=$2 + +TABS="$(printf '\t\t\t\t\t')" +trap 'rm "$OUT"' EXIT + +( + echo "#ifndef _ASM_X86_CPUFEATURE_H" + echo "#include <asm/cpufeature.h>" + echo "#endif" + echo "" + echo "const char * const x86_cap_flags[NCAPINTS*32] = {" + + # Iterate through any input lines starting with #define X86_FEATURE_ + sed -n -e 's/\t/ /g' -e 's/^ *# *define *X86_FEATURE_//p' $IN | + while read i + do + # Name is everything up to the first whitespace + NAME="$(echo "$i" | sed 's/ .*//')" + + # If the /* comment */ starts with a quote string, grab that. + VALUE="$(echo "$i" | sed -n 's@.*/\* *\("[^"]*"\).*\*/@\1@p')" + [ -z "$VALUE" ] && VALUE="\"$NAME\"" + [ "$VALUE" == '""' ] && continue + + # Name is uppercase, VALUE is all lowercase + VALUE="$(echo "$VALUE" | tr A-Z a-z)" + + TABCOUNT=$(( ( 5*8 - 14 - $(echo "$NAME" | wc -c) ) / 8 )) + printf "\t[%s]%.*s = %s,\n" \ + "X86_FEATURE_$NAME" "$TABCOUNT" "$TABS" "$VALUE" + done + echo "};" +) > $OUT + +trap - EXIT diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index d944bf6c50e..a450373e8e9 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -11,31 +11,104 @@ */ #include <linux/types.h> +#include <linux/time.h> +#include <linux/clocksource.h> #include <linux/module.h> +#include <linux/hardirq.h> +#include <linux/efi.h> +#include <linux/interrupt.h> +#include <linux/irq.h> #include <asm/processor.h> #include <asm/hypervisor.h> #include <asm/hyperv.h> #include <asm/mshyperv.h> +#include <asm/desc.h> +#include <asm/idle.h> +#include <asm/irq_regs.h> +#include <asm/i8259.h> +#include <asm/apic.h> +#include <asm/timer.h> struct ms_hyperv_info ms_hyperv; EXPORT_SYMBOL_GPL(ms_hyperv); -static bool __init ms_hyperv_platform(void) +#if IS_ENABLED(CONFIG_HYPERV) +static void (*vmbus_handler)(void); + +void hyperv_vector_handler(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + irq_enter(); + exit_idle(); + + inc_irq_stat(irq_hv_callback_count); + if (vmbus_handler) + vmbus_handler(); + + irq_exit(); + set_irq_regs(old_regs); +} + +void hv_setup_vmbus_irq(void (*handler)(void)) +{ + vmbus_handler = handler; + /* + * Setup the IDT for hypervisor callback. Prevent reallocation + * at module reload. + */ + if (!test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors)) + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, + hyperv_callback_vector); +} + +void hv_remove_vmbus_irq(void) +{ + /* We have no way to deallocate the interrupt gate */ + vmbus_handler = NULL; +} +EXPORT_SYMBOL_GPL(hv_setup_vmbus_irq); +EXPORT_SYMBOL_GPL(hv_remove_vmbus_irq); +#endif + +static uint32_t __init ms_hyperv_platform(void) { u32 eax; u32 hyp_signature[3]; if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) - return false; + return 0; cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS, &eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]); - return eax >= HYPERV_CPUID_MIN && - eax <= HYPERV_CPUID_MAX && - !memcmp("Microsoft Hv", hyp_signature, 12); + if (eax >= HYPERV_CPUID_MIN && + eax <= HYPERV_CPUID_MAX && + !memcmp("Microsoft Hv", hyp_signature, 12)) + return HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS; + + return 0; +} + +static cycle_t read_hv_clock(struct clocksource *arg) +{ + cycle_t current_tick; + /* + * Read the partition counter to get the current tick count. This count + * is set to 0 when the partition is created and is incremented in + * 100 nanosecond units. + */ + rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick); + return current_tick; } +static struct clocksource hyperv_cs = { + .name = "hyperv_clocksource", + .rating = 400, /* use this when running on Hyperv*/ + .read = read_hv_clock, + .mask = CLOCKSOURCE_MASK(64), +}; + static void __init ms_hyperv_init_platform(void) { /* @@ -46,6 +119,29 @@ static void __init ms_hyperv_init_platform(void) printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n", ms_hyperv.features, ms_hyperv.hints); + +#ifdef CONFIG_X86_LOCAL_APIC + if (ms_hyperv.features & HV_X64_MSR_APIC_FREQUENCY_AVAILABLE) { + /* + * Get the APIC frequency. + */ + u64 hv_lapic_frequency; + + rdmsrl(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency); + hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ); + lapic_timer_frequency = hv_lapic_frequency; + printk(KERN_INFO "HyperV: LAPIC Timer Frequency: %#x\n", + lapic_timer_frequency); + } +#endif + + if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE) + clocksource_register_hz(&hyperv_cs, NSEC_PER_SEC/100); + +#ifdef CONFIG_X86_IO_APIC + no_timer_check = 1; +#endif + } const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index ac140c7be39..5f90b85ff22 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -258,15 +258,15 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk, /* Compute the maximum size with which we can make a range: */ if (range_startk) - max_align = ffs(range_startk) - 1; + max_align = __ffs(range_startk); else - max_align = 32; + max_align = BITS_PER_LONG - 1; - align = fls(range_sizek) - 1; + align = __fls(range_sizek); if (align > max_align) align = max_align; - sizek = 1 << align; + sizek = 1UL << align; if (debug_print) { char start_factor = 'K', size_factor = 'K'; unsigned long start_base, size_base; @@ -714,15 +714,15 @@ int __init mtrr_cleanup(unsigned address_bits) if (mtrr_tom2) x_remove_size = (mtrr_tom2 >> PAGE_SHIFT) - x_remove_base; - nr_range = x86_get_mtrr_mem_range(range, 0, x_remove_base, x_remove_size); /* * [0, 1M) should always be covered by var mtrr with WB * and fixed mtrrs should take effect before var mtrr for it: */ - nr_range = add_range_with_merge(range, RANGE_NUM, nr_range, 0, + nr_range = add_range_with_merge(range, RANGE_NUM, 0, 0, 1ULL<<(20 - PAGE_SHIFT)); - /* Sort the ranges: */ - sort_range(range, nr_range); + /* add from var mtrr at last */ + nr_range = x86_get_mtrr_mem_range(range, nr_range, + x_remove_base, x_remove_size); range_sums = sum_ranges(range, nr_range); printk(KERN_INFO "total RAM covered: %ldM\n", diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c index 68a3343e579..9e451b0876b 100644 --- a/arch/x86/kernel/cpu/mtrr/cyrix.c +++ b/arch/x86/kernel/cpu/mtrr/cyrix.c @@ -167,7 +167,7 @@ static void post_set(void) setCx86(CX86_CCR3, ccr3); /* Enable caches */ - write_cr0(read_cr0() & 0xbfffffff); + write_cr0(read_cr0() & ~X86_CR0_CD); /* Restore value of CR4 */ if (cpu_has_pge) diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 9f27228ceff..0e25a1bc5ab 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -1,6 +1,6 @@ /* * This only handles 32bit MTRR on 32bit hosts. This is strictly wrong - * because MTRRs can span upto 40 bits (36bits on most modern x86) + * because MTRRs can span up to 40 bits (36bits on most modern x86) */ #define DEBUG @@ -12,7 +12,6 @@ #include <asm/processor-flags.h> #include <asm/cpufeature.h> #include <asm/tlbflush.h> -#include <asm/system.h> #include <asm/mtrr.h> #include <asm/msr.h> #include <asm/pat.h> @@ -362,11 +361,7 @@ static void __init print_mtrr_state(void) } pr_debug("MTRR variable ranges %sabled:\n", mtrr_state.enabled & 2 ? "en" : "dis"); - if (size_or_mask & 0xffffffffUL) - high_width = ffs(size_or_mask & 0xffffffffUL) - 1; - else - high_width = ffs(size_or_mask>>32) + 32 - 1; - high_width = (high_width - (32 - PAGE_SHIFT) + 3) / 4; + high_width = (__ffs64(size_or_mask) - (32 - PAGE_SHIFT) + 3) / 4; for (i = 0; i < num_var_ranges; ++i) { if (mtrr_state.var_ranges[i].mask_lo & (1 << 11)) @@ -515,8 +510,9 @@ generic_get_free_region(unsigned long base, unsigned long size, int replace_reg) static void generic_get_mtrr(unsigned int reg, unsigned long *base, unsigned long *size, mtrr_type *type) { - unsigned int mask_lo, mask_hi, base_lo, base_hi; - unsigned int tmp, hi; + u32 mask_lo, mask_hi, base_lo, base_hi; + unsigned int hi; + u64 tmp, mask; /* * get_mtrr doesn't need to update mtrr_state, also it could be called @@ -537,17 +533,18 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi); /* Work out the shifted address mask: */ - tmp = mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT; - mask_lo = size_or_mask | tmp; + tmp = (u64)mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT; + mask = size_or_mask | tmp; /* Expand tmp with high bits to all 1s: */ - hi = fls(tmp); + hi = fls64(tmp); if (hi > 0) { - tmp |= ~((1<<(hi - 1)) - 1); + tmp |= ~((1ULL<<(hi - 1)) - 1); - if (tmp != mask_lo) { + if (tmp != mask) { printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n"); - mask_lo = tmp; + add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); + mask = tmp; } } @@ -555,8 +552,8 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, * This works correctly if size is a power of two, i.e. a * contiguous range: */ - *size = -mask_lo; - *base = base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT; + *size = -mask; + *base = (u64)base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT; *type = base_lo & 0xff; out_put_cpu: @@ -686,6 +683,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock) } /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); __flush_tlb(); /* Save MTRR state */ @@ -693,18 +691,20 @@ static void prepare_set(void) __acquires(set_atomicity_lock) /* Disable MTRRs, and set the default type to uncached */ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi); + wbinvd(); } static void post_set(void) __releases(set_atomicity_lock) { /* Flush TLBs (no need to flush caches - they are disabled) */ + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); __flush_tlb(); /* Intel (P6) standard MTRRs */ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); /* Enable caches */ - write_cr0(read_cr0() & 0xbfffffff); + write_cr0(read_cr0() & ~X86_CR0_CD); /* Restore value of CR4 */ if (cpu_has_pge) diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index 79289632cb2..a041e094b8b 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -167,6 +167,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) { int err = 0; mtrr_type type; + unsigned long base; unsigned long size; struct mtrr_sentry sentry; struct mtrr_gentry gentry; @@ -267,14 +268,14 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) #endif if (gentry.regnum >= num_var_ranges) return -EINVAL; - mtrr_if->get(gentry.regnum, &gentry.base, &size, &type); + mtrr_if->get(gentry.regnum, &base, &size, &type); /* Hide entries that go above 4GB */ - if (gentry.base + size - 1 >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT)) + if (base + size - 1 >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT)) || size >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT))) gentry.base = gentry.size = gentry.type = 0; else { - gentry.base <<= PAGE_SHIFT; + gentry.base = base << PAGE_SHIFT; gentry.size = size << PAGE_SHIFT; gentry.type = type; } @@ -321,11 +322,12 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) #endif if (gentry.regnum >= num_var_ranges) return -EINVAL; - mtrr_if->get(gentry.regnum, &gentry.base, &size, &type); + mtrr_if->get(gentry.regnum, &base, &size, &type); /* Hide entries that would overflow */ if (size != (__typeof__(gentry.size))size) gentry.base = gentry.size = gentry.type = 0; else { + gentry.base = base; gentry.size = size; gentry.type = type; } diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 01c0f3ee6cc..f961de9964c 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -45,14 +45,19 @@ #include <linux/cpu.h> #include <linux/pci.h> #include <linux/smp.h> +#include <linux/syscore_ops.h> #include <asm/processor.h> #include <asm/e820.h> #include <asm/mtrr.h> #include <asm/msr.h> +#include <asm/pat.h> #include "mtrr.h" +/* arch_phys_wc_add returns an MTRR register index plus this offset. */ +#define MTRR_TO_PHYS_WC_OFFSET 1000 + u32 num_var_ranges; unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; @@ -78,7 +83,6 @@ void set_mtrr_ops(const struct mtrr_ops *ops) static int have_wrcomb(void) { struct pci_dev *dev; - u8 rev; dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL); if (dev != NULL) { @@ -88,13 +92,11 @@ static int have_wrcomb(void) * chipsets to be tagged */ if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS && - dev->device == PCI_DEVICE_ID_SERVERWORKS_LE) { - pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); - if (rev <= 5) { - pr_info("mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n"); - pci_dev_put(dev); - return 0; - } + dev->device == PCI_DEVICE_ID_SERVERWORKS_LE && + dev->revision <= 5) { + pr_info("mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n"); + pci_dev_put(dev); + return 0; } /* * Intel 450NX errata # 23. Non ascending cacheline evictions to @@ -136,56 +138,42 @@ static void __init init_table(void) } struct set_mtrr_data { - atomic_t count; - atomic_t gate; unsigned long smp_base; unsigned long smp_size; unsigned int smp_reg; mtrr_type smp_type; }; -static DEFINE_PER_CPU(struct cpu_stop_work, mtrr_work); - /** - * mtrr_work_handler - Synchronisation handler. Executed by "other" CPUs. + * mtrr_rendezvous_handler - Work done in the synchronization handler. Executed + * by all the CPUs. * @info: pointer to mtrr configuration data * * Returns nothing. */ -static int mtrr_work_handler(void *info) +static int mtrr_rendezvous_handler(void *info) { -#ifdef CONFIG_SMP struct set_mtrr_data *data = info; - unsigned long flags; - atomic_dec(&data->count); - while (!atomic_read(&data->gate)) - cpu_relax(); - - local_irq_save(flags); - - atomic_dec(&data->count); - while (atomic_read(&data->gate)) - cpu_relax(); - - /* The master has cleared me to execute */ + /* + * We use this same function to initialize the mtrrs during boot, + * resume, runtime cpu online and on an explicit request to set a + * specific MTRR. + * + * During boot or suspend, the state of the boot cpu's mtrrs has been + * saved, and we want to replicate that across all the cpus that come + * online (either at the end of boot or resume or during a runtime cpu + * online). If we're doing that, @reg is set to something special and on + * all the cpu's we do mtrr_if->set_all() (On the logical cpu that + * started the boot/resume sequence, this might be a duplicate + * set_all()). + */ if (data->smp_reg != ~0U) { mtrr_if->set(data->smp_reg, data->smp_base, data->smp_size, data->smp_type); - } else if (mtrr_aps_delayed_init) { - /* - * Initialize the MTRRs inaddition to the synchronisation. - */ + } else if (mtrr_aps_delayed_init || !cpu_online(smp_processor_id())) { mtrr_if->set_all(); } - - atomic_dec(&data->count); - while (!atomic_read(&data->gate)) - cpu_relax(); - - atomic_dec(&data->count); - local_irq_restore(flags); -#endif return 0; } @@ -222,20 +210,11 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2) * 14. Wait for buddies to catch up * 15. Enable interrupts. * - * What does that mean for us? Well, first we set data.count to the number - * of CPUs. As each CPU announces that it started the rendezvous handler by - * decrementing the count, We reset data.count and set the data.gate flag - * allowing all the cpu's to proceed with the work. As each cpu disables - * interrupts, it'll decrement data.count once. We wait until it hits 0 and - * proceed. We clear the data.gate flag and reset data.count. Meanwhile, they - * are waiting for that flag to be cleared. Once it's cleared, each - * CPU goes through the transition of updating MTRRs. - * The CPU vendors may each do it differently, - * so we call mtrr_if->set() callback and let them take care of it. - * When they're done, they again decrement data->count and wait for data.gate - * to be set. - * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag - * Everyone then enables interrupts and we all continue on. + * What does that mean for us? Well, stop_machine() will ensure that + * the rendezvous handler is started on each CPU. And in lockstep they + * do the state transition of disabling interrupts, updating MTRR's + * (the CPU vendors may each do it differently, so we call mtrr_if->set() + * callback and let them take care of it.) and enabling interrupts. * * Note that the mechanism is the same for UP systems, too; all the SMP stuff * becomes nops. @@ -243,82 +222,26 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2) static void set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type) { - struct set_mtrr_data data; - unsigned long flags; - int cpu; - - preempt_disable(); - - data.smp_reg = reg; - data.smp_base = base; - data.smp_size = size; - data.smp_type = type; - atomic_set(&data.count, num_booting_cpus() - 1); - - /* Make sure data.count is visible before unleashing other CPUs */ - smp_wmb(); - atomic_set(&data.gate, 0); - - /* Start the ball rolling on other CPUs */ - for_each_online_cpu(cpu) { - struct cpu_stop_work *work = &per_cpu(mtrr_work, cpu); + struct set_mtrr_data data = { .smp_reg = reg, + .smp_base = base, + .smp_size = size, + .smp_type = type + }; - if (cpu == smp_processor_id()) - continue; - - stop_one_cpu_nowait(cpu, mtrr_work_handler, &data, work); - } - - - while (atomic_read(&data.count)) - cpu_relax(); - - /* Ok, reset count and toggle gate */ - atomic_set(&data.count, num_booting_cpus() - 1); - smp_wmb(); - atomic_set(&data.gate, 1); - - local_irq_save(flags); - - while (atomic_read(&data.count)) - cpu_relax(); - - /* Ok, reset count and toggle gate */ - atomic_set(&data.count, num_booting_cpus() - 1); - smp_wmb(); - atomic_set(&data.gate, 0); - - /* Do our MTRR business */ - - /* - * HACK! - * We use this same function to initialize the mtrrs on boot. - * The state of the boot cpu's mtrrs has been saved, and we want - * to replicate across all the APs. - * If we're doing that @reg is set to something special... - */ - if (reg != ~0U) - mtrr_if->set(reg, base, size, type); - else if (!mtrr_aps_delayed_init) - mtrr_if->set_all(); - - /* Wait for the others */ - while (atomic_read(&data.count)) - cpu_relax(); - - atomic_set(&data.count, num_booting_cpus() - 1); - smp_wmb(); - atomic_set(&data.gate, 1); - - /* - * Wait here for everyone to have seen the gate change - * So we're the last ones to touch 'data' - */ - while (atomic_read(&data.count)) - cpu_relax(); + stop_machine(mtrr_rendezvous_handler, &data, cpu_online_mask); +} - local_irq_restore(flags); - preempt_enable(); +static void set_mtrr_from_inactive_cpu(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type) +{ + struct set_mtrr_data data = { .smp_reg = reg, + .smp_base = base, + .smp_size = size, + .smp_type = type + }; + + stop_machine_from_inactive_cpu(mtrr_rendezvous_handler, &data, + cpu_callout_mask); } /** @@ -386,7 +309,8 @@ int mtrr_add_page(unsigned long base, unsigned long size, return -EINVAL; } - if (base & size_or_mask || size & size_or_mask) { + if ((base | (base + size - 1)) >> + (boot_cpu_data.x86_phys_bits - PAGE_SHIFT)) { pr_warning("mtrr: base or size exceeds the MTRR width\n"); return -EINVAL; } @@ -605,6 +529,73 @@ int mtrr_del(int reg, unsigned long base, unsigned long size) } EXPORT_SYMBOL(mtrr_del); +/** + * arch_phys_wc_add - add a WC MTRR and handle errors if PAT is unavailable + * @base: Physical base address + * @size: Size of region + * + * If PAT is available, this does nothing. If PAT is unavailable, it + * attempts to add a WC MTRR covering size bytes starting at base and + * logs an error if this fails. + * + * Drivers must store the return value to pass to mtrr_del_wc_if_needed, + * but drivers should not try to interpret that return value. + */ +int arch_phys_wc_add(unsigned long base, unsigned long size) +{ + int ret; + + if (pat_enabled) + return 0; /* Success! (We don't need to do anything.) */ + + ret = mtrr_add(base, size, MTRR_TYPE_WRCOMB, true); + if (ret < 0) { + pr_warn("Failed to add WC MTRR for [%p-%p]; performance may suffer.", + (void *)base, (void *)(base + size - 1)); + return ret; + } + return ret + MTRR_TO_PHYS_WC_OFFSET; +} +EXPORT_SYMBOL(arch_phys_wc_add); + +/* + * arch_phys_wc_del - undoes arch_phys_wc_add + * @handle: Return value from arch_phys_wc_add + * + * This cleans up after mtrr_add_wc_if_needed. + * + * The API guarantees that mtrr_del_wc_if_needed(error code) and + * mtrr_del_wc_if_needed(0) do nothing. + */ +void arch_phys_wc_del(int handle) +{ + if (handle >= 1) { + WARN_ON(handle < MTRR_TO_PHYS_WC_OFFSET); + mtrr_del(handle - MTRR_TO_PHYS_WC_OFFSET, 0, 0); + } +} +EXPORT_SYMBOL(arch_phys_wc_del); + +/* + * phys_wc_to_mtrr_index - translates arch_phys_wc_add's return value + * @handle: Return value from arch_phys_wc_add + * + * This will turn the return value from arch_phys_wc_add into an mtrr + * index suitable for debugging. + * + * Note: There is no legitimate use for this function, except possibly + * in printk line. Alas there is an illegitimate use in some ancient + * drm ioctls. + */ +int phys_wc_to_mtrr_index(int handle) +{ + if (handle < MTRR_TO_PHYS_WC_OFFSET) + return -1; + else + return handle - MTRR_TO_PHYS_WC_OFFSET; +} +EXPORT_SYMBOL_GPL(phys_wc_to_mtrr_index); + /* * HACK ALERT! * These should be called implicitly, but we can't yet until all the initcall @@ -630,7 +621,7 @@ struct mtrr_value { static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES]; -static int mtrr_save(struct sys_device *sysdev, pm_message_t state) +static int mtrr_save(void) { int i; @@ -642,7 +633,7 @@ static int mtrr_save(struct sys_device *sysdev, pm_message_t state) return 0; } -static int mtrr_restore(struct sys_device *sysdev) +static void mtrr_restore(void) { int i; @@ -653,18 +644,18 @@ static int mtrr_restore(struct sys_device *sysdev) mtrr_value[i].ltype); } } - return 0; } -static struct sysdev_driver mtrr_sysdev_driver = { +static struct syscore_ops mtrr_syscore_ops = { .suspend = mtrr_save, .resume = mtrr_restore, }; int __initdata changed_by_mtrr_cleanup; +#define SIZE_OR_MASK_BITS(n) (~((1ULL << ((n) - PAGE_SHIFT)) - 1)) /** * mtrr_bp_init - initialize mtrrs on the boot CPU * @@ -682,13 +673,13 @@ void __init mtrr_bp_init(void) if (cpu_has_mtrr) { mtrr_if = &generic_mtrr_ops; - size_or_mask = 0xff000000; /* 36 bits */ + size_or_mask = SIZE_OR_MASK_BITS(36); size_and_mask = 0x00f00000; phys_addr = 36; /* * This is an AMD specific MSR, but we assume(hope?) that - * Intel will implement it to when they extend the address + * Intel will implement it too when they extend the address * bus of the Xeon. */ if (cpuid_eax(0x80000000) >= 0x80000008) { @@ -701,7 +692,7 @@ void __init mtrr_bp_init(void) boot_cpu_data.x86_mask == 0x4)) phys_addr = 36; - size_or_mask = ~((1ULL << (phys_addr - PAGE_SHIFT)) - 1); + size_or_mask = SIZE_OR_MASK_BITS(phys_addr); size_and_mask = ~size_or_mask & 0xfffff00000ULL; } else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR && boot_cpu_data.x86 == 6) { @@ -709,7 +700,7 @@ void __init mtrr_bp_init(void) * VIA C* family have Intel style MTRRs, * but don't support PAE */ - size_or_mask = 0xfff00000; /* 32 bits */ + size_or_mask = SIZE_OR_MASK_BITS(32); size_and_mask = 0; phys_addr = 32; } @@ -719,21 +710,21 @@ void __init mtrr_bp_init(void) if (cpu_has_k6_mtrr) { /* Pre-Athlon (K6) AMD CPU MTRRs */ mtrr_if = mtrr_ops[X86_VENDOR_AMD]; - size_or_mask = 0xfff00000; /* 32 bits */ + size_or_mask = SIZE_OR_MASK_BITS(32); size_and_mask = 0; } break; case X86_VENDOR_CENTAUR: if (cpu_has_centaur_mcr) { mtrr_if = mtrr_ops[X86_VENDOR_CENTAUR]; - size_or_mask = 0xfff00000; /* 32 bits */ + size_or_mask = SIZE_OR_MASK_BITS(32); size_and_mask = 0; } break; case X86_VENDOR_CYRIX: if (cpu_has_cyrix_arr) { mtrr_if = mtrr_ops[X86_VENDOR_CYRIX]; - size_or_mask = 0xfff00000; /* 32 bits */ + size_or_mask = SIZE_OR_MASK_BITS(32); size_and_mask = 0; } break; @@ -773,15 +764,20 @@ void mtrr_ap_init(void) * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug * lock to prevent mtrr entry changes */ - set_mtrr(~0U, 0, 0, 0); + set_mtrr_from_inactive_cpu(~0U, 0, 0, 0); } /** - * Save current fixed-range MTRR state of the BSP + * Save current fixed-range MTRR state of the first cpu in cpu_online_mask. */ void mtrr_save_state(void) { - smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1); + int first_cpu; + + get_online_cpus(); + first_cpu = cpumask_first(cpu_online_mask); + smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1); + put_online_cpus(); } void set_mtrr_aps_delayed_init(void) @@ -793,13 +789,21 @@ void set_mtrr_aps_delayed_init(void) } /* - * MTRR initialization for all AP's + * Delayed MTRR initialization for all AP's */ void mtrr_aps_init(void) { if (!use_intel()) return; + /* + * Check if someone has requested the delay of AP MTRR initialization, + * by doing set_mtrr_aps_delayed_init(), prior to this point. If not, + * then we are done. + */ + if (!mtrr_aps_delayed_init) + return; + set_mtrr(~0U, 0, 0, 0); mtrr_aps_delayed_init = false; } @@ -831,7 +835,7 @@ static int __init mtrr_init_finialize(void) * TBD: is there any system with such CPU which supports * suspend/resume? If no, we should remove the code. */ - sysdev_driver_register(&cpu_sysdev_class, &mtrr_sysdev_driver); + register_syscore_ops(&mtrr_syscore_ops); return 0; } diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index ed6310183ef..2879ecdaac4 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -22,252 +22,32 @@ #include <linux/sched.h> #include <linux/uaccess.h> #include <linux/slab.h> -#include <linux/highmem.h> #include <linux/cpu.h> #include <linux/bitops.h> +#include <linux/device.h> #include <asm/apic.h> #include <asm/stacktrace.h> #include <asm/nmi.h> -#include <asm/compat.h> - -#if 0 -#undef wrmsrl -#define wrmsrl(msr, val) \ -do { \ - trace_printk("wrmsrl(%lx, %lx)\n", (unsigned long)(msr),\ - (unsigned long)(val)); \ - native_write_msr((msr), (u32)((u64)(val)), \ - (u32)((u64)(val) >> 32)); \ -} while (0) -#endif - -/* - * best effort, GUP based copy_from_user() that assumes IRQ or NMI context - */ -static unsigned long -copy_from_user_nmi(void *to, const void __user *from, unsigned long n) -{ - unsigned long offset, addr = (unsigned long)from; - unsigned long size, len = 0; - struct page *page; - void *map; - int ret; - - do { - ret = __get_user_pages_fast(addr, 1, 0, &page); - if (!ret) - break; - - offset = addr & (PAGE_SIZE - 1); - size = min(PAGE_SIZE - offset, n - len); - - map = kmap_atomic(page); - memcpy(to, map+offset, size); - kunmap_atomic(map); - put_page(page); - - len += size; - to += size; - addr += size; - - } while (len < n); - - return len; -} - -struct event_constraint { - union { - unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - u64 idxmsk64; - }; - u64 code; - u64 cmask; - int weight; -}; - -struct amd_nb { - int nb_id; /* NorthBridge id */ - int refcnt; /* reference count */ - struct perf_event *owners[X86_PMC_IDX_MAX]; - struct event_constraint event_constraints[X86_PMC_IDX_MAX]; -}; - -#define MAX_LBR_ENTRIES 16 - -struct cpu_hw_events { - /* - * Generic x86 PMC bits - */ - struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */ - unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - unsigned long running[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - int enabled; - - int n_events; - int n_added; - int n_txn; - int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */ - u64 tags[X86_PMC_IDX_MAX]; - struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ - - unsigned int group_flag; - - /* - * Intel DebugStore bits - */ - struct debug_store *ds; - u64 pebs_enabled; - - /* - * Intel LBR bits - */ - int lbr_users; - void *lbr_context; - struct perf_branch_stack lbr_stack; - struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; - - /* - * AMD specific bits - */ - struct amd_nb *amd_nb; -}; - -#define __EVENT_CONSTRAINT(c, n, m, w) {\ - { .idxmsk64 = (n) }, \ - .code = (c), \ - .cmask = (m), \ - .weight = (w), \ -} - -#define EVENT_CONSTRAINT(c, n, m) \ - __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n)) - -/* - * Constraint on the Event code. - */ -#define INTEL_EVENT_CONSTRAINT(c, n) \ - EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT) - -/* - * Constraint on the Event code + UMask + fixed-mask - * - * filter mask to validate fixed counter events. - * the following filters disqualify for fixed counters: - * - inv - * - edge - * - cnt-mask - * The other filters are supported by fixed counters. - * The any-thread option is supported starting with v3. - */ -#define FIXED_EVENT_CONSTRAINT(c, n) \ - EVENT_CONSTRAINT(c, (1ULL << (32+n)), X86_RAW_EVENT_MASK) +#include <asm/smp.h> +#include <asm/alternative.h> +#include <asm/timer.h> +#include <asm/desc.h> +#include <asm/ldt.h> -/* - * Constraint on the Event code + UMask - */ -#define PEBS_EVENT_CONSTRAINT(c, n) \ - EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK) - -#define EVENT_CONSTRAINT_END \ - EVENT_CONSTRAINT(0, 0, 0) - -#define for_each_event_constraint(e, c) \ - for ((e) = (c); (e)->weight; (e)++) - -union perf_capabilities { - struct { - u64 lbr_format : 6; - u64 pebs_trap : 1; - u64 pebs_arch_reg : 1; - u64 pebs_format : 4; - u64 smm_freeze : 1; - }; - u64 capabilities; -}; - -/* - * struct x86_pmu - generic x86 pmu - */ -struct x86_pmu { - /* - * Generic x86 PMC bits - */ - const char *name; - int version; - int (*handle_irq)(struct pt_regs *); - void (*disable_all)(void); - void (*enable_all)(int added); - void (*enable)(struct perf_event *); - void (*disable)(struct perf_event *); - int (*hw_config)(struct perf_event *event); - int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign); - unsigned eventsel; - unsigned perfctr; - u64 (*event_map)(int); - int max_events; - int num_counters; - int num_counters_fixed; - int cntval_bits; - u64 cntval_mask; - int apic; - u64 max_period; - struct event_constraint * - (*get_event_constraints)(struct cpu_hw_events *cpuc, - struct perf_event *event); - - void (*put_event_constraints)(struct cpu_hw_events *cpuc, - struct perf_event *event); - struct event_constraint *event_constraints; - void (*quirks)(void); - int perfctr_second_write; - - int (*cpu_prepare)(int cpu); - void (*cpu_starting)(int cpu); - void (*cpu_dying)(int cpu); - void (*cpu_dead)(int cpu); +#include "perf_event.h" - /* - * Intel Arch Perfmon v2+ - */ - u64 intel_ctrl; - union perf_capabilities intel_cap; - - /* - * Intel DebugStore bits - */ - int bts, pebs; - int bts_active, pebs_active; - int pebs_record_size; - void (*drain_pebs)(struct pt_regs *regs); - struct event_constraint *pebs_constraints; +struct x86_pmu x86_pmu __read_mostly; - /* - * Intel LBR - */ - unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */ - int lbr_nr; /* hardware stack size */ -}; - -static struct x86_pmu x86_pmu __read_mostly; - -static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { +DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, }; -static int x86_perf_event_set_period(struct perf_event *event); - -/* - * Generalized hw caching related hw_event table, filled - * in on a per model basis. A value of 0 means - * 'not supported', -1 means 'hw_event makes no sense on - * this CPU', any other value means the raw hw_event - * ID. - */ - -#define C(x) PERF_COUNT_HW_CACHE_##x - -static u64 __read_mostly hw_cache_event_ids +u64 __read_mostly hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX]; +u64 __read_mostly hw_cache_extra_regs [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; @@ -277,8 +57,7 @@ static u64 __read_mostly hw_cache_event_ids * Can only be executed on the CPU where the event is active. * Returns the delta events processed. */ -static u64 -x86_perf_event_update(struct perf_event *event) +u64 x86_perf_event_update(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; int shift = 64 - x86_pmu.cntval_bits; @@ -286,7 +65,7 @@ x86_perf_event_update(struct perf_event *event) int idx = hwc->idx; s64 delta; - if (idx == X86_PMC_IDX_FIXED_BTS) + if (idx == INTEL_PMC_IDX_FIXED_BTS) return 0; /* @@ -298,7 +77,7 @@ x86_perf_event_update(struct perf_event *event) */ again: prev_raw_count = local64_read(&hwc->prev_count); - rdmsrl(hwc->event_base + idx, new_raw_count); + rdpmcl(hwc->event_base_rdpmc, new_raw_count); if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, new_raw_count) != prev_raw_count) @@ -321,6 +100,36 @@ again: return new_raw_count; } +/* + * Find and validate any extra registers to set up. + */ +static int x86_pmu_extra_regs(u64 config, struct perf_event *event) +{ + struct hw_perf_event_extra *reg; + struct extra_reg *er; + + reg = &event->hw.extra_reg; + + if (!x86_pmu.extra_regs) + return 0; + + for (er = x86_pmu.extra_regs; er->msr; er++) { + if (er->event != (config & er->config_mask)) + continue; + if (event->attr.config1 & ~er->valid_mask) + return -EINVAL; + /* Check if the extra msrs can be safely accessed*/ + if (!er->extra_msr_access) + return -ENXIO; + + reg->idx = er->idx; + reg->config = event->attr.config1; + reg->reg = er->msr; + break; + } + return 0; +} + static atomic_t active_events; static DEFINE_MUTEX(pmc_reserve_mutex); @@ -330,16 +139,13 @@ static bool reserve_pmc_hardware(void) { int i; - if (nmi_watchdog == NMI_LOCAL_APIC) - disable_lapic_nmi_watchdog(); - for (i = 0; i < x86_pmu.num_counters; i++) { - if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) + if (!reserve_perfctr_nmi(x86_pmu_event_addr(i))) goto perfctr_fail; } for (i = 0; i < x86_pmu.num_counters; i++) { - if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) + if (!reserve_evntsel_nmi(x86_pmu_config_addr(i))) goto eventsel_fail; } @@ -347,16 +153,13 @@ static bool reserve_pmc_hardware(void) eventsel_fail: for (i--; i >= 0; i--) - release_evntsel_nmi(x86_pmu.eventsel + i); + release_evntsel_nmi(x86_pmu_config_addr(i)); i = x86_pmu.num_counters; perfctr_fail: for (i--; i >= 0; i--) - release_perfctr_nmi(x86_pmu.perfctr + i); - - if (nmi_watchdog == NMI_LOCAL_APIC) - enable_lapic_nmi_watchdog(); + release_perfctr_nmi(x86_pmu_event_addr(i)); return false; } @@ -366,12 +169,9 @@ static void release_pmc_hardware(void) int i; for (i = 0; i < x86_pmu.num_counters; i++) { - release_perfctr_nmi(x86_pmu.perfctr + i); - release_evntsel_nmi(x86_pmu.eventsel + i); + release_perfctr_nmi(x86_pmu_event_addr(i)); + release_evntsel_nmi(x86_pmu_config_addr(i)); } - - if (nmi_watchdog == NMI_LOCAL_APIC) - enable_lapic_nmi_watchdog(); } #else @@ -381,8 +181,72 @@ static void release_pmc_hardware(void) {} #endif -static void reserve_ds_buffers(void); -static void release_ds_buffers(void); +static bool check_hw_exists(void) +{ + u64 val, val_fail, val_new= ~0; + int i, reg, reg_fail, ret = 0; + int bios_fail = 0; + + /* + * Check to see if the BIOS enabled any of the counters, if so + * complain and bail. + */ + for (i = 0; i < x86_pmu.num_counters; i++) { + reg = x86_pmu_config_addr(i); + ret = rdmsrl_safe(reg, &val); + if (ret) + goto msr_fail; + if (val & ARCH_PERFMON_EVENTSEL_ENABLE) { + bios_fail = 1; + val_fail = val; + reg_fail = reg; + } + } + + if (x86_pmu.num_counters_fixed) { + reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; + ret = rdmsrl_safe(reg, &val); + if (ret) + goto msr_fail; + for (i = 0; i < x86_pmu.num_counters_fixed; i++) { + if (val & (0x03 << i*4)) { + bios_fail = 1; + val_fail = val; + reg_fail = reg; + } + } + } + + /* + * Read the current value, change it and read it back to see if it + * matches, this is needed to detect certain hardware emulators + * (qemu/kvm) that don't trap on the MSR access and always return 0s. + */ + reg = x86_pmu_event_addr(0); + if (rdmsrl_safe(reg, &val)) + goto msr_fail; + val ^= 0xffffUL; + ret = wrmsrl_safe(reg, val); + ret |= rdmsrl_safe(reg, &val_new); + if (ret || val != val_new) + goto msr_fail; + + /* + * We still allow the PMU driver to operate: + */ + if (bios_fail) { + printk(KERN_CONT "Broken BIOS detected, complain to your hardware vendor.\n"); + printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg_fail, val_fail); + } + + return true; + +msr_fail: + printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n"); + printk(KERN_ERR "Failed to access perfctr msr (MSR %x is %Lx)\n", reg, val_new); + + return false; +} static void hw_perf_event_destroy(struct perf_event *event) { @@ -399,8 +263,9 @@ static inline int x86_pmu_initialized(void) } static inline int -set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr) +set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event) { + struct perf_event_attr *attr = &event->attr; unsigned int cache_type, cache_op, cache_result; u64 config, val; @@ -427,36 +292,27 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr) return -EINVAL; hwc->config |= val; - - return 0; + attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result]; + return x86_pmu_extra_regs(val, event); } -static int x86_setup_perfctr(struct perf_event *event) +int x86_setup_perfctr(struct perf_event *event) { struct perf_event_attr *attr = &event->attr; struct hw_perf_event *hwc = &event->hw; u64 config; - if (!hwc->sample_period) { + if (!is_sampling_event(event)) { hwc->sample_period = x86_pmu.max_period; hwc->last_period = hwc->sample_period; local64_set(&hwc->period_left, hwc->sample_period); - } else { - /* - * If we have a PMU initialized but no APIC - * interrupts, we cannot sample hardware - * events (user-space has to fall back and - * sample via a hrtimer based software event): - */ - if (!x86_pmu.apic) - return -EOPNOTSUPP; } if (attr->type == PERF_TYPE_RAW) - return 0; + return x86_pmu_extra_regs(event->attr.config, event); if (attr->type == PERF_TYPE_HW_CACHE) - return set_ext_hw_attr(hwc, attr); + return set_ext_hw_attr(hwc, event); if (attr->config >= x86_pmu.max_events) return -EINVAL; @@ -475,8 +331,8 @@ static int x86_setup_perfctr(struct perf_event *event) /* * Branch tracing: */ - if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) && - (hwc->sample_period == 1)) { + if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS && + !attr->freq && hwc->sample_period == 1) { /* BTS is not supported by this architecture. */ if (!x86_pmu.bts_active) return -EOPNOTSUPP; @@ -491,13 +347,43 @@ static int x86_setup_perfctr(struct perf_event *event) return 0; } -static int x86_pmu_hw_config(struct perf_event *event) +/* + * check that branch_sample_type is compatible with + * settings needed for precise_ip > 1 which implies + * using the LBR to capture ALL taken branches at the + * priv levels of the measurement + */ +static inline int precise_br_compat(struct perf_event *event) +{ + u64 m = event->attr.branch_sample_type; + u64 b = 0; + + /* must capture all branches */ + if (!(m & PERF_SAMPLE_BRANCH_ANY)) + return 0; + + m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER; + + if (!event->attr.exclude_user) + b |= PERF_SAMPLE_BRANCH_USER; + + if (!event->attr.exclude_kernel) + b |= PERF_SAMPLE_BRANCH_KERNEL; + + /* + * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86 + */ + + return m == b; +} + +int x86_pmu_hw_config(struct perf_event *event) { if (event->attr.precise_ip) { int precise = 0; /* Support for constant skid */ - if (x86_pmu.pebs_active) { + if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) { precise++; /* Support for IP fixup */ @@ -507,6 +393,37 @@ static int x86_pmu_hw_config(struct perf_event *event) if (event->attr.precise_ip > precise) return -EOPNOTSUPP; + /* + * check that PEBS LBR correction does not conflict with + * whatever the user is asking with attr->branch_sample_type + */ + if (event->attr.precise_ip > 1 && + x86_pmu.intel_cap.pebs_format < 2) { + u64 *br_type = &event->attr.branch_sample_type; + + if (has_branch_stack(event)) { + if (!precise_br_compat(event)) + return -EOPNOTSUPP; + + /* branch_sample_type is compatible */ + + } else { + /* + * user did not specify branch_sample_type + * + * For PEBS fixups, we capture all + * the branches at the priv level of the + * event. + */ + *br_type = PERF_SAMPLE_BRANCH_ANY; + + if (!event->attr.exclude_user) + *br_type |= PERF_SAMPLE_BRANCH_USER; + + if (!event->attr.exclude_kernel) + *br_type |= PERF_SAMPLE_BRANCH_KERNEL; + } + } } /* @@ -561,10 +478,14 @@ static int __x86_pmu_event_init(struct perf_event *event) event->hw.last_cpu = -1; event->hw.last_tag = ~0ULL; + /* mark unused */ + event->hw.extra_reg.idx = EXTRA_REG_NONE; + event->hw.branch_reg.idx = EXTRA_REG_NONE; + return x86_pmu.hw_config(event); } -static void x86_pmu_disable_all(void) +void x86_pmu_disable_all(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int idx; @@ -574,11 +495,11 @@ static void x86_pmu_disable_all(void) if (!test_bit(idx, cpuc->active_mask)) continue; - rdmsrl(x86_pmu.eventsel + idx, val); + rdmsrl(x86_pmu_config_addr(idx), val); if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE)) continue; val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; - wrmsrl(x86_pmu.eventsel + idx, val); + wrmsrl(x86_pmu_config_addr(idx), val); } } @@ -599,21 +520,18 @@ static void x86_pmu_disable(struct pmu *pmu) x86_pmu.disable_all(); } -static void x86_pmu_enable_all(int added) +void x86_pmu_enable_all(int added) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int idx; for (idx = 0; idx < x86_pmu.num_counters; idx++) { - struct perf_event *event = cpuc->events[idx]; - u64 val; + struct hw_perf_event *hwc = &cpuc->events[idx]->hw; if (!test_bit(idx, cpuc->active_mask)) continue; - val = event->hw.config; - val |= ARCH_PERFMON_EVENTSEL_ENABLE; - wrmsrl(x86_pmu.eventsel + idx, val); + __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); } } @@ -624,18 +542,198 @@ static inline int is_x86_event(struct perf_event *event) return event->pmu == &pmu; } -static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) +/* + * Event scheduler state: + * + * Assign events iterating over all events and counters, beginning + * with events with least weights first. Keep the current iterator + * state in struct sched_state. + */ +struct sched_state { + int weight; + int event; /* event index */ + int counter; /* counter index */ + int unassigned; /* number of events to be assigned left */ + unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; +}; + +/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */ +#define SCHED_STATES_MAX 2 + +struct perf_sched { + int max_weight; + int max_events; + struct perf_event **events; + struct sched_state state; + int saved_states; + struct sched_state saved[SCHED_STATES_MAX]; +}; + +/* + * Initialize interator that runs through all events and counters. + */ +static void perf_sched_init(struct perf_sched *sched, struct perf_event **events, + int num, int wmin, int wmax) +{ + int idx; + + memset(sched, 0, sizeof(*sched)); + sched->max_events = num; + sched->max_weight = wmax; + sched->events = events; + + for (idx = 0; idx < num; idx++) { + if (events[idx]->hw.constraint->weight == wmin) + break; + } + + sched->state.event = idx; /* start with min weight */ + sched->state.weight = wmin; + sched->state.unassigned = num; +} + +static void perf_sched_save_state(struct perf_sched *sched) +{ + if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX)) + return; + + sched->saved[sched->saved_states] = sched->state; + sched->saved_states++; +} + +static bool perf_sched_restore_state(struct perf_sched *sched) +{ + if (!sched->saved_states) + return false; + + sched->saved_states--; + sched->state = sched->saved[sched->saved_states]; + + /* continue with next counter: */ + clear_bit(sched->state.counter++, sched->state.used); + + return true; +} + +/* + * Select a counter for the current event to schedule. Return true on + * success. + */ +static bool __perf_sched_find_counter(struct perf_sched *sched) { - struct event_constraint *c, *constraints[X86_PMC_IDX_MAX]; + struct event_constraint *c; + int idx; + + if (!sched->state.unassigned) + return false; + + if (sched->state.event >= sched->max_events) + return false; + + c = sched->events[sched->state.event]->hw.constraint; + /* Prefer fixed purpose counters */ + if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) { + idx = INTEL_PMC_IDX_FIXED; + for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) { + if (!__test_and_set_bit(idx, sched->state.used)) + goto done; + } + } + /* Grab the first unused counter starting with idx */ + idx = sched->state.counter; + for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) { + if (!__test_and_set_bit(idx, sched->state.used)) + goto done; + } + + return false; + +done: + sched->state.counter = idx; + + if (c->overlap) + perf_sched_save_state(sched); + + return true; +} + +static bool perf_sched_find_counter(struct perf_sched *sched) +{ + while (!__perf_sched_find_counter(sched)) { + if (!perf_sched_restore_state(sched)) + return false; + } + + return true; +} + +/* + * Go through all unassigned events and find the next one to schedule. + * Take events with the least weight first. Return true on success. + */ +static bool perf_sched_next_event(struct perf_sched *sched) +{ + struct event_constraint *c; + + if (!sched->state.unassigned || !--sched->state.unassigned) + return false; + + do { + /* next event */ + sched->state.event++; + if (sched->state.event >= sched->max_events) { + /* next weight */ + sched->state.event = 0; + sched->state.weight++; + if (sched->state.weight > sched->max_weight) + return false; + } + c = sched->events[sched->state.event]->hw.constraint; + } while (c->weight != sched->state.weight); + + sched->state.counter = 0; /* start with first counter */ + + return true; +} + +/* + * Assign a counter for each event. + */ +int perf_assign_events(struct perf_event **events, int n, + int wmin, int wmax, int *assign) +{ + struct perf_sched sched; + + perf_sched_init(&sched, events, n, wmin, wmax); + + do { + if (!perf_sched_find_counter(&sched)) + break; /* failed */ + if (assign) + assign[sched.state.event] = sched.state.counter; + } while (perf_sched_next_event(&sched)); + + return sched.state.unassigned; +} +EXPORT_SYMBOL_GPL(perf_assign_events); + +int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) +{ + struct event_constraint *c; unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - int i, j, w, wmax, num = 0; + struct perf_event *e; + int i, wmin, wmax, num = 0; struct hw_perf_event *hwc; bitmap_zero(used_mask, X86_PMC_IDX_MAX); - for (i = 0; i < n; i++) { + for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) { + hwc = &cpuc->event_list[i]->hw; c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]); - constraints[i] = c; + hwc->constraint = c; + + wmin = min(wmin, c->weight); + wmax = max(wmax, c->weight); } /* @@ -643,7 +741,7 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) */ for (i = 0; i < n; i++) { hwc = &cpuc->event_list[i]->hw; - c = constraints[i]; + c = hwc->constraint; /* never assigned */ if (hwc->idx == -1) @@ -661,70 +759,41 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) if (assign) assign[i] = hwc->idx; } - if (i == n) - goto done; - - /* - * begin slow path - */ - - bitmap_zero(used_mask, X86_PMC_IDX_MAX); - /* - * weight = number of possible counters - * - * 1 = most constrained, only works on one counter - * wmax = least constrained, works on any counter - * - * assign events to counters starting with most - * constrained events. - */ - wmax = x86_pmu.num_counters; + /* slow path */ + if (i != n) + num = perf_assign_events(cpuc->event_list, n, wmin, + wmax, assign); /* - * when fixed event counters are present, - * wmax is incremented by 1 to account - * for one more choice + * Mark the event as committed, so we do not put_constraint() + * in case new events are added and fail scheduling. */ - if (x86_pmu.num_counters_fixed) - wmax++; - - for (w = 1, num = n; num && w <= wmax; w++) { - /* for each event */ - for (i = 0; num && i < n; i++) { - c = constraints[i]; - hwc = &cpuc->event_list[i]->hw; - - if (c->weight != w) - continue; - - for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) { - if (!test_bit(j, used_mask)) - break; - } - - if (j == X86_PMC_IDX_MAX) - break; - - __set_bit(j, used_mask); - - if (assign) - assign[i] = j; - num--; + if (!num && assign) { + for (i = 0; i < n; i++) { + e = cpuc->event_list[i]; + e->hw.flags |= PERF_X86_EVENT_COMMITTED; } } -done: /* * scheduling failed or is just a simulation, * free resources if necessary */ if (!assign || num) { for (i = 0; i < n; i++) { + e = cpuc->event_list[i]; + /* + * do not put_constraint() on comitted events, + * because they are good to go + */ + if ((e->hw.flags & PERF_X86_EVENT_COMMITTED)) + continue; + if (x86_pmu.put_event_constraints) - x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]); + x86_pmu.put_event_constraints(cpuc, e); } } - return num ? -ENOSPC : 0; + return num ? -EINVAL : 0; } /* @@ -743,7 +812,7 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, if (is_x86_event(leader)) { if (n >= max_count) - return -ENOSPC; + return -EINVAL; cpuc->event_list[n] = leader; n++; } @@ -756,7 +825,7 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, continue; if (n >= max_count) - return -ENOSPC; + return -EINVAL; cpuc->event_list[n] = event; n++; @@ -773,20 +842,17 @@ static inline void x86_assign_hw_event(struct perf_event *event, hwc->last_cpu = smp_processor_id(); hwc->last_tag = ++cpuc->tags[i]; - if (hwc->idx == X86_PMC_IDX_FIXED_BTS) { + if (hwc->idx == INTEL_PMC_IDX_FIXED_BTS) { hwc->config_base = 0; hwc->event_base = 0; - } else if (hwc->idx >= X86_PMC_IDX_FIXED) { + } else if (hwc->idx >= INTEL_PMC_IDX_FIXED) { hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; - /* - * We set it so that event_base + idx in wrmsr/rdmsr maps to - * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2: - */ - hwc->event_base = - MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED; + hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - INTEL_PMC_IDX_FIXED); + hwc->event_base_rdpmc = (hwc->idx - INTEL_PMC_IDX_FIXED) | 1<<30; } else { - hwc->config_base = x86_pmu.eventsel; - hwc->event_base = x86_pmu.perfctr; + hwc->config_base = x86_pmu_config_addr(hwc->idx); + hwc->event_base = x86_pmu_event_addr(hwc->idx); + hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx); } } @@ -800,7 +866,6 @@ static inline int match_prev_assignment(struct hw_perf_event *hwc, } static void x86_pmu_start(struct perf_event *event, int flags); -static void x86_pmu_stop(struct perf_event *event, int flags); static void x86_pmu_enable(struct pmu *pmu) { @@ -822,7 +887,6 @@ static void x86_pmu_enable(struct pmu *pmu) * hw_perf_group_sched_in() or x86_pmu_enable() * * step1: save events moving to new counters - * step2: reprogram moved events into new counters */ for (i = 0; i < n_running; i++) { event = cpuc->event_list[i]; @@ -848,6 +912,9 @@ static void x86_pmu_enable(struct pmu *pmu) x86_pmu_stop(event, PERF_EF_UPDATE); } + /* + * step2: reprogram moved events into new counters + */ for (i = 0; i < cpuc->n_events; i++) { event = cpuc->event_list[i]; hwc = &event->hw; @@ -872,34 +939,20 @@ static void x86_pmu_enable(struct pmu *pmu) x86_pmu.enable_all(added); } -static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, - u64 enable_mask) -{ - wrmsrl(hwc->config_base + hwc->idx, hwc->config | enable_mask); -} - -static inline void x86_pmu_disable_event(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - - wrmsrl(hwc->config_base + hwc->idx, hwc->config); -} - static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); /* * Set the next IRQ period, based on the hwc->period_left value. * To be called with the event disabled in hw: */ -static int -x86_perf_event_set_period(struct perf_event *event) +int x86_perf_event_set_period(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; s64 left = local64_read(&hwc->period_left); s64 period = hwc->sample_period; int ret = 0, idx = hwc->idx; - if (idx == X86_PMC_IDX_FIXED_BTS) + if (idx == INTEL_PMC_IDX_FIXED_BTS) return 0; /* @@ -935,7 +988,7 @@ x86_perf_event_set_period(struct perf_event *event) */ local64_set(&hwc->prev_count, (u64)-left); - wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask); + wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask); /* * Due to erratum on certan cpu we need @@ -943,7 +996,7 @@ x86_perf_event_set_period(struct perf_event *event) * is updated properly */ if (x86_pmu.perfctr_second_write) { - wrmsrl(hwc->event_base + idx, + wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask); } @@ -952,10 +1005,9 @@ x86_perf_event_set_period(struct perf_event *event) return ret; } -static void x86_pmu_enable_event(struct perf_event *event) +void x86_pmu_enable_event(struct perf_event *event) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - if (cpuc->enabled) + if (__this_cpu_read(cpu_hw_events.enabled)) __x86_pmu_enable_event(&event->hw, ARCH_PERFMON_EVENTSEL_ENABLE); } @@ -987,8 +1039,8 @@ static int x86_pmu_add(struct perf_event *event, int flags) /* * If group events scheduling transaction was started, - * skip the schedulability test here, it will be peformed - * at commit time (->commit_txn) as a whole + * skip the schedulability test here, it will be performed + * at commit time (->commit_txn) as a whole. */ if (cpuc->group_flag & PERF_EVENT_TXN) goto done_collect; @@ -1003,6 +1055,10 @@ static int x86_pmu_add(struct perf_event *event, int flags) memcpy(cpuc->assign, assign, n*sizeof(int)); done_collect: + /* + * Commit the collect_events() state. See x86_pmu_del() and + * x86_pmu_*_txn(). + */ cpuc->n_events = n; cpuc->n_added += n - n0; cpuc->n_txn += n - n0; @@ -1071,8 +1127,8 @@ void perf_event_print_debug(void) pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); for (idx = 0; idx < x86_pmu.num_counters; idx++) { - rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); - rdmsrl(x86_pmu.perfctr + idx, pmc_count); + rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl); + rdmsrl(x86_pmu_event_addr(idx), pmc_count); prev_left = per_cpu(pmc_prev_left[idx], cpu); @@ -1092,7 +1148,7 @@ void perf_event_print_debug(void) local_irq_restore(flags); } -static void x86_pmu_stop(struct perf_event *event, int flags) +void x86_pmu_stop(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; @@ -1120,32 +1176,50 @@ static void x86_pmu_del(struct perf_event *event, int flags) int i; /* + * event is descheduled + */ + event->hw.flags &= ~PERF_X86_EVENT_COMMITTED; + + /* * If we're called during a txn, we don't need to do anything. * The events never got scheduled and ->cancel_txn will truncate * the event_list. + * + * XXX assumes any ->del() called during a TXN will only be on + * an event added during that same TXN. */ if (cpuc->group_flag & PERF_EVENT_TXN) return; + /* + * Not a TXN, therefore cleanup properly. + */ x86_pmu_stop(event, PERF_EF_UPDATE); for (i = 0; i < cpuc->n_events; i++) { - if (event == cpuc->event_list[i]) { + if (event == cpuc->event_list[i]) + break; + } - if (x86_pmu.put_event_constraints) - x86_pmu.put_event_constraints(cpuc, event); + if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */ + return; - while (++i < cpuc->n_events) - cpuc->event_list[i-1] = cpuc->event_list[i]; + /* If we have a newly added event; make sure to decrease n_added. */ + if (i >= cpuc->n_events - cpuc->n_added) + --cpuc->n_added; + + if (x86_pmu.put_event_constraints) + x86_pmu.put_event_constraints(cpuc, event); + + /* Delete the array entry. */ + while (++i < cpuc->n_events) + cpuc->event_list[i-1] = cpuc->event_list[i]; + --cpuc->n_events; - --cpuc->n_events; - break; - } - } perf_event_update_userpage(event); } -static int x86_pmu_handle_irq(struct pt_regs *regs) +int x86_pmu_handle_irq(struct pt_regs *regs) { struct perf_sample_data data; struct cpu_hw_events *cpuc; @@ -1153,10 +1227,18 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) int idx, handled = 0; u64 val; - perf_sample_data_init(&data, 0); - cpuc = &__get_cpu_var(cpu_hw_events); + /* + * Some chipsets need to unmask the LVTPC in a particular spot + * inside the nmi handler. As a result, the unmasking was pushed + * into all the nmi handlers. + * + * This generic handler doesn't seem to have any issues where the + * unmasking occurs so it was left at the top. + */ + apic_write(APIC_LVTPC, APIC_DM_NMI); + for (idx = 0; idx < x86_pmu.num_counters; idx++) { if (!test_bit(idx, cpuc->active_mask)) { /* @@ -1179,12 +1261,12 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) * event overflow */ handled++; - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, event->hw.last_period); if (!x86_perf_event_set_period(event)) continue; - if (perf_event_overflow(event, 1, &data, regs)) + if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); } @@ -1205,121 +1287,54 @@ void perf_events_lapic_init(void) apic_write(APIC_LVTPC, APIC_DM_NMI); } -struct pmu_nmi_state { - unsigned int marked; - int handled; -}; - -static DEFINE_PER_CPU(struct pmu_nmi_state, pmu_nmi); - -static int __kprobes -perf_event_nmi_handler(struct notifier_block *self, - unsigned long cmd, void *__args) +static int +perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs) { - struct die_args *args = __args; - unsigned int this_nmi; - int handled; + u64 start_clock; + u64 finish_clock; + int ret; if (!atomic_read(&active_events)) - return NOTIFY_DONE; - - switch (cmd) { - case DIE_NMI: - case DIE_NMI_IPI: - break; - case DIE_NMIUNKNOWN: - this_nmi = percpu_read(irq_stat.__nmi_count); - if (this_nmi != __get_cpu_var(pmu_nmi).marked) - /* let the kernel handle the unknown nmi */ - return NOTIFY_DONE; - /* - * This one is a PMU back-to-back nmi. Two events - * trigger 'simultaneously' raising two back-to-back - * NMIs. If the first NMI handles both, the latter - * will be empty and daze the CPU. So, we drop it to - * avoid false-positive 'unknown nmi' messages. - */ - return NOTIFY_STOP; - default: - return NOTIFY_DONE; - } - - apic_write(APIC_LVTPC, APIC_DM_NMI); - - handled = x86_pmu.handle_irq(args->regs); - if (!handled) - return NOTIFY_DONE; - - this_nmi = percpu_read(irq_stat.__nmi_count); - if ((handled > 1) || - /* the next nmi could be a back-to-back nmi */ - ((__get_cpu_var(pmu_nmi).marked == this_nmi) && - (__get_cpu_var(pmu_nmi).handled > 1))) { - /* - * We could have two subsequent back-to-back nmis: The - * first handles more than one counter, the 2nd - * handles only one counter and the 3rd handles no - * counter. - * - * This is the 2nd nmi because the previous was - * handling more than one counter. We will mark the - * next (3rd) and then drop it if unhandled. - */ - __get_cpu_var(pmu_nmi).marked = this_nmi + 1; - __get_cpu_var(pmu_nmi).handled = handled; - } - - return NOTIFY_STOP; -} + return NMI_DONE; -static __read_mostly struct notifier_block perf_event_nmi_notifier = { - .notifier_call = perf_event_nmi_handler, - .next = NULL, - .priority = 1 -}; - -static struct event_constraint unconstrained; -static struct event_constraint emptyconstraint; + start_clock = sched_clock(); + ret = x86_pmu.handle_irq(regs); + finish_clock = sched_clock(); -static struct event_constraint * -x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) -{ - struct event_constraint *c; + perf_sample_event_took(finish_clock - start_clock); - if (x86_pmu.event_constraints) { - for_each_event_constraint(c, x86_pmu.event_constraints) { - if ((event->hw.config & c->cmask) == c->code) - return c; - } - } - - return &unconstrained; + return ret; } +NOKPROBE_SYMBOL(perf_event_nmi_handler); -#include "perf_event_amd.c" -#include "perf_event_p6.c" -#include "perf_event_p4.c" -#include "perf_event_intel_lbr.c" -#include "perf_event_intel_ds.c" -#include "perf_event_intel.c" +struct event_constraint emptyconstraint; +struct event_constraint unconstrained; -static int __cpuinit +static int x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) { unsigned int cpu = (long)hcpu; + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); int ret = NOTIFY_OK; switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: + cpuc->kfree_on_online = NULL; if (x86_pmu.cpu_prepare) ret = x86_pmu.cpu_prepare(cpu); break; case CPU_STARTING: + if (x86_pmu.attr_rdpmc) + set_in_cr4(X86_CR4_PCE); if (x86_pmu.cpu_starting) x86_pmu.cpu_starting(cpu); break; + case CPU_ONLINE: + kfree(cpuc->kfree_on_online); + break; + case CPU_DYING: if (x86_pmu.cpu_dying) x86_pmu.cpu_dying(cpu); @@ -1346,11 +1361,163 @@ static void __init pmu_check_apic(void) x86_pmu.apic = 0; pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); pr_info("no hardware sampling interrupt available.\n"); + + /* + * If we have a PMU initialized but no APIC + * interrupts, we cannot sample hardware + * events (user-space has to fall back and + * sample via a hrtimer based software event): + */ + pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT; + } -void __init init_hw_perf_events(void) +static struct attribute_group x86_pmu_format_group = { + .name = "format", + .attrs = NULL, +}; + +/* + * Remove all undefined events (x86_pmu.event_map(id) == 0) + * out of events_attr attributes. + */ +static void __init filter_events(struct attribute **attrs) { - struct event_constraint *c; + struct device_attribute *d; + struct perf_pmu_events_attr *pmu_attr; + int i, j; + + for (i = 0; attrs[i]; i++) { + d = (struct device_attribute *)attrs[i]; + pmu_attr = container_of(d, struct perf_pmu_events_attr, attr); + /* str trumps id */ + if (pmu_attr->event_str) + continue; + if (x86_pmu.event_map(i)) + continue; + + for (j = i; attrs[j]; j++) + attrs[j] = attrs[j + 1]; + + /* Check the shifted attr. */ + i--; + } +} + +/* Merge two pointer arrays */ +static __init struct attribute **merge_attr(struct attribute **a, struct attribute **b) +{ + struct attribute **new; + int j, i; + + for (j = 0; a[j]; j++) + ; + for (i = 0; b[i]; i++) + j++; + j++; + + new = kmalloc(sizeof(struct attribute *) * j, GFP_KERNEL); + if (!new) + return NULL; + + j = 0; + for (i = 0; a[i]; i++) + new[j++] = a[i]; + for (i = 0; b[i]; i++) + new[j++] = b[i]; + new[j] = NULL; + + return new; +} + +ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, + char *page) +{ + struct perf_pmu_events_attr *pmu_attr = \ + container_of(attr, struct perf_pmu_events_attr, attr); + u64 config = x86_pmu.event_map(pmu_attr->id); + + /* string trumps id */ + if (pmu_attr->event_str) + return sprintf(page, "%s", pmu_attr->event_str); + + return x86_pmu.events_sysfs_show(page, config); +} + +EVENT_ATTR(cpu-cycles, CPU_CYCLES ); +EVENT_ATTR(instructions, INSTRUCTIONS ); +EVENT_ATTR(cache-references, CACHE_REFERENCES ); +EVENT_ATTR(cache-misses, CACHE_MISSES ); +EVENT_ATTR(branch-instructions, BRANCH_INSTRUCTIONS ); +EVENT_ATTR(branch-misses, BRANCH_MISSES ); +EVENT_ATTR(bus-cycles, BUS_CYCLES ); +EVENT_ATTR(stalled-cycles-frontend, STALLED_CYCLES_FRONTEND ); +EVENT_ATTR(stalled-cycles-backend, STALLED_CYCLES_BACKEND ); +EVENT_ATTR(ref-cycles, REF_CPU_CYCLES ); + +static struct attribute *empty_attrs; + +static struct attribute *events_attr[] = { + EVENT_PTR(CPU_CYCLES), + EVENT_PTR(INSTRUCTIONS), + EVENT_PTR(CACHE_REFERENCES), + EVENT_PTR(CACHE_MISSES), + EVENT_PTR(BRANCH_INSTRUCTIONS), + EVENT_PTR(BRANCH_MISSES), + EVENT_PTR(BUS_CYCLES), + EVENT_PTR(STALLED_CYCLES_FRONTEND), + EVENT_PTR(STALLED_CYCLES_BACKEND), + EVENT_PTR(REF_CPU_CYCLES), + NULL, +}; + +static struct attribute_group x86_pmu_events_group = { + .name = "events", + .attrs = events_attr, +}; + +ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event) +{ + u64 umask = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; + u64 cmask = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24; + bool edge = (config & ARCH_PERFMON_EVENTSEL_EDGE); + bool pc = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL); + bool any = (config & ARCH_PERFMON_EVENTSEL_ANY); + bool inv = (config & ARCH_PERFMON_EVENTSEL_INV); + ssize_t ret; + + /* + * We have whole page size to spend and just little data + * to write, so we can safely use sprintf. + */ + ret = sprintf(page, "event=0x%02llx", event); + + if (umask) + ret += sprintf(page + ret, ",umask=0x%02llx", umask); + + if (edge) + ret += sprintf(page + ret, ",edge"); + + if (pc) + ret += sprintf(page + ret, ",pc"); + + if (any) + ret += sprintf(page + ret, ",any"); + + if (inv) + ret += sprintf(page + ret, ",inv"); + + if (cmask) + ret += sprintf(page + ret, ",cmask=0x%02llx", cmask); + + ret += sprintf(page + ret, "\n"); + + return ret; +} + +static int __init init_hw_perf_events(void) +{ + struct x86_pmu_quirk *quirk; int err; pr_info("Performance Events: "); @@ -1363,51 +1530,52 @@ void __init init_hw_perf_events(void) err = amd_pmu_init(); break; default: - return; + err = -ENOTSUPP; } if (err != 0) { pr_cont("no PMU driver, software events only.\n"); - return; + return 0; } pmu_check_apic(); + /* sanity check that the hardware exists or is emulated */ + if (!check_hw_exists()) + return 0; + pr_cont("%s PMU driver.\n", x86_pmu.name); - if (x86_pmu.quirks) - x86_pmu.quirks(); + x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ - if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { - WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", - x86_pmu.num_counters, X86_PMC_MAX_GENERIC); - x86_pmu.num_counters = X86_PMC_MAX_GENERIC; - } - x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; + for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next) + quirk->func(); - if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { - WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", - x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); - x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; - } - - x86_pmu.intel_ctrl |= - ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; + if (!x86_pmu.intel_ctrl) + x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; perf_events_lapic_init(); - register_die_notifier(&perf_event_nmi_notifier); + register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI"); unconstrained = (struct event_constraint) __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1, - 0, x86_pmu.num_counters); + 0, x86_pmu.num_counters, 0, 0); - if (x86_pmu.event_constraints) { - for_each_event_constraint(c, x86_pmu.event_constraints) { - if (c->cmask != X86_RAW_EVENT_MASK) - continue; + x86_pmu_format_group.attrs = x86_pmu.format_attrs; - c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; - c->weight += x86_pmu.num_counters; - } + if (x86_pmu.event_attrs) + x86_pmu_events_group.attrs = x86_pmu.event_attrs; + + if (!x86_pmu.events_sysfs_show) + x86_pmu_events_group.attrs = &empty_attrs; + else + filter_events(x86_pmu_events_group.attrs); + + if (x86_pmu.cpu_events) { + struct attribute **tmp; + + tmp = merge_attr(x86_pmu_events_group.attrs, x86_pmu.cpu_events); + if (!WARN_ON(!tmp)) + x86_pmu_events_group.attrs = tmp; } pr_info("... version: %d\n", x86_pmu.version); @@ -1418,9 +1586,12 @@ void __init init_hw_perf_events(void) pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed); pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl); - perf_pmu_register(&pmu); + perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW); perf_cpu_notifier(x86_pmu_notifier); + + return 0; } +early_initcall(init_hw_perf_events); static inline void x86_pmu_read(struct perf_event *event) { @@ -1434,11 +1605,9 @@ static inline void x86_pmu_read(struct perf_event *event) */ static void x86_pmu_start_txn(struct pmu *pmu) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - perf_pmu_disable(pmu); - cpuc->group_flag |= PERF_EVENT_TXN; - cpuc->n_txn = 0; + __this_cpu_or(cpu_hw_events.group_flag, PERF_EVENT_TXN); + __this_cpu_write(cpu_hw_events.n_txn, 0); } /* @@ -1448,14 +1617,13 @@ static void x86_pmu_start_txn(struct pmu *pmu) */ static void x86_pmu_cancel_txn(struct pmu *pmu) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - - cpuc->group_flag &= ~PERF_EVENT_TXN; + __this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN); /* - * Truncate the collected events. + * Truncate collected array by the number of events added in this + * transaction. See x86_pmu_add() and x86_pmu_*_txn(). */ - cpuc->n_added -= cpuc->n_txn; - cpuc->n_events -= cpuc->n_txn; + __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn)); + __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn)); perf_pmu_enable(pmu); } @@ -1463,6 +1631,8 @@ static void x86_pmu_cancel_txn(struct pmu *pmu) * Commit group events scheduling transaction * Perform the group schedulability test as a whole * Return 0 if success + * + * Does not cancel the transaction on failure; expects the caller to do this. */ static int x86_pmu_commit_txn(struct pmu *pmu) { @@ -1489,6 +1659,41 @@ static int x86_pmu_commit_txn(struct pmu *pmu) perf_pmu_enable(pmu); return 0; } +/* + * a fake_cpuc is used to validate event groups. Due to + * the extra reg logic, we need to also allocate a fake + * per_core and per_cpu structure. Otherwise, group events + * using extra reg may conflict without the kernel being + * able to catch this when the last event gets added to + * the group. + */ +static void free_fake_cpuc(struct cpu_hw_events *cpuc) +{ + kfree(cpuc->shared_regs); + kfree(cpuc); +} + +static struct cpu_hw_events *allocate_fake_cpuc(void) +{ + struct cpu_hw_events *cpuc; + int cpu = raw_smp_processor_id(); + + cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL); + if (!cpuc) + return ERR_PTR(-ENOMEM); + + /* only needed, if we have extra_regs */ + if (x86_pmu.extra_regs) { + cpuc->shared_regs = allocate_shared_regs(cpu); + if (!cpuc->shared_regs) + goto error; + } + cpuc->is_fake = 1; + return cpuc; +error: + free_fake_cpuc(cpuc); + return ERR_PTR(-ENOMEM); +} /* * validate that we can schedule this event @@ -1499,19 +1704,19 @@ static int validate_event(struct perf_event *event) struct event_constraint *c; int ret = 0; - fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO); - if (!fake_cpuc) - return -ENOMEM; + fake_cpuc = allocate_fake_cpuc(); + if (IS_ERR(fake_cpuc)) + return PTR_ERR(fake_cpuc); c = x86_pmu.get_event_constraints(fake_cpuc, event); if (!c || !c->weight) - ret = -ENOSPC; + ret = -EINVAL; if (x86_pmu.put_event_constraints) x86_pmu.put_event_constraints(fake_cpuc, event); - kfree(fake_cpuc); + free_fake_cpuc(fake_cpuc); return ret; } @@ -1531,40 +1736,36 @@ static int validate_group(struct perf_event *event) { struct perf_event *leader = event->group_leader; struct cpu_hw_events *fake_cpuc; - int ret, n; - - ret = -ENOMEM; - fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO); - if (!fake_cpuc) - goto out; + int ret = -EINVAL, n; + fake_cpuc = allocate_fake_cpuc(); + if (IS_ERR(fake_cpuc)) + return PTR_ERR(fake_cpuc); /* * the event is not yet connected with its * siblings therefore we must first collect * existing siblings, then add the new event * before we can simulate the scheduling */ - ret = -ENOSPC; n = collect_events(fake_cpuc, leader, true); if (n < 0) - goto out_free; + goto out; fake_cpuc->n_events = n; n = collect_events(fake_cpuc, event, false); if (n < 0) - goto out_free; + goto out; fake_cpuc->n_events = n; ret = x86_pmu.schedule_events(fake_cpuc, n, NULL); -out_free: - kfree(fake_cpuc); out: + free_fake_cpuc(fake_cpuc); return ret; } -int x86_pmu_event_init(struct perf_event *event) +static int x86_pmu_event_init(struct perf_event *event) { struct pmu *tmp; int err; @@ -1604,38 +1805,142 @@ int x86_pmu_event_init(struct perf_event *event) return err; } -static struct pmu pmu = { - .pmu_enable = x86_pmu_enable, - .pmu_disable = x86_pmu_disable, +static int x86_pmu_event_idx(struct perf_event *event) +{ + int idx = event->hw.idx; + + if (!x86_pmu.attr_rdpmc) + return 0; - .event_init = x86_pmu_event_init, + if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) { + idx -= INTEL_PMC_IDX_FIXED; + idx |= 1 << 30; + } - .add = x86_pmu_add, - .del = x86_pmu_del, - .start = x86_pmu_start, - .stop = x86_pmu_stop, - .read = x86_pmu_read, + return idx + 1; +} - .start_txn = x86_pmu_start_txn, - .cancel_txn = x86_pmu_cancel_txn, - .commit_txn = x86_pmu_commit_txn, +static ssize_t get_attr_rdpmc(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc); +} + +static void change_rdpmc(void *info) +{ + bool enable = !!(unsigned long)info; + + if (enable) + set_in_cr4(X86_CR4_PCE); + else + clear_in_cr4(X86_CR4_PCE); +} + +static ssize_t set_attr_rdpmc(struct device *cdev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + unsigned long val; + ssize_t ret; + + ret = kstrtoul(buf, 0, &val); + if (ret) + return ret; + + if (x86_pmu.attr_rdpmc_broken) + return -ENOTSUPP; + + if (!!val != !!x86_pmu.attr_rdpmc) { + x86_pmu.attr_rdpmc = !!val; + on_each_cpu(change_rdpmc, (void *)val, 1); + } + + return count; +} + +static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc); + +static struct attribute *x86_pmu_attrs[] = { + &dev_attr_rdpmc.attr, + NULL, }; -/* - * callchain support - */ +static struct attribute_group x86_pmu_attr_group = { + .attrs = x86_pmu_attrs, +}; + +static const struct attribute_group *x86_pmu_attr_groups[] = { + &x86_pmu_attr_group, + &x86_pmu_format_group, + &x86_pmu_events_group, + NULL, +}; -static void -backtrace_warning_symbol(void *data, char *msg, unsigned long symbol) +static void x86_pmu_flush_branch_stack(void) { - /* Ignore warnings */ + if (x86_pmu.flush_branch_stack) + x86_pmu.flush_branch_stack(); } -static void backtrace_warning(void *data, char *msg) +void perf_check_microcode(void) +{ + if (x86_pmu.check_microcode) + x86_pmu.check_microcode(); +} +EXPORT_SYMBOL_GPL(perf_check_microcode); + +static struct pmu pmu = { + .pmu_enable = x86_pmu_enable, + .pmu_disable = x86_pmu_disable, + + .attr_groups = x86_pmu_attr_groups, + + .event_init = x86_pmu_event_init, + + .add = x86_pmu_add, + .del = x86_pmu_del, + .start = x86_pmu_start, + .stop = x86_pmu_stop, + .read = x86_pmu_read, + + .start_txn = x86_pmu_start_txn, + .cancel_txn = x86_pmu_cancel_txn, + .commit_txn = x86_pmu_commit_txn, + + .event_idx = x86_pmu_event_idx, + .flush_branch_stack = x86_pmu_flush_branch_stack, +}; + +void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) { - /* Ignore warnings */ + struct cyc2ns_data *data; + + userpg->cap_user_time = 0; + userpg->cap_user_time_zero = 0; + userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc; + userpg->pmc_width = x86_pmu.cntval_bits; + + if (!sched_clock_stable()) + return; + + data = cyc2ns_read_begin(); + + userpg->cap_user_time = 1; + userpg->time_mult = data->cyc2ns_mul; + userpg->time_shift = data->cyc2ns_shift; + userpg->time_offset = data->cyc2ns_offset - now; + + userpg->cap_user_time_zero = 1; + userpg->time_zero = data->cyc2ns_offset; + + cyc2ns_read_end(data); } +/* + * callchain support + */ + static int backtrace_stack(void *data, char *name) { return 0; @@ -1649,8 +1954,6 @@ static void backtrace_address(void *data, unsigned long addr, int reliable) } static const struct stacktrace_ops backtrace_ops = { - .warning = backtrace_warning, - .warning_symbol = backtrace_warning_symbol, .stack = backtrace_stack, .address = backtrace_address, .walk_stack = print_context_stack_bp, @@ -1666,35 +1969,71 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) perf_callchain_store(entry, regs->ip); - dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry); + dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); +} + +static inline int +valid_user_frame(const void __user *fp, unsigned long size) +{ + return (__range_not_ok(fp, size, TASK_SIZE) == 0); +} + +static unsigned long get_segment_base(unsigned int segment) +{ + struct desc_struct *desc; + int idx = segment >> 3; + + if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) { + if (idx > LDT_ENTRIES) + return 0; + + if (idx > current->active_mm->context.size) + return 0; + + desc = current->active_mm->context.ldt; + } else { + if (idx > GDT_ENTRIES) + return 0; + + desc = __this_cpu_ptr(&gdt_page.gdt[0]); + } + + return get_desc_base(desc + idx); } #ifdef CONFIG_COMPAT + +#include <asm/compat.h> + static inline int perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) { /* 32-bit process in 64-bit kernel. */ + unsigned long ss_base, cs_base; struct stack_frame_ia32 frame; const void __user *fp; if (!test_thread_flag(TIF_IA32)) return 0; - fp = compat_ptr(regs->bp); + cs_base = get_segment_base(regs->cs); + ss_base = get_segment_base(regs->ss); + + fp = compat_ptr(ss_base + regs->bp); while (entry->nr < PERF_MAX_STACK_DEPTH) { unsigned long bytes; frame.next_frame = 0; frame.return_address = 0; bytes = copy_from_user_nmi(&frame, fp, sizeof(frame)); - if (bytes != sizeof(frame)) + if (bytes != 0) break; - if (fp < compat_ptr(regs->sp)) + if (!valid_user_frame(fp, sizeof(frame))) break; - perf_callchain_store(entry, frame.return_address); - fp = compat_ptr(frame.next_frame); + perf_callchain_store(entry, cs_base + frame.return_address); + fp = compat_ptr(ss_base + frame.next_frame); } return 1; } @@ -1717,10 +2056,19 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) return; } + /* + * We don't know what to do with VM86 stacks.. ignore them for now. + */ + if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM)) + return; + fp = (void __user *)regs->bp; perf_callchain_store(entry, regs->ip); + if (!current->mm) + return; + if (perf_callchain_user32(regs, entry)) return; @@ -1730,10 +2078,10 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) frame.return_address = 0; bytes = copy_from_user_nmi(&frame, fp, sizeof(frame)); - if (bytes != sizeof(frame)) + if (bytes != 0) break; - if ((unsigned long)fp < regs->sp) + if (!valid_user_frame(fp, sizeof(frame))) break; perf_callchain_store(entry, frame.return_address); @@ -1741,16 +2089,50 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) } } -unsigned long perf_instruction_pointer(struct pt_regs *regs) +/* + * Deal with code segment offsets for the various execution modes: + * + * VM86 - the good olde 16 bit days, where the linear address is + * 20 bits and we use regs->ip + 0x10 * regs->cs. + * + * IA32 - Where we need to look at GDT/LDT segment descriptor tables + * to figure out what the 32bit base address is. + * + * X32 - has TIF_X32 set, but is running in x86_64 + * + * X86_64 - CS,DS,SS,ES are all zero based. + */ +static unsigned long code_segment_base(struct pt_regs *regs) { - unsigned long ip; + /* + * If we are in VM86 mode, add the segment offset to convert to a + * linear address. + */ + if (regs->flags & X86_VM_MASK) + return 0x10 * regs->cs; + /* + * For IA32 we look at the GDT/LDT segment base to convert the + * effective IP to a linear address. + */ +#ifdef CONFIG_X86_32 + if (user_mode(regs) && regs->cs != __USER_CS) + return get_segment_base(regs->cs); +#else + if (test_thread_flag(TIF_IA32)) { + if (user_mode(regs) && regs->cs != __USER32_CS) + return get_segment_base(regs->cs); + } +#endif + return 0; +} + +unsigned long perf_instruction_pointer(struct pt_regs *regs) +{ if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) - ip = perf_guest_cbs->get_guest_ip(); - else - ip = instruction_pointer(regs); + return perf_guest_cbs->get_guest_ip(); - return ip; + return regs->ip + code_segment_base(regs); } unsigned long perf_misc_flags(struct pt_regs *regs) @@ -1774,3 +2156,15 @@ unsigned long perf_misc_flags(struct pt_regs *regs) return misc; } + +void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) +{ + cap->version = x86_pmu.version; + cap->num_counters_gp = x86_pmu.num_counters; + cap->num_counters_fixed = x86_pmu.num_counters_fixed; + cap->bit_width_gp = x86_pmu.cntval_bits; + cap->bit_width_fixed = x86_pmu.cntval_bits; + cap->events_mask = (unsigned int)x86_pmu.events_maskl; + cap->events_mask_len = x86_pmu.events_mask_len; +} +EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability); diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h new file mode 100644 index 00000000000..8ade93111e0 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event.h @@ -0,0 +1,739 @@ +/* + * Performance events x86 architecture header + * + * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> + * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2009 Jaswinder Singh Rajput + * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com> + * Copyright (C) 2009 Google, Inc., Stephane Eranian + * + * For licencing details see kernel-base/COPYING + */ + +#include <linux/perf_event.h> + +#if 0 +#undef wrmsrl +#define wrmsrl(msr, val) \ +do { \ + unsigned int _msr = (msr); \ + u64 _val = (val); \ + trace_printk("wrmsrl(%x, %Lx)\n", (unsigned int)(_msr), \ + (unsigned long long)(_val)); \ + native_write_msr((_msr), (u32)(_val), (u32)(_val >> 32)); \ +} while (0) +#endif + +/* + * | NHM/WSM | SNB | + * register ------------------------------- + * | HT | no HT | HT | no HT | + *----------------------------------------- + * offcore | core | core | cpu | core | + * lbr_sel | core | core | cpu | core | + * ld_lat | cpu | core | cpu | core | + *----------------------------------------- + * + * Given that there is a small number of shared regs, + * we can pre-allocate their slot in the per-cpu + * per-core reg tables. + */ +enum extra_reg_type { + EXTRA_REG_NONE = -1, /* not used */ + + EXTRA_REG_RSP_0 = 0, /* offcore_response_0 */ + EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */ + EXTRA_REG_LBR = 2, /* lbr_select */ + EXTRA_REG_LDLAT = 3, /* ld_lat_threshold */ + + EXTRA_REG_MAX /* number of entries needed */ +}; + +struct event_constraint { + union { + unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + u64 idxmsk64; + }; + u64 code; + u64 cmask; + int weight; + int overlap; + int flags; +}; +/* + * struct hw_perf_event.flags flags + */ +#define PERF_X86_EVENT_PEBS_LDLAT 0x1 /* ld+ldlat data address sampling */ +#define PERF_X86_EVENT_PEBS_ST 0x2 /* st data address sampling */ +#define PERF_X86_EVENT_PEBS_ST_HSW 0x4 /* haswell style st data sampling */ +#define PERF_X86_EVENT_COMMITTED 0x8 /* event passed commit_txn */ + +struct amd_nb { + int nb_id; /* NorthBridge id */ + int refcnt; /* reference count */ + struct perf_event *owners[X86_PMC_IDX_MAX]; + struct event_constraint event_constraints[X86_PMC_IDX_MAX]; +}; + +/* The maximal number of PEBS events: */ +#define MAX_PEBS_EVENTS 8 + +/* + * A debug store configuration. + * + * We only support architectures that use 64bit fields. + */ +struct debug_store { + u64 bts_buffer_base; + u64 bts_index; + u64 bts_absolute_maximum; + u64 bts_interrupt_threshold; + u64 pebs_buffer_base; + u64 pebs_index; + u64 pebs_absolute_maximum; + u64 pebs_interrupt_threshold; + u64 pebs_event_reset[MAX_PEBS_EVENTS]; +}; + +/* + * Per register state. + */ +struct er_account { + raw_spinlock_t lock; /* per-core: protect structure */ + u64 config; /* extra MSR config */ + u64 reg; /* extra MSR number */ + atomic_t ref; /* reference count */ +}; + +/* + * Per core/cpu state + * + * Used to coordinate shared registers between HT threads or + * among events on a single PMU. + */ +struct intel_shared_regs { + struct er_account regs[EXTRA_REG_MAX]; + int refcnt; /* per-core: #HT threads */ + unsigned core_id; /* per-core: core id */ +}; + +#define MAX_LBR_ENTRIES 16 + +struct cpu_hw_events { + /* + * Generic x86 PMC bits + */ + struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */ + unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + unsigned long running[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + int enabled; + + int n_events; /* the # of events in the below arrays */ + int n_added; /* the # last events in the below arrays; + they've never been enabled yet */ + int n_txn; /* the # last events in the below arrays; + added in the current transaction */ + int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */ + u64 tags[X86_PMC_IDX_MAX]; + struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ + + unsigned int group_flag; + int is_fake; + + /* + * Intel DebugStore bits + */ + struct debug_store *ds; + u64 pebs_enabled; + + /* + * Intel LBR bits + */ + int lbr_users; + void *lbr_context; + struct perf_branch_stack lbr_stack; + struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; + struct er_account *lbr_sel; + u64 br_sel; + + /* + * Intel host/guest exclude bits + */ + u64 intel_ctrl_guest_mask; + u64 intel_ctrl_host_mask; + struct perf_guest_switch_msr guest_switch_msrs[X86_PMC_IDX_MAX]; + + /* + * Intel checkpoint mask + */ + u64 intel_cp_status; + + /* + * manage shared (per-core, per-cpu) registers + * used on Intel NHM/WSM/SNB + */ + struct intel_shared_regs *shared_regs; + + /* + * AMD specific bits + */ + struct amd_nb *amd_nb; + /* Inverted mask of bits to clear in the perf_ctr ctrl registers */ + u64 perf_ctr_virt_mask; + + void *kfree_on_online; +}; + +#define __EVENT_CONSTRAINT(c, n, m, w, o, f) {\ + { .idxmsk64 = (n) }, \ + .code = (c), \ + .cmask = (m), \ + .weight = (w), \ + .overlap = (o), \ + .flags = f, \ +} + +#define EVENT_CONSTRAINT(c, n, m) \ + __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0, 0) + +/* + * The overlap flag marks event constraints with overlapping counter + * masks. This is the case if the counter mask of such an event is not + * a subset of any other counter mask of a constraint with an equal or + * higher weight, e.g.: + * + * c_overlaps = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0); + * c_another1 = EVENT_CONSTRAINT(0, 0x07, 0); + * c_another2 = EVENT_CONSTRAINT(0, 0x38, 0); + * + * The event scheduler may not select the correct counter in the first + * cycle because it needs to know which subsequent events will be + * scheduled. It may fail to schedule the events then. So we set the + * overlap flag for such constraints to give the scheduler a hint which + * events to select for counter rescheduling. + * + * Care must be taken as the rescheduling algorithm is O(n!) which + * will increase scheduling cycles for an over-commited system + * dramatically. The number of such EVENT_CONSTRAINT_OVERLAP() macros + * and its counter masks must be kept at a minimum. + */ +#define EVENT_CONSTRAINT_OVERLAP(c, n, m) \ + __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 1, 0) + +/* + * Constraint on the Event code. + */ +#define INTEL_EVENT_CONSTRAINT(c, n) \ + EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT) + +/* + * Constraint on the Event code + UMask + fixed-mask + * + * filter mask to validate fixed counter events. + * the following filters disqualify for fixed counters: + * - inv + * - edge + * - cnt-mask + * - in_tx + * - in_tx_checkpointed + * The other filters are supported by fixed counters. + * The any-thread option is supported starting with v3. + */ +#define FIXED_EVENT_FLAGS (X86_RAW_EVENT_MASK|HSW_IN_TX|HSW_IN_TX_CHECKPOINTED) +#define FIXED_EVENT_CONSTRAINT(c, n) \ + EVENT_CONSTRAINT(c, (1ULL << (32+n)), FIXED_EVENT_FLAGS) + +/* + * Constraint on the Event code + UMask + */ +#define INTEL_UEVENT_CONSTRAINT(c, n) \ + EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK) + +#define INTEL_PLD_CONSTRAINT(c, n) \ + __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \ + HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT) + +#define INTEL_PST_CONSTRAINT(c, n) \ + __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \ + HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST) + +/* DataLA version of store sampling without extra enable bit. */ +#define INTEL_PST_HSW_CONSTRAINT(c, n) \ + __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \ + HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW) + +/* + * We define the end marker as having a weight of -1 + * to enable blacklisting of events using a counter bitmask + * of zero and thus a weight of zero. + * The end marker has a weight that cannot possibly be + * obtained from counting the bits in the bitmask. + */ +#define EVENT_CONSTRAINT_END { .weight = -1 } + +/* + * Check for end marker with weight == -1 + */ +#define for_each_event_constraint(e, c) \ + for ((e) = (c); (e)->weight != -1; (e)++) + +/* + * Extra registers for specific events. + * + * Some events need large masks and require external MSRs. + * Those extra MSRs end up being shared for all events on + * a PMU and sometimes between PMU of sibling HT threads. + * In either case, the kernel needs to handle conflicting + * accesses to those extra, shared, regs. The data structure + * to manage those registers is stored in cpu_hw_event. + */ +struct extra_reg { + unsigned int event; + unsigned int msr; + u64 config_mask; + u64 valid_mask; + int idx; /* per_xxx->regs[] reg index */ + bool extra_msr_access; +}; + +#define EVENT_EXTRA_REG(e, ms, m, vm, i) { \ + .event = (e), \ + .msr = (ms), \ + .config_mask = (m), \ + .valid_mask = (vm), \ + .idx = EXTRA_REG_##i, \ + .extra_msr_access = true, \ + } + +#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx) \ + EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx) + +#define INTEL_UEVENT_EXTRA_REG(event, msr, vm, idx) \ + EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT | \ + ARCH_PERFMON_EVENTSEL_UMASK, vm, idx) + +#define INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(c) \ + INTEL_UEVENT_EXTRA_REG(c, \ + MSR_PEBS_LD_LAT_THRESHOLD, \ + 0xffff, \ + LDLAT) + +#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0) + +union perf_capabilities { + struct { + u64 lbr_format:6; + u64 pebs_trap:1; + u64 pebs_arch_reg:1; + u64 pebs_format:4; + u64 smm_freeze:1; + /* + * PMU supports separate counter range for writing + * values > 32bit. + */ + u64 full_width_write:1; + }; + u64 capabilities; +}; + +struct x86_pmu_quirk { + struct x86_pmu_quirk *next; + void (*func)(void); +}; + +union x86_pmu_config { + struct { + u64 event:8, + umask:8, + usr:1, + os:1, + edge:1, + pc:1, + interrupt:1, + __reserved1:1, + en:1, + inv:1, + cmask:8, + event2:4, + __reserved2:4, + go:1, + ho:1; + } bits; + u64 value; +}; + +#define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value + +/* + * struct x86_pmu - generic x86 pmu + */ +struct x86_pmu { + /* + * Generic x86 PMC bits + */ + const char *name; + int version; + int (*handle_irq)(struct pt_regs *); + void (*disable_all)(void); + void (*enable_all)(int added); + void (*enable)(struct perf_event *); + void (*disable)(struct perf_event *); + int (*hw_config)(struct perf_event *event); + int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign); + unsigned eventsel; + unsigned perfctr; + int (*addr_offset)(int index, bool eventsel); + int (*rdpmc_index)(int index); + u64 (*event_map)(int); + int max_events; + int num_counters; + int num_counters_fixed; + int cntval_bits; + u64 cntval_mask; + union { + unsigned long events_maskl; + unsigned long events_mask[BITS_TO_LONGS(ARCH_PERFMON_EVENTS_COUNT)]; + }; + int events_mask_len; + int apic; + u64 max_period; + struct event_constraint * + (*get_event_constraints)(struct cpu_hw_events *cpuc, + struct perf_event *event); + + void (*put_event_constraints)(struct cpu_hw_events *cpuc, + struct perf_event *event); + struct event_constraint *event_constraints; + struct x86_pmu_quirk *quirks; + int perfctr_second_write; + bool late_ack; + + /* + * sysfs attrs + */ + int attr_rdpmc_broken; + int attr_rdpmc; + struct attribute **format_attrs; + struct attribute **event_attrs; + + ssize_t (*events_sysfs_show)(char *page, u64 config); + struct attribute **cpu_events; + + /* + * CPU Hotplug hooks + */ + int (*cpu_prepare)(int cpu); + void (*cpu_starting)(int cpu); + void (*cpu_dying)(int cpu); + void (*cpu_dead)(int cpu); + + void (*check_microcode)(void); + void (*flush_branch_stack)(void); + + /* + * Intel Arch Perfmon v2+ + */ + u64 intel_ctrl; + union perf_capabilities intel_cap; + + /* + * Intel DebugStore bits + */ + unsigned int bts :1, + bts_active :1, + pebs :1, + pebs_active :1, + pebs_broken :1; + int pebs_record_size; + void (*drain_pebs)(struct pt_regs *regs); + struct event_constraint *pebs_constraints; + void (*pebs_aliases)(struct perf_event *event); + int max_pebs_events; + + /* + * Intel LBR + */ + unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */ + int lbr_nr; /* hardware stack size */ + u64 lbr_sel_mask; /* LBR_SELECT valid bits */ + const int *lbr_sel_map; /* lbr_select mappings */ + bool lbr_double_abort; /* duplicated lbr aborts */ + + /* + * Extra registers for events + */ + struct extra_reg *extra_regs; + unsigned int er_flags; + + /* + * Intel host/guest support (KVM) + */ + struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr); +}; + +#define x86_add_quirk(func_) \ +do { \ + static struct x86_pmu_quirk __quirk __initdata = { \ + .func = func_, \ + }; \ + __quirk.next = x86_pmu.quirks; \ + x86_pmu.quirks = &__quirk; \ +} while (0) + +#define ERF_NO_HT_SHARING 1 +#define ERF_HAS_RSP_1 2 + +#define EVENT_VAR(_id) event_attr_##_id +#define EVENT_PTR(_id) &event_attr_##_id.attr.attr + +#define EVENT_ATTR(_name, _id) \ +static struct perf_pmu_events_attr EVENT_VAR(_id) = { \ + .attr = __ATTR(_name, 0444, events_sysfs_show, NULL), \ + .id = PERF_COUNT_HW_##_id, \ + .event_str = NULL, \ +}; + +#define EVENT_ATTR_STR(_name, v, str) \ +static struct perf_pmu_events_attr event_attr_##v = { \ + .attr = __ATTR(_name, 0444, events_sysfs_show, NULL), \ + .id = 0, \ + .event_str = str, \ +}; + +extern struct x86_pmu x86_pmu __read_mostly; + +DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events); + +int x86_perf_event_set_period(struct perf_event *event); + +/* + * Generalized hw caching related hw_event table, filled + * in on a per model basis. A value of 0 means + * 'not supported', -1 means 'hw_event makes no sense on + * this CPU', any other value means the raw hw_event + * ID. + */ + +#define C(x) PERF_COUNT_HW_CACHE_##x + +extern u64 __read_mostly hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX]; +extern u64 __read_mostly hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX]; + +u64 x86_perf_event_update(struct perf_event *event); + +static inline unsigned int x86_pmu_config_addr(int index) +{ + return x86_pmu.eventsel + (x86_pmu.addr_offset ? + x86_pmu.addr_offset(index, true) : index); +} + +static inline unsigned int x86_pmu_event_addr(int index) +{ + return x86_pmu.perfctr + (x86_pmu.addr_offset ? + x86_pmu.addr_offset(index, false) : index); +} + +static inline int x86_pmu_rdpmc_index(int index) +{ + return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index; +} + +int x86_setup_perfctr(struct perf_event *event); + +int x86_pmu_hw_config(struct perf_event *event); + +void x86_pmu_disable_all(void); + +static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, + u64 enable_mask) +{ + u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask); + + if (hwc->extra_reg.reg) + wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config); + wrmsrl(hwc->config_base, (hwc->config | enable_mask) & ~disable_mask); +} + +void x86_pmu_enable_all(int added); + +int perf_assign_events(struct perf_event **events, int n, + int wmin, int wmax, int *assign); +int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign); + +void x86_pmu_stop(struct perf_event *event, int flags); + +static inline void x86_pmu_disable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + wrmsrl(hwc->config_base, hwc->config); +} + +void x86_pmu_enable_event(struct perf_event *event); + +int x86_pmu_handle_irq(struct pt_regs *regs); + +extern struct event_constraint emptyconstraint; + +extern struct event_constraint unconstrained; + +static inline bool kernel_ip(unsigned long ip) +{ +#ifdef CONFIG_X86_32 + return ip > PAGE_OFFSET; +#else + return (long)ip < 0; +#endif +} + +/* + * Not all PMUs provide the right context information to place the reported IP + * into full context. Specifically segment registers are typically not + * supplied. + * + * Assuming the address is a linear address (it is for IBS), we fake the CS and + * vm86 mode using the known zero-based code segment and 'fix up' the registers + * to reflect this. + * + * Intel PEBS/LBR appear to typically provide the effective address, nothing + * much we can do about that but pray and treat it like a linear address. + */ +static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip) +{ + regs->cs = kernel_ip(ip) ? __KERNEL_CS : __USER_CS; + if (regs->flags & X86_VM_MASK) + regs->flags ^= (PERF_EFLAGS_VM | X86_VM_MASK); + regs->ip = ip; +} + +ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event); +ssize_t intel_event_sysfs_show(char *page, u64 config); + +#ifdef CONFIG_CPU_SUP_AMD + +int amd_pmu_init(void); + +#else /* CONFIG_CPU_SUP_AMD */ + +static inline int amd_pmu_init(void) +{ + return 0; +} + +#endif /* CONFIG_CPU_SUP_AMD */ + +#ifdef CONFIG_CPU_SUP_INTEL + +int intel_pmu_save_and_restart(struct perf_event *event); + +struct event_constraint * +x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event); + +struct intel_shared_regs *allocate_shared_regs(int cpu); + +int intel_pmu_init(void); + +void init_debug_store_on_cpu(int cpu); + +void fini_debug_store_on_cpu(int cpu); + +void release_ds_buffers(void); + +void reserve_ds_buffers(void); + +extern struct event_constraint bts_constraint; + +void intel_pmu_enable_bts(u64 config); + +void intel_pmu_disable_bts(void); + +int intel_pmu_drain_bts_buffer(void); + +extern struct event_constraint intel_core2_pebs_event_constraints[]; + +extern struct event_constraint intel_atom_pebs_event_constraints[]; + +extern struct event_constraint intel_slm_pebs_event_constraints[]; + +extern struct event_constraint intel_nehalem_pebs_event_constraints[]; + +extern struct event_constraint intel_westmere_pebs_event_constraints[]; + +extern struct event_constraint intel_snb_pebs_event_constraints[]; + +extern struct event_constraint intel_ivb_pebs_event_constraints[]; + +extern struct event_constraint intel_hsw_pebs_event_constraints[]; + +struct event_constraint *intel_pebs_constraints(struct perf_event *event); + +void intel_pmu_pebs_enable(struct perf_event *event); + +void intel_pmu_pebs_disable(struct perf_event *event); + +void intel_pmu_pebs_enable_all(void); + +void intel_pmu_pebs_disable_all(void); + +void intel_ds_init(void); + +void intel_pmu_lbr_reset(void); + +void intel_pmu_lbr_enable(struct perf_event *event); + +void intel_pmu_lbr_disable(struct perf_event *event); + +void intel_pmu_lbr_enable_all(void); + +void intel_pmu_lbr_disable_all(void); + +void intel_pmu_lbr_read(void); + +void intel_pmu_lbr_init_core(void); + +void intel_pmu_lbr_init_nhm(void); + +void intel_pmu_lbr_init_atom(void); + +void intel_pmu_lbr_init_snb(void); + +int intel_pmu_setup_lbr_filter(struct perf_event *event); + +int p4_pmu_init(void); + +int p6_pmu_init(void); + +int knc_pmu_init(void); + +ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, + char *page); + +#else /* CONFIG_CPU_SUP_INTEL */ + +static inline void reserve_ds_buffers(void) +{ +} + +static inline void release_ds_buffers(void) +{ +} + +static inline int intel_pmu_init(void) +{ + return 0; +} + +static inline struct intel_shared_regs *allocate_shared_regs(int cpu) +{ + return NULL; +} + +#endif /* CONFIG_CPU_SUP_INTEL */ diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index e421b8cd694..beeb7cc0704 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -1,6 +1,11 @@ -#ifdef CONFIG_CPU_SUP_AMD +#include <linux/perf_event.h> +#include <linux/export.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <asm/apicdef.h> -static DEFINE_RAW_SPINLOCK(amd_nb_lock); +#include "perf_event.h" static __initconst const u64 amd_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] @@ -10,7 +15,7 @@ static __initconst const u64 amd_hw_cache_event_ids [ C(L1D) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ - [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */ + [ C(RESULT_MISS) ] = 0x0141, /* Data Cache Misses */ }, [ C(OP_WRITE) ] = { [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */ @@ -91,6 +96,20 @@ static __initconst const u64 amd_hw_cache_event_ids [ C(RESULT_MISS) ] = -1, }, }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0xb8e9, /* CPU Request to Memory, l+r */ + [ C(RESULT_MISS) ] = 0x98e9, /* CPU Request to Memory, r */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, }; /* @@ -98,12 +117,14 @@ static __initconst const u64 amd_hw_cache_event_ids */ static const u64 amd_perfmon_event_map[] = { - [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, - [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, + [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, + [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, + [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00d0, /* "Decoder empty" event */ + [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x00d1, /* "Dispatch stalls" event */ }; static u64 amd_pmu_event_map(int hw_event) @@ -111,17 +132,61 @@ static u64 amd_pmu_event_map(int hw_event) return amd_perfmon_event_map[hw_event]; } -static int amd_pmu_hw_config(struct perf_event *event) +/* + * Previously calculated offsets + */ +static unsigned int event_offsets[X86_PMC_IDX_MAX] __read_mostly; +static unsigned int count_offsets[X86_PMC_IDX_MAX] __read_mostly; + +/* + * Legacy CPUs: + * 4 counters starting at 0xc0010000 each offset by 1 + * + * CPUs with core performance counter extensions: + * 6 counters starting at 0xc0010200 each offset by 2 + */ +static inline int amd_pmu_addr_offset(int index, bool eventsel) { - int ret = x86_pmu_hw_config(event); + int offset; - if (ret) - return ret; + if (!index) + return index; - if (event->attr.type != PERF_TYPE_RAW) - return 0; + if (eventsel) + offset = event_offsets[index]; + else + offset = count_offsets[index]; + + if (offset) + return offset; - event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK; + if (!cpu_has_perfctr_core) + offset = index; + else + offset = index << 1; + + if (eventsel) + event_offsets[index] = offset; + else + count_offsets[index] = offset; + + return offset; +} + +static int amd_core_hw_config(struct perf_event *event) +{ + if (event->attr.exclude_host && event->attr.exclude_guest) + /* + * When HO == GO == 1 the hardware treats that as GO == HO == 0 + * and will count in both modes. We don't want to count in that + * case so we emulate no-counting by setting US = OS = 0. + */ + event->hw.config &= ~(ARCH_PERFMON_EVENTSEL_USR | + ARCH_PERFMON_EVENTSEL_OS); + else if (event->attr.exclude_host) + event->hw.config |= AMD64_EVENTSEL_GUESTONLY; + else if (event->attr.exclude_guest) + event->hw.config |= AMD64_EVENTSEL_HOSTONLY; return 0; } @@ -129,6 +194,11 @@ static int amd_pmu_hw_config(struct perf_event *event) /* * AMD64 events are detected based on their event codes. */ +static inline unsigned int amd_get_event_code(struct hw_perf_event *hwc) +{ + return ((hwc->config >> 24) & 0x0f00) | (hwc->config & 0x00ff); +} + static inline int amd_is_nb_event(struct hw_perf_event *hwc) { return (hwc->config & 0xe0) == 0xe0; @@ -141,20 +211,34 @@ static inline int amd_has_nb(struct cpu_hw_events *cpuc) return nb && nb->nb_id != -1; } -static void amd_put_event_constraints(struct cpu_hw_events *cpuc, - struct perf_event *event) +static int amd_pmu_hw_config(struct perf_event *event) +{ + int ret; + + /* pass precise event sampling to ibs: */ + if (event->attr.precise_ip && get_ibs_caps()) + return -ENOENT; + + if (has_branch_stack(event)) + return -EOPNOTSUPP; + + ret = x86_pmu_hw_config(event); + if (ret) + return ret; + + if (event->attr.type == PERF_TYPE_RAW) + event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK; + + return amd_core_hw_config(event); +} + +static void __amd_put_nb_event_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event) { - struct hw_perf_event *hwc = &event->hw; struct amd_nb *nb = cpuc->amd_nb; int i; /* - * only care about NB events - */ - if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc))) - return; - - /* * need to scan whole list because event may not have * been assigned during scheduling * @@ -163,10 +247,8 @@ static void amd_put_event_constraints(struct cpu_hw_events *cpuc, * when we come here */ for (i = 0; i < x86_pmu.num_counters; i++) { - if (nb->owners[i] == event) { - cmpxchg(nb->owners+i, event, NULL); + if (cmpxchg(nb->owners + i, event, NULL) == event) break; - } } } @@ -202,24 +284,24 @@ static void amd_put_event_constraints(struct cpu_hw_events *cpuc, * * Given that resources are allocated (cmpxchg), they must be * eventually freed for others to use. This is accomplished by - * calling amd_put_event_constraints(). + * calling __amd_put_nb_event_constraints() * * Non NB events are not impacted by this restriction. */ static struct event_constraint * -amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +__amd_get_nb_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, + struct event_constraint *c) { struct hw_perf_event *hwc = &event->hw; struct amd_nb *nb = cpuc->amd_nb; - struct perf_event *old = NULL; - int max = x86_pmu.num_counters; - int i, j, k = -1; + struct perf_event *old; + int idx, new = -1; - /* - * if not NB event or no NB, then no constraints - */ - if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc))) - return &unconstrained; + if (!c) + c = &unconstrained; + + if (cpuc->is_fake) + return c; /* * detect if already present, if so reuse @@ -231,61 +313,45 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) * because of successive calls to x86_schedule_events() from * hw_perf_group_sched_in() without hw_perf_enable() */ - for (i = 0; i < max; i++) { - /* - * keep track of first free slot - */ - if (k == -1 && !nb->owners[i]) - k = i; + for_each_set_bit(idx, c->idxmsk, x86_pmu.num_counters) { + if (new == -1 || hwc->idx == idx) + /* assign free slot, prefer hwc->idx */ + old = cmpxchg(nb->owners + idx, NULL, event); + else if (nb->owners[idx] == event) + /* event already present */ + old = event; + else + continue; + + if (old && old != event) + continue; + + /* reassign to this slot */ + if (new != -1) + cmpxchg(nb->owners + new, event, NULL); + new = idx; /* already present, reuse */ - if (nb->owners[i] == event) - goto done; - } - /* - * not present, so grab a new slot - * starting either at: - */ - if (hwc->idx != -1) { - /* previous assignment */ - i = hwc->idx; - } else if (k != -1) { - /* start from free slot found */ - i = k; - } else { - /* - * event not found, no slot found in - * first pass, try again from the - * beginning - */ - i = 0; - } - j = i; - do { - old = cmpxchg(nb->owners+i, NULL, event); - if (!old) + if (old == event) break; - if (++i == max) - i = 0; - } while (i != j); -done: - if (!old) - return &nb->event_constraints[i]; - - return &emptyconstraint; + } + + if (new == -1) + return &emptyconstraint; + + return &nb->event_constraints[new]; } -static struct amd_nb *amd_alloc_nb(int cpu, int nb_id) +static struct amd_nb *amd_alloc_nb(int cpu) { struct amd_nb *nb; int i; - nb = kmalloc_node(sizeof(struct amd_nb), GFP_KERNEL | __GFP_ZERO, - cpu_to_node(cpu)); + nb = kzalloc_node(sizeof(struct amd_nb), GFP_KERNEL, cpu_to_node(cpu)); if (!nb) return NULL; - nb->nb_id = nb_id; + nb->nb_id = -1; /* * initialize all possible NB constraints @@ -306,7 +372,7 @@ static int amd_pmu_cpu_prepare(int cpu) if (boot_cpu_data.x86_max_cores < 2) return NOTIFY_OK; - cpuc->amd_nb = amd_alloc_nb(cpu, -1); + cpuc->amd_nb = amd_alloc_nb(cpu); if (!cpuc->amd_nb) return NOTIFY_BAD; @@ -319,21 +385,21 @@ static void amd_pmu_cpu_starting(int cpu) struct amd_nb *nb; int i, nb_id; + cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY; + if (boot_cpu_data.x86_max_cores < 2) return; nb_id = amd_get_nb_id(cpu); WARN_ON_ONCE(nb_id == BAD_APICID); - raw_spin_lock(&amd_nb_lock); - for_each_online_cpu(i) { nb = per_cpu(cpu_hw_events, i).amd_nb; if (WARN_ON_ONCE(!nb)) continue; if (nb->nb_id == nb_id) { - kfree(cpuc->amd_nb); + cpuc->kfree_on_online = cpuc->amd_nb; cpuc->amd_nb = nb; break; } @@ -341,8 +407,6 @@ static void amd_pmu_cpu_starting(int cpu) cpuc->amd_nb->nb_id = nb_id; cpuc->amd_nb->refcnt++; - - raw_spin_unlock(&amd_nb_lock); } static void amd_pmu_cpu_dead(int cpu) @@ -354,8 +418,6 @@ static void amd_pmu_cpu_dead(int cpu) cpuhw = &per_cpu(cpu_hw_events, cpu); - raw_spin_lock(&amd_nb_lock); - if (cpuhw->amd_nb) { struct amd_nb *nb = cpuhw->amd_nb; @@ -364,8 +426,194 @@ static void amd_pmu_cpu_dead(int cpu) cpuhw->amd_nb = NULL; } +} + +static struct event_constraint * +amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +{ + /* + * if not NB event or no NB, then no constraints + */ + if (!(amd_has_nb(cpuc) && amd_is_nb_event(&event->hw))) + return &unconstrained; - raw_spin_unlock(&amd_nb_lock); + return __amd_get_nb_event_constraints(cpuc, event, NULL); +} + +static void amd_put_event_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + if (amd_has_nb(cpuc) && amd_is_nb_event(&event->hw)) + __amd_put_nb_event_constraints(cpuc, event); +} + +PMU_FORMAT_ATTR(event, "config:0-7,32-35"); +PMU_FORMAT_ATTR(umask, "config:8-15" ); +PMU_FORMAT_ATTR(edge, "config:18" ); +PMU_FORMAT_ATTR(inv, "config:23" ); +PMU_FORMAT_ATTR(cmask, "config:24-31" ); + +static struct attribute *amd_format_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_cmask.attr, + NULL, +}; + +/* AMD Family 15h */ + +#define AMD_EVENT_TYPE_MASK 0x000000F0ULL + +#define AMD_EVENT_FP 0x00000000ULL ... 0x00000010ULL +#define AMD_EVENT_LS 0x00000020ULL ... 0x00000030ULL +#define AMD_EVENT_DC 0x00000040ULL ... 0x00000050ULL +#define AMD_EVENT_CU 0x00000060ULL ... 0x00000070ULL +#define AMD_EVENT_IC_DE 0x00000080ULL ... 0x00000090ULL +#define AMD_EVENT_EX_LS 0x000000C0ULL +#define AMD_EVENT_DE 0x000000D0ULL +#define AMD_EVENT_NB 0x000000E0ULL ... 0x000000F0ULL + +/* + * AMD family 15h event code/PMC mappings: + * + * type = event_code & 0x0F0: + * + * 0x000 FP PERF_CTL[5:3] + * 0x010 FP PERF_CTL[5:3] + * 0x020 LS PERF_CTL[5:0] + * 0x030 LS PERF_CTL[5:0] + * 0x040 DC PERF_CTL[5:0] + * 0x050 DC PERF_CTL[5:0] + * 0x060 CU PERF_CTL[2:0] + * 0x070 CU PERF_CTL[2:0] + * 0x080 IC/DE PERF_CTL[2:0] + * 0x090 IC/DE PERF_CTL[2:0] + * 0x0A0 --- + * 0x0B0 --- + * 0x0C0 EX/LS PERF_CTL[5:0] + * 0x0D0 DE PERF_CTL[2:0] + * 0x0E0 NB NB_PERF_CTL[3:0] + * 0x0F0 NB NB_PERF_CTL[3:0] + * + * Exceptions: + * + * 0x000 FP PERF_CTL[3], PERF_CTL[5:3] (*) + * 0x003 FP PERF_CTL[3] + * 0x004 FP PERF_CTL[3], PERF_CTL[5:3] (*) + * 0x00B FP PERF_CTL[3] + * 0x00D FP PERF_CTL[3] + * 0x023 DE PERF_CTL[2:0] + * 0x02D LS PERF_CTL[3] + * 0x02E LS PERF_CTL[3,0] + * 0x031 LS PERF_CTL[2:0] (**) + * 0x043 CU PERF_CTL[2:0] + * 0x045 CU PERF_CTL[2:0] + * 0x046 CU PERF_CTL[2:0] + * 0x054 CU PERF_CTL[2:0] + * 0x055 CU PERF_CTL[2:0] + * 0x08F IC PERF_CTL[0] + * 0x187 DE PERF_CTL[0] + * 0x188 DE PERF_CTL[0] + * 0x0DB EX PERF_CTL[5:0] + * 0x0DC LS PERF_CTL[5:0] + * 0x0DD LS PERF_CTL[5:0] + * 0x0DE LS PERF_CTL[5:0] + * 0x0DF LS PERF_CTL[5:0] + * 0x1C0 EX PERF_CTL[5:3] + * 0x1D6 EX PERF_CTL[5:0] + * 0x1D8 EX PERF_CTL[5:0] + * + * (*) depending on the umask all FPU counters may be used + * (**) only one unitmask enabled at a time + */ + +static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0); +static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0); +static struct event_constraint amd_f15_PMC3 = EVENT_CONSTRAINT(0, 0x08, 0); +static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0); +static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0); +static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0); + +static struct event_constraint * +amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + unsigned int event_code = amd_get_event_code(hwc); + + switch (event_code & AMD_EVENT_TYPE_MASK) { + case AMD_EVENT_FP: + switch (event_code) { + case 0x000: + if (!(hwc->config & 0x0000F000ULL)) + break; + if (!(hwc->config & 0x00000F00ULL)) + break; + return &amd_f15_PMC3; + case 0x004: + if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1) + break; + return &amd_f15_PMC3; + case 0x003: + case 0x00B: + case 0x00D: + return &amd_f15_PMC3; + } + return &amd_f15_PMC53; + case AMD_EVENT_LS: + case AMD_EVENT_DC: + case AMD_EVENT_EX_LS: + switch (event_code) { + case 0x023: + case 0x043: + case 0x045: + case 0x046: + case 0x054: + case 0x055: + return &amd_f15_PMC20; + case 0x02D: + return &amd_f15_PMC3; + case 0x02E: + return &amd_f15_PMC30; + case 0x031: + if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1) + return &amd_f15_PMC20; + return &emptyconstraint; + case 0x1C0: + return &amd_f15_PMC53; + default: + return &amd_f15_PMC50; + } + case AMD_EVENT_CU: + case AMD_EVENT_IC_DE: + case AMD_EVENT_DE: + switch (event_code) { + case 0x08F: + case 0x187: + case 0x188: + return &amd_f15_PMC0; + case 0x0DB ... 0x0DF: + case 0x1D6: + case 0x1D8: + return &amd_f15_PMC50; + default: + return &amd_f15_PMC20; + } + case AMD_EVENT_NB: + /* moved to perf_event_amd_uncore.c */ + return &emptyconstraint; + default: + return &emptyconstraint; + } +} + +static ssize_t amd_event_sysfs_show(char *page, u64 config) +{ + u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT) | + (config & AMD64_EVENTSEL_EVENT) >> 24; + + return x86_event_sysfs_show(page, config, event); } static __initconst const struct x86_pmu amd_pmu = { @@ -379,9 +627,10 @@ static __initconst const struct x86_pmu amd_pmu = { .schedule_events = x86_schedule_events, .eventsel = MSR_K7_EVNTSEL0, .perfctr = MSR_K7_PERFCTR0, + .addr_offset = amd_pmu_addr_offset, .event_map = amd_pmu_event_map, .max_events = ARRAY_SIZE(amd_perfmon_event_map), - .num_counters = 4, + .num_counters = AMD64_NUM_COUNTERS, .cntval_bits = 48, .cntval_mask = (1ULL << 48) - 1, .apic = 1, @@ -390,19 +639,57 @@ static __initconst const struct x86_pmu amd_pmu = { .get_event_constraints = amd_get_event_constraints, .put_event_constraints = amd_put_event_constraints, + .format_attrs = amd_format_attr, + .events_sysfs_show = amd_event_sysfs_show, + .cpu_prepare = amd_pmu_cpu_prepare, .cpu_starting = amd_pmu_cpu_starting, .cpu_dead = amd_pmu_cpu_dead, }; -static __init int amd_pmu_init(void) +static int __init amd_core_pmu_init(void) +{ + if (!cpu_has_perfctr_core) + return 0; + + switch (boot_cpu_data.x86) { + case 0x15: + pr_cont("Fam15h "); + x86_pmu.get_event_constraints = amd_get_event_constraints_f15h; + break; + + default: + pr_err("core perfctr but no constraints; unknown hardware!\n"); + return -ENODEV; + } + + /* + * If core performance counter extensions exists, we must use + * MSR_F15H_PERF_CTL/MSR_F15H_PERF_CTR msrs. See also + * amd_pmu_addr_offset(). + */ + x86_pmu.eventsel = MSR_F15H_PERF_CTL; + x86_pmu.perfctr = MSR_F15H_PERF_CTR; + x86_pmu.num_counters = AMD64_NUM_COUNTERS_CORE; + + pr_cont("core perfctr, "); + return 0; +} + +__init int amd_pmu_init(void) { + int ret; + /* Performance-monitoring supported from K7 and later: */ if (boot_cpu_data.x86 < 6) return -ENODEV; x86_pmu = amd_pmu; + ret = amd_core_pmu_init(); + if (ret) + return ret; + /* Events are common for all AMDs */ memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, sizeof(hw_cache_event_ids)); @@ -410,11 +697,32 @@ static __init int amd_pmu_init(void) return 0; } -#else /* CONFIG_CPU_SUP_AMD */ - -static int amd_pmu_init(void) +void amd_pmu_enable_virt(void) { - return 0; + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + cpuc->perf_ctr_virt_mask = 0; + + /* Reload all events */ + x86_pmu_disable_all(); + x86_pmu_enable_all(0); } +EXPORT_SYMBOL_GPL(amd_pmu_enable_virt); + +void amd_pmu_disable_virt(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); -#endif + /* + * We only mask out the Host-only bit so that host-only counting works + * when SVM is disabled. If someone sets up a guest-only counter when + * SVM is disabled the Guest-only bits still gets set and the counter + * will not count anything. + */ + cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY; + + /* Reload all events */ + x86_pmu_disable_all(); + x86_pmu_enable_all(0); +} +EXPORT_SYMBOL_GPL(amd_pmu_disable_virt); diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c new file mode 100644 index 00000000000..cbb1be3ed9e --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -0,0 +1,946 @@ +/* + * Performance events - AMD IBS + * + * Copyright (C) 2011 Advanced Micro Devices, Inc., Robert Richter + * + * For licencing details see kernel-base/COPYING + */ + +#include <linux/perf_event.h> +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/ptrace.h> +#include <linux/syscore_ops.h> + +#include <asm/apic.h> + +#include "perf_event.h" + +static u32 ibs_caps; + +#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) + +#include <linux/kprobes.h> +#include <linux/hardirq.h> + +#include <asm/nmi.h> + +#define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT) +#define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT + +enum ibs_states { + IBS_ENABLED = 0, + IBS_STARTED = 1, + IBS_STOPPING = 2, + + IBS_MAX_STATES, +}; + +struct cpu_perf_ibs { + struct perf_event *event; + unsigned long state[BITS_TO_LONGS(IBS_MAX_STATES)]; +}; + +struct perf_ibs { + struct pmu pmu; + unsigned int msr; + u64 config_mask; + u64 cnt_mask; + u64 enable_mask; + u64 valid_mask; + u64 max_period; + unsigned long offset_mask[1]; + int offset_max; + struct cpu_perf_ibs __percpu *pcpu; + + struct attribute **format_attrs; + struct attribute_group format_group; + const struct attribute_group *attr_groups[2]; + + u64 (*get_count)(u64 config); +}; + +struct perf_ibs_data { + u32 size; + union { + u32 data[0]; /* data buffer starts here */ + u32 caps; + }; + u64 regs[MSR_AMD64_IBS_REG_COUNT_MAX]; +}; + +static int +perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period) +{ + s64 left = local64_read(&hwc->period_left); + s64 period = hwc->sample_period; + int overflow = 0; + + /* + * If we are way outside a reasonable range then just skip forward: + */ + if (unlikely(left <= -period)) { + left = period; + local64_set(&hwc->period_left, left); + hwc->last_period = period; + overflow = 1; + } + + if (unlikely(left < (s64)min)) { + left += period; + local64_set(&hwc->period_left, left); + hwc->last_period = period; + overflow = 1; + } + + /* + * If the hw period that triggers the sw overflow is too short + * we might hit the irq handler. This biases the results. + * Thus we shorten the next-to-last period and set the last + * period to the max period. + */ + if (left > max) { + left -= max; + if (left > max) + left = max; + else if (left < min) + left = min; + } + + *hw_period = (u64)left; + + return overflow; +} + +static int +perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width) +{ + struct hw_perf_event *hwc = &event->hw; + int shift = 64 - width; + u64 prev_raw_count; + u64 delta; + + /* + * Careful: an NMI might modify the previous event value. + * + * Our tactic to handle this is to first atomically read and + * exchange a new raw count - then add that new-prev delta + * count to the generic event atomically: + */ + prev_raw_count = local64_read(&hwc->prev_count); + if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, + new_raw_count) != prev_raw_count) + return 0; + + /* + * Now we have the new raw value and have updated the prev + * timestamp already. We can now calculate the elapsed delta + * (event-)time and add that to the generic event. + * + * Careful, not all hw sign-extends above the physical width + * of the count. + */ + delta = (new_raw_count << shift) - (prev_raw_count << shift); + delta >>= shift; + + local64_add(delta, &event->count); + local64_sub(delta, &hwc->period_left); + + return 1; +} + +static struct perf_ibs perf_ibs_fetch; +static struct perf_ibs perf_ibs_op; + +static struct perf_ibs *get_ibs_pmu(int type) +{ + if (perf_ibs_fetch.pmu.type == type) + return &perf_ibs_fetch; + if (perf_ibs_op.pmu.type == type) + return &perf_ibs_op; + return NULL; +} + +/* + * Use IBS for precise event sampling: + * + * perf record -a -e cpu-cycles:p ... # use ibs op counting cycle count + * perf record -a -e r076:p ... # same as -e cpu-cycles:p + * perf record -a -e r0C1:p ... # use ibs op counting micro-ops + * + * IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl, + * MSRC001_1033) is used to select either cycle or micro-ops counting + * mode. + * + * The rip of IBS samples has skid 0. Thus, IBS supports precise + * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the + * rip is invalid when IBS was not able to record the rip correctly. + * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then. + * + */ +static int perf_ibs_precise_event(struct perf_event *event, u64 *config) +{ + switch (event->attr.precise_ip) { + case 0: + return -ENOENT; + case 1: + case 2: + break; + default: + return -EOPNOTSUPP; + } + + switch (event->attr.type) { + case PERF_TYPE_HARDWARE: + switch (event->attr.config) { + case PERF_COUNT_HW_CPU_CYCLES: + *config = 0; + return 0; + } + break; + case PERF_TYPE_RAW: + switch (event->attr.config) { + case 0x0076: + *config = 0; + return 0; + case 0x00C1: + *config = IBS_OP_CNT_CTL; + return 0; + } + break; + default: + return -ENOENT; + } + + return -EOPNOTSUPP; +} + +static const struct perf_event_attr ibs_notsupp = { + .exclude_user = 1, + .exclude_kernel = 1, + .exclude_hv = 1, + .exclude_idle = 1, + .exclude_host = 1, + .exclude_guest = 1, +}; + +static int perf_ibs_init(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct perf_ibs *perf_ibs; + u64 max_cnt, config; + int ret; + + perf_ibs = get_ibs_pmu(event->attr.type); + if (perf_ibs) { + config = event->attr.config; + } else { + perf_ibs = &perf_ibs_op; + ret = perf_ibs_precise_event(event, &config); + if (ret) + return ret; + } + + if (event->pmu != &perf_ibs->pmu) + return -ENOENT; + + if (perf_flags(&event->attr) & perf_flags(&ibs_notsupp)) + return -EINVAL; + + if (config & ~perf_ibs->config_mask) + return -EINVAL; + + if (hwc->sample_period) { + if (config & perf_ibs->cnt_mask) + /* raw max_cnt may not be set */ + return -EINVAL; + if (!event->attr.sample_freq && hwc->sample_period & 0x0f) + /* + * lower 4 bits can not be set in ibs max cnt, + * but allowing it in case we adjust the + * sample period to set a frequency. + */ + return -EINVAL; + hwc->sample_period &= ~0x0FULL; + if (!hwc->sample_period) + hwc->sample_period = 0x10; + } else { + max_cnt = config & perf_ibs->cnt_mask; + config &= ~perf_ibs->cnt_mask; + event->attr.sample_period = max_cnt << 4; + hwc->sample_period = event->attr.sample_period; + } + + if (!hwc->sample_period) + return -EINVAL; + + /* + * If we modify hwc->sample_period, we also need to update + * hwc->last_period and hwc->period_left. + */ + hwc->last_period = hwc->sample_period; + local64_set(&hwc->period_left, hwc->sample_period); + + hwc->config_base = perf_ibs->msr; + hwc->config = config; + + return 0; +} + +static int perf_ibs_set_period(struct perf_ibs *perf_ibs, + struct hw_perf_event *hwc, u64 *period) +{ + int overflow; + + /* ignore lower 4 bits in min count: */ + overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period); + local64_set(&hwc->prev_count, 0); + + return overflow; +} + +static u64 get_ibs_fetch_count(u64 config) +{ + return (config & IBS_FETCH_CNT) >> 12; +} + +static u64 get_ibs_op_count(u64 config) +{ + u64 count = 0; + + if (config & IBS_OP_VAL) + count += (config & IBS_OP_MAX_CNT) << 4; /* cnt rolled over */ + + if (ibs_caps & IBS_CAPS_RDWROPCNT) + count += (config & IBS_OP_CUR_CNT) >> 32; + + return count; +} + +static void +perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event, + u64 *config) +{ + u64 count = perf_ibs->get_count(*config); + + /* + * Set width to 64 since we do not overflow on max width but + * instead on max count. In perf_ibs_set_period() we clear + * prev count manually on overflow. + */ + while (!perf_event_try_update(event, count, 64)) { + rdmsrl(event->hw.config_base, *config); + count = perf_ibs->get_count(*config); + } +} + +static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs, + struct hw_perf_event *hwc, u64 config) +{ + wrmsrl(hwc->config_base, hwc->config | config | perf_ibs->enable_mask); +} + +/* + * Erratum #420 Instruction-Based Sampling Engine May Generate + * Interrupt that Cannot Be Cleared: + * + * Must clear counter mask first, then clear the enable bit. See + * Revision Guide for AMD Family 10h Processors, Publication #41322. + */ +static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs, + struct hw_perf_event *hwc, u64 config) +{ + config &= ~perf_ibs->cnt_mask; + wrmsrl(hwc->config_base, config); + config &= ~perf_ibs->enable_mask; + wrmsrl(hwc->config_base, config); +} + +/* + * We cannot restore the ibs pmu state, so we always needs to update + * the event while stopping it and then reset the state when starting + * again. Thus, ignoring PERF_EF_RELOAD and PERF_EF_UPDATE flags in + * perf_ibs_start()/perf_ibs_stop() and instead always do it. + */ +static void perf_ibs_start(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); + struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); + u64 period; + + if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) + return; + + WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); + hwc->state = 0; + + perf_ibs_set_period(perf_ibs, hwc, &period); + set_bit(IBS_STARTED, pcpu->state); + perf_ibs_enable_event(perf_ibs, hwc, period >> 4); + + perf_event_update_userpage(event); +} + +static void perf_ibs_stop(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); + struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); + u64 config; + int stopping; + + stopping = test_and_clear_bit(IBS_STARTED, pcpu->state); + + if (!stopping && (hwc->state & PERF_HES_UPTODATE)) + return; + + rdmsrl(hwc->config_base, config); + + if (stopping) { + set_bit(IBS_STOPPING, pcpu->state); + perf_ibs_disable_event(perf_ibs, hwc, config); + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; + } + + if (hwc->state & PERF_HES_UPTODATE) + return; + + /* + * Clear valid bit to not count rollovers on update, rollovers + * are only updated in the irq handler. + */ + config &= ~perf_ibs->valid_mask; + + perf_ibs_event_update(perf_ibs, event, &config); + hwc->state |= PERF_HES_UPTODATE; +} + +static int perf_ibs_add(struct perf_event *event, int flags) +{ + struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); + struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); + + if (test_and_set_bit(IBS_ENABLED, pcpu->state)) + return -ENOSPC; + + event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + + pcpu->event = event; + + if (flags & PERF_EF_START) + perf_ibs_start(event, PERF_EF_RELOAD); + + return 0; +} + +static void perf_ibs_del(struct perf_event *event, int flags) +{ + struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); + struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); + + if (!test_and_clear_bit(IBS_ENABLED, pcpu->state)) + return; + + perf_ibs_stop(event, PERF_EF_UPDATE); + + pcpu->event = NULL; + + perf_event_update_userpage(event); +} + +static void perf_ibs_read(struct perf_event *event) { } + +PMU_FORMAT_ATTR(rand_en, "config:57"); +PMU_FORMAT_ATTR(cnt_ctl, "config:19"); + +static struct attribute *ibs_fetch_format_attrs[] = { + &format_attr_rand_en.attr, + NULL, +}; + +static struct attribute *ibs_op_format_attrs[] = { + NULL, /* &format_attr_cnt_ctl.attr if IBS_CAPS_OPCNT */ + NULL, +}; + +static struct perf_ibs perf_ibs_fetch = { + .pmu = { + .task_ctx_nr = perf_invalid_context, + + .event_init = perf_ibs_init, + .add = perf_ibs_add, + .del = perf_ibs_del, + .start = perf_ibs_start, + .stop = perf_ibs_stop, + .read = perf_ibs_read, + }, + .msr = MSR_AMD64_IBSFETCHCTL, + .config_mask = IBS_FETCH_CONFIG_MASK, + .cnt_mask = IBS_FETCH_MAX_CNT, + .enable_mask = IBS_FETCH_ENABLE, + .valid_mask = IBS_FETCH_VAL, + .max_period = IBS_FETCH_MAX_CNT << 4, + .offset_mask = { MSR_AMD64_IBSFETCH_REG_MASK }, + .offset_max = MSR_AMD64_IBSFETCH_REG_COUNT, + .format_attrs = ibs_fetch_format_attrs, + + .get_count = get_ibs_fetch_count, +}; + +static struct perf_ibs perf_ibs_op = { + .pmu = { + .task_ctx_nr = perf_invalid_context, + + .event_init = perf_ibs_init, + .add = perf_ibs_add, + .del = perf_ibs_del, + .start = perf_ibs_start, + .stop = perf_ibs_stop, + .read = perf_ibs_read, + }, + .msr = MSR_AMD64_IBSOPCTL, + .config_mask = IBS_OP_CONFIG_MASK, + .cnt_mask = IBS_OP_MAX_CNT, + .enable_mask = IBS_OP_ENABLE, + .valid_mask = IBS_OP_VAL, + .max_period = IBS_OP_MAX_CNT << 4, + .offset_mask = { MSR_AMD64_IBSOP_REG_MASK }, + .offset_max = MSR_AMD64_IBSOP_REG_COUNT, + .format_attrs = ibs_op_format_attrs, + + .get_count = get_ibs_op_count, +}; + +static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) +{ + struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); + struct perf_event *event = pcpu->event; + struct hw_perf_event *hwc = &event->hw; + struct perf_sample_data data; + struct perf_raw_record raw; + struct pt_regs regs; + struct perf_ibs_data ibs_data; + int offset, size, check_rip, offset_max, throttle = 0; + unsigned int msr; + u64 *buf, *config, period; + + if (!test_bit(IBS_STARTED, pcpu->state)) { + /* + * Catch spurious interrupts after stopping IBS: After + * disabling IBS there could be still incoming NMIs + * with samples that even have the valid bit cleared. + * Mark all this NMIs as handled. + */ + return test_and_clear_bit(IBS_STOPPING, pcpu->state) ? 1 : 0; + } + + msr = hwc->config_base; + buf = ibs_data.regs; + rdmsrl(msr, *buf); + if (!(*buf++ & perf_ibs->valid_mask)) + return 0; + + config = &ibs_data.regs[0]; + perf_ibs_event_update(perf_ibs, event, config); + perf_sample_data_init(&data, 0, hwc->last_period); + if (!perf_ibs_set_period(perf_ibs, hwc, &period)) + goto out; /* no sw counter overflow */ + + ibs_data.caps = ibs_caps; + size = 1; + offset = 1; + check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK)); + if (event->attr.sample_type & PERF_SAMPLE_RAW) + offset_max = perf_ibs->offset_max; + else if (check_rip) + offset_max = 2; + else + offset_max = 1; + do { + rdmsrl(msr + offset, *buf++); + size++; + offset = find_next_bit(perf_ibs->offset_mask, + perf_ibs->offset_max, + offset + 1); + } while (offset < offset_max); + ibs_data.size = sizeof(u64) * size; + + regs = *iregs; + if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) { + regs.flags &= ~PERF_EFLAGS_EXACT; + } else { + set_linear_ip(®s, ibs_data.regs[1]); + regs.flags |= PERF_EFLAGS_EXACT; + } + + if (event->attr.sample_type & PERF_SAMPLE_RAW) { + raw.size = sizeof(u32) + ibs_data.size; + raw.data = ibs_data.data; + data.raw = &raw; + } + + throttle = perf_event_overflow(event, &data, ®s); +out: + if (throttle) + perf_ibs_disable_event(perf_ibs, hwc, *config); + else + perf_ibs_enable_event(perf_ibs, hwc, period >> 4); + + perf_event_update_userpage(event); + + return 1; +} + +static int +perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs) +{ + int handled = 0; + + handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs); + handled += perf_ibs_handle_irq(&perf_ibs_op, regs); + + if (handled) + inc_irq_stat(apic_perf_irqs); + + return handled; +} +NOKPROBE_SYMBOL(perf_ibs_nmi_handler); + +static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name) +{ + struct cpu_perf_ibs __percpu *pcpu; + int ret; + + pcpu = alloc_percpu(struct cpu_perf_ibs); + if (!pcpu) + return -ENOMEM; + + perf_ibs->pcpu = pcpu; + + /* register attributes */ + if (perf_ibs->format_attrs[0]) { + memset(&perf_ibs->format_group, 0, sizeof(perf_ibs->format_group)); + perf_ibs->format_group.name = "format"; + perf_ibs->format_group.attrs = perf_ibs->format_attrs; + + memset(&perf_ibs->attr_groups, 0, sizeof(perf_ibs->attr_groups)); + perf_ibs->attr_groups[0] = &perf_ibs->format_group; + perf_ibs->pmu.attr_groups = perf_ibs->attr_groups; + } + + ret = perf_pmu_register(&perf_ibs->pmu, name, -1); + if (ret) { + perf_ibs->pcpu = NULL; + free_percpu(pcpu); + } + + return ret; +} + +static __init int perf_event_ibs_init(void) +{ + struct attribute **attr = ibs_op_format_attrs; + + if (!ibs_caps) + return -ENODEV; /* ibs not supported by the cpu */ + + perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); + + if (ibs_caps & IBS_CAPS_OPCNT) { + perf_ibs_op.config_mask |= IBS_OP_CNT_CTL; + *attr++ = &format_attr_cnt_ctl.attr; + } + perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); + + register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs"); + printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps); + + return 0; +} + +#else /* defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) */ + +static __init int perf_event_ibs_init(void) { return 0; } + +#endif + +/* IBS - apic initialization, for perf and oprofile */ + +static __init u32 __get_ibs_caps(void) +{ + u32 caps; + unsigned int max_level; + + if (!boot_cpu_has(X86_FEATURE_IBS)) + return 0; + + /* check IBS cpuid feature flags */ + max_level = cpuid_eax(0x80000000); + if (max_level < IBS_CPUID_FEATURES) + return IBS_CAPS_DEFAULT; + + caps = cpuid_eax(IBS_CPUID_FEATURES); + if (!(caps & IBS_CAPS_AVAIL)) + /* cpuid flags not valid */ + return IBS_CAPS_DEFAULT; + + return caps; +} + +u32 get_ibs_caps(void) +{ + return ibs_caps; +} + +EXPORT_SYMBOL(get_ibs_caps); + +static inline int get_eilvt(int offset) +{ + return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1); +} + +static inline int put_eilvt(int offset) +{ + return !setup_APIC_eilvt(offset, 0, 0, 1); +} + +/* + * Check and reserve APIC extended interrupt LVT offset for IBS if available. + */ +static inline int ibs_eilvt_valid(void) +{ + int offset; + u64 val; + int valid = 0; + + preempt_disable(); + + rdmsrl(MSR_AMD64_IBSCTL, val); + offset = val & IBSCTL_LVT_OFFSET_MASK; + + if (!(val & IBSCTL_LVT_OFFSET_VALID)) { + pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n", + smp_processor_id(), offset, MSR_AMD64_IBSCTL, val); + goto out; + } + + if (!get_eilvt(offset)) { + pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n", + smp_processor_id(), offset, MSR_AMD64_IBSCTL, val); + goto out; + } + + valid = 1; +out: + preempt_enable(); + + return valid; +} + +static int setup_ibs_ctl(int ibs_eilvt_off) +{ + struct pci_dev *cpu_cfg; + int nodes; + u32 value = 0; + + nodes = 0; + cpu_cfg = NULL; + do { + cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD, + PCI_DEVICE_ID_AMD_10H_NB_MISC, + cpu_cfg); + if (!cpu_cfg) + break; + ++nodes; + pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off + | IBSCTL_LVT_OFFSET_VALID); + pci_read_config_dword(cpu_cfg, IBSCTL, &value); + if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) { + pci_dev_put(cpu_cfg); + printk(KERN_DEBUG "Failed to setup IBS LVT offset, " + "IBSCTL = 0x%08x\n", value); + return -EINVAL; + } + } while (1); + + if (!nodes) { + printk(KERN_DEBUG "No CPU node configured for IBS\n"); + return -ENODEV; + } + + return 0; +} + +/* + * This runs only on the current cpu. We try to find an LVT offset and + * setup the local APIC. For this we must disable preemption. On + * success we initialize all nodes with this offset. This updates then + * the offset in the IBS_CTL per-node msr. The per-core APIC setup of + * the IBS interrupt vector is handled by perf_ibs_cpu_notifier that + * is using the new offset. + */ +static int force_ibs_eilvt_setup(void) +{ + int offset; + int ret; + + preempt_disable(); + /* find the next free available EILVT entry, skip offset 0 */ + for (offset = 1; offset < APIC_EILVT_NR_MAX; offset++) { + if (get_eilvt(offset)) + break; + } + preempt_enable(); + + if (offset == APIC_EILVT_NR_MAX) { + printk(KERN_DEBUG "No EILVT entry available\n"); + return -EBUSY; + } + + ret = setup_ibs_ctl(offset); + if (ret) + goto out; + + if (!ibs_eilvt_valid()) { + ret = -EFAULT; + goto out; + } + + pr_info("IBS: LVT offset %d assigned\n", offset); + + return 0; +out: + preempt_disable(); + put_eilvt(offset); + preempt_enable(); + return ret; +} + +static void ibs_eilvt_setup(void) +{ + /* + * Force LVT offset assignment for family 10h: The offsets are + * not assigned by the BIOS for this family, so the OS is + * responsible for doing it. If the OS assignment fails, fall + * back to BIOS settings and try to setup this. + */ + if (boot_cpu_data.x86 == 0x10) + force_ibs_eilvt_setup(); +} + +static inline int get_ibs_lvt_offset(void) +{ + u64 val; + + rdmsrl(MSR_AMD64_IBSCTL, val); + if (!(val & IBSCTL_LVT_OFFSET_VALID)) + return -EINVAL; + + return val & IBSCTL_LVT_OFFSET_MASK; +} + +static void setup_APIC_ibs(void *dummy) +{ + int offset; + + offset = get_ibs_lvt_offset(); + if (offset < 0) + goto failed; + + if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0)) + return; +failed: + pr_warn("perf: IBS APIC setup failed on cpu #%d\n", + smp_processor_id()); +} + +static void clear_APIC_ibs(void *dummy) +{ + int offset; + + offset = get_ibs_lvt_offset(); + if (offset >= 0) + setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1); +} + +#ifdef CONFIG_PM + +static int perf_ibs_suspend(void) +{ + clear_APIC_ibs(NULL); + return 0; +} + +static void perf_ibs_resume(void) +{ + ibs_eilvt_setup(); + setup_APIC_ibs(NULL); +} + +static struct syscore_ops perf_ibs_syscore_ops = { + .resume = perf_ibs_resume, + .suspend = perf_ibs_suspend, +}; + +static void perf_ibs_pm_init(void) +{ + register_syscore_ops(&perf_ibs_syscore_ops); +} + +#else + +static inline void perf_ibs_pm_init(void) { } + +#endif + +static int +perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) +{ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_STARTING: + setup_APIC_ibs(NULL); + break; + case CPU_DYING: + clear_APIC_ibs(NULL); + break; + default: + break; + } + + return NOTIFY_OK; +} + +static __init int amd_ibs_init(void) +{ + u32 caps; + int ret = -EINVAL; + + caps = __get_ibs_caps(); + if (!caps) + return -ENODEV; /* ibs not supported by the cpu */ + + ibs_eilvt_setup(); + + if (!ibs_eilvt_valid()) + goto out; + + perf_ibs_pm_init(); + cpu_notifier_register_begin(); + ibs_caps = caps; + /* make ibs_caps visible to other cpus: */ + smp_mb(); + smp_call_function(setup_APIC_ibs, NULL, 1); + __perf_cpu_notifier(perf_ibs_cpu_notifier); + cpu_notifier_register_done(); + + ret = perf_event_ibs_init(); +out: + if (ret) + pr_err("Failed to setup IBS, %d\n", ret); + return ret; +} + +/* Since we need the pci subsystem to init ibs we can't do this earlier: */ +device_initcall(amd_ibs_init); diff --git a/arch/x86/kernel/cpu/perf_event_amd_iommu.c b/arch/x86/kernel/cpu/perf_event_amd_iommu.c new file mode 100644 index 00000000000..639d1289b1b --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_amd_iommu.c @@ -0,0 +1,502 @@ +/* + * Copyright (C) 2013 Advanced Micro Devices, Inc. + * + * Author: Steven Kinney <Steven.Kinney@amd.com> + * Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com> + * + * Perf: amd_iommu - AMD IOMMU Performance Counter PMU implementation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/perf_event.h> +#include <linux/module.h> +#include <linux/cpumask.h> +#include <linux/slab.h> + +#include "perf_event.h" +#include "perf_event_amd_iommu.h" + +#define COUNTER_SHIFT 16 + +#define _GET_BANK(ev) ((u8)(ev->hw.extra_reg.reg >> 8)) +#define _GET_CNTR(ev) ((u8)(ev->hw.extra_reg.reg)) + +/* iommu pmu config masks */ +#define _GET_CSOURCE(ev) ((ev->hw.config & 0xFFULL)) +#define _GET_DEVID(ev) ((ev->hw.config >> 8) & 0xFFFFULL) +#define _GET_PASID(ev) ((ev->hw.config >> 24) & 0xFFFFULL) +#define _GET_DOMID(ev) ((ev->hw.config >> 40) & 0xFFFFULL) +#define _GET_DEVID_MASK(ev) ((ev->hw.extra_reg.config) & 0xFFFFULL) +#define _GET_PASID_MASK(ev) ((ev->hw.extra_reg.config >> 16) & 0xFFFFULL) +#define _GET_DOMID_MASK(ev) ((ev->hw.extra_reg.config >> 32) & 0xFFFFULL) + +static struct perf_amd_iommu __perf_iommu; + +struct perf_amd_iommu { + struct pmu pmu; + u8 max_banks; + u8 max_counters; + u64 cntr_assign_mask; + raw_spinlock_t lock; + const struct attribute_group *attr_groups[4]; +}; + +#define format_group attr_groups[0] +#define cpumask_group attr_groups[1] +#define events_group attr_groups[2] +#define null_group attr_groups[3] + +/*--------------------------------------------- + * sysfs format attributes + *---------------------------------------------*/ +PMU_FORMAT_ATTR(csource, "config:0-7"); +PMU_FORMAT_ATTR(devid, "config:8-23"); +PMU_FORMAT_ATTR(pasid, "config:24-39"); +PMU_FORMAT_ATTR(domid, "config:40-55"); +PMU_FORMAT_ATTR(devid_mask, "config1:0-15"); +PMU_FORMAT_ATTR(pasid_mask, "config1:16-31"); +PMU_FORMAT_ATTR(domid_mask, "config1:32-47"); + +static struct attribute *iommu_format_attrs[] = { + &format_attr_csource.attr, + &format_attr_devid.attr, + &format_attr_pasid.attr, + &format_attr_domid.attr, + &format_attr_devid_mask.attr, + &format_attr_pasid_mask.attr, + &format_attr_domid_mask.attr, + NULL, +}; + +static struct attribute_group amd_iommu_format_group = { + .name = "format", + .attrs = iommu_format_attrs, +}; + +/*--------------------------------------------- + * sysfs events attributes + *---------------------------------------------*/ +struct amd_iommu_event_desc { + struct kobj_attribute attr; + const char *event; +}; + +static ssize_t _iommu_event_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct amd_iommu_event_desc *event = + container_of(attr, struct amd_iommu_event_desc, attr); + return sprintf(buf, "%s\n", event->event); +} + +#define AMD_IOMMU_EVENT_DESC(_name, _event) \ +{ \ + .attr = __ATTR(_name, 0444, _iommu_event_show, NULL), \ + .event = _event, \ +} + +static struct amd_iommu_event_desc amd_iommu_v2_event_descs[] = { + AMD_IOMMU_EVENT_DESC(mem_pass_untrans, "csource=0x01"), + AMD_IOMMU_EVENT_DESC(mem_pass_pretrans, "csource=0x02"), + AMD_IOMMU_EVENT_DESC(mem_pass_excl, "csource=0x03"), + AMD_IOMMU_EVENT_DESC(mem_target_abort, "csource=0x04"), + AMD_IOMMU_EVENT_DESC(mem_trans_total, "csource=0x05"), + AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pte_hit, "csource=0x06"), + AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pte_mis, "csource=0x07"), + AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pde_hit, "csource=0x08"), + AMD_IOMMU_EVENT_DESC(mem_iommu_tlb_pde_mis, "csource=0x09"), + AMD_IOMMU_EVENT_DESC(mem_dte_hit, "csource=0x0a"), + AMD_IOMMU_EVENT_DESC(mem_dte_mis, "csource=0x0b"), + AMD_IOMMU_EVENT_DESC(page_tbl_read_tot, "csource=0x0c"), + AMD_IOMMU_EVENT_DESC(page_tbl_read_nst, "csource=0x0d"), + AMD_IOMMU_EVENT_DESC(page_tbl_read_gst, "csource=0x0e"), + AMD_IOMMU_EVENT_DESC(int_dte_hit, "csource=0x0f"), + AMD_IOMMU_EVENT_DESC(int_dte_mis, "csource=0x10"), + AMD_IOMMU_EVENT_DESC(cmd_processed, "csource=0x11"), + AMD_IOMMU_EVENT_DESC(cmd_processed_inv, "csource=0x12"), + AMD_IOMMU_EVENT_DESC(tlb_inv, "csource=0x13"), + { /* end: all zeroes */ }, +}; + +/*--------------------------------------------- + * sysfs cpumask attributes + *---------------------------------------------*/ +static cpumask_t iommu_cpumask; + +static ssize_t _iommu_cpumask_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &iommu_cpumask); + buf[n++] = '\n'; + buf[n] = '\0'; + return n; +} +static DEVICE_ATTR(cpumask, S_IRUGO, _iommu_cpumask_show, NULL); + +static struct attribute *iommu_cpumask_attrs[] = { + &dev_attr_cpumask.attr, + NULL, +}; + +static struct attribute_group amd_iommu_cpumask_group = { + .attrs = iommu_cpumask_attrs, +}; + +/*---------------------------------------------*/ + +static int get_next_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu) +{ + unsigned long flags; + int shift, bank, cntr, retval; + int max_banks = perf_iommu->max_banks; + int max_cntrs = perf_iommu->max_counters; + + raw_spin_lock_irqsave(&perf_iommu->lock, flags); + + for (bank = 0, shift = 0; bank < max_banks; bank++) { + for (cntr = 0; cntr < max_cntrs; cntr++) { + shift = bank + (bank*3) + cntr; + if (perf_iommu->cntr_assign_mask & (1ULL<<shift)) { + continue; + } else { + perf_iommu->cntr_assign_mask |= (1ULL<<shift); + retval = ((u16)((u16)bank<<8) | (u8)(cntr)); + goto out; + } + } + } + retval = -ENOSPC; +out: + raw_spin_unlock_irqrestore(&perf_iommu->lock, flags); + return retval; +} + +static int clear_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu, + u8 bank, u8 cntr) +{ + unsigned long flags; + int max_banks, max_cntrs; + int shift = 0; + + max_banks = perf_iommu->max_banks; + max_cntrs = perf_iommu->max_counters; + + if ((bank > max_banks) || (cntr > max_cntrs)) + return -EINVAL; + + shift = bank + cntr + (bank*3); + + raw_spin_lock_irqsave(&perf_iommu->lock, flags); + perf_iommu->cntr_assign_mask &= ~(1ULL<<shift); + raw_spin_unlock_irqrestore(&perf_iommu->lock, flags); + + return 0; +} + +static int perf_iommu_event_init(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct perf_amd_iommu *perf_iommu; + u64 config, config1; + + /* test the event attr type check for PMU enumeration */ + if (event->attr.type != event->pmu->type) + return -ENOENT; + + /* + * IOMMU counters are shared across all cores. + * Therefore, it does not support per-process mode. + * Also, it does not support event sampling mode. + */ + if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK) + return -EINVAL; + + /* IOMMU counters do not have usr/os/guest/host bits */ + if (event->attr.exclude_user || event->attr.exclude_kernel || + event->attr.exclude_host || event->attr.exclude_guest) + return -EINVAL; + + if (event->cpu < 0) + return -EINVAL; + + perf_iommu = &__perf_iommu; + + if (event->pmu != &perf_iommu->pmu) + return -ENOENT; + + if (perf_iommu) { + config = event->attr.config; + config1 = event->attr.config1; + } else { + return -EINVAL; + } + + /* integrate with iommu base devid (0000), assume one iommu */ + perf_iommu->max_banks = + amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID); + perf_iommu->max_counters = + amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID); + if ((perf_iommu->max_banks == 0) || (perf_iommu->max_counters == 0)) + return -EINVAL; + + /* update the hw_perf_event struct with the iommu config data */ + hwc->config = config; + hwc->extra_reg.config = config1; + + return 0; +} + +static void perf_iommu_enable_event(struct perf_event *ev) +{ + u8 csource = _GET_CSOURCE(ev); + u16 devid = _GET_DEVID(ev); + u64 reg = 0ULL; + + reg = csource; + amd_iommu_pc_get_set_reg_val(devid, + _GET_BANK(ev), _GET_CNTR(ev) , + IOMMU_PC_COUNTER_SRC_REG, ®, true); + + reg = 0ULL | devid | (_GET_DEVID_MASK(ev) << 32); + if (reg) + reg |= (1UL << 31); + amd_iommu_pc_get_set_reg_val(devid, + _GET_BANK(ev), _GET_CNTR(ev) , + IOMMU_PC_DEVID_MATCH_REG, ®, true); + + reg = 0ULL | _GET_PASID(ev) | (_GET_PASID_MASK(ev) << 32); + if (reg) + reg |= (1UL << 31); + amd_iommu_pc_get_set_reg_val(devid, + _GET_BANK(ev), _GET_CNTR(ev) , + IOMMU_PC_PASID_MATCH_REG, ®, true); + + reg = 0ULL | _GET_DOMID(ev) | (_GET_DOMID_MASK(ev) << 32); + if (reg) + reg |= (1UL << 31); + amd_iommu_pc_get_set_reg_val(devid, + _GET_BANK(ev), _GET_CNTR(ev) , + IOMMU_PC_DOMID_MATCH_REG, ®, true); +} + +static void perf_iommu_disable_event(struct perf_event *event) +{ + u64 reg = 0ULL; + + amd_iommu_pc_get_set_reg_val(_GET_DEVID(event), + _GET_BANK(event), _GET_CNTR(event), + IOMMU_PC_COUNTER_SRC_REG, ®, true); +} + +static void perf_iommu_start(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + + pr_debug("perf: amd_iommu:perf_iommu_start\n"); + if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) + return; + + WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); + hwc->state = 0; + + if (flags & PERF_EF_RELOAD) { + u64 prev_raw_count = local64_read(&hwc->prev_count); + amd_iommu_pc_get_set_reg_val(_GET_DEVID(event), + _GET_BANK(event), _GET_CNTR(event), + IOMMU_PC_COUNTER_REG, &prev_raw_count, true); + } + + perf_iommu_enable_event(event); + perf_event_update_userpage(event); + +} + +static void perf_iommu_read(struct perf_event *event) +{ + u64 count = 0ULL; + u64 prev_raw_count = 0ULL; + u64 delta = 0ULL; + struct hw_perf_event *hwc = &event->hw; + pr_debug("perf: amd_iommu:perf_iommu_read\n"); + + amd_iommu_pc_get_set_reg_val(_GET_DEVID(event), + _GET_BANK(event), _GET_CNTR(event), + IOMMU_PC_COUNTER_REG, &count, false); + + /* IOMMU pc counter register is only 48 bits */ + count &= 0xFFFFFFFFFFFFULL; + + prev_raw_count = local64_read(&hwc->prev_count); + if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, + count) != prev_raw_count) + return; + + /* Handling 48-bit counter overflowing */ + delta = (count << COUNTER_SHIFT) - (prev_raw_count << COUNTER_SHIFT); + delta >>= COUNTER_SHIFT; + local64_add(delta, &event->count); + +} + +static void perf_iommu_stop(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + u64 config; + + pr_debug("perf: amd_iommu:perf_iommu_stop\n"); + + if (hwc->state & PERF_HES_UPTODATE) + return; + + perf_iommu_disable_event(event); + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; + + if (hwc->state & PERF_HES_UPTODATE) + return; + + config = hwc->config; + perf_iommu_read(event); + hwc->state |= PERF_HES_UPTODATE; +} + +static int perf_iommu_add(struct perf_event *event, int flags) +{ + int retval; + struct perf_amd_iommu *perf_iommu = + container_of(event->pmu, struct perf_amd_iommu, pmu); + + pr_debug("perf: amd_iommu:perf_iommu_add\n"); + event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + + /* request an iommu bank/counter */ + retval = get_next_avail_iommu_bnk_cntr(perf_iommu); + if (retval != -ENOSPC) + event->hw.extra_reg.reg = (u16)retval; + else + return retval; + + if (flags & PERF_EF_START) + perf_iommu_start(event, PERF_EF_RELOAD); + + return 0; +} + +static void perf_iommu_del(struct perf_event *event, int flags) +{ + struct perf_amd_iommu *perf_iommu = + container_of(event->pmu, struct perf_amd_iommu, pmu); + + pr_debug("perf: amd_iommu:perf_iommu_del\n"); + perf_iommu_stop(event, PERF_EF_UPDATE); + + /* clear the assigned iommu bank/counter */ + clear_avail_iommu_bnk_cntr(perf_iommu, + _GET_BANK(event), + _GET_CNTR(event)); + + perf_event_update_userpage(event); +} + +static __init int _init_events_attrs(struct perf_amd_iommu *perf_iommu) +{ + struct attribute **attrs; + struct attribute_group *attr_group; + int i = 0, j; + + while (amd_iommu_v2_event_descs[i].attr.attr.name) + i++; + + attr_group = kzalloc(sizeof(struct attribute *) + * (i + 1) + sizeof(*attr_group), GFP_KERNEL); + if (!attr_group) + return -ENOMEM; + + attrs = (struct attribute **)(attr_group + 1); + for (j = 0; j < i; j++) + attrs[j] = &amd_iommu_v2_event_descs[j].attr.attr; + + attr_group->name = "events"; + attr_group->attrs = attrs; + perf_iommu->events_group = attr_group; + + return 0; +} + +static __init void amd_iommu_pc_exit(void) +{ + if (__perf_iommu.events_group != NULL) { + kfree(__perf_iommu.events_group); + __perf_iommu.events_group = NULL; + } +} + +static __init int _init_perf_amd_iommu( + struct perf_amd_iommu *perf_iommu, char *name) +{ + int ret; + + raw_spin_lock_init(&perf_iommu->lock); + + /* Init format attributes */ + perf_iommu->format_group = &amd_iommu_format_group; + + /* Init cpumask attributes to only core 0 */ + cpumask_set_cpu(0, &iommu_cpumask); + perf_iommu->cpumask_group = &amd_iommu_cpumask_group; + + /* Init events attributes */ + if (_init_events_attrs(perf_iommu) != 0) + pr_err("perf: amd_iommu: Only support raw events.\n"); + + /* Init null attributes */ + perf_iommu->null_group = NULL; + perf_iommu->pmu.attr_groups = perf_iommu->attr_groups; + + ret = perf_pmu_register(&perf_iommu->pmu, name, -1); + if (ret) { + pr_err("perf: amd_iommu: Failed to initialized.\n"); + amd_iommu_pc_exit(); + } else { + pr_info("perf: amd_iommu: Detected. (%d banks, %d counters/bank)\n", + amd_iommu_pc_get_max_banks(IOMMU_BASE_DEVID), + amd_iommu_pc_get_max_counters(IOMMU_BASE_DEVID)); + } + + return ret; +} + +static struct perf_amd_iommu __perf_iommu = { + .pmu = { + .event_init = perf_iommu_event_init, + .add = perf_iommu_add, + .del = perf_iommu_del, + .start = perf_iommu_start, + .stop = perf_iommu_stop, + .read = perf_iommu_read, + }, + .max_banks = 0x00, + .max_counters = 0x00, + .cntr_assign_mask = 0ULL, + .format_group = NULL, + .cpumask_group = NULL, + .events_group = NULL, + .null_group = NULL, +}; + +static __init int amd_iommu_pc_init(void) +{ + /* Make sure the IOMMU PC resource is available */ + if (!amd_iommu_pc_supported()) + return -ENODEV; + + _init_perf_amd_iommu(&__perf_iommu, "amd_iommu"); + + return 0; +} + +device_initcall(amd_iommu_pc_init); diff --git a/arch/x86/kernel/cpu/perf_event_amd_iommu.h b/arch/x86/kernel/cpu/perf_event_amd_iommu.h new file mode 100644 index 00000000000..845d173278e --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_amd_iommu.h @@ -0,0 +1,40 @@ +/* + * Copyright (C) 2013 Advanced Micro Devices, Inc. + * + * Author: Steven Kinney <Steven.Kinney@amd.com> + * Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _PERF_EVENT_AMD_IOMMU_H_ +#define _PERF_EVENT_AMD_IOMMU_H_ + +/* iommu pc mmio region register indexes */ +#define IOMMU_PC_COUNTER_REG 0x00 +#define IOMMU_PC_COUNTER_SRC_REG 0x08 +#define IOMMU_PC_PASID_MATCH_REG 0x10 +#define IOMMU_PC_DOMID_MATCH_REG 0x18 +#define IOMMU_PC_DEVID_MATCH_REG 0x20 +#define IOMMU_PC_COUNTER_REPORT_REG 0x28 + +/* maximun specified bank/counters */ +#define PC_MAX_SPEC_BNKS 64 +#define PC_MAX_SPEC_CNTRS 16 + +/* iommu pc reg masks*/ +#define IOMMU_BASE_DEVID 0x0000 + +/* amd_iommu_init.c external support functions */ +extern bool amd_iommu_pc_supported(void); + +extern u8 amd_iommu_pc_get_max_banks(u16 devid); + +extern u8 amd_iommu_pc_get_max_counters(u16 devid); + +extern int amd_iommu_pc_get_set_reg_val(u16 devid, u8 bank, u8 cntr, + u8 fxn, u64 *value, bool is_write); + +#endif /*_PERF_EVENT_AMD_IOMMU_H_*/ diff --git a/arch/x86/kernel/cpu/perf_event_amd_uncore.c b/arch/x86/kernel/cpu/perf_event_amd_uncore.c new file mode 100644 index 00000000000..3bbdf4cd38b --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_amd_uncore.c @@ -0,0 +1,547 @@ +/* + * Copyright (C) 2013 Advanced Micro Devices, Inc. + * + * Author: Jacob Shin <jacob.shin@amd.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/perf_event.h> +#include <linux/percpu.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/cpu.h> +#include <linux/cpumask.h> + +#include <asm/cpufeature.h> +#include <asm/perf_event.h> +#include <asm/msr.h> + +#define NUM_COUNTERS_NB 4 +#define NUM_COUNTERS_L2 4 +#define MAX_COUNTERS NUM_COUNTERS_NB + +#define RDPMC_BASE_NB 6 +#define RDPMC_BASE_L2 10 + +#define COUNTER_SHIFT 16 + +struct amd_uncore { + int id; + int refcnt; + int cpu; + int num_counters; + int rdpmc_base; + u32 msr_base; + cpumask_t *active_mask; + struct pmu *pmu; + struct perf_event *events[MAX_COUNTERS]; + struct amd_uncore *free_when_cpu_online; +}; + +static struct amd_uncore * __percpu *amd_uncore_nb; +static struct amd_uncore * __percpu *amd_uncore_l2; + +static struct pmu amd_nb_pmu; +static struct pmu amd_l2_pmu; + +static cpumask_t amd_nb_active_mask; +static cpumask_t amd_l2_active_mask; + +static bool is_nb_event(struct perf_event *event) +{ + return event->pmu->type == amd_nb_pmu.type; +} + +static bool is_l2_event(struct perf_event *event) +{ + return event->pmu->type == amd_l2_pmu.type; +} + +static struct amd_uncore *event_to_amd_uncore(struct perf_event *event) +{ + if (is_nb_event(event) && amd_uncore_nb) + return *per_cpu_ptr(amd_uncore_nb, event->cpu); + else if (is_l2_event(event) && amd_uncore_l2) + return *per_cpu_ptr(amd_uncore_l2, event->cpu); + + return NULL; +} + +static void amd_uncore_read(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 prev, new; + s64 delta; + + /* + * since we do not enable counter overflow interrupts, + * we do not have to worry about prev_count changing on us + */ + + prev = local64_read(&hwc->prev_count); + rdpmcl(hwc->event_base_rdpmc, new); + local64_set(&hwc->prev_count, new); + delta = (new << COUNTER_SHIFT) - (prev << COUNTER_SHIFT); + delta >>= COUNTER_SHIFT; + local64_add(delta, &event->count); +} + +static void amd_uncore_start(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + + if (flags & PERF_EF_RELOAD) + wrmsrl(hwc->event_base, (u64)local64_read(&hwc->prev_count)); + + hwc->state = 0; + wrmsrl(hwc->config_base, (hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE)); + perf_event_update_userpage(event); +} + +static void amd_uncore_stop(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + + wrmsrl(hwc->config_base, hwc->config); + hwc->state |= PERF_HES_STOPPED; + + if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { + amd_uncore_read(event); + hwc->state |= PERF_HES_UPTODATE; + } +} + +static int amd_uncore_add(struct perf_event *event, int flags) +{ + int i; + struct amd_uncore *uncore = event_to_amd_uncore(event); + struct hw_perf_event *hwc = &event->hw; + + /* are we already assigned? */ + if (hwc->idx != -1 && uncore->events[hwc->idx] == event) + goto out; + + for (i = 0; i < uncore->num_counters; i++) { + if (uncore->events[i] == event) { + hwc->idx = i; + goto out; + } + } + + /* if not, take the first available counter */ + hwc->idx = -1; + for (i = 0; i < uncore->num_counters; i++) { + if (cmpxchg(&uncore->events[i], NULL, event) == NULL) { + hwc->idx = i; + break; + } + } + +out: + if (hwc->idx == -1) + return -EBUSY; + + hwc->config_base = uncore->msr_base + (2 * hwc->idx); + hwc->event_base = uncore->msr_base + 1 + (2 * hwc->idx); + hwc->event_base_rdpmc = uncore->rdpmc_base + hwc->idx; + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + + if (flags & PERF_EF_START) + amd_uncore_start(event, PERF_EF_RELOAD); + + return 0; +} + +static void amd_uncore_del(struct perf_event *event, int flags) +{ + int i; + struct amd_uncore *uncore = event_to_amd_uncore(event); + struct hw_perf_event *hwc = &event->hw; + + amd_uncore_stop(event, PERF_EF_UPDATE); + + for (i = 0; i < uncore->num_counters; i++) { + if (cmpxchg(&uncore->events[i], event, NULL) == event) + break; + } + + hwc->idx = -1; +} + +static int amd_uncore_event_init(struct perf_event *event) +{ + struct amd_uncore *uncore; + struct hw_perf_event *hwc = &event->hw; + + if (event->attr.type != event->pmu->type) + return -ENOENT; + + /* + * NB and L2 counters (MSRs) are shared across all cores that share the + * same NB / L2 cache. Interrupts can be directed to a single target + * core, however, event counts generated by processes running on other + * cores cannot be masked out. So we do not support sampling and + * per-thread events. + */ + if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK) + return -EINVAL; + + /* NB and L2 counters do not have usr/os/guest/host bits */ + if (event->attr.exclude_user || event->attr.exclude_kernel || + event->attr.exclude_host || event->attr.exclude_guest) + return -EINVAL; + + /* and we do not enable counter overflow interrupts */ + hwc->config = event->attr.config & AMD64_RAW_EVENT_MASK_NB; + hwc->idx = -1; + + if (event->cpu < 0) + return -EINVAL; + + uncore = event_to_amd_uncore(event); + if (!uncore) + return -ENODEV; + + /* + * since request can come in to any of the shared cores, we will remap + * to a single common cpu. + */ + event->cpu = uncore->cpu; + + return 0; +} + +static ssize_t amd_uncore_attr_show_cpumask(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + int n; + cpumask_t *active_mask; + struct pmu *pmu = dev_get_drvdata(dev); + + if (pmu->type == amd_nb_pmu.type) + active_mask = &amd_nb_active_mask; + else if (pmu->type == amd_l2_pmu.type) + active_mask = &amd_l2_active_mask; + else + return 0; + + n = cpulist_scnprintf(buf, PAGE_SIZE - 2, active_mask); + buf[n++] = '\n'; + buf[n] = '\0'; + return n; +} +static DEVICE_ATTR(cpumask, S_IRUGO, amd_uncore_attr_show_cpumask, NULL); + +static struct attribute *amd_uncore_attrs[] = { + &dev_attr_cpumask.attr, + NULL, +}; + +static struct attribute_group amd_uncore_attr_group = { + .attrs = amd_uncore_attrs, +}; + +PMU_FORMAT_ATTR(event, "config:0-7,32-35"); +PMU_FORMAT_ATTR(umask, "config:8-15"); + +static struct attribute *amd_uncore_format_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + NULL, +}; + +static struct attribute_group amd_uncore_format_group = { + .name = "format", + .attrs = amd_uncore_format_attr, +}; + +static const struct attribute_group *amd_uncore_attr_groups[] = { + &amd_uncore_attr_group, + &amd_uncore_format_group, + NULL, +}; + +static struct pmu amd_nb_pmu = { + .attr_groups = amd_uncore_attr_groups, + .name = "amd_nb", + .event_init = amd_uncore_event_init, + .add = amd_uncore_add, + .del = amd_uncore_del, + .start = amd_uncore_start, + .stop = amd_uncore_stop, + .read = amd_uncore_read, +}; + +static struct pmu amd_l2_pmu = { + .attr_groups = amd_uncore_attr_groups, + .name = "amd_l2", + .event_init = amd_uncore_event_init, + .add = amd_uncore_add, + .del = amd_uncore_del, + .start = amd_uncore_start, + .stop = amd_uncore_stop, + .read = amd_uncore_read, +}; + +static struct amd_uncore *amd_uncore_alloc(unsigned int cpu) +{ + return kzalloc_node(sizeof(struct amd_uncore), GFP_KERNEL, + cpu_to_node(cpu)); +} + +static void amd_uncore_cpu_up_prepare(unsigned int cpu) +{ + struct amd_uncore *uncore; + + if (amd_uncore_nb) { + uncore = amd_uncore_alloc(cpu); + uncore->cpu = cpu; + uncore->num_counters = NUM_COUNTERS_NB; + uncore->rdpmc_base = RDPMC_BASE_NB; + uncore->msr_base = MSR_F15H_NB_PERF_CTL; + uncore->active_mask = &amd_nb_active_mask; + uncore->pmu = &amd_nb_pmu; + *per_cpu_ptr(amd_uncore_nb, cpu) = uncore; + } + + if (amd_uncore_l2) { + uncore = amd_uncore_alloc(cpu); + uncore->cpu = cpu; + uncore->num_counters = NUM_COUNTERS_L2; + uncore->rdpmc_base = RDPMC_BASE_L2; + uncore->msr_base = MSR_F16H_L2I_PERF_CTL; + uncore->active_mask = &amd_l2_active_mask; + uncore->pmu = &amd_l2_pmu; + *per_cpu_ptr(amd_uncore_l2, cpu) = uncore; + } +} + +static struct amd_uncore * +amd_uncore_find_online_sibling(struct amd_uncore *this, + struct amd_uncore * __percpu *uncores) +{ + unsigned int cpu; + struct amd_uncore *that; + + for_each_online_cpu(cpu) { + that = *per_cpu_ptr(uncores, cpu); + + if (!that) + continue; + + if (this == that) + continue; + + if (this->id == that->id) { + that->free_when_cpu_online = this; + this = that; + break; + } + } + + this->refcnt++; + return this; +} + +static void amd_uncore_cpu_starting(unsigned int cpu) +{ + unsigned int eax, ebx, ecx, edx; + struct amd_uncore *uncore; + + if (amd_uncore_nb) { + uncore = *per_cpu_ptr(amd_uncore_nb, cpu); + cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); + uncore->id = ecx & 0xff; + + uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_nb); + *per_cpu_ptr(amd_uncore_nb, cpu) = uncore; + } + + if (amd_uncore_l2) { + unsigned int apicid = cpu_data(cpu).apicid; + unsigned int nshared; + + uncore = *per_cpu_ptr(amd_uncore_l2, cpu); + cpuid_count(0x8000001d, 2, &eax, &ebx, &ecx, &edx); + nshared = ((eax >> 14) & 0xfff) + 1; + uncore->id = apicid - (apicid % nshared); + + uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_l2); + *per_cpu_ptr(amd_uncore_l2, cpu) = uncore; + } +} + +static void uncore_online(unsigned int cpu, + struct amd_uncore * __percpu *uncores) +{ + struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu); + + kfree(uncore->free_when_cpu_online); + uncore->free_when_cpu_online = NULL; + + if (cpu == uncore->cpu) + cpumask_set_cpu(cpu, uncore->active_mask); +} + +static void amd_uncore_cpu_online(unsigned int cpu) +{ + if (amd_uncore_nb) + uncore_online(cpu, amd_uncore_nb); + + if (amd_uncore_l2) + uncore_online(cpu, amd_uncore_l2); +} + +static void uncore_down_prepare(unsigned int cpu, + struct amd_uncore * __percpu *uncores) +{ + unsigned int i; + struct amd_uncore *this = *per_cpu_ptr(uncores, cpu); + + if (this->cpu != cpu) + return; + + /* this cpu is going down, migrate to a shared sibling if possible */ + for_each_online_cpu(i) { + struct amd_uncore *that = *per_cpu_ptr(uncores, i); + + if (cpu == i) + continue; + + if (this == that) { + perf_pmu_migrate_context(this->pmu, cpu, i); + cpumask_clear_cpu(cpu, that->active_mask); + cpumask_set_cpu(i, that->active_mask); + that->cpu = i; + break; + } + } +} + +static void amd_uncore_cpu_down_prepare(unsigned int cpu) +{ + if (amd_uncore_nb) + uncore_down_prepare(cpu, amd_uncore_nb); + + if (amd_uncore_l2) + uncore_down_prepare(cpu, amd_uncore_l2); +} + +static void uncore_dead(unsigned int cpu, struct amd_uncore * __percpu *uncores) +{ + struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu); + + if (cpu == uncore->cpu) + cpumask_clear_cpu(cpu, uncore->active_mask); + + if (!--uncore->refcnt) + kfree(uncore); + *per_cpu_ptr(amd_uncore_nb, cpu) = NULL; +} + +static void amd_uncore_cpu_dead(unsigned int cpu) +{ + if (amd_uncore_nb) + uncore_dead(cpu, amd_uncore_nb); + + if (amd_uncore_l2) + uncore_dead(cpu, amd_uncore_l2); +} + +static int +amd_uncore_cpu_notifier(struct notifier_block *self, unsigned long action, + void *hcpu) +{ + unsigned int cpu = (long)hcpu; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_UP_PREPARE: + amd_uncore_cpu_up_prepare(cpu); + break; + + case CPU_STARTING: + amd_uncore_cpu_starting(cpu); + break; + + case CPU_ONLINE: + amd_uncore_cpu_online(cpu); + break; + + case CPU_DOWN_PREPARE: + amd_uncore_cpu_down_prepare(cpu); + break; + + case CPU_UP_CANCELED: + case CPU_DEAD: + amd_uncore_cpu_dead(cpu); + break; + + default: + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block amd_uncore_cpu_notifier_block = { + .notifier_call = amd_uncore_cpu_notifier, + .priority = CPU_PRI_PERF + 1, +}; + +static void __init init_cpu_already_online(void *dummy) +{ + unsigned int cpu = smp_processor_id(); + + amd_uncore_cpu_starting(cpu); + amd_uncore_cpu_online(cpu); +} + +static int __init amd_uncore_init(void) +{ + unsigned int cpu; + int ret = -ENODEV; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + return -ENODEV; + + if (!cpu_has_topoext) + return -ENODEV; + + if (cpu_has_perfctr_nb) { + amd_uncore_nb = alloc_percpu(struct amd_uncore *); + perf_pmu_register(&amd_nb_pmu, amd_nb_pmu.name, -1); + + printk(KERN_INFO "perf: AMD NB counters detected\n"); + ret = 0; + } + + if (cpu_has_perfctr_l2) { + amd_uncore_l2 = alloc_percpu(struct amd_uncore *); + perf_pmu_register(&amd_l2_pmu, amd_l2_pmu.name, -1); + + printk(KERN_INFO "perf: AMD L2I counters detected\n"); + ret = 0; + } + + if (ret) + return -ENODEV; + + cpu_notifier_register_begin(); + + /* init cpus already online before registering for hotplug notifier */ + for_each_online_cpu(cpu) { + amd_uncore_cpu_up_prepare(cpu); + smp_call_function_single(cpu, init_cpu_already_online, NULL, 1); + } + + __register_cpu_notifier(&amd_uncore_cpu_notifier_block); + cpu_notifier_register_done(); + + return 0; +} +device_initcall(amd_uncore_init); diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index c8f5c088cad..2502d0d9d24 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1,20 +1,40 @@ -#ifdef CONFIG_CPU_SUP_INTEL +/* + * Per core/cpu state + * + * Used to coordinate shared registers between HT threads or + * among events on a single PMU. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/stddef.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/export.h> + +#include <asm/cpufeature.h> +#include <asm/hardirq.h> +#include <asm/apic.h> + +#include "perf_event.h" /* * Intel PerfMon, used on Core and later. */ -static const u64 intel_perfmon_event_map[] = -{ - [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e, - [PERF_COUNT_HW_CACHE_MISSES] = 0x412e, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, - [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, +static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e, + [PERF_COUNT_HW_CACHE_MISSES] = 0x412e, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, + [PERF_COUNT_HW_REF_CPU_CYCLES] = 0x0300, /* pseudo-encoding */ }; -static struct event_constraint intel_core_event_constraints[] = +static struct event_constraint intel_core_event_constraints[] __read_mostly = { INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ @@ -25,16 +45,11 @@ static struct event_constraint intel_core_event_constraints[] = EVENT_CONSTRAINT_END }; -static struct event_constraint intel_core2_event_constraints[] = +static struct event_constraint intel_core2_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ - /* - * Core2 has Fixed Counter 2 listed as CPU_CLK_UNHALTED.REF and event - * 0x013c as CPU_CLK_UNHALTED.BUS and specifies there is a fixed - * ratio between these counters. - */ - /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ @@ -48,11 +63,11 @@ static struct event_constraint intel_core2_event_constraints[] = EVENT_CONSTRAINT_END }; -static struct event_constraint intel_nehalem_event_constraints[] = +static struct event_constraint intel_nehalem_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ - /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */ INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */ INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */ @@ -64,11 +79,19 @@ static struct event_constraint intel_nehalem_event_constraints[] = EVENT_CONSTRAINT_END }; -static struct event_constraint intel_westmere_event_constraints[] = +static struct extra_reg intel_nehalem_extra_regs[] __read_mostly = +{ + /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0), + INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b), + EVENT_EXTRA_END +}; + +static struct event_constraint intel_westmere_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ - /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */ INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ @@ -76,11 +99,124 @@ static struct event_constraint intel_westmere_event_constraints[] = EVENT_CONSTRAINT_END }; -static struct event_constraint intel_gen_event_constraints[] = +static struct event_constraint intel_snb_event_constraints[] __read_mostly = +{ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ + INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */ + INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */ + INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */ + INTEL_UEVENT_CONSTRAINT(0x06a3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */ + INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */ + INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ + INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ + INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */ + INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_ivb_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ - /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ + INTEL_UEVENT_CONSTRAINT(0x0148, 0x4), /* L1D_PEND_MISS.PENDING */ + INTEL_UEVENT_CONSTRAINT(0x0279, 0xf), /* IDQ.EMTPY */ + INTEL_UEVENT_CONSTRAINT(0x019c, 0xf), /* IDQ_UOPS_NOT_DELIVERED.CORE */ + INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_LDM_PENDING */ + INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */ + INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */ + INTEL_UEVENT_CONSTRAINT(0x06a3, 0xf), /* CYCLE_ACTIVITY.STALLS_LDM_PENDING */ + INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */ + INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */ + INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ + /* + * Errata BV98 -- MEM_*_RETIRED events can leak between counters of SMT + * siblings; disable these events because they can corrupt unrelated + * counters. + */ + INTEL_EVENT_CONSTRAINT(0xd0, 0x0), /* MEM_UOPS_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd1, 0x0), /* MEM_LOAD_UOPS_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd2, 0x0), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd3, 0x0), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */ + EVENT_CONSTRAINT_END +}; + +static struct extra_reg intel_westmere_extra_regs[] __read_mostly = +{ + /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1), + INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b), + EVENT_EXTRA_END +}; + +static struct event_constraint intel_v1_event_constraints[] __read_mostly = +{ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_gen_event_constraints[] __read_mostly = +{ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_slm_event_constraints[] __read_mostly = +{ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* pseudo CPU_CLK_UNHALTED.REF */ + EVENT_CONSTRAINT_END +}; + +static struct extra_reg intel_snb_extra_regs[] __read_mostly = { + /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3f807f8fffull, RSP_1), + INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), + EVENT_EXTRA_END +}; + +static struct extra_reg intel_snbep_extra_regs[] __read_mostly = { + /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1), + INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), + EVENT_EXTRA_END +}; + +EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3"); +EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3"); +EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2"); + +struct attribute *nhm_events_attrs[] = { + EVENT_PTR(mem_ld_nhm), + NULL, +}; + +struct attribute *snb_events_attrs[] = { + EVENT_PTR(mem_ld_snb), + EVENT_PTR(mem_st_snb), + NULL, +}; + +static struct event_constraint intel_hsw_event_constraints[] = { + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ + INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.* */ + INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ + INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ + /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */ + INTEL_EVENT_CONSTRAINT(0x08a3, 0x4), + /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */ + INTEL_EVENT_CONSTRAINT(0x0ca3, 0x4), + /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */ + INTEL_EVENT_CONSTRAINT(0x04a3, 0xf), EVENT_CONSTRAINT_END }; @@ -89,6 +225,196 @@ static u64 intel_pmu_event_map(int hw_event) return intel_perfmon_event_map[hw_event]; } +#define SNB_DMND_DATA_RD (1ULL << 0) +#define SNB_DMND_RFO (1ULL << 1) +#define SNB_DMND_IFETCH (1ULL << 2) +#define SNB_DMND_WB (1ULL << 3) +#define SNB_PF_DATA_RD (1ULL << 4) +#define SNB_PF_RFO (1ULL << 5) +#define SNB_PF_IFETCH (1ULL << 6) +#define SNB_LLC_DATA_RD (1ULL << 7) +#define SNB_LLC_RFO (1ULL << 8) +#define SNB_LLC_IFETCH (1ULL << 9) +#define SNB_BUS_LOCKS (1ULL << 10) +#define SNB_STRM_ST (1ULL << 11) +#define SNB_OTHER (1ULL << 15) +#define SNB_RESP_ANY (1ULL << 16) +#define SNB_NO_SUPP (1ULL << 17) +#define SNB_LLC_HITM (1ULL << 18) +#define SNB_LLC_HITE (1ULL << 19) +#define SNB_LLC_HITS (1ULL << 20) +#define SNB_LLC_HITF (1ULL << 21) +#define SNB_LOCAL (1ULL << 22) +#define SNB_REMOTE (0xffULL << 23) +#define SNB_SNP_NONE (1ULL << 31) +#define SNB_SNP_NOT_NEEDED (1ULL << 32) +#define SNB_SNP_MISS (1ULL << 33) +#define SNB_NO_FWD (1ULL << 34) +#define SNB_SNP_FWD (1ULL << 35) +#define SNB_HITM (1ULL << 36) +#define SNB_NON_DRAM (1ULL << 37) + +#define SNB_DMND_READ (SNB_DMND_DATA_RD|SNB_LLC_DATA_RD) +#define SNB_DMND_WRITE (SNB_DMND_RFO|SNB_LLC_RFO) +#define SNB_DMND_PREFETCH (SNB_PF_DATA_RD|SNB_PF_RFO) + +#define SNB_SNP_ANY (SNB_SNP_NONE|SNB_SNP_NOT_NEEDED| \ + SNB_SNP_MISS|SNB_NO_FWD|SNB_SNP_FWD| \ + SNB_HITM) + +#define SNB_DRAM_ANY (SNB_LOCAL|SNB_REMOTE|SNB_SNP_ANY) +#define SNB_DRAM_REMOTE (SNB_REMOTE|SNB_SNP_ANY) + +#define SNB_L3_ACCESS SNB_RESP_ANY +#define SNB_L3_MISS (SNB_DRAM_ANY|SNB_NON_DRAM) + +static __initconst const u64 snb_hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_L3_ACCESS, + [ C(RESULT_MISS) ] = SNB_DMND_READ|SNB_L3_MISS, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_L3_ACCESS, + [ C(RESULT_MISS) ] = SNB_DMND_WRITE|SNB_L3_MISS, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_L3_ACCESS, + [ C(RESULT_MISS) ] = SNB_DMND_PREFETCH|SNB_L3_MISS, + }, + }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_DRAM_ANY, + [ C(RESULT_MISS) ] = SNB_DMND_READ|SNB_DRAM_REMOTE, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_DRAM_ANY, + [ C(RESULT_MISS) ] = SNB_DMND_WRITE|SNB_DRAM_REMOTE, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_DRAM_ANY, + [ C(RESULT_MISS) ] = SNB_DMND_PREFETCH|SNB_DRAM_REMOTE, + }, + }, +}; + +static __initconst const u64 snb_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0xf1d0, /* MEM_UOP_RETIRED.LOADS */ + [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPLACEMENT */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0xf2d0, /* MEM_UOP_RETIRED.STORES */ + [ C(RESULT_MISS) ] = 0x0851, /* L1D.ALL_M_REPLACEMENT */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x024e, /* HW_PRE_REQ.DL1_MISS */ + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0280, /* ICACHE.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_WRITE) ] = { + /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_PREFETCH) ] = { + /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOP_RETIRED.ALL_LOADS */ + [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.CAUSES_A_WALK */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOP_RETIRED.ALL_STORES */ + [ C(RESULT_MISS) ] = 0x0149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x1085, /* ITLB_MISSES.STLB_HIT */ + [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.CAUSES_A_WALK */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ + [ C(RESULT_MISS) ] = 0x00c5, /* BR_MISP_RETIRED.ALL_BRANCHES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + }, + +}; + static __initconst const u64 westmere_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] @@ -124,16 +450,26 @@ static __initconst const u64 westmere_hw_cache_event_ids }, [ C(LL ) ] = { [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ - [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ + /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, }, + /* + * Use RFO, not WRITEBACK, because a write miss would typically occur + * on RFO. + */ [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ - [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ + /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, }, [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ - [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ + /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, }, }, [ C(DTLB) ] = { @@ -178,6 +514,88 @@ static __initconst const u64 westmere_hw_cache_event_ids [ C(RESULT_MISS) ] = -1, }, }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + }, +}; + +/* + * Nehalem/Westmere MSR_OFFCORE_RESPONSE bits; + * See IA32 SDM Vol 3B 30.6.1.3 + */ + +#define NHM_DMND_DATA_RD (1 << 0) +#define NHM_DMND_RFO (1 << 1) +#define NHM_DMND_IFETCH (1 << 2) +#define NHM_DMND_WB (1 << 3) +#define NHM_PF_DATA_RD (1 << 4) +#define NHM_PF_DATA_RFO (1 << 5) +#define NHM_PF_IFETCH (1 << 6) +#define NHM_OFFCORE_OTHER (1 << 7) +#define NHM_UNCORE_HIT (1 << 8) +#define NHM_OTHER_CORE_HIT_SNP (1 << 9) +#define NHM_OTHER_CORE_HITM (1 << 10) + /* reserved */ +#define NHM_REMOTE_CACHE_FWD (1 << 12) +#define NHM_REMOTE_DRAM (1 << 13) +#define NHM_LOCAL_DRAM (1 << 14) +#define NHM_NON_DRAM (1 << 15) + +#define NHM_LOCAL (NHM_LOCAL_DRAM|NHM_REMOTE_CACHE_FWD) +#define NHM_REMOTE (NHM_REMOTE_DRAM) + +#define NHM_DMND_READ (NHM_DMND_DATA_RD) +#define NHM_DMND_WRITE (NHM_DMND_RFO|NHM_DMND_WB) +#define NHM_DMND_PREFETCH (NHM_PF_DATA_RD|NHM_PF_DATA_RFO) + +#define NHM_L3_HIT (NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM) +#define NHM_L3_MISS (NHM_NON_DRAM|NHM_LOCAL_DRAM|NHM_REMOTE_DRAM|NHM_REMOTE_CACHE_FWD) +#define NHM_L3_ACCESS (NHM_L3_HIT|NHM_L3_MISS) + +static __initconst const u64 nehalem_hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_L3_ACCESS, + [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_L3_MISS, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_L3_ACCESS, + [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_L3_MISS, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS, + [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_L3_MISS, + }, + }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_LOCAL|NHM_REMOTE, + [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_REMOTE, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_LOCAL|NHM_REMOTE, + [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_REMOTE, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_LOCAL|NHM_REMOTE, + [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_REMOTE, + }, + }, }; static __initconst const u64 nehalem_hw_cache_event_ids @@ -187,12 +605,12 @@ static __initconst const u64 nehalem_hw_cache_event_ids { [ C(L1D) ] = { [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ - [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ + [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */ + [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */ }, [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ - [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ + [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */ + [ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */ }, [ C(OP_PREFETCH) ] = { [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ @@ -215,16 +633,26 @@ static __initconst const u64 nehalem_hw_cache_event_ids }, [ C(LL ) ] = { [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ - [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ + /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, }, + /* + * Use RFO, not WRITEBACK, because a write miss would typically occur + * on RFO. + */ [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ - [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ + /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, }, [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ - [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ + /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, }, }, [ C(DTLB) ] = { @@ -269,6 +697,20 @@ static __initconst const u64 nehalem_hw_cache_event_ids [ C(RESULT_MISS) ] = -1, }, }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + }, }; static __initconst const u64 core2_hw_cache_event_ids @@ -453,13 +895,161 @@ static __initconst const u64 atom_hw_cache_event_ids }, }; +static struct extra_reg intel_slm_extra_regs[] __read_mostly = +{ + /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x768005ffffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x768005ffffull, RSP_1), + EVENT_EXTRA_END +}; + +#define SLM_DMND_READ SNB_DMND_DATA_RD +#define SLM_DMND_WRITE SNB_DMND_RFO +#define SLM_DMND_PREFETCH (SNB_PF_DATA_RD|SNB_PF_RFO) + +#define SLM_SNP_ANY (SNB_SNP_NONE|SNB_SNP_MISS|SNB_NO_FWD|SNB_HITM) +#define SLM_LLC_ACCESS SNB_RESP_ANY +#define SLM_LLC_MISS (SLM_SNP_ANY|SNB_NON_DRAM) + +static __initconst const u64 slm_hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = SLM_DMND_READ|SLM_LLC_ACCESS, + [ C(RESULT_MISS) ] = SLM_DMND_READ|SLM_LLC_MISS, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = SLM_DMND_WRITE|SLM_LLC_ACCESS, + [ C(RESULT_MISS) ] = SLM_DMND_WRITE|SLM_LLC_MISS, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = SLM_DMND_PREFETCH|SLM_LLC_ACCESS, + [ C(RESULT_MISS) ] = SLM_DMND_PREFETCH|SLM_LLC_MISS, + }, + }, +}; + +static __initconst const u64 slm_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0x0104, /* LD_DCU_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0380, /* ICACHE.ACCESSES */ + [ C(RESULT_MISS) ] = 0x0280, /* ICACGE.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_WRITE) ] = { + /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_PREFETCH) ] = { + /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0x0804, /* LD_DTLB_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ + [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event) +{ + /* user explicitly requested branch sampling */ + if (has_branch_stack(event)) + return true; + + /* implicit branch sampling to correct PEBS skid */ + if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 && + x86_pmu.intel_cap.pebs_format < 2) + return true; + + return false; +} + static void intel_pmu_disable_all(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); - if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) + if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) intel_pmu_disable_bts(); intel_pmu_pebs_disable_all(); @@ -472,11 +1062,12 @@ static void intel_pmu_enable_all(int added) intel_pmu_pebs_enable_all(); intel_pmu_lbr_enable_all(); - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, + x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask); - if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { + if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { struct perf_event *event = - cpuc->events[X86_PMC_IDX_FIXED_BTS]; + cpuc->events[INTEL_PMC_IDX_FIXED_BTS]; if (WARN_ON_ONCE(!event)) return; @@ -582,7 +1173,7 @@ static inline void intel_pmu_ack_status(u64 ack) static void intel_pmu_disable_fixed(struct hw_perf_event *hwc) { - int idx = hwc->idx - X86_PMC_IDX_FIXED; + int idx = hwc->idx - INTEL_PMC_IDX_FIXED; u64 ctrl_val, mask; mask = 0xfULL << (idx * 4); @@ -592,16 +1183,33 @@ static void intel_pmu_disable_fixed(struct hw_perf_event *hwc) wrmsrl(hwc->config_base, ctrl_val); } +static inline bool event_is_checkpointed(struct perf_event *event) +{ + return (event->hw.config & HSW_IN_TX_CHECKPOINTED) != 0; +} + static void intel_pmu_disable_event(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) { + if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) { intel_pmu_disable_bts(); intel_pmu_drain_bts_buffer(); return; } + cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx); + cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx); + cpuc->intel_cp_status &= ~(1ull << hwc->idx); + + /* + * must disable before any actual event + * because any event may be combined with LBR + */ + if (intel_pmu_needs_lbr_smpl(event)) + intel_pmu_lbr_disable(event); + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { intel_pmu_disable_fixed(hwc); return; @@ -615,7 +1223,7 @@ static void intel_pmu_disable_event(struct perf_event *event) static void intel_pmu_enable_fixed(struct hw_perf_event *hwc) { - int idx = hwc->idx - X86_PMC_IDX_FIXED; + int idx = hwc->idx - INTEL_PMC_IDX_FIXED; u64 ctrl_val, bits, mask; /* @@ -647,14 +1255,29 @@ static void intel_pmu_enable_fixed(struct hw_perf_event *hwc) static void intel_pmu_enable_event(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) { - if (!__get_cpu_var(cpu_hw_events).enabled) + if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) { + if (!__this_cpu_read(cpu_hw_events.enabled)) return; intel_pmu_enable_bts(hwc->config); return; } + /* + * must enabled before any actual event + * because any event may be combined with LBR + */ + if (intel_pmu_needs_lbr_smpl(event)) + intel_pmu_lbr_enable(event); + + if (event->attr.exclude_host) + cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx); + if (event->attr.exclude_guest) + cpuc->intel_ctrl_host_mask |= (1ull << hwc->idx); + + if (unlikely(event_is_checkpointed(event))) + cpuc->intel_cp_status |= (1ull << hwc->idx); if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { intel_pmu_enable_fixed(hwc); @@ -671,15 +1294,26 @@ static void intel_pmu_enable_event(struct perf_event *event) * Save and restart an expired event. Called by NMI contexts, * so it has to be careful about preempting normal event ops: */ -static int intel_pmu_save_and_restart(struct perf_event *event) +int intel_pmu_save_and_restart(struct perf_event *event) { x86_perf_event_update(event); + /* + * For a checkpointed counter always reset back to 0. This + * avoids a situation where the counter overflows, aborts the + * transaction and is then set back to shortly before the + * overflow, and overflows and aborts again. + */ + if (unlikely(event_is_checkpointed(event))) { + /* No race with NMIs because the counter should not be armed */ + wrmsrl(event->hw.event_base, 0); + local64_set(&event->hw.prev_count, 0); + } return x86_perf_event_set_period(event); } static void intel_pmu_reset(void) { - struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds; + struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds); unsigned long flags; int idx; @@ -688,14 +1322,14 @@ static void intel_pmu_reset(void) local_irq_save(flags); - printk("clearing PMU state on CPU#%d\n", smp_processor_id()); + pr_info("clearing PMU state on CPU#%d\n", smp_processor_id()); for (idx = 0; idx < x86_pmu.num_counters; idx++) { - checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); - checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); + wrmsrl_safe(x86_pmu_config_addr(idx), 0ull); + wrmsrl_safe(x86_pmu_event_addr(idx), 0ull); } for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) - checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); + wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); if (ds) ds->bts_index = ds->bts_buffer_base; @@ -715,24 +1349,30 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) u64 status; int handled; - perf_sample_data_init(&data, 0); - cpuc = &__get_cpu_var(cpu_hw_events); + /* + * No known reason to not always do late ACK, + * but just in case do it opt-in. + */ + if (!x86_pmu.late_ack) + apic_write(APIC_LVTPC, APIC_DM_NMI); intel_pmu_disable_all(); handled = intel_pmu_drain_bts_buffer(); status = intel_pmu_get_status(); - if (!status) { - intel_pmu_enable_all(0); - return handled; - } + if (!status) + goto done; loops = 0; again: intel_pmu_ack_status(status); if (++loops > 100) { - WARN_ONCE(1, "perfevents: irq loop stuck!\n"); - perf_event_print_debug(); + static bool warned = false; + if (!warned) { + WARN(1, "perfevents: irq loop stuck!\n"); + perf_event_print_debug(); + warned = true; + } intel_pmu_reset(); goto done; } @@ -742,6 +1382,15 @@ again: intel_pmu_lbr_read(); /* + * CondChgd bit 63 doesn't mean any overflow status. Ignore + * and clear the bit. + */ + if (__test_and_clear_bit(63, (unsigned long *)&status)) { + if (!status) + goto done; + } + + /* * PEBS overflow sets bit 62 in the global status register */ if (__test_and_clear_bit(62, (unsigned long *)&status)) { @@ -749,6 +1398,13 @@ again: x86_pmu.drain_pebs(regs); } + /* + * Checkpointed counters can lead to 'spurious' PMIs because the + * rollback caused by the PMI will have cleared the overflow status + * bit. Therefore always force probe these counters. + */ + status |= cpuc->intel_cp_status; + for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { struct perf_event *event = cpuc->events[bit]; @@ -760,9 +1416,12 @@ again: if (!intel_pmu_save_and_restart(event)) continue; - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, event->hw.last_period); - if (perf_event_overflow(event, 1, &data, regs)) + if (has_branch_stack(event)) + data.br_stack = &cpuc->lbr_stack; + + if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); } @@ -775,6 +1434,13 @@ again: done: intel_pmu_enable_all(0); + /* + * Only unmask the NMI after the overflow counters + * have been reset. This avoids spurious NMIs on + * Haswell CPUs. + */ + if (x86_pmu.late_ack) + apic_write(APIC_LVTPC, APIC_DM_NMI); return handled; } @@ -784,6 +1450,9 @@ intel_bts_constraints(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; unsigned int hw_event, bts_event; + if (event->attr.freq) + return NULL; + hw_event = hwc->config & INTEL_ARCH_EVENT_MASK; bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS); @@ -793,6 +1462,184 @@ intel_bts_constraints(struct perf_event *event) return NULL; } +static int intel_alt_er(int idx) +{ + if (!(x86_pmu.er_flags & ERF_HAS_RSP_1)) + return idx; + + if (idx == EXTRA_REG_RSP_0) + return EXTRA_REG_RSP_1; + + if (idx == EXTRA_REG_RSP_1) + return EXTRA_REG_RSP_0; + + return idx; +} + +static void intel_fixup_er(struct perf_event *event, int idx) +{ + event->hw.extra_reg.idx = idx; + + if (idx == EXTRA_REG_RSP_0) { + event->hw.config &= ~INTEL_ARCH_EVENT_MASK; + event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_0].event; + event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0; + } else if (idx == EXTRA_REG_RSP_1) { + event->hw.config &= ~INTEL_ARCH_EVENT_MASK; + event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_1].event; + event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1; + } +} + +/* + * manage allocation of shared extra msr for certain events + * + * sharing can be: + * per-cpu: to be shared between the various events on a single PMU + * per-core: per-cpu + shared by HT threads + */ +static struct event_constraint * +__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event, + struct hw_perf_event_extra *reg) +{ + struct event_constraint *c = &emptyconstraint; + struct er_account *era; + unsigned long flags; + int idx = reg->idx; + + /* + * reg->alloc can be set due to existing state, so for fake cpuc we + * need to ignore this, otherwise we might fail to allocate proper fake + * state for this extra reg constraint. Also see the comment below. + */ + if (reg->alloc && !cpuc->is_fake) + return NULL; /* call x86_get_event_constraint() */ + +again: + era = &cpuc->shared_regs->regs[idx]; + /* + * we use spin_lock_irqsave() to avoid lockdep issues when + * passing a fake cpuc + */ + raw_spin_lock_irqsave(&era->lock, flags); + + if (!atomic_read(&era->ref) || era->config == reg->config) { + + /* + * If its a fake cpuc -- as per validate_{group,event}() we + * shouldn't touch event state and we can avoid doing so + * since both will only call get_event_constraints() once + * on each event, this avoids the need for reg->alloc. + * + * Not doing the ER fixup will only result in era->reg being + * wrong, but since we won't actually try and program hardware + * this isn't a problem either. + */ + if (!cpuc->is_fake) { + if (idx != reg->idx) + intel_fixup_er(event, idx); + + /* + * x86_schedule_events() can call get_event_constraints() + * multiple times on events in the case of incremental + * scheduling(). reg->alloc ensures we only do the ER + * allocation once. + */ + reg->alloc = 1; + } + + /* lock in msr value */ + era->config = reg->config; + era->reg = reg->reg; + + /* one more user */ + atomic_inc(&era->ref); + + /* + * need to call x86_get_event_constraint() + * to check if associated event has constraints + */ + c = NULL; + } else { + idx = intel_alt_er(idx); + if (idx != reg->idx) { + raw_spin_unlock_irqrestore(&era->lock, flags); + goto again; + } + } + raw_spin_unlock_irqrestore(&era->lock, flags); + + return c; +} + +static void +__intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc, + struct hw_perf_event_extra *reg) +{ + struct er_account *era; + + /* + * Only put constraint if extra reg was actually allocated. Also takes + * care of event which do not use an extra shared reg. + * + * Also, if this is a fake cpuc we shouldn't touch any event state + * (reg->alloc) and we don't care about leaving inconsistent cpuc state + * either since it'll be thrown out. + */ + if (!reg->alloc || cpuc->is_fake) + return; + + era = &cpuc->shared_regs->regs[reg->idx]; + + /* one fewer user */ + atomic_dec(&era->ref); + + /* allocate again next time */ + reg->alloc = 0; +} + +static struct event_constraint * +intel_shared_regs_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + struct event_constraint *c = NULL, *d; + struct hw_perf_event_extra *xreg, *breg; + + xreg = &event->hw.extra_reg; + if (xreg->idx != EXTRA_REG_NONE) { + c = __intel_shared_reg_get_constraints(cpuc, event, xreg); + if (c == &emptyconstraint) + return c; + } + breg = &event->hw.branch_reg; + if (breg->idx != EXTRA_REG_NONE) { + d = __intel_shared_reg_get_constraints(cpuc, event, breg); + if (d == &emptyconstraint) { + __intel_shared_reg_put_constraints(cpuc, xreg); + c = d; + } + } + return c; +} + +struct event_constraint * +x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +{ + struct event_constraint *c; + + if (x86_pmu.event_constraints) { + for_each_event_constraint(c, x86_pmu.event_constraints) { + if ((event->hw.config & c->cmask) == c->code) { + event->hw.flags |= c->flags; + return c; + } + } + } + + return &unconstrained; +} + static struct event_constraint * intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) { @@ -806,9 +1653,90 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event if (c) return c; + c = intel_shared_regs_constraints(cpuc, event); + if (c) + return c; + return x86_get_event_constraints(cpuc, event); } +static void +intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + struct hw_perf_event_extra *reg; + + reg = &event->hw.extra_reg; + if (reg->idx != EXTRA_REG_NONE) + __intel_shared_reg_put_constraints(cpuc, reg); + + reg = &event->hw.branch_reg; + if (reg->idx != EXTRA_REG_NONE) + __intel_shared_reg_put_constraints(cpuc, reg); +} + +static void intel_put_event_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + intel_put_shared_regs_event_constraints(cpuc, event); +} + +static void intel_pebs_aliases_core2(struct perf_event *event) +{ + if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) { + /* + * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P + * (0x003c) so that we can use it with PEBS. + * + * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't + * PEBS capable. However we can use INST_RETIRED.ANY_P + * (0x00c0), which is a PEBS capable event, to get the same + * count. + * + * INST_RETIRED.ANY_P counts the number of cycles that retires + * CNTMASK instructions. By setting CNTMASK to a value (16) + * larger than the maximum number of instructions that can be + * retired per cycle (4) and then inverting the condition, we + * count all cycles that retire 16 or less instructions, which + * is every cycle. + * + * Thereby we gain a PEBS capable cycle counter. + */ + u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16); + + alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK); + event->hw.config = alt_config; + } +} + +static void intel_pebs_aliases_snb(struct perf_event *event) +{ + if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) { + /* + * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P + * (0x003c) so that we can use it with PEBS. + * + * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't + * PEBS capable. However we can use UOPS_RETIRED.ALL + * (0x01c2), which is a PEBS capable event, to get the same + * count. + * + * UOPS_RETIRED.ALL counts the number of cycles that retires + * CNTMASK micro-ops. By setting CNTMASK to a value (16) + * larger than the maximum number of micro-ops that can be + * retired per cycle (4) and then inverting the condition, we + * count all cycles that retire 16 or less micro-ops, which + * is every cycle. + * + * Thereby we gain a PEBS capable cycle counter. + */ + u64 alt_config = X86_CONFIG(.event=0xc2, .umask=0x01, .inv=1, .cmask=16); + + alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK); + event->hw.config = alt_config; + } +} + static int intel_pmu_hw_config(struct perf_event *event) { int ret = x86_pmu_hw_config(event); @@ -816,6 +1744,15 @@ static int intel_pmu_hw_config(struct perf_event *event) if (ret) return ret; + if (event->attr.precise_ip && x86_pmu.pebs_aliases) + x86_pmu.pebs_aliases(event); + + if (intel_pmu_needs_lbr_smpl(event)) { + ret = intel_pmu_setup_lbr_filter(event); + if (ret) + return ret; + } + if (event->attr.type != PERF_TYPE_RAW) return 0; @@ -833,12 +1770,174 @@ static int intel_pmu_hw_config(struct perf_event *event) return 0; } +struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr) +{ + if (x86_pmu.guest_get_msrs) + return x86_pmu.guest_get_msrs(nr); + *nr = 0; + return NULL; +} +EXPORT_SYMBOL_GPL(perf_guest_get_msrs); + +static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs; + + arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL; + arr[0].host = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask; + arr[0].guest = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_host_mask; + /* + * If PMU counter has PEBS enabled it is not enough to disable counter + * on a guest entry since PEBS memory write can overshoot guest entry + * and corrupt guest memory. Disabling PEBS solves the problem. + */ + arr[1].msr = MSR_IA32_PEBS_ENABLE; + arr[1].host = cpuc->pebs_enabled; + arr[1].guest = 0; + + *nr = 2; + return arr; +} + +static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs; + int idx; + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + struct perf_event *event = cpuc->events[idx]; + + arr[idx].msr = x86_pmu_config_addr(idx); + arr[idx].host = arr[idx].guest = 0; + + if (!test_bit(idx, cpuc->active_mask)) + continue; + + arr[idx].host = arr[idx].guest = + event->hw.config | ARCH_PERFMON_EVENTSEL_ENABLE; + + if (event->attr.exclude_host) + arr[idx].host &= ~ARCH_PERFMON_EVENTSEL_ENABLE; + else if (event->attr.exclude_guest) + arr[idx].guest &= ~ARCH_PERFMON_EVENTSEL_ENABLE; + } + + *nr = x86_pmu.num_counters; + return arr; +} + +static void core_pmu_enable_event(struct perf_event *event) +{ + if (!event->attr.exclude_host) + x86_pmu_enable_event(event); +} + +static void core_pmu_enable_all(int added) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + int idx; + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + struct hw_perf_event *hwc = &cpuc->events[idx]->hw; + + if (!test_bit(idx, cpuc->active_mask) || + cpuc->events[idx]->attr.exclude_host) + continue; + + __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); + } +} + +static int hsw_hw_config(struct perf_event *event) +{ + int ret = intel_pmu_hw_config(event); + + if (ret) + return ret; + if (!boot_cpu_has(X86_FEATURE_RTM) && !boot_cpu_has(X86_FEATURE_HLE)) + return 0; + event->hw.config |= event->attr.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED); + + /* + * IN_TX/IN_TX-CP filters are not supported by the Haswell PMU with + * PEBS or in ANY thread mode. Since the results are non-sensical forbid + * this combination. + */ + if ((event->hw.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED)) && + ((event->hw.config & ARCH_PERFMON_EVENTSEL_ANY) || + event->attr.precise_ip > 0)) + return -EOPNOTSUPP; + + if (event_is_checkpointed(event)) { + /* + * Sampling of checkpointed events can cause situations where + * the CPU constantly aborts because of a overflow, which is + * then checkpointed back and ignored. Forbid checkpointing + * for sampling. + * + * But still allow a long sampling period, so that perf stat + * from KVM works. + */ + if (event->attr.sample_period > 0 && + event->attr.sample_period < 0x7fffffff) + return -EOPNOTSUPP; + } + return 0; +} + +static struct event_constraint counter2_constraint = + EVENT_CONSTRAINT(0, 0x4, 0); + +static struct event_constraint * +hsw_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +{ + struct event_constraint *c = intel_get_event_constraints(cpuc, event); + + /* Handle special quirk on in_tx_checkpointed only in counter 2 */ + if (event->hw.config & HSW_IN_TX_CHECKPOINTED) { + if (c->idxmsk64 & (1U << 2)) + return &counter2_constraint; + return &emptyconstraint; + } + + return c; +} + +PMU_FORMAT_ATTR(event, "config:0-7" ); +PMU_FORMAT_ATTR(umask, "config:8-15" ); +PMU_FORMAT_ATTR(edge, "config:18" ); +PMU_FORMAT_ATTR(pc, "config:19" ); +PMU_FORMAT_ATTR(any, "config:21" ); /* v3 + */ +PMU_FORMAT_ATTR(inv, "config:23" ); +PMU_FORMAT_ATTR(cmask, "config:24-31" ); +PMU_FORMAT_ATTR(in_tx, "config:32"); +PMU_FORMAT_ATTR(in_tx_cp, "config:33"); + +static struct attribute *intel_arch_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_pc.attr, + &format_attr_inv.attr, + &format_attr_cmask.attr, + NULL, +}; + +ssize_t intel_event_sysfs_show(char *page, u64 config) +{ + u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT); + + return x86_event_sysfs_show(page, config, event); +} + static __initconst const struct x86_pmu core_pmu = { .name = "core", .handle_irq = x86_pmu_handle_irq, .disable_all = x86_pmu_disable_all, - .enable_all = x86_pmu_enable_all, - .enable = x86_pmu_enable_event, + .enable_all = core_pmu_enable_all, + .enable = core_pmu_enable_event, .disable = x86_pmu_disable_event, .hw_config = x86_pmu_hw_config, .schedule_events = x86_schedule_events, @@ -854,23 +1953,129 @@ static __initconst const struct x86_pmu core_pmu = { */ .max_period = (1ULL << 31) - 1, .get_event_constraints = intel_get_event_constraints, + .put_event_constraints = intel_put_event_constraints, .event_constraints = intel_core_event_constraints, + .guest_get_msrs = core_guest_get_msrs, + .format_attrs = intel_arch_formats_attr, + .events_sysfs_show = intel_event_sysfs_show, }; +struct intel_shared_regs *allocate_shared_regs(int cpu) +{ + struct intel_shared_regs *regs; + int i; + + regs = kzalloc_node(sizeof(struct intel_shared_regs), + GFP_KERNEL, cpu_to_node(cpu)); + if (regs) { + /* + * initialize the locks to keep lockdep happy + */ + for (i = 0; i < EXTRA_REG_MAX; i++) + raw_spin_lock_init(®s->regs[i].lock); + + regs->core_id = -1; + } + return regs; +} + +static int intel_pmu_cpu_prepare(int cpu) +{ + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + + if (!(x86_pmu.extra_regs || x86_pmu.lbr_sel_map)) + return NOTIFY_OK; + + cpuc->shared_regs = allocate_shared_regs(cpu); + if (!cpuc->shared_regs) + return NOTIFY_BAD; + + return NOTIFY_OK; +} + static void intel_pmu_cpu_starting(int cpu) { + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + int core_id = topology_core_id(cpu); + int i; + init_debug_store_on_cpu(cpu); /* * Deal with CPUs that don't clear their LBRs on power-up. */ intel_pmu_lbr_reset(); + + cpuc->lbr_sel = NULL; + + if (!cpuc->shared_regs) + return; + + if (!(x86_pmu.er_flags & ERF_NO_HT_SHARING)) { + for_each_cpu(i, topology_thread_cpumask(cpu)) { + struct intel_shared_regs *pc; + + pc = per_cpu(cpu_hw_events, i).shared_regs; + if (pc && pc->core_id == core_id) { + cpuc->kfree_on_online = cpuc->shared_regs; + cpuc->shared_regs = pc; + break; + } + } + cpuc->shared_regs->core_id = core_id; + cpuc->shared_regs->refcnt++; + } + + if (x86_pmu.lbr_sel_map) + cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR]; } static void intel_pmu_cpu_dying(int cpu) { + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + struct intel_shared_regs *pc; + + pc = cpuc->shared_regs; + if (pc) { + if (pc->core_id == -1 || --pc->refcnt == 0) + kfree(pc); + cpuc->shared_regs = NULL; + } + fini_debug_store_on_cpu(cpu); } +static void intel_pmu_flush_branch_stack(void) +{ + /* + * Intel LBR does not tag entries with the + * PID of the current task, then we need to + * flush it on ctxsw + * For now, we simply reset it + */ + if (x86_pmu.lbr_nr) + intel_pmu_lbr_reset(); +} + +PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63"); + +PMU_FORMAT_ATTR(ldlat, "config1:0-15"); + +static struct attribute *intel_arch3_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_pc.attr, + &format_attr_any.attr, + &format_attr_inv.attr, + &format_attr_cmask.attr, + &format_attr_in_tx.attr, + &format_attr_in_tx_cp.attr, + + &format_attr_offcore_rsp.attr, /* XXX do NHM/WSM + SNB breakout */ + &format_attr_ldlat.attr, /* PEBS load latency */ + NULL, +}; + static __initconst const struct x86_pmu intel_pmu = { .name = "Intel", .handle_irq = intel_pmu_handle_irq, @@ -892,12 +2097,20 @@ static __initconst const struct x86_pmu intel_pmu = { */ .max_period = (1ULL << 31) - 1, .get_event_constraints = intel_get_event_constraints, + .put_event_constraints = intel_put_event_constraints, + .pebs_aliases = intel_pebs_aliases_core2, + + .format_attrs = intel_arch3_formats_attr, + .events_sysfs_show = intel_event_sysfs_show, + .cpu_prepare = intel_pmu_cpu_prepare, .cpu_starting = intel_pmu_cpu_starting, .cpu_dying = intel_pmu_cpu_dying, + .guest_get_msrs = intel_guest_get_msrs, + .flush_branch_stack = intel_pmu_flush_branch_stack, }; -static void intel_clovertown_quirks(void) +static __init void intel_clovertown_quirk(void) { /* * PEBS is unreliable due to: @@ -913,28 +2126,195 @@ static void intel_clovertown_quirks(void) * AJ106 could possibly be worked around by not allowing LBR * usage from PEBS, including the fixup. * AJ68 could possibly be worked around by always programming - * a pebs_event_reset[0] value and coping with the lost events. + * a pebs_event_reset[0] value and coping with the lost events. * * But taken together it might just make sense to not enable PEBS on * these chips. */ - printk(KERN_WARNING "PEBS disabled due to CPU errata.\n"); + pr_warn("PEBS disabled due to CPU errata\n"); x86_pmu.pebs = 0; x86_pmu.pebs_constraints = NULL; } -static __init int intel_pmu_init(void) +static int intel_snb_pebs_broken(int cpu) +{ + u32 rev = UINT_MAX; /* default to broken for unknown models */ + + switch (cpu_data(cpu).x86_model) { + case 42: /* SNB */ + rev = 0x28; + break; + + case 45: /* SNB-EP */ + switch (cpu_data(cpu).x86_mask) { + case 6: rev = 0x618; break; + case 7: rev = 0x70c; break; + } + } + + return (cpu_data(cpu).microcode < rev); +} + +static void intel_snb_check_microcode(void) +{ + int pebs_broken = 0; + int cpu; + + get_online_cpus(); + for_each_online_cpu(cpu) { + if ((pebs_broken = intel_snb_pebs_broken(cpu))) + break; + } + put_online_cpus(); + + if (pebs_broken == x86_pmu.pebs_broken) + return; + + /* + * Serialized by the microcode lock.. + */ + if (x86_pmu.pebs_broken) { + pr_info("PEBS enabled due to microcode update\n"); + x86_pmu.pebs_broken = 0; + } else { + pr_info("PEBS disabled due to CPU errata, please upgrade microcode\n"); + x86_pmu.pebs_broken = 1; + } +} + +/* + * Under certain circumstances, access certain MSR may cause #GP. + * The function tests if the input MSR can be safely accessed. + */ +static bool check_msr(unsigned long msr, u64 mask) +{ + u64 val_old, val_new, val_tmp; + + /* + * Read the current value, change it and read it back to see if it + * matches, this is needed to detect certain hardware emulators + * (qemu/kvm) that don't trap on the MSR access and always return 0s. + */ + if (rdmsrl_safe(msr, &val_old)) + return false; + + /* + * Only change the bits which can be updated by wrmsrl. + */ + val_tmp = val_old ^ mask; + if (wrmsrl_safe(msr, val_tmp) || + rdmsrl_safe(msr, &val_new)) + return false; + + if (val_new != val_tmp) + return false; + + /* Here it's sure that the MSR can be safely accessed. + * Restore the old value and return. + */ + wrmsrl(msr, val_old); + + return true; +} + +static __init void intel_sandybridge_quirk(void) +{ + x86_pmu.check_microcode = intel_snb_check_microcode; + intel_snb_check_microcode(); +} + +static const struct { int id; char *name; } intel_arch_events_map[] __initconst = { + { PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" }, + { PERF_COUNT_HW_INSTRUCTIONS, "instructions" }, + { PERF_COUNT_HW_BUS_CYCLES, "bus cycles" }, + { PERF_COUNT_HW_CACHE_REFERENCES, "cache references" }, + { PERF_COUNT_HW_CACHE_MISSES, "cache misses" }, + { PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" }, + { PERF_COUNT_HW_BRANCH_MISSES, "branch misses" }, +}; + +static __init void intel_arch_events_quirk(void) +{ + int bit; + + /* disable event that reported as not presend by cpuid */ + for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) { + intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0; + pr_warn("CPUID marked event: \'%s\' unavailable\n", + intel_arch_events_map[bit].name); + } +} + +static __init void intel_nehalem_quirk(void) +{ + union cpuid10_ebx ebx; + + ebx.full = x86_pmu.events_maskl; + if (ebx.split.no_branch_misses_retired) { + /* + * Erratum AAJ80 detected, we work it around by using + * the BR_MISP_EXEC.ANY event. This will over-count + * branch-misses, but it's still much better than the + * architectural event which is often completely bogus: + */ + intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89; + ebx.split.no_branch_misses_retired = 0; + x86_pmu.events_maskl = ebx.full; + pr_info("CPU erratum AAJ80 worked around\n"); + } +} + +EVENT_ATTR_STR(mem-loads, mem_ld_hsw, "event=0xcd,umask=0x1,ldlat=3"); +EVENT_ATTR_STR(mem-stores, mem_st_hsw, "event=0xd0,umask=0x82") + +/* Haswell special events */ +EVENT_ATTR_STR(tx-start, tx_start, "event=0xc9,umask=0x1"); +EVENT_ATTR_STR(tx-commit, tx_commit, "event=0xc9,umask=0x2"); +EVENT_ATTR_STR(tx-abort, tx_abort, "event=0xc9,umask=0x4"); +EVENT_ATTR_STR(tx-capacity, tx_capacity, "event=0x54,umask=0x2"); +EVENT_ATTR_STR(tx-conflict, tx_conflict, "event=0x54,umask=0x1"); +EVENT_ATTR_STR(el-start, el_start, "event=0xc8,umask=0x1"); +EVENT_ATTR_STR(el-commit, el_commit, "event=0xc8,umask=0x2"); +EVENT_ATTR_STR(el-abort, el_abort, "event=0xc8,umask=0x4"); +EVENT_ATTR_STR(el-capacity, el_capacity, "event=0x54,umask=0x2"); +EVENT_ATTR_STR(el-conflict, el_conflict, "event=0x54,umask=0x1"); +EVENT_ATTR_STR(cycles-t, cycles_t, "event=0x3c,in_tx=1"); +EVENT_ATTR_STR(cycles-ct, cycles_ct, "event=0x3c,in_tx=1,in_tx_cp=1"); + +static struct attribute *hsw_events_attrs[] = { + EVENT_PTR(tx_start), + EVENT_PTR(tx_commit), + EVENT_PTR(tx_abort), + EVENT_PTR(tx_capacity), + EVENT_PTR(tx_conflict), + EVENT_PTR(el_start), + EVENT_PTR(el_commit), + EVENT_PTR(el_abort), + EVENT_PTR(el_capacity), + EVENT_PTR(el_conflict), + EVENT_PTR(cycles_t), + EVENT_PTR(cycles_ct), + EVENT_PTR(mem_ld_hsw), + EVENT_PTR(mem_st_hsw), + NULL +}; + +__init int intel_pmu_init(void) { union cpuid10_edx edx; union cpuid10_eax eax; + union cpuid10_ebx ebx; + struct event_constraint *c; unsigned int unused; - unsigned int ebx; - int version; + struct extra_reg *er; + int version, i; if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { switch (boot_cpu_data.x86) { case 0x6: return p6_pmu_init(); + case 0xb: + return knc_pmu_init(); case 0xf: return p4_pmu_init(); } @@ -945,8 +2325,8 @@ static __init int intel_pmu_init(void) * Check whether the Architectural PerfMon supports * Branch Misses Retired hw_event or not. */ - cpuid(10, &eax.full, &ebx, &unused, &edx.full); - if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) + cpuid(10, &eax.full, &ebx.full, &unused, &edx.full); + if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT) return -ENODEV; version = eax.split.version_id; @@ -960,6 +2340,11 @@ static __init int intel_pmu_init(void) x86_pmu.cntval_bits = eax.split.bit_width; x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1; + x86_pmu.events_maskl = ebx.full; + x86_pmu.events_mask_len = eax.split.mask_length; + + x86_pmu.max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters); + /* * Quirk: v2 perfmon does not report fixed-purpose events, so * assume at least 3 events: @@ -967,10 +2352,7 @@ static __init int intel_pmu_init(void) if (version > 1) x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); - /* - * v2 and above have a perf capabilities MSR - */ - if (version > 1) { + if (boot_cpu_has(X86_FEATURE_PDCM)) { u64 capabilities; rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities); @@ -979,6 +2361,8 @@ static __init int intel_pmu_init(void) intel_ds_init(); + x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */ + /* * Install the hw-cache-events table: */ @@ -988,7 +2372,7 @@ static __init int intel_pmu_init(void) break; case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ - x86_pmu.quirks = intel_clovertown_quirks; + x86_add_quirk(intel_clovertown_quirk); case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ case 29: /* six-core 45 nm xeon "Dunnington" */ @@ -998,6 +2382,7 @@ static __init int intel_pmu_init(void) intel_pmu_lbr_init_core(); x86_pmu.event_constraints = intel_core2_event_constraints; + x86_pmu.pebs_constraints = intel_core2_pebs_event_constraints; pr_cont("Core2 events, "); break; @@ -1006,51 +2391,262 @@ static __init int intel_pmu_init(void) case 46: /* 45 nm nehalem-ex, "Beckton" */ memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs, + sizeof(hw_cache_extra_regs)); intel_pmu_lbr_init_nhm(); x86_pmu.event_constraints = intel_nehalem_event_constraints; + x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints; x86_pmu.enable_all = intel_pmu_nhm_enable_all; + x86_pmu.extra_regs = intel_nehalem_extra_regs; + + x86_pmu.cpu_events = nhm_events_attrs; + + /* UOPS_ISSUED.STALLED_CYCLES */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = + X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1); + /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = + X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1); + + x86_add_quirk(intel_nehalem_quirk); + pr_cont("Nehalem events, "); break; case 28: /* Atom */ + case 38: /* Lincroft */ + case 39: /* Penwell */ + case 53: /* Cloverview */ + case 54: /* Cedarview */ memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, sizeof(hw_cache_event_ids)); intel_pmu_lbr_init_atom(); x86_pmu.event_constraints = intel_gen_event_constraints; + x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints; pr_cont("Atom events, "); break; + case 55: /* Atom 22nm "Silvermont" */ + case 77: /* Avoton "Silvermont" */ + memcpy(hw_cache_event_ids, slm_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs, + sizeof(hw_cache_extra_regs)); + + intel_pmu_lbr_init_atom(); + + x86_pmu.event_constraints = intel_slm_event_constraints; + x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints; + x86_pmu.extra_regs = intel_slm_extra_regs; + x86_pmu.er_flags |= ERF_HAS_RSP_1; + pr_cont("Silvermont events, "); + break; + case 37: /* 32 nm nehalem, "Clarkdale" */ case 44: /* 32 nm nehalem, "Gulftown" */ + case 47: /* 32 nm Xeon E7 */ memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs, + sizeof(hw_cache_extra_regs)); intel_pmu_lbr_init_nhm(); x86_pmu.event_constraints = intel_westmere_event_constraints; x86_pmu.enable_all = intel_pmu_nhm_enable_all; + x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints; + x86_pmu.extra_regs = intel_westmere_extra_regs; + x86_pmu.er_flags |= ERF_HAS_RSP_1; + + x86_pmu.cpu_events = nhm_events_attrs; + + /* UOPS_ISSUED.STALLED_CYCLES */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = + X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1); + /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = + X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1); + pr_cont("Westmere events, "); break; + case 42: /* SandyBridge */ + case 45: /* SandyBridge, "Romely-EP" */ + x86_add_quirk(intel_sandybridge_quirk); + memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, + sizeof(hw_cache_extra_regs)); + + intel_pmu_lbr_init_snb(); + + x86_pmu.event_constraints = intel_snb_event_constraints; + x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints; + x86_pmu.pebs_aliases = intel_pebs_aliases_snb; + if (boot_cpu_data.x86_model == 45) + x86_pmu.extra_regs = intel_snbep_extra_regs; + else + x86_pmu.extra_regs = intel_snb_extra_regs; + /* all extra regs are per-cpu when HT is on */ + x86_pmu.er_flags |= ERF_HAS_RSP_1; + x86_pmu.er_flags |= ERF_NO_HT_SHARING; + + x86_pmu.cpu_events = snb_events_attrs; + + /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = + X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1); + /* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = + X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1); + + pr_cont("SandyBridge events, "); + break; + case 58: /* IvyBridge */ + case 62: /* IvyBridge EP */ + memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + /* dTLB-load-misses on IVB is different than SNB */ + hw_cache_event_ids[C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = 0x8108; /* DTLB_LOAD_MISSES.DEMAND_LD_MISS_CAUSES_A_WALK */ + + memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, + sizeof(hw_cache_extra_regs)); + + intel_pmu_lbr_init_snb(); + + x86_pmu.event_constraints = intel_ivb_event_constraints; + x86_pmu.pebs_constraints = intel_ivb_pebs_event_constraints; + x86_pmu.pebs_aliases = intel_pebs_aliases_snb; + if (boot_cpu_data.x86_model == 62) + x86_pmu.extra_regs = intel_snbep_extra_regs; + else + x86_pmu.extra_regs = intel_snb_extra_regs; + /* all extra regs are per-cpu when HT is on */ + x86_pmu.er_flags |= ERF_HAS_RSP_1; + x86_pmu.er_flags |= ERF_NO_HT_SHARING; + + x86_pmu.cpu_events = snb_events_attrs; + + /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = + X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1); + + pr_cont("IvyBridge events, "); + break; + + + case 60: /* Haswell Client */ + case 70: + case 71: + case 63: + case 69: + x86_pmu.late_ack = true; + memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); + + intel_pmu_lbr_init_snb(); + + x86_pmu.event_constraints = intel_hsw_event_constraints; + x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints; + x86_pmu.extra_regs = intel_snb_extra_regs; + x86_pmu.pebs_aliases = intel_pebs_aliases_snb; + /* all extra regs are per-cpu when HT is on */ + x86_pmu.er_flags |= ERF_HAS_RSP_1; + x86_pmu.er_flags |= ERF_NO_HT_SHARING; + + x86_pmu.hw_config = hsw_hw_config; + x86_pmu.get_event_constraints = hsw_get_event_constraints; + x86_pmu.cpu_events = hsw_events_attrs; + x86_pmu.lbr_double_abort = true; + pr_cont("Haswell events, "); + break; + default: + switch (x86_pmu.version) { + case 1: + x86_pmu.event_constraints = intel_v1_event_constraints; + pr_cont("generic architected perfmon v1, "); + break; + default: + /* + * default constraints for v2 and up + */ + x86_pmu.event_constraints = intel_gen_event_constraints; + pr_cont("generic architected perfmon, "); + break; + } + } + + if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) { + WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", + x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC); + x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC; + } + x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; + + if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) { + WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", + x86_pmu.num_counters_fixed, INTEL_PMC_MAX_FIXED); + x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED; + } + + x86_pmu.intel_ctrl |= + ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED; + + if (x86_pmu.event_constraints) { /* - * default constraints for v2 and up + * event on fixed counter2 (REF_CYCLES) only works on this + * counter, so do not extend mask to generic counters */ - x86_pmu.event_constraints = intel_gen_event_constraints; - pr_cont("generic architected perfmon, "); + for_each_event_constraint(c, x86_pmu.event_constraints) { + if (c->cmask != FIXED_EVENT_FLAGS + || c->idxmsk64 == INTEL_PMC_MSK_FIXED_REF_CYCLES) { + continue; + } + + c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; + c->weight += x86_pmu.num_counters; + } + } + + /* + * Access LBR MSR may cause #GP under certain circumstances. + * E.g. KVM doesn't support LBR MSR + * Check all LBT MSR here. + * Disable LBR access if any LBR MSRs can not be accessed. + */ + if (x86_pmu.lbr_nr && !check_msr(x86_pmu.lbr_tos, 0x3UL)) + x86_pmu.lbr_nr = 0; + for (i = 0; i < x86_pmu.lbr_nr; i++) { + if (!(check_msr(x86_pmu.lbr_from + i, 0xffffUL) && + check_msr(x86_pmu.lbr_to + i, 0xffffUL))) + x86_pmu.lbr_nr = 0; + } + + /* + * Access extra MSR may cause #GP under certain circumstances. + * E.g. KVM doesn't support offcore event + * Check all extra_regs here. + */ + if (x86_pmu.extra_regs) { + for (er = x86_pmu.extra_regs; er->msr; er++) { + er->extra_msr_access = check_msr(er->msr, 0x1ffUL); + /* Disable LBR select mapping */ + if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access) + x86_pmu.lbr_sel_map = NULL; + } } - return 0; -} -#else /* CONFIG_CPU_SUP_INTEL */ + /* Support full width counters using alternative MSR range */ + if (x86_pmu.intel_cap.full_width_write) { + x86_pmu.max_period = x86_pmu.cntval_mask; + x86_pmu.perfctr = MSR_IA32_PMC0; + pr_cont("full-width counters, "); + } -static int intel_pmu_init(void) -{ return 0; } - -#endif /* CONFIG_CPU_SUP_INTEL */ diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index b7dcd9f2b8a..696ade311de 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -1,13 +1,18 @@ -#ifdef CONFIG_CPU_SUP_INTEL +#include <linux/bitops.h> +#include <linux/types.h> +#include <linux/slab.h> -/* The maximal number of PEBS events: */ -#define MAX_PEBS_EVENTS 4 +#include <asm/perf_event.h> +#include <asm/insn.h> + +#include "perf_event.h" /* The size of a BTS record in bytes: */ #define BTS_RECORD_SIZE 24 #define BTS_BUFFER_SIZE (PAGE_SIZE << 4) #define PEBS_BUFFER_SIZE PAGE_SIZE +#define PEBS_FIXUP_SIZE PAGE_SIZE /* * pebs_record_32 for p4 and core not supported @@ -20,6 +25,159 @@ struct pebs_record_32 { */ +union intel_x86_pebs_dse { + u64 val; + struct { + unsigned int ld_dse:4; + unsigned int ld_stlb_miss:1; + unsigned int ld_locked:1; + unsigned int ld_reserved:26; + }; + struct { + unsigned int st_l1d_hit:1; + unsigned int st_reserved1:3; + unsigned int st_stlb_miss:1; + unsigned int st_locked:1; + unsigned int st_reserved2:26; + }; +}; + + +/* + * Map PEBS Load Latency Data Source encodings to generic + * memory data source information + */ +#define P(a, b) PERF_MEM_S(a, b) +#define OP_LH (P(OP, LOAD) | P(LVL, HIT)) +#define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS)) + +static const u64 pebs_data_source[] = { + P(OP, LOAD) | P(LVL, MISS) | P(LVL, L3) | P(SNOOP, NA),/* 0x00:ukn L3 */ + OP_LH | P(LVL, L1) | P(SNOOP, NONE), /* 0x01: L1 local */ + OP_LH | P(LVL, LFB) | P(SNOOP, NONE), /* 0x02: LFB hit */ + OP_LH | P(LVL, L2) | P(SNOOP, NONE), /* 0x03: L2 hit */ + OP_LH | P(LVL, L3) | P(SNOOP, NONE), /* 0x04: L3 hit */ + OP_LH | P(LVL, L3) | P(SNOOP, MISS), /* 0x05: L3 hit, snoop miss */ + OP_LH | P(LVL, L3) | P(SNOOP, HIT), /* 0x06: L3 hit, snoop hit */ + OP_LH | P(LVL, L3) | P(SNOOP, HITM), /* 0x07: L3 hit, snoop hitm */ + OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HIT), /* 0x08: L3 miss snoop hit */ + OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/ + OP_LH | P(LVL, LOC_RAM) | P(SNOOP, HIT), /* 0x0a: L3 miss, shared */ + OP_LH | P(LVL, REM_RAM1) | P(SNOOP, HIT), /* 0x0b: L3 miss, shared */ + OP_LH | P(LVL, LOC_RAM) | SNOOP_NONE_MISS,/* 0x0c: L3 miss, excl */ + OP_LH | P(LVL, REM_RAM1) | SNOOP_NONE_MISS,/* 0x0d: L3 miss, excl */ + OP_LH | P(LVL, IO) | P(SNOOP, NONE), /* 0x0e: I/O */ + OP_LH | P(LVL, UNC) | P(SNOOP, NONE), /* 0x0f: uncached */ +}; + +static u64 precise_store_data(u64 status) +{ + union intel_x86_pebs_dse dse; + u64 val = P(OP, STORE) | P(SNOOP, NA) | P(LVL, L1) | P(TLB, L2); + + dse.val = status; + + /* + * bit 4: TLB access + * 1 = stored missed 2nd level TLB + * + * so it either hit the walker or the OS + * otherwise hit 2nd level TLB + */ + if (dse.st_stlb_miss) + val |= P(TLB, MISS); + else + val |= P(TLB, HIT); + + /* + * bit 0: hit L1 data cache + * if not set, then all we know is that + * it missed L1D + */ + if (dse.st_l1d_hit) + val |= P(LVL, HIT); + else + val |= P(LVL, MISS); + + /* + * bit 5: Locked prefix + */ + if (dse.st_locked) + val |= P(LOCK, LOCKED); + + return val; +} + +static u64 precise_store_data_hsw(struct perf_event *event, u64 status) +{ + union perf_mem_data_src dse; + u64 cfg = event->hw.config & INTEL_ARCH_EVENT_MASK; + + dse.val = 0; + dse.mem_op = PERF_MEM_OP_STORE; + dse.mem_lvl = PERF_MEM_LVL_NA; + + /* + * L1 info only valid for following events: + * + * MEM_UOPS_RETIRED.STLB_MISS_STORES + * MEM_UOPS_RETIRED.LOCK_STORES + * MEM_UOPS_RETIRED.SPLIT_STORES + * MEM_UOPS_RETIRED.ALL_STORES + */ + if (cfg != 0x12d0 && cfg != 0x22d0 && cfg != 0x42d0 && cfg != 0x82d0) + return dse.mem_lvl; + + if (status & 1) + dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT; + else + dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS; + + /* Nothing else supported. Sorry. */ + return dse.val; +} + +static u64 load_latency_data(u64 status) +{ + union intel_x86_pebs_dse dse; + u64 val; + int model = boot_cpu_data.x86_model; + int fam = boot_cpu_data.x86; + + dse.val = status; + + /* + * use the mapping table for bit 0-3 + */ + val = pebs_data_source[dse.ld_dse]; + + /* + * Nehalem models do not support TLB, Lock infos + */ + if (fam == 0x6 && (model == 26 || model == 30 + || model == 31 || model == 46)) { + val |= P(TLB, NA) | P(LOCK, NA); + return val; + } + /* + * bit 4: TLB access + * 0 = did not miss 2nd level TLB + * 1 = missed 2nd level TLB + */ + if (dse.ld_stlb_miss) + val |= P(TLB, MISS) | P(TLB, L2); + else + val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2); + + /* + * bit 5: locked prefix + */ + if (dse.ld_locked) + val |= P(LOCK, LOCKED); + + return val; +} + struct pebs_record_core { u64 flags, ip; u64 ax, bx, cx, dx; @@ -38,23 +196,36 @@ struct pebs_record_nhm { }; /* - * A debug store configuration. - * - * We only support architectures that use 64bit fields. + * Same as pebs_record_nhm, with two additional fields. */ -struct debug_store { - u64 bts_buffer_base; - u64 bts_index; - u64 bts_absolute_maximum; - u64 bts_interrupt_threshold; - u64 pebs_buffer_base; - u64 pebs_index; - u64 pebs_absolute_maximum; - u64 pebs_interrupt_threshold; - u64 pebs_event_reset[MAX_PEBS_EVENTS]; +struct pebs_record_hsw { + u64 flags, ip; + u64 ax, bx, cx, dx; + u64 si, di, bp, sp; + u64 r8, r9, r10, r11; + u64 r12, r13, r14, r15; + u64 status, dla, dse, lat; + u64 real_ip, tsx_tuning; +}; + +union hsw_tsx_tuning { + struct { + u32 cycles_last_block : 32, + hle_abort : 1, + rtm_abort : 1, + instruction_abort : 1, + non_instruction_abort : 1, + retry : 1, + data_conflict : 1, + capacity_writes : 1, + capacity_reads : 1; + }; + u64 value; }; -static void init_debug_store_on_cpu(int cpu) +#define PEBS_HSW_TSX_FLAGS 0xff00000000ULL + +void init_debug_store_on_cpu(int cpu) { struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; @@ -66,7 +237,7 @@ static void init_debug_store_on_cpu(int cpu) (u32)((u64)(unsigned long)ds >> 32)); } -static void fini_debug_store_on_cpu(int cpu) +void fini_debug_store_on_cpu(int cpu) { if (!per_cpu(cpu_hw_events, cpu).ds) return; @@ -74,20 +245,35 @@ static void fini_debug_store_on_cpu(int cpu) wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0); } +static DEFINE_PER_CPU(void *, insn_buffer); + static int alloc_pebs_buffer(int cpu) { struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; int node = cpu_to_node(cpu); int max, thresh = 1; /* always use a single PEBS record */ - void *buffer; + void *buffer, *ibuffer; if (!x86_pmu.pebs) return 0; - buffer = kmalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node); + buffer = kzalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL, node); if (unlikely(!buffer)) return -ENOMEM; + /* + * HSW+ already provides us the eventing ip; no need to allocate this + * buffer then. + */ + if (x86_pmu.intel_cap.pebs_format < 2) { + ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node); + if (!ibuffer) { + kfree(buffer); + return -ENOMEM; + } + per_cpu(insn_buffer, cpu) = ibuffer; + } + max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size; ds->pebs_buffer_base = (u64)(unsigned long)buffer; @@ -108,6 +294,9 @@ static void release_pebs_buffer(int cpu) if (!ds || !x86_pmu.pebs) return; + kfree(per_cpu(insn_buffer, cpu)); + per_cpu(insn_buffer, cpu) = NULL; + kfree((void *)(unsigned long)ds->pebs_buffer_base); ds->pebs_buffer_base = 0; } @@ -122,9 +311,11 @@ static int alloc_bts_buffer(int cpu) if (!x86_pmu.bts) return 0; - buffer = kmalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node); - if (unlikely(!buffer)) + buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node); + if (unlikely(!buffer)) { + WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__); return -ENOMEM; + } max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE; thresh = max / 16; @@ -155,7 +346,7 @@ static int alloc_ds_buffer(int cpu) int node = cpu_to_node(cpu); struct debug_store *ds; - ds = kmalloc_node(sizeof(*ds), GFP_KERNEL | __GFP_ZERO, node); + ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node); if (unlikely(!ds)) return -ENOMEM; @@ -175,7 +366,7 @@ static void release_ds_buffer(int cpu) kfree(ds); } -static void release_ds_buffers(void) +void release_ds_buffers(void) { int cpu; @@ -194,7 +385,7 @@ static void release_ds_buffers(void) put_online_cpus(); } -static void reserve_ds_buffers(void) +void reserve_ds_buffers(void) { int bts_err = 0, pebs_err = 0; int cpu; @@ -260,10 +451,10 @@ static void reserve_ds_buffers(void) * BTS */ -static struct event_constraint bts_constraint = - EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0); +struct event_constraint bts_constraint = + EVENT_CONSTRAINT(0, 1ULL << INTEL_PMC_IDX_FIXED_BTS, 0); -static void intel_pmu_enable_bts(u64 config) +void intel_pmu_enable_bts(u64 config) { unsigned long debugctlmsr; @@ -282,7 +473,7 @@ static void intel_pmu_enable_bts(u64 config) update_debugctlmsr(debugctlmsr); } -static void intel_pmu_disable_bts(void) +void intel_pmu_disable_bts(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); unsigned long debugctlmsr; @@ -299,7 +490,7 @@ static void intel_pmu_disable_bts(void) update_debugctlmsr(debugctlmsr); } -static int intel_pmu_drain_bts_buffer(void) +int intel_pmu_drain_bts_buffer(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct debug_store *ds = cpuc->ds; @@ -308,7 +499,7 @@ static int intel_pmu_drain_bts_buffer(void) u64 to; u64 flags; }; - struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS]; + struct perf_event *event = cpuc->events[INTEL_PMC_IDX_FIXED_BTS]; struct bts_record *at, *top; struct perf_output_handle handle; struct perf_event_header header; @@ -327,11 +518,11 @@ static int intel_pmu_drain_bts_buffer(void) if (top <= at) return 0; + memset(®s, 0, sizeof(regs)); + ds->bts_index = ds->bts_buffer_base; - perf_sample_data_init(&data, 0); - data.period = event->hw.last_period; - regs.ip = 0; + perf_sample_data_init(&data, 0, event->hw.last_period); /* * Prepare a generic sample, i.e. fill in the invariant fields. @@ -340,7 +531,7 @@ static int intel_pmu_drain_bts_buffer(void) */ perf_prepare_sample(&header, &data, event, ®s); - if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1)) + if (perf_output_begin(&handle, event, header.size * (top - at))) return 1; for (; at < top; at++) { @@ -361,35 +552,146 @@ static int intel_pmu_drain_bts_buffer(void) /* * PEBS */ +struct event_constraint intel_core2_pebs_event_constraints[] = { + INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ + INTEL_UEVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */ + INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */ + INTEL_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */ + INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ + EVENT_CONSTRAINT_END +}; -static struct event_constraint intel_core_pebs_events[] = { - PEBS_EVENT_CONSTRAINT(0x00c0, 0x1), /* INSTR_RETIRED.ANY */ - PEBS_EVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */ - PEBS_EVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */ - PEBS_EVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */ - PEBS_EVENT_CONSTRAINT(0x01cb, 0x1), /* MEM_LOAD_RETIRED.L1D_MISS */ - PEBS_EVENT_CONSTRAINT(0x02cb, 0x1), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */ - PEBS_EVENT_CONSTRAINT(0x04cb, 0x1), /* MEM_LOAD_RETIRED.L2_MISS */ - PEBS_EVENT_CONSTRAINT(0x08cb, 0x1), /* MEM_LOAD_RETIRED.L2_LINE_MISS */ - PEBS_EVENT_CONSTRAINT(0x10cb, 0x1), /* MEM_LOAD_RETIRED.DTLB_MISS */ +struct event_constraint intel_atom_pebs_event_constraints[] = { + INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ + INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */ + INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ EVENT_CONSTRAINT_END }; -static struct event_constraint intel_nehalem_pebs_events[] = { - PEBS_EVENT_CONSTRAINT(0x00c0, 0xf), /* INSTR_RETIRED.ANY */ - PEBS_EVENT_CONSTRAINT(0xfec1, 0xf), /* X87_OPS_RETIRED.ANY */ - PEBS_EVENT_CONSTRAINT(0x00c5, 0xf), /* BR_INST_RETIRED.MISPRED */ - PEBS_EVENT_CONSTRAINT(0x1fc7, 0xf), /* SIMD_INST_RETURED.ANY */ - PEBS_EVENT_CONSTRAINT(0x01cb, 0xf), /* MEM_LOAD_RETIRED.L1D_MISS */ - PEBS_EVENT_CONSTRAINT(0x02cb, 0xf), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */ - PEBS_EVENT_CONSTRAINT(0x04cb, 0xf), /* MEM_LOAD_RETIRED.L2_MISS */ - PEBS_EVENT_CONSTRAINT(0x08cb, 0xf), /* MEM_LOAD_RETIRED.L2_LINE_MISS */ - PEBS_EVENT_CONSTRAINT(0x10cb, 0xf), /* MEM_LOAD_RETIRED.DTLB_MISS */ +struct event_constraint intel_slm_pebs_event_constraints[] = { + INTEL_UEVENT_CONSTRAINT(0x0103, 0x1), /* REHABQ.LD_BLOCK_ST_FORWARD_PS */ + INTEL_UEVENT_CONSTRAINT(0x0803, 0x1), /* REHABQ.LD_SPLITS_PS */ + INTEL_UEVENT_CONSTRAINT(0x0204, 0x1), /* MEM_UOPS_RETIRED.L2_HIT_LOADS_PS */ + INTEL_UEVENT_CONSTRAINT(0x0404, 0x1), /* MEM_UOPS_RETIRED.L2_MISS_LOADS_PS */ + INTEL_UEVENT_CONSTRAINT(0x0804, 0x1), /* MEM_UOPS_RETIRED.DTLB_MISS_LOADS_PS */ + INTEL_UEVENT_CONSTRAINT(0x2004, 0x1), /* MEM_UOPS_RETIRED.HITM_PS */ + INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY_PS */ + INTEL_UEVENT_CONSTRAINT(0x00c4, 0x1), /* BR_INST_RETIRED.ALL_BRANCHES_PS */ + INTEL_UEVENT_CONSTRAINT(0x7ec4, 0x1), /* BR_INST_RETIRED.JCC_PS */ + INTEL_UEVENT_CONSTRAINT(0xbfc4, 0x1), /* BR_INST_RETIRED.FAR_BRANCH_PS */ + INTEL_UEVENT_CONSTRAINT(0xebc4, 0x1), /* BR_INST_RETIRED.NON_RETURN_IND_PS */ + INTEL_UEVENT_CONSTRAINT(0xf7c4, 0x1), /* BR_INST_RETIRED.RETURN_PS */ + INTEL_UEVENT_CONSTRAINT(0xf9c4, 0x1), /* BR_INST_RETIRED.CALL_PS */ + INTEL_UEVENT_CONSTRAINT(0xfbc4, 0x1), /* BR_INST_RETIRED.IND_CALL_PS */ + INTEL_UEVENT_CONSTRAINT(0xfdc4, 0x1), /* BR_INST_RETIRED.REL_CALL_PS */ + INTEL_UEVENT_CONSTRAINT(0xfec4, 0x1), /* BR_INST_RETIRED.TAKEN_JCC_PS */ + INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_MISP_RETIRED.ALL_BRANCHES_PS */ + INTEL_UEVENT_CONSTRAINT(0x7ec5, 0x1), /* BR_INST_MISP_RETIRED.JCC_PS */ + INTEL_UEVENT_CONSTRAINT(0xebc5, 0x1), /* BR_INST_MISP_RETIRED.NON_RETURN_IND_PS */ + INTEL_UEVENT_CONSTRAINT(0xf7c5, 0x1), /* BR_INST_MISP_RETIRED.RETURN_PS */ + INTEL_UEVENT_CONSTRAINT(0xfbc5, 0x1), /* BR_INST_MISP_RETIRED.IND_CALL_PS */ + INTEL_UEVENT_CONSTRAINT(0xfec5, 0x1), /* BR_INST_MISP_RETIRED.TAKEN_JCC_PS */ + EVENT_CONSTRAINT_END +}; + +struct event_constraint intel_nehalem_pebs_event_constraints[] = { + INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ + INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */ + INTEL_EVENT_CONSTRAINT(0xc0, 0xf), /* INST_RETIRED.ANY */ + INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ + INTEL_UEVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */ + INTEL_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */ + INTEL_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ + INTEL_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ + EVENT_CONSTRAINT_END +}; + +struct event_constraint intel_westmere_pebs_event_constraints[] = { + INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ + INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */ + INTEL_EVENT_CONSTRAINT(0xc0, 0xf), /* INSTR_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */ + INTEL_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ + INTEL_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ + EVENT_CONSTRAINT_END +}; + +struct event_constraint intel_snb_pebs_event_constraints[] = { + INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ + INTEL_UEVENT_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */ + INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */ + INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */ + INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */ + INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */ + INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */ + INTEL_UEVENT_CONSTRAINT(0x02d4, 0xf), /* MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS */ + EVENT_CONSTRAINT_END +}; + +struct event_constraint intel_ivb_pebs_event_constraints[] = { + INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ + INTEL_UEVENT_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */ + INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */ + INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */ + INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */ + INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */ + INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */ + EVENT_CONSTRAINT_END +}; + +struct event_constraint intel_hsw_pebs_event_constraints[] = { + INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ + INTEL_PST_HSW_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */ + INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */ + INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ + INTEL_UEVENT_CONSTRAINT(0x01c5, 0xf), /* BR_MISP_RETIRED.CONDITIONAL */ + INTEL_UEVENT_CONSTRAINT(0x04c5, 0xf), /* BR_MISP_RETIRED.ALL_BRANCHES */ + INTEL_UEVENT_CONSTRAINT(0x20c5, 0xf), /* BR_MISP_RETIRED.NEAR_TAKEN */ + INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.* */ + /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */ + INTEL_UEVENT_CONSTRAINT(0x11d0, 0xf), + /* MEM_UOPS_RETIRED.STLB_MISS_STORES */ + INTEL_UEVENT_CONSTRAINT(0x12d0, 0xf), + INTEL_UEVENT_CONSTRAINT(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */ + INTEL_UEVENT_CONSTRAINT(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */ + /* MEM_UOPS_RETIRED.SPLIT_STORES */ + INTEL_UEVENT_CONSTRAINT(0x42d0, 0xf), + INTEL_UEVENT_CONSTRAINT(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */ + INTEL_PST_HSW_CONSTRAINT(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */ + INTEL_UEVENT_CONSTRAINT(0x01d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.L1_HIT */ + INTEL_UEVENT_CONSTRAINT(0x02d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.L2_HIT */ + INTEL_UEVENT_CONSTRAINT(0x04d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.L3_HIT */ + /* MEM_LOAD_UOPS_RETIRED.HIT_LFB */ + INTEL_UEVENT_CONSTRAINT(0x40d1, 0xf), + /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS */ + INTEL_UEVENT_CONSTRAINT(0x01d2, 0xf), + /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT */ + INTEL_UEVENT_CONSTRAINT(0x02d2, 0xf), + /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM */ + INTEL_UEVENT_CONSTRAINT(0x01d3, 0xf), + INTEL_UEVENT_CONSTRAINT(0x04c8, 0xf), /* HLE_RETIRED.Abort */ + INTEL_UEVENT_CONSTRAINT(0x04c9, 0xf), /* RTM_RETIRED.Abort */ + EVENT_CONSTRAINT_END }; -static struct event_constraint * -intel_pebs_constraints(struct perf_event *event) +struct event_constraint *intel_pebs_constraints(struct perf_event *event) { struct event_constraint *c; @@ -398,15 +700,17 @@ intel_pebs_constraints(struct perf_event *event) if (x86_pmu.pebs_constraints) { for_each_event_constraint(c, x86_pmu.pebs_constraints) { - if ((event->hw.config & c->cmask) == c->code) + if ((event->hw.config & c->cmask) == c->code) { + event->hw.flags |= c->flags; return c; + } } } return &emptyconstraint; } -static void intel_pmu_pebs_enable(struct perf_event *event) +void intel_pmu_pebs_enable(struct perf_event *event) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; @@ -414,28 +718,32 @@ static void intel_pmu_pebs_enable(struct perf_event *event) hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT; cpuc->pebs_enabled |= 1ULL << hwc->idx; - WARN_ON_ONCE(cpuc->enabled); - if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1) - intel_pmu_lbr_enable(event); + if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) + cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32); + else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST) + cpuc->pebs_enabled |= 1ULL << 63; } -static void intel_pmu_pebs_disable(struct perf_event *event) +void intel_pmu_pebs_disable(struct perf_event *event) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; cpuc->pebs_enabled &= ~(1ULL << hwc->idx); + + if (event->hw.constraint->flags & PERF_X86_EVENT_PEBS_LDLAT) + cpuc->pebs_enabled &= ~(1ULL << (hwc->idx + 32)); + else if (event->hw.constraint->flags & PERF_X86_EVENT_PEBS_ST) + cpuc->pebs_enabled &= ~(1ULL << 63); + if (cpuc->enabled) wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); hwc->config |= ARCH_PERFMON_EVENTSEL_INT; - - if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1) - intel_pmu_lbr_disable(event); } -static void intel_pmu_pebs_enable_all(void) +void intel_pmu_pebs_enable_all(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -443,7 +751,7 @@ static void intel_pmu_pebs_enable_all(void) wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); } -static void intel_pmu_pebs_disable_all(void) +void intel_pmu_pebs_disable_all(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -451,23 +759,14 @@ static void intel_pmu_pebs_disable_all(void) wrmsrl(MSR_IA32_PEBS_ENABLE, 0); } -#include <asm/insn.h> - -static inline bool kernel_ip(unsigned long ip) -{ -#ifdef CONFIG_X86_32 - return ip > PAGE_OFFSET; -#else - return (long)ip < 0; -#endif -} - static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); unsigned long from = cpuc->lbr_entries[0].from; unsigned long old_to, to = cpuc->lbr_entries[0].to; unsigned long ip = regs->ip; + int is_64bit = 0; + void *kaddr; /* * We don't need to fixup if the PEBS assist is fault like @@ -491,41 +790,48 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) * unsigned math, either ip is before the start (impossible) or * the basic block is larger than 1 page (sanity) */ - if ((ip - to) > PAGE_SIZE) + if ((ip - to) > PEBS_FIXUP_SIZE) return 0; /* * We sampled a branch insn, rewind using the LBR stack */ if (ip == to) { - regs->ip = from; + set_linear_ip(regs, from); return 1; } + if (!kernel_ip(ip)) { + int size, bytes; + u8 *buf = this_cpu_read(insn_buffer); + + size = ip - to; /* Must fit our buffer, see above */ + bytes = copy_from_user_nmi(buf, (void __user *)to, size); + if (bytes != 0) + return 0; + + kaddr = buf; + } else { + kaddr = (void *)to; + } + do { struct insn insn; - u8 buf[MAX_INSN_SIZE]; - void *kaddr; old_to = to; - if (!kernel_ip(ip)) { - int bytes, size = MAX_INSN_SIZE; - - bytes = copy_from_user_nmi(buf, (void __user *)to, size); - if (bytes != size) - return 0; - kaddr = buf; - } else - kaddr = (void *)to; - - kernel_insn_init(&insn, kaddr); +#ifdef CONFIG_X86_64 + is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32); +#endif + insn_init(&insn, kaddr, is_64bit); insn_get_length(&insn); + to += insn.length; + kaddr += insn.length; } while (to < ip); if (to == ip) { - regs->ip = old_to; + set_linear_ip(regs, old_to); return 1; } @@ -536,25 +842,74 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) return 0; } -static int intel_pmu_save_and_restart(struct perf_event *event); +static inline u64 intel_hsw_weight(struct pebs_record_hsw *pebs) +{ + if (pebs->tsx_tuning) { + union hsw_tsx_tuning tsx = { .value = pebs->tsx_tuning }; + return tsx.cycles_last_block; + } + return 0; +} + +static inline u64 intel_hsw_transaction(struct pebs_record_hsw *pebs) +{ + u64 txn = (pebs->tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32; + + /* For RTM XABORTs also log the abort code from AX */ + if ((txn & PERF_TXN_TRANSACTION) && (pebs->ax & 1)) + txn |= ((pebs->ax >> 24) & 0xff) << PERF_TXN_ABORT_SHIFT; + return txn; +} static void __intel_pmu_pebs_event(struct perf_event *event, struct pt_regs *iregs, void *__pebs) { /* - * We cast to pebs_record_core since that is a subset of - * both formats and we don't use the other fields in this - * routine. + * We cast to the biggest pebs_record but are careful not to + * unconditionally access the 'extra' entries. */ - struct pebs_record_core *pebs = __pebs; + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct pebs_record_hsw *pebs = __pebs; struct perf_sample_data data; struct pt_regs regs; + u64 sample_type; + int fll, fst; if (!intel_pmu_save_and_restart(event)) return; - perf_sample_data_init(&data, 0); + fll = event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT; + fst = event->hw.flags & (PERF_X86_EVENT_PEBS_ST | + PERF_X86_EVENT_PEBS_ST_HSW); + + perf_sample_data_init(&data, 0, event->hw.last_period); + data.period = event->hw.last_period; + sample_type = event->attr.sample_type; + + /* + * if PEBS-LL or PreciseStore + */ + if (fll || fst) { + /* + * Use latency for weight (only avail with PEBS-LL) + */ + if (fll && (sample_type & PERF_SAMPLE_WEIGHT)) + data.weight = pebs->lat; + + /* + * data.data_src encodes the data source + */ + if (sample_type & PERF_SAMPLE_DATA_SRC) { + if (fll) + data.data_src.val = load_latency_data(pebs->dse); + else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW) + data.data_src.val = + precise_store_data_hsw(event, pebs->dse); + else + data.data_src.val = precise_store_data(pebs->dse); + } + } /* * We use the interrupt regs as a base because the PEBS record @@ -567,16 +922,36 @@ static void __intel_pmu_pebs_event(struct perf_event *event, * A possible PERF_SAMPLE_REGS will have to transfer all regs. */ regs = *iregs; - regs.ip = pebs->ip; + regs.flags = pebs->flags; + set_linear_ip(®s, pebs->ip); regs.bp = pebs->bp; regs.sp = pebs->sp; - if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(®s)) + if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format >= 2) { + regs.ip = pebs->real_ip; + regs.flags |= PERF_EFLAGS_EXACT; + } else if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(®s)) regs.flags |= PERF_EFLAGS_EXACT; else regs.flags &= ~PERF_EFLAGS_EXACT; - if (perf_event_overflow(event, 1, &data, ®s)) + if ((event->attr.sample_type & PERF_SAMPLE_ADDR) && + x86_pmu.intel_cap.pebs_format >= 1) + data.addr = pebs->dla; + + if (x86_pmu.intel_cap.pebs_format >= 2) { + /* Only set the TSX weight when no memory weight. */ + if ((event->attr.sample_type & PERF_SAMPLE_WEIGHT) && !fll) + data.weight = intel_hsw_weight(pebs); + + if (event->attr.sample_type & PERF_SAMPLE_TRANSACTION) + data.txn = intel_hsw_transaction(pebs); + } + + if (has_branch_stack(event)) + data.br_stack = &cpuc->lbr_stack; + + if (perf_event_overflow(event, &data, ®s)) x86_pmu_stop(event, 0); } @@ -615,7 +990,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) * Should not happen, we program the threshold at 1 and do not * set a reset value. */ - WARN_ON_ONCE(n > 1); + WARN_ONCE(n > 1, "bad leftover pebs %d\n", n); at += n - 1; __intel_pmu_pebs_event(event, iregs, at); @@ -625,10 +1000,10 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct debug_store *ds = cpuc->ds; - struct pebs_record_nhm *at, *top; struct perf_event *event = NULL; + void *at, *top; u64 status = 0; - int bit, n; + int bit; if (!x86_pmu.pebs_active) return; @@ -638,18 +1013,22 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) ds->pebs_index = ds->pebs_buffer_base; - n = top - at; - if (n <= 0) + if (unlikely(at > top)) return; /* * Should not happen, we program the threshold at 1 and do not * set a reset value. */ - WARN_ON_ONCE(n > MAX_PEBS_EVENTS); + WARN_ONCE(top - at > x86_pmu.max_pebs_events * x86_pmu.pebs_record_size, + "Unexpected number of pebs records %ld\n", + (long)(top - at) / x86_pmu.pebs_record_size); + + for (; at < top; at += x86_pmu.pebs_record_size) { + struct pebs_record_nhm *p = at; - for ( ; at < top; at++) { - for_each_set_bit(bit, (unsigned long *)&at->status, MAX_PEBS_EVENTS) { + for_each_set_bit(bit, (unsigned long *)&p->status, + x86_pmu.max_pebs_events) { event = cpuc->events[bit]; if (!test_bit(bit, cpuc->active_mask)) continue; @@ -665,7 +1044,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) break; } - if (!event || bit >= MAX_PEBS_EVENTS) + if (!event || bit >= x86_pmu.max_pebs_events) continue; __intel_pmu_pebs_event(event, iregs, at); @@ -676,7 +1055,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) * BTS, PEBS probe and setup */ -static void intel_ds_init(void) +void intel_ds_init(void) { /* * No support for 32bit formats @@ -695,32 +1074,33 @@ static void intel_ds_init(void) printk(KERN_CONT "PEBS fmt0%c, ", pebs_type); x86_pmu.pebs_record_size = sizeof(struct pebs_record_core); x86_pmu.drain_pebs = intel_pmu_drain_pebs_core; - x86_pmu.pebs_constraints = intel_core_pebs_events; break; case 1: printk(KERN_CONT "PEBS fmt1%c, ", pebs_type); x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm); x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm; - x86_pmu.pebs_constraints = intel_nehalem_pebs_events; + break; + + case 2: + pr_cont("PEBS fmt2%c, ", pebs_type); + x86_pmu.pebs_record_size = sizeof(struct pebs_record_hsw); + x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm; break; default: printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type); x86_pmu.pebs = 0; - break; } } } -#else /* CONFIG_CPU_SUP_INTEL */ - -static void reserve_ds_buffers(void) +void perf_restore_debug_store(void) { -} + struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds); -static void release_ds_buffers(void) -{ -} + if (!x86_pmu.bts && !x86_pmu.pebs) + return; -#endif /* CONFIG_CPU_SUP_INTEL */ + wrmsrl(MSR_IA32_DS_AREA, (unsigned long)ds); +} diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index d202c1bece1..9dd2459a4c7 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -1,12 +1,130 @@ -#ifdef CONFIG_CPU_SUP_INTEL +#include <linux/perf_event.h> +#include <linux/types.h> + +#include <asm/perf_event.h> +#include <asm/msr.h> +#include <asm/insn.h> + +#include "perf_event.h" enum { LBR_FORMAT_32 = 0x00, LBR_FORMAT_LIP = 0x01, LBR_FORMAT_EIP = 0x02, LBR_FORMAT_EIP_FLAGS = 0x03, + LBR_FORMAT_EIP_FLAGS2 = 0x04, + LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_EIP_FLAGS2, }; +static enum { + LBR_EIP_FLAGS = 1, + LBR_TSX = 2, +} lbr_desc[LBR_FORMAT_MAX_KNOWN + 1] = { + [LBR_FORMAT_EIP_FLAGS] = LBR_EIP_FLAGS, + [LBR_FORMAT_EIP_FLAGS2] = LBR_EIP_FLAGS | LBR_TSX, +}; + +/* + * Intel LBR_SELECT bits + * Intel Vol3a, April 2011, Section 16.7 Table 16-10 + * + * Hardware branch filter (not available on all CPUs) + */ +#define LBR_KERNEL_BIT 0 /* do not capture at ring0 */ +#define LBR_USER_BIT 1 /* do not capture at ring > 0 */ +#define LBR_JCC_BIT 2 /* do not capture conditional branches */ +#define LBR_REL_CALL_BIT 3 /* do not capture relative calls */ +#define LBR_IND_CALL_BIT 4 /* do not capture indirect calls */ +#define LBR_RETURN_BIT 5 /* do not capture near returns */ +#define LBR_IND_JMP_BIT 6 /* do not capture indirect jumps */ +#define LBR_REL_JMP_BIT 7 /* do not capture relative jumps */ +#define LBR_FAR_BIT 8 /* do not capture far branches */ + +#define LBR_KERNEL (1 << LBR_KERNEL_BIT) +#define LBR_USER (1 << LBR_USER_BIT) +#define LBR_JCC (1 << LBR_JCC_BIT) +#define LBR_REL_CALL (1 << LBR_REL_CALL_BIT) +#define LBR_IND_CALL (1 << LBR_IND_CALL_BIT) +#define LBR_RETURN (1 << LBR_RETURN_BIT) +#define LBR_REL_JMP (1 << LBR_REL_JMP_BIT) +#define LBR_IND_JMP (1 << LBR_IND_JMP_BIT) +#define LBR_FAR (1 << LBR_FAR_BIT) + +#define LBR_PLM (LBR_KERNEL | LBR_USER) + +#define LBR_SEL_MASK 0x1ff /* valid bits in LBR_SELECT */ +#define LBR_NOT_SUPP -1 /* LBR filter not supported */ +#define LBR_IGN 0 /* ignored */ + +#define LBR_ANY \ + (LBR_JCC |\ + LBR_REL_CALL |\ + LBR_IND_CALL |\ + LBR_RETURN |\ + LBR_REL_JMP |\ + LBR_IND_JMP |\ + LBR_FAR) + +#define LBR_FROM_FLAG_MISPRED (1ULL << 63) +#define LBR_FROM_FLAG_IN_TX (1ULL << 62) +#define LBR_FROM_FLAG_ABORT (1ULL << 61) + +#define for_each_branch_sample_type(x) \ + for ((x) = PERF_SAMPLE_BRANCH_USER; \ + (x) < PERF_SAMPLE_BRANCH_MAX; (x) <<= 1) + +/* + * x86control flow change classification + * x86control flow changes include branches, interrupts, traps, faults + */ +enum { + X86_BR_NONE = 0, /* unknown */ + + X86_BR_USER = 1 << 0, /* branch target is user */ + X86_BR_KERNEL = 1 << 1, /* branch target is kernel */ + + X86_BR_CALL = 1 << 2, /* call */ + X86_BR_RET = 1 << 3, /* return */ + X86_BR_SYSCALL = 1 << 4, /* syscall */ + X86_BR_SYSRET = 1 << 5, /* syscall return */ + X86_BR_INT = 1 << 6, /* sw interrupt */ + X86_BR_IRET = 1 << 7, /* return from interrupt */ + X86_BR_JCC = 1 << 8, /* conditional */ + X86_BR_JMP = 1 << 9, /* jump */ + X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */ + X86_BR_IND_CALL = 1 << 11,/* indirect calls */ + X86_BR_ABORT = 1 << 12,/* transaction abort */ + X86_BR_IN_TX = 1 << 13,/* in transaction */ + X86_BR_NO_TX = 1 << 14,/* not in transaction */ +}; + +#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL) +#define X86_BR_ANYTX (X86_BR_NO_TX | X86_BR_IN_TX) + +#define X86_BR_ANY \ + (X86_BR_CALL |\ + X86_BR_RET |\ + X86_BR_SYSCALL |\ + X86_BR_SYSRET |\ + X86_BR_INT |\ + X86_BR_IRET |\ + X86_BR_JCC |\ + X86_BR_JMP |\ + X86_BR_IRQ |\ + X86_BR_ABORT |\ + X86_BR_IND_CALL) + +#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY) + +#define X86_BR_ANY_CALL \ + (X86_BR_CALL |\ + X86_BR_IND_CALL |\ + X86_BR_SYSCALL |\ + X86_BR_IRQ |\ + X86_BR_INT) + +static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc); + /* * We only support LBR implementations that have FREEZE_LBRS_ON_PMI * otherwise it becomes near impossible to get a reliable stack. @@ -15,6 +133,10 @@ enum { static void __intel_pmu_lbr_enable(void) { u64 debugctl; + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + if (cpuc->lbr_sel) + wrmsrl(MSR_LBR_SELECT, cpuc->lbr_sel->config); rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); @@ -48,7 +170,7 @@ static void intel_pmu_lbr_reset_64(void) } } -static void intel_pmu_lbr_reset(void) +void intel_pmu_lbr_reset(void) { if (!x86_pmu.lbr_nr) return; @@ -59,29 +181,27 @@ static void intel_pmu_lbr_reset(void) intel_pmu_lbr_reset_64(); } -static void intel_pmu_lbr_enable(struct perf_event *event) +void intel_pmu_lbr_enable(struct perf_event *event) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); if (!x86_pmu.lbr_nr) return; - WARN_ON_ONCE(cpuc->enabled); - /* * Reset the LBR stack if we changed task context to * avoid data leaks. */ - if (event->ctx->task && cpuc->lbr_context != event->ctx) { intel_pmu_lbr_reset(); cpuc->lbr_context = event->ctx; } + cpuc->br_sel = event->hw.branch_reg.reg; cpuc->lbr_users++; } -static void intel_pmu_lbr_disable(struct perf_event *event) +void intel_pmu_lbr_disable(struct perf_event *event) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -91,11 +211,14 @@ static void intel_pmu_lbr_disable(struct perf_event *event) cpuc->lbr_users--; WARN_ON_ONCE(cpuc->lbr_users < 0); - if (cpuc->enabled && !cpuc->lbr_users) + if (cpuc->enabled && !cpuc->lbr_users) { __intel_pmu_lbr_disable(); + /* avoid stale pointer */ + cpuc->lbr_context = NULL; + } } -static void intel_pmu_lbr_enable_all(void) +void intel_pmu_lbr_enable_all(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -103,7 +226,7 @@ static void intel_pmu_lbr_enable_all(void) __intel_pmu_lbr_enable(); } -static void intel_pmu_lbr_disable_all(void) +void intel_pmu_lbr_disable_all(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -111,6 +234,9 @@ static void intel_pmu_lbr_disable_all(void) __intel_pmu_lbr_disable(); } +/* + * TOS = most recently recorded branch + */ static inline u64 intel_pmu_lbr_tos(void) { u64 tos; @@ -138,15 +264,15 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr); - cpuc->lbr_entries[i].from = msr_lastbranch.from; - cpuc->lbr_entries[i].to = msr_lastbranch.to; - cpuc->lbr_entries[i].flags = 0; + cpuc->lbr_entries[i].from = msr_lastbranch.from; + cpuc->lbr_entries[i].to = msr_lastbranch.to; + cpuc->lbr_entries[i].mispred = 0; + cpuc->lbr_entries[i].predicted = 0; + cpuc->lbr_entries[i].reserved = 0; } cpuc->lbr_stack.nr = i; } -#define LBR_FROM_FLAG_MISPRED (1ULL << 63) - /* * Due to lack of segmentation in Linux the effective address (offset) * is the same as the linear address, allowing us to merge the LIP and EIP @@ -158,27 +284,53 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) int lbr_format = x86_pmu.intel_cap.lbr_format; u64 tos = intel_pmu_lbr_tos(); int i; + int out = 0; for (i = 0; i < x86_pmu.lbr_nr; i++) { unsigned long lbr_idx = (tos - i) & mask; - u64 from, to, flags = 0; + u64 from, to, mis = 0, pred = 0, in_tx = 0, abort = 0; + int skip = 0; + int lbr_flags = lbr_desc[lbr_format]; rdmsrl(x86_pmu.lbr_from + lbr_idx, from); rdmsrl(x86_pmu.lbr_to + lbr_idx, to); - if (lbr_format == LBR_FORMAT_EIP_FLAGS) { - flags = !!(from & LBR_FROM_FLAG_MISPRED); - from = (u64)((((s64)from) << 1) >> 1); + if (lbr_flags & LBR_EIP_FLAGS) { + mis = !!(from & LBR_FROM_FLAG_MISPRED); + pred = !mis; + skip = 1; } - - cpuc->lbr_entries[i].from = from; - cpuc->lbr_entries[i].to = to; - cpuc->lbr_entries[i].flags = flags; + if (lbr_flags & LBR_TSX) { + in_tx = !!(from & LBR_FROM_FLAG_IN_TX); + abort = !!(from & LBR_FROM_FLAG_ABORT); + skip = 3; + } + from = (u64)((((s64)from) << skip) >> skip); + + /* + * Some CPUs report duplicated abort records, + * with the second entry not having an abort bit set. + * Skip them here. This loop runs backwards, + * so we need to undo the previous record. + * If the abort just happened outside the window + * the extra entry cannot be removed. + */ + if (abort && x86_pmu.lbr_double_abort && out > 0) + out--; + + cpuc->lbr_entries[out].from = from; + cpuc->lbr_entries[out].to = to; + cpuc->lbr_entries[out].mispred = mis; + cpuc->lbr_entries[out].predicted = pred; + cpuc->lbr_entries[out].in_tx = in_tx; + cpuc->lbr_entries[out].abort = abort; + cpuc->lbr_entries[out].reserved = 0; + out++; } - cpuc->lbr_stack.nr = i; + cpuc->lbr_stack.nr = out; } -static void intel_pmu_lbr_read(void) +void intel_pmu_lbr_read(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -189,30 +341,439 @@ static void intel_pmu_lbr_read(void) intel_pmu_lbr_read_32(cpuc); else intel_pmu_lbr_read_64(cpuc); + + intel_pmu_lbr_filter(cpuc); +} + +/* + * SW filter is used: + * - in case there is no HW filter + * - in case the HW filter has errata or limitations + */ +static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event) +{ + u64 br_type = event->attr.branch_sample_type; + int mask = 0; + + if (br_type & PERF_SAMPLE_BRANCH_USER) + mask |= X86_BR_USER; + + if (br_type & PERF_SAMPLE_BRANCH_KERNEL) + mask |= X86_BR_KERNEL; + + /* we ignore BRANCH_HV here */ + + if (br_type & PERF_SAMPLE_BRANCH_ANY) + mask |= X86_BR_ANY; + + if (br_type & PERF_SAMPLE_BRANCH_ANY_CALL) + mask |= X86_BR_ANY_CALL; + + if (br_type & PERF_SAMPLE_BRANCH_ANY_RETURN) + mask |= X86_BR_RET | X86_BR_IRET | X86_BR_SYSRET; + + if (br_type & PERF_SAMPLE_BRANCH_IND_CALL) + mask |= X86_BR_IND_CALL; + + if (br_type & PERF_SAMPLE_BRANCH_ABORT_TX) + mask |= X86_BR_ABORT; + + if (br_type & PERF_SAMPLE_BRANCH_IN_TX) + mask |= X86_BR_IN_TX; + + if (br_type & PERF_SAMPLE_BRANCH_NO_TX) + mask |= X86_BR_NO_TX; + + if (br_type & PERF_SAMPLE_BRANCH_COND) + mask |= X86_BR_JCC; + + /* + * stash actual user request into reg, it may + * be used by fixup code for some CPU + */ + event->hw.branch_reg.reg = mask; } -static void intel_pmu_lbr_init_core(void) +/* + * setup the HW LBR filter + * Used only when available, may not be enough to disambiguate + * all branches, may need the help of the SW filter + */ +static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event) +{ + struct hw_perf_event_extra *reg; + u64 br_type = event->attr.branch_sample_type; + u64 mask = 0, m; + u64 v; + + for_each_branch_sample_type(m) { + if (!(br_type & m)) + continue; + + v = x86_pmu.lbr_sel_map[m]; + if (v == LBR_NOT_SUPP) + return -EOPNOTSUPP; + + if (v != LBR_IGN) + mask |= v; + } + reg = &event->hw.branch_reg; + reg->idx = EXTRA_REG_LBR; + + /* LBR_SELECT operates in suppress mode so invert mask */ + reg->config = ~mask & x86_pmu.lbr_sel_mask; + + return 0; +} + +int intel_pmu_setup_lbr_filter(struct perf_event *event) +{ + int ret = 0; + + /* + * no LBR on this PMU + */ + if (!x86_pmu.lbr_nr) + return -EOPNOTSUPP; + + /* + * setup SW LBR filter + */ + intel_pmu_setup_sw_lbr_filter(event); + + /* + * setup HW LBR filter, if any + */ + if (x86_pmu.lbr_sel_map) + ret = intel_pmu_setup_hw_lbr_filter(event); + + return ret; +} + +/* + * return the type of control flow change at address "from" + * intruction is not necessarily a branch (in case of interrupt). + * + * The branch type returned also includes the priv level of the + * target of the control flow change (X86_BR_USER, X86_BR_KERNEL). + * + * If a branch type is unknown OR the instruction cannot be + * decoded (e.g., text page not present), then X86_BR_NONE is + * returned. + */ +static int branch_type(unsigned long from, unsigned long to, int abort) +{ + struct insn insn; + void *addr; + int bytes, size = MAX_INSN_SIZE; + int ret = X86_BR_NONE; + int ext, to_plm, from_plm; + u8 buf[MAX_INSN_SIZE]; + int is64 = 0; + + to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER; + from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER; + + /* + * maybe zero if lbr did not fill up after a reset by the time + * we get a PMU interrupt + */ + if (from == 0 || to == 0) + return X86_BR_NONE; + + if (abort) + return X86_BR_ABORT | to_plm; + + if (from_plm == X86_BR_USER) { + /* + * can happen if measuring at the user level only + * and we interrupt in a kernel thread, e.g., idle. + */ + if (!current->mm) + return X86_BR_NONE; + + /* may fail if text not present */ + bytes = copy_from_user_nmi(buf, (void __user *)from, size); + if (bytes != 0) + return X86_BR_NONE; + + addr = buf; + } else { + /* + * The LBR logs any address in the IP, even if the IP just + * faulted. This means userspace can control the from address. + * Ensure we don't blindy read any address by validating it is + * a known text address. + */ + if (kernel_text_address(from)) + addr = (void *)from; + else + return X86_BR_NONE; + } + + /* + * decoder needs to know the ABI especially + * on 64-bit systems running 32-bit apps + */ +#ifdef CONFIG_X86_64 + is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32); +#endif + insn_init(&insn, addr, is64); + insn_get_opcode(&insn); + + switch (insn.opcode.bytes[0]) { + case 0xf: + switch (insn.opcode.bytes[1]) { + case 0x05: /* syscall */ + case 0x34: /* sysenter */ + ret = X86_BR_SYSCALL; + break; + case 0x07: /* sysret */ + case 0x35: /* sysexit */ + ret = X86_BR_SYSRET; + break; + case 0x80 ... 0x8f: /* conditional */ + ret = X86_BR_JCC; + break; + default: + ret = X86_BR_NONE; + } + break; + case 0x70 ... 0x7f: /* conditional */ + ret = X86_BR_JCC; + break; + case 0xc2: /* near ret */ + case 0xc3: /* near ret */ + case 0xca: /* far ret */ + case 0xcb: /* far ret */ + ret = X86_BR_RET; + break; + case 0xcf: /* iret */ + ret = X86_BR_IRET; + break; + case 0xcc ... 0xce: /* int */ + ret = X86_BR_INT; + break; + case 0xe8: /* call near rel */ + case 0x9a: /* call far absolute */ + ret = X86_BR_CALL; + break; + case 0xe0 ... 0xe3: /* loop jmp */ + ret = X86_BR_JCC; + break; + case 0xe9 ... 0xeb: /* jmp */ + ret = X86_BR_JMP; + break; + case 0xff: /* call near absolute, call far absolute ind */ + insn_get_modrm(&insn); + ext = (insn.modrm.bytes[0] >> 3) & 0x7; + switch (ext) { + case 2: /* near ind call */ + case 3: /* far ind call */ + ret = X86_BR_IND_CALL; + break; + case 4: + case 5: + ret = X86_BR_JMP; + break; + } + break; + default: + ret = X86_BR_NONE; + } + /* + * interrupts, traps, faults (and thus ring transition) may + * occur on any instructions. Thus, to classify them correctly, + * we need to first look at the from and to priv levels. If they + * are different and to is in the kernel, then it indicates + * a ring transition. If the from instruction is not a ring + * transition instr (syscall, systenter, int), then it means + * it was a irq, trap or fault. + * + * we have no way of detecting kernel to kernel faults. + */ + if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL + && ret != X86_BR_SYSCALL && ret != X86_BR_INT) + ret = X86_BR_IRQ; + + /* + * branch priv level determined by target as + * is done by HW when LBR_SELECT is implemented + */ + if (ret != X86_BR_NONE) + ret |= to_plm; + + return ret; +} + +/* + * implement actual branch filter based on user demand. + * Hardware may not exactly satisfy that request, thus + * we need to inspect opcodes. Mismatched branches are + * discarded. Therefore, the number of branches returned + * in PERF_SAMPLE_BRANCH_STACK sample may vary. + */ +static void +intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) +{ + u64 from, to; + int br_sel = cpuc->br_sel; + int i, j, type; + bool compress = false; + + /* if sampling all branches, then nothing to filter */ + if ((br_sel & X86_BR_ALL) == X86_BR_ALL) + return; + + for (i = 0; i < cpuc->lbr_stack.nr; i++) { + + from = cpuc->lbr_entries[i].from; + to = cpuc->lbr_entries[i].to; + + type = branch_type(from, to, cpuc->lbr_entries[i].abort); + if (type != X86_BR_NONE && (br_sel & X86_BR_ANYTX)) { + if (cpuc->lbr_entries[i].in_tx) + type |= X86_BR_IN_TX; + else + type |= X86_BR_NO_TX; + } + + /* if type does not correspond, then discard */ + if (type == X86_BR_NONE || (br_sel & type) != type) { + cpuc->lbr_entries[i].from = 0; + compress = true; + } + } + + if (!compress) + return; + + /* remove all entries with from=0 */ + for (i = 0; i < cpuc->lbr_stack.nr; ) { + if (!cpuc->lbr_entries[i].from) { + j = i; + while (++j < cpuc->lbr_stack.nr) + cpuc->lbr_entries[j-1] = cpuc->lbr_entries[j]; + cpuc->lbr_stack.nr--; + if (!cpuc->lbr_entries[i].from) + continue; + } + i++; + } +} + +/* + * Map interface branch filters onto LBR filters + */ +static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = { + [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY, + [PERF_SAMPLE_BRANCH_USER] = LBR_USER, + [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL, + [PERF_SAMPLE_BRANCH_HV] = LBR_IGN, + [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_REL_JMP + | LBR_IND_JMP | LBR_FAR, + /* + * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches + */ + [PERF_SAMPLE_BRANCH_ANY_CALL] = + LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR, + /* + * NHM/WSM erratum: must include IND_JMP to capture IND_CALL + */ + [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL | LBR_IND_JMP, + [PERF_SAMPLE_BRANCH_COND] = LBR_JCC, +}; + +static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = { + [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY, + [PERF_SAMPLE_BRANCH_USER] = LBR_USER, + [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL, + [PERF_SAMPLE_BRANCH_HV] = LBR_IGN, + [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_FAR, + [PERF_SAMPLE_BRANCH_ANY_CALL] = LBR_REL_CALL | LBR_IND_CALL + | LBR_FAR, + [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL, + [PERF_SAMPLE_BRANCH_COND] = LBR_JCC, +}; + +/* core */ +void intel_pmu_lbr_init_core(void) { x86_pmu.lbr_nr = 4; - x86_pmu.lbr_tos = 0x01c9; - x86_pmu.lbr_from = 0x40; - x86_pmu.lbr_to = 0x60; + x86_pmu.lbr_tos = MSR_LBR_TOS; + x86_pmu.lbr_from = MSR_LBR_CORE_FROM; + x86_pmu.lbr_to = MSR_LBR_CORE_TO; + + /* + * SW branch filter usage: + * - compensate for lack of HW filter + */ + pr_cont("4-deep LBR, "); } -static void intel_pmu_lbr_init_nhm(void) +/* nehalem/westmere */ +void intel_pmu_lbr_init_nhm(void) { x86_pmu.lbr_nr = 16; - x86_pmu.lbr_tos = 0x01c9; - x86_pmu.lbr_from = 0x680; - x86_pmu.lbr_to = 0x6c0; + x86_pmu.lbr_tos = MSR_LBR_TOS; + x86_pmu.lbr_from = MSR_LBR_NHM_FROM; + x86_pmu.lbr_to = MSR_LBR_NHM_TO; + + x86_pmu.lbr_sel_mask = LBR_SEL_MASK; + x86_pmu.lbr_sel_map = nhm_lbr_sel_map; + + /* + * SW branch filter usage: + * - workaround LBR_SEL errata (see above) + * - support syscall, sysret capture. + * That requires LBR_FAR but that means far + * jmp need to be filtered out + */ + pr_cont("16-deep LBR, "); } -static void intel_pmu_lbr_init_atom(void) +/* sandy bridge */ +void intel_pmu_lbr_init_snb(void) { - x86_pmu.lbr_nr = 8; - x86_pmu.lbr_tos = 0x01c9; - x86_pmu.lbr_from = 0x40; - x86_pmu.lbr_to = 0x60; + x86_pmu.lbr_nr = 16; + x86_pmu.lbr_tos = MSR_LBR_TOS; + x86_pmu.lbr_from = MSR_LBR_NHM_FROM; + x86_pmu.lbr_to = MSR_LBR_NHM_TO; + + x86_pmu.lbr_sel_mask = LBR_SEL_MASK; + x86_pmu.lbr_sel_map = snb_lbr_sel_map; + + /* + * SW branch filter usage: + * - support syscall, sysret capture. + * That requires LBR_FAR but that means far + * jmp need to be filtered out + */ + pr_cont("16-deep LBR, "); } -#endif /* CONFIG_CPU_SUP_INTEL */ +/* atom */ +void intel_pmu_lbr_init_atom(void) +{ + /* + * only models starting at stepping 10 seems + * to have an operational LBR which can freeze + * on PMU interrupt + */ + if (boot_cpu_data.x86_model == 28 + && boot_cpu_data.x86_mask < 10) { + pr_cont("LBR disabled due to erratum"); + return; + } + + x86_pmu.lbr_nr = 8; + x86_pmu.lbr_tos = MSR_LBR_TOS; + x86_pmu.lbr_from = MSR_LBR_CORE_FROM; + x86_pmu.lbr_to = MSR_LBR_CORE_TO; + + /* + * SW branch filter usage: + * - compensate for lack of HW filter + */ + pr_cont("8-deep LBR, "); +} diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c new file mode 100644 index 00000000000..619f7699487 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c @@ -0,0 +1,714 @@ +/* + * perf_event_intel_rapl.c: support Intel RAPL energy consumption counters + * Copyright (C) 2013 Google, Inc., Stephane Eranian + * + * Intel RAPL interface is specified in the IA-32 Manual Vol3b + * section 14.7.1 (September 2013) + * + * RAPL provides more controls than just reporting energy consumption + * however here we only expose the 3 energy consumption free running + * counters (pp0, pkg, dram). + * + * Each of those counters increments in a power unit defined by the + * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules + * but it can vary. + * + * Counter to rapl events mappings: + * + * pp0 counter: consumption of all physical cores (power plane 0) + * event: rapl_energy_cores + * perf code: 0x1 + * + * pkg counter: consumption of the whole processor package + * event: rapl_energy_pkg + * perf code: 0x2 + * + * dram counter: consumption of the dram domain (servers only) + * event: rapl_energy_dram + * perf code: 0x3 + * + * dram counter: consumption of the builtin-gpu domain (client only) + * event: rapl_energy_gpu + * perf code: 0x4 + * + * We manage those counters as free running (read-only). They may be + * use simultaneously by other tools, such as turbostat. + * + * The events only support system-wide mode counting. There is no + * sampling support because it does not make sense and is not + * supported by the RAPL hardware. + * + * Because we want to avoid floating-point operations in the kernel, + * the events are all reported in fixed point arithmetic (32.32). + * Tools must adjust the counts to convert them to Watts using + * the duration of the measurement. Tools may use a function such as + * ldexp(raw_count, -32); + */ +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/perf_event.h> +#include <asm/cpu_device_id.h> +#include "perf_event.h" + +/* + * RAPL energy status counters + */ +#define RAPL_IDX_PP0_NRG_STAT 0 /* all cores */ +#define INTEL_RAPL_PP0 0x1 /* pseudo-encoding */ +#define RAPL_IDX_PKG_NRG_STAT 1 /* entire package */ +#define INTEL_RAPL_PKG 0x2 /* pseudo-encoding */ +#define RAPL_IDX_RAM_NRG_STAT 2 /* DRAM */ +#define INTEL_RAPL_RAM 0x3 /* pseudo-encoding */ +#define RAPL_IDX_PP1_NRG_STAT 3 /* gpu */ +#define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */ + +/* Clients have PP0, PKG */ +#define RAPL_IDX_CLN (1<<RAPL_IDX_PP0_NRG_STAT|\ + 1<<RAPL_IDX_PKG_NRG_STAT|\ + 1<<RAPL_IDX_PP1_NRG_STAT) + +/* Servers have PP0, PKG, RAM */ +#define RAPL_IDX_SRV (1<<RAPL_IDX_PP0_NRG_STAT|\ + 1<<RAPL_IDX_PKG_NRG_STAT|\ + 1<<RAPL_IDX_RAM_NRG_STAT) + +/* Servers have PP0, PKG, RAM, PP1 */ +#define RAPL_IDX_HSW (1<<RAPL_IDX_PP0_NRG_STAT|\ + 1<<RAPL_IDX_PKG_NRG_STAT|\ + 1<<RAPL_IDX_RAM_NRG_STAT|\ + 1<<RAPL_IDX_PP1_NRG_STAT) + +/* + * event code: LSB 8 bits, passed in attr->config + * any other bit is reserved + */ +#define RAPL_EVENT_MASK 0xFFULL + +#define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format) \ +static ssize_t __rapl_##_var##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + char *page) \ +{ \ + BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ + return sprintf(page, _format "\n"); \ +} \ +static struct kobj_attribute format_attr_##_var = \ + __ATTR(_name, 0444, __rapl_##_var##_show, NULL) + +#define RAPL_EVENT_DESC(_name, _config) \ +{ \ + .attr = __ATTR(_name, 0444, rapl_event_show, NULL), \ + .config = _config, \ +} + +#define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */ + +struct rapl_pmu { + spinlock_t lock; + int hw_unit; /* 1/2^hw_unit Joule */ + int n_active; /* number of active events */ + struct list_head active_list; + struct pmu *pmu; /* pointer to rapl_pmu_class */ + ktime_t timer_interval; /* in ktime_t unit */ + struct hrtimer hrtimer; +}; + +static struct pmu rapl_pmu_class; +static cpumask_t rapl_cpu_mask; +static int rapl_cntr_mask; + +static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu); +static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free); + +static inline u64 rapl_read_counter(struct perf_event *event) +{ + u64 raw; + rdmsrl(event->hw.event_base, raw); + return raw; +} + +static inline u64 rapl_scale(u64 v) +{ + /* + * scale delta to smallest unit (1/2^32) + * users must then scale back: count * 1/(1e9*2^32) to get Joules + * or use ldexp(count, -32). + * Watts = Joules/Time delta + */ + return v << (32 - __get_cpu_var(rapl_pmu)->hw_unit); +} + +static u64 rapl_event_update(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 prev_raw_count, new_raw_count; + s64 delta, sdelta; + int shift = RAPL_CNTR_WIDTH; + +again: + prev_raw_count = local64_read(&hwc->prev_count); + rdmsrl(event->hw.event_base, new_raw_count); + + if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, + new_raw_count) != prev_raw_count) { + cpu_relax(); + goto again; + } + + /* + * Now we have the new raw value and have updated the prev + * timestamp already. We can now calculate the elapsed delta + * (event-)time and add that to the generic event. + * + * Careful, not all hw sign-extends above the physical width + * of the count. + */ + delta = (new_raw_count << shift) - (prev_raw_count << shift); + delta >>= shift; + + sdelta = rapl_scale(delta); + + local64_add(sdelta, &event->count); + + return new_raw_count; +} + +static void rapl_start_hrtimer(struct rapl_pmu *pmu) +{ + __hrtimer_start_range_ns(&pmu->hrtimer, + pmu->timer_interval, 0, + HRTIMER_MODE_REL_PINNED, 0); +} + +static void rapl_stop_hrtimer(struct rapl_pmu *pmu) +{ + hrtimer_cancel(&pmu->hrtimer); +} + +static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer) +{ + struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu); + struct perf_event *event; + unsigned long flags; + + if (!pmu->n_active) + return HRTIMER_NORESTART; + + spin_lock_irqsave(&pmu->lock, flags); + + list_for_each_entry(event, &pmu->active_list, active_entry) { + rapl_event_update(event); + } + + spin_unlock_irqrestore(&pmu->lock, flags); + + hrtimer_forward_now(hrtimer, pmu->timer_interval); + + return HRTIMER_RESTART; +} + +static void rapl_hrtimer_init(struct rapl_pmu *pmu) +{ + struct hrtimer *hr = &pmu->hrtimer; + + hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hr->function = rapl_hrtimer_handle; +} + +static void __rapl_pmu_event_start(struct rapl_pmu *pmu, + struct perf_event *event) +{ + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) + return; + + event->hw.state = 0; + + list_add_tail(&event->active_entry, &pmu->active_list); + + local64_set(&event->hw.prev_count, rapl_read_counter(event)); + + pmu->n_active++; + if (pmu->n_active == 1) + rapl_start_hrtimer(pmu); +} + +static void rapl_pmu_event_start(struct perf_event *event, int mode) +{ + struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu); + unsigned long flags; + + spin_lock_irqsave(&pmu->lock, flags); + __rapl_pmu_event_start(pmu, event); + spin_unlock_irqrestore(&pmu->lock, flags); +} + +static void rapl_pmu_event_stop(struct perf_event *event, int mode) +{ + struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu); + struct hw_perf_event *hwc = &event->hw; + unsigned long flags; + + spin_lock_irqsave(&pmu->lock, flags); + + /* mark event as deactivated and stopped */ + if (!(hwc->state & PERF_HES_STOPPED)) { + WARN_ON_ONCE(pmu->n_active <= 0); + pmu->n_active--; + if (pmu->n_active == 0) + rapl_stop_hrtimer(pmu); + + list_del(&event->active_entry); + + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; + } + + /* check if update of sw counter is necessary */ + if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { + /* + * Drain the remaining delta count out of a event + * that we are disabling: + */ + rapl_event_update(event); + hwc->state |= PERF_HES_UPTODATE; + } + + spin_unlock_irqrestore(&pmu->lock, flags); +} + +static int rapl_pmu_event_add(struct perf_event *event, int mode) +{ + struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu); + struct hw_perf_event *hwc = &event->hw; + unsigned long flags; + + spin_lock_irqsave(&pmu->lock, flags); + + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + + if (mode & PERF_EF_START) + __rapl_pmu_event_start(pmu, event); + + spin_unlock_irqrestore(&pmu->lock, flags); + + return 0; +} + +static void rapl_pmu_event_del(struct perf_event *event, int flags) +{ + rapl_pmu_event_stop(event, PERF_EF_UPDATE); +} + +static int rapl_pmu_event_init(struct perf_event *event) +{ + u64 cfg = event->attr.config & RAPL_EVENT_MASK; + int bit, msr, ret = 0; + + /* only look at RAPL events */ + if (event->attr.type != rapl_pmu_class.type) + return -ENOENT; + + /* check only supported bits are set */ + if (event->attr.config & ~RAPL_EVENT_MASK) + return -EINVAL; + + /* + * check event is known (determines counter) + */ + switch (cfg) { + case INTEL_RAPL_PP0: + bit = RAPL_IDX_PP0_NRG_STAT; + msr = MSR_PP0_ENERGY_STATUS; + break; + case INTEL_RAPL_PKG: + bit = RAPL_IDX_PKG_NRG_STAT; + msr = MSR_PKG_ENERGY_STATUS; + break; + case INTEL_RAPL_RAM: + bit = RAPL_IDX_RAM_NRG_STAT; + msr = MSR_DRAM_ENERGY_STATUS; + break; + case INTEL_RAPL_PP1: + bit = RAPL_IDX_PP1_NRG_STAT; + msr = MSR_PP1_ENERGY_STATUS; + break; + default: + return -EINVAL; + } + /* check event supported */ + if (!(rapl_cntr_mask & (1 << bit))) + return -EINVAL; + + /* unsupported modes and filters */ + if (event->attr.exclude_user || + event->attr.exclude_kernel || + event->attr.exclude_hv || + event->attr.exclude_idle || + event->attr.exclude_host || + event->attr.exclude_guest || + event->attr.sample_period) /* no sampling */ + return -EINVAL; + + /* must be done before validate_group */ + event->hw.event_base = msr; + event->hw.config = cfg; + event->hw.idx = bit; + + return ret; +} + +static void rapl_pmu_event_read(struct perf_event *event) +{ + rapl_event_update(event); +} + +static ssize_t rapl_get_attr_cpumask(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &rapl_cpu_mask); + + buf[n++] = '\n'; + buf[n] = '\0'; + return n; +} + +static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL); + +static struct attribute *rapl_pmu_attrs[] = { + &dev_attr_cpumask.attr, + NULL, +}; + +static struct attribute_group rapl_pmu_attr_group = { + .attrs = rapl_pmu_attrs, +}; + +EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01"); +EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02"); +EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03"); +EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04"); + +EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules"); +EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules"); +EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules"); +EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules"); + +/* + * we compute in 0.23 nJ increments regardless of MSR + */ +EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10"); +EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10"); +EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10"); +EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10"); + +static struct attribute *rapl_events_srv_attr[] = { + EVENT_PTR(rapl_cores), + EVENT_PTR(rapl_pkg), + EVENT_PTR(rapl_ram), + + EVENT_PTR(rapl_cores_unit), + EVENT_PTR(rapl_pkg_unit), + EVENT_PTR(rapl_ram_unit), + + EVENT_PTR(rapl_cores_scale), + EVENT_PTR(rapl_pkg_scale), + EVENT_PTR(rapl_ram_scale), + NULL, +}; + +static struct attribute *rapl_events_cln_attr[] = { + EVENT_PTR(rapl_cores), + EVENT_PTR(rapl_pkg), + EVENT_PTR(rapl_gpu), + + EVENT_PTR(rapl_cores_unit), + EVENT_PTR(rapl_pkg_unit), + EVENT_PTR(rapl_gpu_unit), + + EVENT_PTR(rapl_cores_scale), + EVENT_PTR(rapl_pkg_scale), + EVENT_PTR(rapl_gpu_scale), + NULL, +}; + +static struct attribute *rapl_events_hsw_attr[] = { + EVENT_PTR(rapl_cores), + EVENT_PTR(rapl_pkg), + EVENT_PTR(rapl_gpu), + EVENT_PTR(rapl_ram), + + EVENT_PTR(rapl_cores_unit), + EVENT_PTR(rapl_pkg_unit), + EVENT_PTR(rapl_gpu_unit), + EVENT_PTR(rapl_ram_unit), + + EVENT_PTR(rapl_cores_scale), + EVENT_PTR(rapl_pkg_scale), + EVENT_PTR(rapl_gpu_scale), + EVENT_PTR(rapl_ram_scale), + NULL, +}; + +static struct attribute_group rapl_pmu_events_group = { + .name = "events", + .attrs = NULL, /* patched at runtime */ +}; + +DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7"); +static struct attribute *rapl_formats_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group rapl_pmu_format_group = { + .name = "format", + .attrs = rapl_formats_attr, +}; + +const struct attribute_group *rapl_attr_groups[] = { + &rapl_pmu_attr_group, + &rapl_pmu_format_group, + &rapl_pmu_events_group, + NULL, +}; + +static struct pmu rapl_pmu_class = { + .attr_groups = rapl_attr_groups, + .task_ctx_nr = perf_invalid_context, /* system-wide only */ + .event_init = rapl_pmu_event_init, + .add = rapl_pmu_event_add, /* must have */ + .del = rapl_pmu_event_del, /* must have */ + .start = rapl_pmu_event_start, + .stop = rapl_pmu_event_stop, + .read = rapl_pmu_event_read, +}; + +static void rapl_cpu_exit(int cpu) +{ + struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); + int i, phys_id = topology_physical_package_id(cpu); + int target = -1; + + /* find a new cpu on same package */ + for_each_online_cpu(i) { + if (i == cpu) + continue; + if (phys_id == topology_physical_package_id(i)) { + target = i; + break; + } + } + /* + * clear cpu from cpumask + * if was set in cpumask and still some cpu on package, + * then move to new cpu + */ + if (cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask) && target >= 0) + cpumask_set_cpu(target, &rapl_cpu_mask); + + WARN_ON(cpumask_empty(&rapl_cpu_mask)); + /* + * migrate events and context to new cpu + */ + if (target >= 0) + perf_pmu_migrate_context(pmu->pmu, cpu, target); + + /* cancel overflow polling timer for CPU */ + rapl_stop_hrtimer(pmu); +} + +static void rapl_cpu_init(int cpu) +{ + int i, phys_id = topology_physical_package_id(cpu); + + /* check if phys_is is already covered */ + for_each_cpu(i, &rapl_cpu_mask) { + if (phys_id == topology_physical_package_id(i)) + return; + } + /* was not found, so add it */ + cpumask_set_cpu(cpu, &rapl_cpu_mask); +} + +static int rapl_cpu_prepare(int cpu) +{ + struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); + int phys_id = topology_physical_package_id(cpu); + u64 ms; + u64 msr_rapl_power_unit_bits; + + if (pmu) + return 0; + + if (phys_id < 0) + return -1; + + /* protect rdmsrl() to handle virtualization */ + if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits)) + return -1; + + pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); + if (!pmu) + return -1; + + spin_lock_init(&pmu->lock); + + INIT_LIST_HEAD(&pmu->active_list); + + /* + * grab power unit as: 1/2^unit Joules + * + * we cache in local PMU instance + */ + pmu->hw_unit = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; + pmu->pmu = &rapl_pmu_class; + + /* + * use reference of 200W for scaling the timeout + * to avoid missing counter overflows. + * 200W = 200 Joules/sec + * divide interval by 2 to avoid lockstep (2 * 100) + * if hw unit is 32, then we use 2 ms 1/200/2 + */ + if (pmu->hw_unit < 32) + ms = (1000 / (2 * 100)) * (1ULL << (32 - pmu->hw_unit - 1)); + else + ms = 2; + + pmu->timer_interval = ms_to_ktime(ms); + + rapl_hrtimer_init(pmu); + + /* set RAPL pmu for this cpu for now */ + per_cpu(rapl_pmu, cpu) = pmu; + per_cpu(rapl_pmu_to_free, cpu) = NULL; + + return 0; +} + +static void rapl_cpu_kfree(int cpu) +{ + struct rapl_pmu *pmu = per_cpu(rapl_pmu_to_free, cpu); + + kfree(pmu); + + per_cpu(rapl_pmu_to_free, cpu) = NULL; +} + +static int rapl_cpu_dying(int cpu) +{ + struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); + + if (!pmu) + return 0; + + per_cpu(rapl_pmu, cpu) = NULL; + + per_cpu(rapl_pmu_to_free, cpu) = pmu; + + return 0; +} + +static int rapl_cpu_notifier(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (long)hcpu; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_UP_PREPARE: + rapl_cpu_prepare(cpu); + break; + case CPU_STARTING: + rapl_cpu_init(cpu); + break; + case CPU_UP_CANCELED: + case CPU_DYING: + rapl_cpu_dying(cpu); + break; + case CPU_ONLINE: + case CPU_DEAD: + rapl_cpu_kfree(cpu); + break; + case CPU_DOWN_PREPARE: + rapl_cpu_exit(cpu); + break; + default: + break; + } + + return NOTIFY_OK; +} + +static const struct x86_cpu_id rapl_cpu_match[] = { + [0] = { .vendor = X86_VENDOR_INTEL, .family = 6 }, + [1] = {}, +}; + +static int __init rapl_pmu_init(void) +{ + struct rapl_pmu *pmu; + int cpu, ret; + + /* + * check for Intel processor family 6 + */ + if (!x86_match_cpu(rapl_cpu_match)) + return 0; + + /* check supported CPU */ + switch (boot_cpu_data.x86_model) { + case 42: /* Sandy Bridge */ + case 58: /* Ivy Bridge */ + rapl_cntr_mask = RAPL_IDX_CLN; + rapl_pmu_events_group.attrs = rapl_events_cln_attr; + break; + case 60: /* Haswell */ + case 69: /* Haswell-Celeron */ + rapl_cntr_mask = RAPL_IDX_HSW; + rapl_pmu_events_group.attrs = rapl_events_hsw_attr; + break; + case 45: /* Sandy Bridge-EP */ + case 62: /* IvyTown */ + rapl_cntr_mask = RAPL_IDX_SRV; + rapl_pmu_events_group.attrs = rapl_events_srv_attr; + break; + + default: + /* unsupported */ + return 0; + } + + cpu_notifier_register_begin(); + + for_each_online_cpu(cpu) { + ret = rapl_cpu_prepare(cpu); + if (ret) + goto out; + rapl_cpu_init(cpu); + } + + __perf_cpu_notifier(rapl_cpu_notifier); + + ret = perf_pmu_register(&rapl_pmu_class, "power", -1); + if (WARN_ON(ret)) { + pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret); + cpu_notifier_register_done(); + return -1; + } + + pmu = __get_cpu_var(rapl_pmu); + + pr_info("RAPL PMU detected, hw unit 2^-%d Joules," + " API unit is 2^-32 Joules," + " %d fixed counters" + " %llu ms ovfl timer\n", + pmu->hw_unit, + hweight32(rapl_cntr_mask), + ktime_to_ms(pmu->timer_interval)); + +out: + cpu_notifier_register_done(); + + return 0; +} +device_initcall(rapl_pmu_init); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c new file mode 100644 index 00000000000..ae6552a0701 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -0,0 +1,4298 @@ +#include "perf_event_intel_uncore.h" + +static struct intel_uncore_type *empty_uncore[] = { NULL, }; +static struct intel_uncore_type **msr_uncores = empty_uncore; +static struct intel_uncore_type **pci_uncores = empty_uncore; +/* pci bus to socket mapping */ +static int pcibus_to_physid[256] = { [0 ... 255] = -1, }; + +static struct pci_dev *extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX]; + +static DEFINE_RAW_SPINLOCK(uncore_box_lock); + +/* mask of cpus that collect uncore events */ +static cpumask_t uncore_cpu_mask; + +/* constraint for the fixed counter */ +static struct event_constraint constraint_fixed = + EVENT_CONSTRAINT(~0ULL, 1 << UNCORE_PMC_IDX_FIXED, ~0ULL); +static struct event_constraint constraint_empty = + EVENT_CONSTRAINT(0, 0, 0); + +#define __BITS_VALUE(x, i, n) ((typeof(x))(((x) >> ((i) * (n))) & \ + ((1ULL << (n)) - 1))) + +DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7"); +DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21"); +DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); +DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18"); +DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19"); +DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23"); +DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28"); +DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31"); +DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31"); +DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28"); +DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15"); +DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30"); +DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51"); +DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4"); +DEFINE_UNCORE_FORMAT_ATTR(filter_link, filter_link, "config1:5-8"); +DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17"); +DEFINE_UNCORE_FORMAT_ATTR(filter_nid2, filter_nid, "config1:32-47"); +DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22"); +DEFINE_UNCORE_FORMAT_ATTR(filter_state2, filter_state, "config1:17-22"); +DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31"); +DEFINE_UNCORE_FORMAT_ATTR(filter_opc2, filter_opc, "config1:52-60"); +DEFINE_UNCORE_FORMAT_ATTR(filter_band0, filter_band0, "config1:0-7"); +DEFINE_UNCORE_FORMAT_ATTR(filter_band1, filter_band1, "config1:8-15"); +DEFINE_UNCORE_FORMAT_ATTR(filter_band2, filter_band2, "config1:16-23"); +DEFINE_UNCORE_FORMAT_ATTR(filter_band3, filter_band3, "config1:24-31"); +DEFINE_UNCORE_FORMAT_ATTR(match_rds, match_rds, "config1:48-51"); +DEFINE_UNCORE_FORMAT_ATTR(match_rnid30, match_rnid30, "config1:32-35"); +DEFINE_UNCORE_FORMAT_ATTR(match_rnid4, match_rnid4, "config1:31"); +DEFINE_UNCORE_FORMAT_ATTR(match_dnid, match_dnid, "config1:13-17"); +DEFINE_UNCORE_FORMAT_ATTR(match_mc, match_mc, "config1:9-12"); +DEFINE_UNCORE_FORMAT_ATTR(match_opc, match_opc, "config1:5-8"); +DEFINE_UNCORE_FORMAT_ATTR(match_vnw, match_vnw, "config1:3-4"); +DEFINE_UNCORE_FORMAT_ATTR(match0, match0, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(match1, match1, "config1:32-63"); +DEFINE_UNCORE_FORMAT_ATTR(mask_rds, mask_rds, "config2:48-51"); +DEFINE_UNCORE_FORMAT_ATTR(mask_rnid30, mask_rnid30, "config2:32-35"); +DEFINE_UNCORE_FORMAT_ATTR(mask_rnid4, mask_rnid4, "config2:31"); +DEFINE_UNCORE_FORMAT_ATTR(mask_dnid, mask_dnid, "config2:13-17"); +DEFINE_UNCORE_FORMAT_ATTR(mask_mc, mask_mc, "config2:9-12"); +DEFINE_UNCORE_FORMAT_ATTR(mask_opc, mask_opc, "config2:5-8"); +DEFINE_UNCORE_FORMAT_ATTR(mask_vnw, mask_vnw, "config2:3-4"); +DEFINE_UNCORE_FORMAT_ATTR(mask0, mask0, "config2:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(mask1, mask1, "config2:32-63"); + +static void uncore_pmu_start_hrtimer(struct intel_uncore_box *box); +static void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box); +static void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event); +static void uncore_pmu_event_read(struct perf_event *event); + +static struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event) +{ + return container_of(event->pmu, struct intel_uncore_pmu, pmu); +} + +static struct intel_uncore_box * +uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu) +{ + struct intel_uncore_box *box; + + box = *per_cpu_ptr(pmu->box, cpu); + if (box) + return box; + + raw_spin_lock(&uncore_box_lock); + list_for_each_entry(box, &pmu->box_list, list) { + if (box->phys_id == topology_physical_package_id(cpu)) { + atomic_inc(&box->refcnt); + *per_cpu_ptr(pmu->box, cpu) = box; + break; + } + } + raw_spin_unlock(&uncore_box_lock); + + return *per_cpu_ptr(pmu->box, cpu); +} + +static struct intel_uncore_box *uncore_event_to_box(struct perf_event *event) +{ + /* + * perf core schedules event on the basis of cpu, uncore events are + * collected by one of the cpus inside a physical package. + */ + return uncore_pmu_to_box(uncore_event_to_pmu(event), smp_processor_id()); +} + +static u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event) +{ + u64 count; + + rdmsrl(event->hw.event_base, count); + + return count; +} + +/* + * generic get constraint function for shared match/mask registers. + */ +static struct event_constraint * +uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + struct intel_uncore_extra_reg *er; + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; + unsigned long flags; + bool ok = false; + + /* + * reg->alloc can be set due to existing state, so for fake box we + * need to ignore this, otherwise we might fail to allocate proper + * fake state for this extra reg constraint. + */ + if (reg1->idx == EXTRA_REG_NONE || + (!uncore_box_is_fake(box) && reg1->alloc)) + return NULL; + + er = &box->shared_regs[reg1->idx]; + raw_spin_lock_irqsave(&er->lock, flags); + if (!atomic_read(&er->ref) || + (er->config1 == reg1->config && er->config2 == reg2->config)) { + atomic_inc(&er->ref); + er->config1 = reg1->config; + er->config2 = reg2->config; + ok = true; + } + raw_spin_unlock_irqrestore(&er->lock, flags); + + if (ok) { + if (!uncore_box_is_fake(box)) + reg1->alloc = 1; + return NULL; + } + + return &constraint_empty; +} + +static void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + struct intel_uncore_extra_reg *er; + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + + /* + * Only put constraint if extra reg was actually allocated. Also + * takes care of event which do not use an extra shared reg. + * + * Also, if this is a fake box we shouldn't touch any event state + * (reg->alloc) and we don't care about leaving inconsistent box + * state either since it will be thrown out. + */ + if (uncore_box_is_fake(box) || !reg1->alloc) + return; + + er = &box->shared_regs[reg1->idx]; + atomic_dec(&er->ref); + reg1->alloc = 0; +} + +static u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx) +{ + struct intel_uncore_extra_reg *er; + unsigned long flags; + u64 config; + + er = &box->shared_regs[idx]; + + raw_spin_lock_irqsave(&er->lock, flags); + config = er->config; + raw_spin_unlock_irqrestore(&er->lock, flags); + + return config; +} + +/* Sandy Bridge-EP uncore support */ +static struct intel_uncore_type snbep_uncore_cbox; +static struct intel_uncore_type snbep_uncore_pcu; + +static void snbep_uncore_pci_disable_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + int box_ctl = uncore_pci_box_ctl(box); + u32 config = 0; + + if (!pci_read_config_dword(pdev, box_ctl, &config)) { + config |= SNBEP_PMON_BOX_CTL_FRZ; + pci_write_config_dword(pdev, box_ctl, config); + } +} + +static void snbep_uncore_pci_enable_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + int box_ctl = uncore_pci_box_ctl(box); + u32 config = 0; + + if (!pci_read_config_dword(pdev, box_ctl, &config)) { + config &= ~SNBEP_PMON_BOX_CTL_FRZ; + pci_write_config_dword(pdev, box_ctl, config); + } +} + +static void snbep_uncore_pci_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + + pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); +} + +static void snbep_uncore_pci_disable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + + pci_write_config_dword(pdev, hwc->config_base, hwc->config); +} + +static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + u64 count = 0; + + pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count); + pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1); + + return count; +} + +static void snbep_uncore_pci_init_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + + pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, SNBEP_PMON_BOX_CTL_INT); +} + +static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box) +{ + u64 config; + unsigned msr; + + msr = uncore_msr_box_ctl(box); + if (msr) { + rdmsrl(msr, config); + config |= SNBEP_PMON_BOX_CTL_FRZ; + wrmsrl(msr, config); + } +} + +static void snbep_uncore_msr_enable_box(struct intel_uncore_box *box) +{ + u64 config; + unsigned msr; + + msr = uncore_msr_box_ctl(box); + if (msr) { + rdmsrl(msr, config); + config &= ~SNBEP_PMON_BOX_CTL_FRZ; + wrmsrl(msr, config); + } +} + +static void snbep_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + + if (reg1->idx != EXTRA_REG_NONE) + wrmsrl(reg1->reg, uncore_shared_reg_config(box, 0)); + + wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); +} + +static void snbep_uncore_msr_disable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + wrmsrl(hwc->config_base, hwc->config); +} + +static void snbep_uncore_msr_init_box(struct intel_uncore_box *box) +{ + unsigned msr = uncore_msr_box_ctl(box); + + if (msr) + wrmsrl(msr, SNBEP_PMON_BOX_CTL_INT); +} + +static struct attribute *snbep_uncore_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + NULL, +}; + +static struct attribute *snbep_uncore_ubox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh5.attr, + NULL, +}; + +static struct attribute *snbep_uncore_cbox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_tid_en.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + &format_attr_filter_tid.attr, + &format_attr_filter_nid.attr, + &format_attr_filter_state.attr, + &format_attr_filter_opc.attr, + NULL, +}; + +static struct attribute *snbep_uncore_pcu_formats_attr[] = { + &format_attr_event_ext.attr, + &format_attr_occ_sel.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh5.attr, + &format_attr_occ_invert.attr, + &format_attr_occ_edge.attr, + &format_attr_filter_band0.attr, + &format_attr_filter_band1.attr, + &format_attr_filter_band2.attr, + &format_attr_filter_band3.attr, + NULL, +}; + +static struct attribute *snbep_uncore_qpi_formats_attr[] = { + &format_attr_event_ext.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + &format_attr_match_rds.attr, + &format_attr_match_rnid30.attr, + &format_attr_match_rnid4.attr, + &format_attr_match_dnid.attr, + &format_attr_match_mc.attr, + &format_attr_match_opc.attr, + &format_attr_match_vnw.attr, + &format_attr_match0.attr, + &format_attr_match1.attr, + &format_attr_mask_rds.attr, + &format_attr_mask_rnid30.attr, + &format_attr_mask_rnid4.attr, + &format_attr_mask_dnid.attr, + &format_attr_mask_mc.attr, + &format_attr_mask_opc.attr, + &format_attr_mask_vnw.attr, + &format_attr_mask0.attr, + &format_attr_mask1.attr, + NULL, +}; + +static struct uncore_event_desc snbep_uncore_imc_events[] = { + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"), + INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x03"), + INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"), + { /* end: all zeroes */ }, +}; + +static struct uncore_event_desc snbep_uncore_qpi_events[] = { + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x14"), + INTEL_UNCORE_EVENT_DESC(txl_flits_active, "event=0x00,umask=0x06"), + INTEL_UNCORE_EVENT_DESC(drs_data, "event=0x102,umask=0x08"), + INTEL_UNCORE_EVENT_DESC(ncb_data, "event=0x103,umask=0x04"), + { /* end: all zeroes */ }, +}; + +static struct attribute_group snbep_uncore_format_group = { + .name = "format", + .attrs = snbep_uncore_formats_attr, +}; + +static struct attribute_group snbep_uncore_ubox_format_group = { + .name = "format", + .attrs = snbep_uncore_ubox_formats_attr, +}; + +static struct attribute_group snbep_uncore_cbox_format_group = { + .name = "format", + .attrs = snbep_uncore_cbox_formats_attr, +}; + +static struct attribute_group snbep_uncore_pcu_format_group = { + .name = "format", + .attrs = snbep_uncore_pcu_formats_attr, +}; + +static struct attribute_group snbep_uncore_qpi_format_group = { + .name = "format", + .attrs = snbep_uncore_qpi_formats_attr, +}; + +#define SNBEP_UNCORE_MSR_OPS_COMMON_INIT() \ + .init_box = snbep_uncore_msr_init_box, \ + .disable_box = snbep_uncore_msr_disable_box, \ + .enable_box = snbep_uncore_msr_enable_box, \ + .disable_event = snbep_uncore_msr_disable_event, \ + .enable_event = snbep_uncore_msr_enable_event, \ + .read_counter = uncore_msr_read_counter + +static struct intel_uncore_ops snbep_uncore_msr_ops = { + SNBEP_UNCORE_MSR_OPS_COMMON_INIT(), +}; + +#define SNBEP_UNCORE_PCI_OPS_COMMON_INIT() \ + .init_box = snbep_uncore_pci_init_box, \ + .disable_box = snbep_uncore_pci_disable_box, \ + .enable_box = snbep_uncore_pci_enable_box, \ + .disable_event = snbep_uncore_pci_disable_event, \ + .read_counter = snbep_uncore_pci_read_counter + +static struct intel_uncore_ops snbep_uncore_pci_ops = { + SNBEP_UNCORE_PCI_OPS_COMMON_INIT(), + .enable_event = snbep_uncore_pci_enable_event, \ +}; + +static struct event_constraint snbep_uncore_cbox_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x01, 0x1), + UNCORE_EVENT_CONSTRAINT(0x02, 0x3), + UNCORE_EVENT_CONSTRAINT(0x04, 0x3), + UNCORE_EVENT_CONSTRAINT(0x05, 0x3), + UNCORE_EVENT_CONSTRAINT(0x07, 0x3), + UNCORE_EVENT_CONSTRAINT(0x09, 0x3), + UNCORE_EVENT_CONSTRAINT(0x11, 0x1), + UNCORE_EVENT_CONSTRAINT(0x12, 0x3), + UNCORE_EVENT_CONSTRAINT(0x13, 0x3), + UNCORE_EVENT_CONSTRAINT(0x1b, 0xc), + UNCORE_EVENT_CONSTRAINT(0x1c, 0xc), + UNCORE_EVENT_CONSTRAINT(0x1d, 0xc), + UNCORE_EVENT_CONSTRAINT(0x1e, 0xc), + EVENT_CONSTRAINT_OVERLAP(0x1f, 0xe, 0xff), + UNCORE_EVENT_CONSTRAINT(0x21, 0x3), + UNCORE_EVENT_CONSTRAINT(0x23, 0x3), + UNCORE_EVENT_CONSTRAINT(0x31, 0x3), + UNCORE_EVENT_CONSTRAINT(0x32, 0x3), + UNCORE_EVENT_CONSTRAINT(0x33, 0x3), + UNCORE_EVENT_CONSTRAINT(0x34, 0x3), + UNCORE_EVENT_CONSTRAINT(0x35, 0x3), + UNCORE_EVENT_CONSTRAINT(0x36, 0x1), + UNCORE_EVENT_CONSTRAINT(0x37, 0x3), + UNCORE_EVENT_CONSTRAINT(0x38, 0x3), + UNCORE_EVENT_CONSTRAINT(0x39, 0x3), + UNCORE_EVENT_CONSTRAINT(0x3b, 0x1), + EVENT_CONSTRAINT_END +}; + +static struct event_constraint snbep_uncore_r2pcie_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x10, 0x3), + UNCORE_EVENT_CONSTRAINT(0x11, 0x3), + UNCORE_EVENT_CONSTRAINT(0x12, 0x1), + UNCORE_EVENT_CONSTRAINT(0x23, 0x3), + UNCORE_EVENT_CONSTRAINT(0x24, 0x3), + UNCORE_EVENT_CONSTRAINT(0x25, 0x3), + UNCORE_EVENT_CONSTRAINT(0x26, 0x3), + UNCORE_EVENT_CONSTRAINT(0x32, 0x3), + UNCORE_EVENT_CONSTRAINT(0x33, 0x3), + UNCORE_EVENT_CONSTRAINT(0x34, 0x3), + EVENT_CONSTRAINT_END +}; + +static struct event_constraint snbep_uncore_r3qpi_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x10, 0x3), + UNCORE_EVENT_CONSTRAINT(0x11, 0x3), + UNCORE_EVENT_CONSTRAINT(0x12, 0x3), + UNCORE_EVENT_CONSTRAINT(0x13, 0x1), + UNCORE_EVENT_CONSTRAINT(0x20, 0x3), + UNCORE_EVENT_CONSTRAINT(0x21, 0x3), + UNCORE_EVENT_CONSTRAINT(0x22, 0x3), + UNCORE_EVENT_CONSTRAINT(0x23, 0x3), + UNCORE_EVENT_CONSTRAINT(0x24, 0x3), + UNCORE_EVENT_CONSTRAINT(0x25, 0x3), + UNCORE_EVENT_CONSTRAINT(0x26, 0x3), + UNCORE_EVENT_CONSTRAINT(0x28, 0x3), + UNCORE_EVENT_CONSTRAINT(0x29, 0x3), + UNCORE_EVENT_CONSTRAINT(0x2a, 0x3), + UNCORE_EVENT_CONSTRAINT(0x2b, 0x3), + UNCORE_EVENT_CONSTRAINT(0x2c, 0x3), + UNCORE_EVENT_CONSTRAINT(0x2d, 0x3), + UNCORE_EVENT_CONSTRAINT(0x2e, 0x3), + UNCORE_EVENT_CONSTRAINT(0x2f, 0x3), + UNCORE_EVENT_CONSTRAINT(0x30, 0x3), + UNCORE_EVENT_CONSTRAINT(0x31, 0x3), + UNCORE_EVENT_CONSTRAINT(0x32, 0x3), + UNCORE_EVENT_CONSTRAINT(0x33, 0x3), + UNCORE_EVENT_CONSTRAINT(0x34, 0x3), + UNCORE_EVENT_CONSTRAINT(0x36, 0x3), + UNCORE_EVENT_CONSTRAINT(0x37, 0x3), + UNCORE_EVENT_CONSTRAINT(0x38, 0x3), + UNCORE_EVENT_CONSTRAINT(0x39, 0x3), + EVENT_CONSTRAINT_END +}; + +static struct intel_uncore_type snbep_uncore_ubox = { + .name = "ubox", + .num_counters = 2, + .num_boxes = 1, + .perf_ctr_bits = 44, + .fixed_ctr_bits = 48, + .perf_ctr = SNBEP_U_MSR_PMON_CTR0, + .event_ctl = SNBEP_U_MSR_PMON_CTL0, + .event_mask = SNBEP_U_MSR_PMON_RAW_EVENT_MASK, + .fixed_ctr = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR, + .fixed_ctl = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL, + .ops = &snbep_uncore_msr_ops, + .format_group = &snbep_uncore_ubox_format_group, +}; + +static struct extra_reg snbep_uncore_cbox_extra_regs[] = { + SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN, + SNBEP_CBO_PMON_CTL_TID_EN, 0x1), + SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0x6), + SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0x6), + SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0x6), + SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x6), + SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0xa), + SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0xa), + SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x2), + SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x2), + SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x2), + SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x2), + SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0xa), + SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0xa), + SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x2), + SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x2), + SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x2), + SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x2), + EVENT_EXTRA_END +}; + +static void snbep_cbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct intel_uncore_extra_reg *er = &box->shared_regs[0]; + int i; + + if (uncore_box_is_fake(box)) + return; + + for (i = 0; i < 5; i++) { + if (reg1->alloc & (0x1 << i)) + atomic_sub(1 << (i * 6), &er->ref); + } + reg1->alloc = 0; +} + +static struct event_constraint * +__snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event, + u64 (*cbox_filter_mask)(int fields)) +{ + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct intel_uncore_extra_reg *er = &box->shared_regs[0]; + int i, alloc = 0; + unsigned long flags; + u64 mask; + + if (reg1->idx == EXTRA_REG_NONE) + return NULL; + + raw_spin_lock_irqsave(&er->lock, flags); + for (i = 0; i < 5; i++) { + if (!(reg1->idx & (0x1 << i))) + continue; + if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i))) + continue; + + mask = cbox_filter_mask(0x1 << i); + if (!__BITS_VALUE(atomic_read(&er->ref), i, 6) || + !((reg1->config ^ er->config) & mask)) { + atomic_add(1 << (i * 6), &er->ref); + er->config &= ~mask; + er->config |= reg1->config & mask; + alloc |= (0x1 << i); + } else { + break; + } + } + raw_spin_unlock_irqrestore(&er->lock, flags); + if (i < 5) + goto fail; + + if (!uncore_box_is_fake(box)) + reg1->alloc |= alloc; + + return NULL; +fail: + for (; i >= 0; i--) { + if (alloc & (0x1 << i)) + atomic_sub(1 << (i * 6), &er->ref); + } + return &constraint_empty; +} + +static u64 snbep_cbox_filter_mask(int fields) +{ + u64 mask = 0; + + if (fields & 0x1) + mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_TID; + if (fields & 0x2) + mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_NID; + if (fields & 0x4) + mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE; + if (fields & 0x8) + mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC; + + return mask; +} + +static struct event_constraint * +snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + return __snbep_cbox_get_constraint(box, event, snbep_cbox_filter_mask); +} + +static int snbep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct extra_reg *er; + int idx = 0; + + for (er = snbep_uncore_cbox_extra_regs; er->msr; er++) { + if (er->event != (event->hw.config & er->config_mask)) + continue; + idx |= er->idx; + } + + if (idx) { + reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER + + SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx; + reg1->config = event->attr.config1 & snbep_cbox_filter_mask(idx); + reg1->idx = idx; + } + return 0; +} + +static struct intel_uncore_ops snbep_uncore_cbox_ops = { + SNBEP_UNCORE_MSR_OPS_COMMON_INIT(), + .hw_config = snbep_cbox_hw_config, + .get_constraint = snbep_cbox_get_constraint, + .put_constraint = snbep_cbox_put_constraint, +}; + +static struct intel_uncore_type snbep_uncore_cbox = { + .name = "cbox", + .num_counters = 4, + .num_boxes = 8, + .perf_ctr_bits = 44, + .event_ctl = SNBEP_C0_MSR_PMON_CTL0, + .perf_ctr = SNBEP_C0_MSR_PMON_CTR0, + .event_mask = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_C0_MSR_PMON_BOX_CTL, + .msr_offset = SNBEP_CBO_MSR_OFFSET, + .num_shared_regs = 1, + .constraints = snbep_uncore_cbox_constraints, + .ops = &snbep_uncore_cbox_ops, + .format_group = &snbep_uncore_cbox_format_group, +}; + +static u64 snbep_pcu_alter_er(struct perf_event *event, int new_idx, bool modify) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + u64 config = reg1->config; + + if (new_idx > reg1->idx) + config <<= 8 * (new_idx - reg1->idx); + else + config >>= 8 * (reg1->idx - new_idx); + + if (modify) { + hwc->config += new_idx - reg1->idx; + reg1->config = config; + reg1->idx = new_idx; + } + return config; +} + +static struct event_constraint * +snbep_pcu_get_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct intel_uncore_extra_reg *er = &box->shared_regs[0]; + unsigned long flags; + int idx = reg1->idx; + u64 mask, config1 = reg1->config; + bool ok = false; + + if (reg1->idx == EXTRA_REG_NONE || + (!uncore_box_is_fake(box) && reg1->alloc)) + return NULL; +again: + mask = 0xffULL << (idx * 8); + raw_spin_lock_irqsave(&er->lock, flags); + if (!__BITS_VALUE(atomic_read(&er->ref), idx, 8) || + !((config1 ^ er->config) & mask)) { + atomic_add(1 << (idx * 8), &er->ref); + er->config &= ~mask; + er->config |= config1 & mask; + ok = true; + } + raw_spin_unlock_irqrestore(&er->lock, flags); + + if (!ok) { + idx = (idx + 1) % 4; + if (idx != reg1->idx) { + config1 = snbep_pcu_alter_er(event, idx, false); + goto again; + } + return &constraint_empty; + } + + if (!uncore_box_is_fake(box)) { + if (idx != reg1->idx) + snbep_pcu_alter_er(event, idx, true); + reg1->alloc = 1; + } + return NULL; +} + +static void snbep_pcu_put_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct intel_uncore_extra_reg *er = &box->shared_regs[0]; + + if (uncore_box_is_fake(box) || !reg1->alloc) + return; + + atomic_sub(1 << (reg1->idx * 8), &er->ref); + reg1->alloc = 0; +} + +static int snbep_pcu_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + int ev_sel = hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK; + + if (ev_sel >= 0xb && ev_sel <= 0xe) { + reg1->reg = SNBEP_PCU_MSR_PMON_BOX_FILTER; + reg1->idx = ev_sel - 0xb; + reg1->config = event->attr.config1 & (0xff << reg1->idx); + } + return 0; +} + +static struct intel_uncore_ops snbep_uncore_pcu_ops = { + SNBEP_UNCORE_MSR_OPS_COMMON_INIT(), + .hw_config = snbep_pcu_hw_config, + .get_constraint = snbep_pcu_get_constraint, + .put_constraint = snbep_pcu_put_constraint, +}; + +static struct intel_uncore_type snbep_uncore_pcu = { + .name = "pcu", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .perf_ctr = SNBEP_PCU_MSR_PMON_CTR0, + .event_ctl = SNBEP_PCU_MSR_PMON_CTL0, + .event_mask = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCU_MSR_PMON_BOX_CTL, + .num_shared_regs = 1, + .ops = &snbep_uncore_pcu_ops, + .format_group = &snbep_uncore_pcu_format_group, +}; + +static struct intel_uncore_type *snbep_msr_uncores[] = { + &snbep_uncore_ubox, + &snbep_uncore_cbox, + &snbep_uncore_pcu, + NULL, +}; + +enum { + SNBEP_PCI_QPI_PORT0_FILTER, + SNBEP_PCI_QPI_PORT1_FILTER, +}; + +static int snbep_qpi_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + + if ((hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK) == 0x38) { + reg1->idx = 0; + reg1->reg = SNBEP_Q_Py_PCI_PMON_PKT_MATCH0; + reg1->config = event->attr.config1; + reg2->reg = SNBEP_Q_Py_PCI_PMON_PKT_MASK0; + reg2->config = event->attr.config2; + } + return 0; +} + +static void snbep_qpi_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + + if (reg1->idx != EXTRA_REG_NONE) { + int idx = box->pmu->pmu_idx + SNBEP_PCI_QPI_PORT0_FILTER; + struct pci_dev *filter_pdev = extra_pci_dev[box->phys_id][idx]; + WARN_ON_ONCE(!filter_pdev); + if (filter_pdev) { + pci_write_config_dword(filter_pdev, reg1->reg, + (u32)reg1->config); + pci_write_config_dword(filter_pdev, reg1->reg + 4, + (u32)(reg1->config >> 32)); + pci_write_config_dword(filter_pdev, reg2->reg, + (u32)reg2->config); + pci_write_config_dword(filter_pdev, reg2->reg + 4, + (u32)(reg2->config >> 32)); + } + } + + pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); +} + +static struct intel_uncore_ops snbep_uncore_qpi_ops = { + SNBEP_UNCORE_PCI_OPS_COMMON_INIT(), + .enable_event = snbep_qpi_enable_event, + .hw_config = snbep_qpi_hw_config, + .get_constraint = uncore_get_constraint, + .put_constraint = uncore_put_constraint, +}; + +#define SNBEP_UNCORE_PCI_COMMON_INIT() \ + .perf_ctr = SNBEP_PCI_PMON_CTR0, \ + .event_ctl = SNBEP_PCI_PMON_CTL0, \ + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, \ + .box_ctl = SNBEP_PCI_PMON_BOX_CTL, \ + .ops = &snbep_uncore_pci_ops, \ + .format_group = &snbep_uncore_format_group + +static struct intel_uncore_type snbep_uncore_ha = { + .name = "ha", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type snbep_uncore_imc = { + .name = "imc", + .num_counters = 4, + .num_boxes = 4, + .perf_ctr_bits = 48, + .fixed_ctr_bits = 48, + .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR, + .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL, + .event_descs = snbep_uncore_imc_events, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type snbep_uncore_qpi = { + .name = "qpi", + .num_counters = 4, + .num_boxes = 2, + .perf_ctr_bits = 48, + .perf_ctr = SNBEP_PCI_PMON_CTR0, + .event_ctl = SNBEP_PCI_PMON_CTL0, + .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCI_PMON_BOX_CTL, + .num_shared_regs = 1, + .ops = &snbep_uncore_qpi_ops, + .event_descs = snbep_uncore_qpi_events, + .format_group = &snbep_uncore_qpi_format_group, +}; + + +static struct intel_uncore_type snbep_uncore_r2pcie = { + .name = "r2pcie", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 44, + .constraints = snbep_uncore_r2pcie_constraints, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type snbep_uncore_r3qpi = { + .name = "r3qpi", + .num_counters = 3, + .num_boxes = 2, + .perf_ctr_bits = 44, + .constraints = snbep_uncore_r3qpi_constraints, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +enum { + SNBEP_PCI_UNCORE_HA, + SNBEP_PCI_UNCORE_IMC, + SNBEP_PCI_UNCORE_QPI, + SNBEP_PCI_UNCORE_R2PCIE, + SNBEP_PCI_UNCORE_R3QPI, +}; + +static struct intel_uncore_type *snbep_pci_uncores[] = { + [SNBEP_PCI_UNCORE_HA] = &snbep_uncore_ha, + [SNBEP_PCI_UNCORE_IMC] = &snbep_uncore_imc, + [SNBEP_PCI_UNCORE_QPI] = &snbep_uncore_qpi, + [SNBEP_PCI_UNCORE_R2PCIE] = &snbep_uncore_r2pcie, + [SNBEP_PCI_UNCORE_R3QPI] = &snbep_uncore_r3qpi, + NULL, +}; + +static DEFINE_PCI_DEVICE_TABLE(snbep_uncore_pci_ids) = { + { /* Home Agent */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA), + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_HA, 0), + }, + { /* MC Channel 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC0), + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 0), + }, + { /* MC Channel 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC1), + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 1), + }, + { /* MC Channel 2 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC2), + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 2), + }, + { /* MC Channel 3 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC3), + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 3), + }, + { /* QPI Port 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI0), + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 0), + }, + { /* QPI Port 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI1), + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 1), + }, + { /* R2PCIe */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R2PCIE), + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R2PCIE, 0), + }, + { /* R3QPI Link 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI0), + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 0), + }, + { /* R3QPI Link 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI1), + .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 1), + }, + { /* QPI Port 0 filter */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c86), + .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, + SNBEP_PCI_QPI_PORT0_FILTER), + }, + { /* QPI Port 0 filter */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c96), + .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, + SNBEP_PCI_QPI_PORT1_FILTER), + }, + { /* end: all zeroes */ } +}; + +static struct pci_driver snbep_uncore_pci_driver = { + .name = "snbep_uncore", + .id_table = snbep_uncore_pci_ids, +}; + +/* + * build pci bus to socket mapping + */ +static int snbep_pci2phy_map_init(int devid) +{ + struct pci_dev *ubox_dev = NULL; + int i, bus, nodeid; + int err = 0; + u32 config = 0; + + while (1) { + /* find the UBOX device */ + ubox_dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, ubox_dev); + if (!ubox_dev) + break; + bus = ubox_dev->bus->number; + /* get the Node ID of the local register */ + err = pci_read_config_dword(ubox_dev, 0x40, &config); + if (err) + break; + nodeid = config; + /* get the Node ID mapping */ + err = pci_read_config_dword(ubox_dev, 0x54, &config); + if (err) + break; + /* + * every three bits in the Node ID mapping register maps + * to a particular node. + */ + for (i = 0; i < 8; i++) { + if (nodeid == ((config >> (3 * i)) & 0x7)) { + pcibus_to_physid[bus] = i; + break; + } + } + } + + if (!err) { + /* + * For PCI bus with no UBOX device, find the next bus + * that has UBOX device and use its mapping. + */ + i = -1; + for (bus = 255; bus >= 0; bus--) { + if (pcibus_to_physid[bus] >= 0) + i = pcibus_to_physid[bus]; + else + pcibus_to_physid[bus] = i; + } + } + + if (ubox_dev) + pci_dev_put(ubox_dev); + + return err ? pcibios_err_to_errno(err) : 0; +} +/* end of Sandy Bridge-EP uncore support */ + +/* IvyTown uncore support */ +static void ivt_uncore_msr_init_box(struct intel_uncore_box *box) +{ + unsigned msr = uncore_msr_box_ctl(box); + if (msr) + wrmsrl(msr, IVT_PMON_BOX_CTL_INT); +} + +static void ivt_uncore_pci_init_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + + pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, IVT_PMON_BOX_CTL_INT); +} + +#define IVT_UNCORE_MSR_OPS_COMMON_INIT() \ + .init_box = ivt_uncore_msr_init_box, \ + .disable_box = snbep_uncore_msr_disable_box, \ + .enable_box = snbep_uncore_msr_enable_box, \ + .disable_event = snbep_uncore_msr_disable_event, \ + .enable_event = snbep_uncore_msr_enable_event, \ + .read_counter = uncore_msr_read_counter + +static struct intel_uncore_ops ivt_uncore_msr_ops = { + IVT_UNCORE_MSR_OPS_COMMON_INIT(), +}; + +static struct intel_uncore_ops ivt_uncore_pci_ops = { + .init_box = ivt_uncore_pci_init_box, + .disable_box = snbep_uncore_pci_disable_box, + .enable_box = snbep_uncore_pci_enable_box, + .disable_event = snbep_uncore_pci_disable_event, + .enable_event = snbep_uncore_pci_enable_event, + .read_counter = snbep_uncore_pci_read_counter, +}; + +#define IVT_UNCORE_PCI_COMMON_INIT() \ + .perf_ctr = SNBEP_PCI_PMON_CTR0, \ + .event_ctl = SNBEP_PCI_PMON_CTL0, \ + .event_mask = IVT_PMON_RAW_EVENT_MASK, \ + .box_ctl = SNBEP_PCI_PMON_BOX_CTL, \ + .ops = &ivt_uncore_pci_ops, \ + .format_group = &ivt_uncore_format_group + +static struct attribute *ivt_uncore_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + NULL, +}; + +static struct attribute *ivt_uncore_ubox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh5.attr, + NULL, +}; + +static struct attribute *ivt_uncore_cbox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_tid_en.attr, + &format_attr_thresh8.attr, + &format_attr_filter_tid.attr, + &format_attr_filter_link.attr, + &format_attr_filter_state2.attr, + &format_attr_filter_nid2.attr, + &format_attr_filter_opc2.attr, + NULL, +}; + +static struct attribute *ivt_uncore_pcu_formats_attr[] = { + &format_attr_event_ext.attr, + &format_attr_occ_sel.attr, + &format_attr_edge.attr, + &format_attr_thresh5.attr, + &format_attr_occ_invert.attr, + &format_attr_occ_edge.attr, + &format_attr_filter_band0.attr, + &format_attr_filter_band1.attr, + &format_attr_filter_band2.attr, + &format_attr_filter_band3.attr, + NULL, +}; + +static struct attribute *ivt_uncore_qpi_formats_attr[] = { + &format_attr_event_ext.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_thresh8.attr, + &format_attr_match_rds.attr, + &format_attr_match_rnid30.attr, + &format_attr_match_rnid4.attr, + &format_attr_match_dnid.attr, + &format_attr_match_mc.attr, + &format_attr_match_opc.attr, + &format_attr_match_vnw.attr, + &format_attr_match0.attr, + &format_attr_match1.attr, + &format_attr_mask_rds.attr, + &format_attr_mask_rnid30.attr, + &format_attr_mask_rnid4.attr, + &format_attr_mask_dnid.attr, + &format_attr_mask_mc.attr, + &format_attr_mask_opc.attr, + &format_attr_mask_vnw.attr, + &format_attr_mask0.attr, + &format_attr_mask1.attr, + NULL, +}; + +static struct attribute_group ivt_uncore_format_group = { + .name = "format", + .attrs = ivt_uncore_formats_attr, +}; + +static struct attribute_group ivt_uncore_ubox_format_group = { + .name = "format", + .attrs = ivt_uncore_ubox_formats_attr, +}; + +static struct attribute_group ivt_uncore_cbox_format_group = { + .name = "format", + .attrs = ivt_uncore_cbox_formats_attr, +}; + +static struct attribute_group ivt_uncore_pcu_format_group = { + .name = "format", + .attrs = ivt_uncore_pcu_formats_attr, +}; + +static struct attribute_group ivt_uncore_qpi_format_group = { + .name = "format", + .attrs = ivt_uncore_qpi_formats_attr, +}; + +static struct intel_uncore_type ivt_uncore_ubox = { + .name = "ubox", + .num_counters = 2, + .num_boxes = 1, + .perf_ctr_bits = 44, + .fixed_ctr_bits = 48, + .perf_ctr = SNBEP_U_MSR_PMON_CTR0, + .event_ctl = SNBEP_U_MSR_PMON_CTL0, + .event_mask = IVT_U_MSR_PMON_RAW_EVENT_MASK, + .fixed_ctr = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR, + .fixed_ctl = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL, + .ops = &ivt_uncore_msr_ops, + .format_group = &ivt_uncore_ubox_format_group, +}; + +static struct extra_reg ivt_uncore_cbox_extra_regs[] = { + SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN, + SNBEP_CBO_PMON_CTL_TID_EN, 0x1), + SNBEP_CBO_EVENT_EXTRA_REG(0x1031, 0x10ff, 0x2), + + SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0xc), + SNBEP_CBO_EVENT_EXTRA_REG(0x5134, 0xffff, 0xc), + SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0xc), + SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0xc), + SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0xc), + SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x2335, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0x18), + SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0x18), + SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x8135, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18), + SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18), + SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x5036, 0xffff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x8136, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x8336, 0xffff, 0x10), + SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x8), + EVENT_EXTRA_END +}; + +static u64 ivt_cbox_filter_mask(int fields) +{ + u64 mask = 0; + + if (fields & 0x1) + mask |= IVT_CB0_MSR_PMON_BOX_FILTER_TID; + if (fields & 0x2) + mask |= IVT_CB0_MSR_PMON_BOX_FILTER_LINK; + if (fields & 0x4) + mask |= IVT_CB0_MSR_PMON_BOX_FILTER_STATE; + if (fields & 0x8) + mask |= IVT_CB0_MSR_PMON_BOX_FILTER_NID; + if (fields & 0x10) + mask |= IVT_CB0_MSR_PMON_BOX_FILTER_OPC; + + return mask; +} + +static struct event_constraint * +ivt_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + return __snbep_cbox_get_constraint(box, event, ivt_cbox_filter_mask); +} + +static int ivt_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct extra_reg *er; + int idx = 0; + + for (er = ivt_uncore_cbox_extra_regs; er->msr; er++) { + if (er->event != (event->hw.config & er->config_mask)) + continue; + idx |= er->idx; + } + + if (idx) { + reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER + + SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx; + reg1->config = event->attr.config1 & ivt_cbox_filter_mask(idx); + reg1->idx = idx; + } + return 0; +} + +static void ivt_cbox_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + + if (reg1->idx != EXTRA_REG_NONE) { + u64 filter = uncore_shared_reg_config(box, 0); + wrmsrl(reg1->reg, filter & 0xffffffff); + wrmsrl(reg1->reg + 6, filter >> 32); + } + + wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); +} + +static struct intel_uncore_ops ivt_uncore_cbox_ops = { + .init_box = ivt_uncore_msr_init_box, + .disable_box = snbep_uncore_msr_disable_box, + .enable_box = snbep_uncore_msr_enable_box, + .disable_event = snbep_uncore_msr_disable_event, + .enable_event = ivt_cbox_enable_event, + .read_counter = uncore_msr_read_counter, + .hw_config = ivt_cbox_hw_config, + .get_constraint = ivt_cbox_get_constraint, + .put_constraint = snbep_cbox_put_constraint, +}; + +static struct intel_uncore_type ivt_uncore_cbox = { + .name = "cbox", + .num_counters = 4, + .num_boxes = 15, + .perf_ctr_bits = 44, + .event_ctl = SNBEP_C0_MSR_PMON_CTL0, + .perf_ctr = SNBEP_C0_MSR_PMON_CTR0, + .event_mask = IVT_CBO_MSR_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_C0_MSR_PMON_BOX_CTL, + .msr_offset = SNBEP_CBO_MSR_OFFSET, + .num_shared_regs = 1, + .constraints = snbep_uncore_cbox_constraints, + .ops = &ivt_uncore_cbox_ops, + .format_group = &ivt_uncore_cbox_format_group, +}; + +static struct intel_uncore_ops ivt_uncore_pcu_ops = { + IVT_UNCORE_MSR_OPS_COMMON_INIT(), + .hw_config = snbep_pcu_hw_config, + .get_constraint = snbep_pcu_get_constraint, + .put_constraint = snbep_pcu_put_constraint, +}; + +static struct intel_uncore_type ivt_uncore_pcu = { + .name = "pcu", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .perf_ctr = SNBEP_PCU_MSR_PMON_CTR0, + .event_ctl = SNBEP_PCU_MSR_PMON_CTL0, + .event_mask = IVT_PCU_MSR_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCU_MSR_PMON_BOX_CTL, + .num_shared_regs = 1, + .ops = &ivt_uncore_pcu_ops, + .format_group = &ivt_uncore_pcu_format_group, +}; + +static struct intel_uncore_type *ivt_msr_uncores[] = { + &ivt_uncore_ubox, + &ivt_uncore_cbox, + &ivt_uncore_pcu, + NULL, +}; + +static struct intel_uncore_type ivt_uncore_ha = { + .name = "ha", + .num_counters = 4, + .num_boxes = 2, + .perf_ctr_bits = 48, + IVT_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type ivt_uncore_imc = { + .name = "imc", + .num_counters = 4, + .num_boxes = 8, + .perf_ctr_bits = 48, + .fixed_ctr_bits = 48, + .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR, + .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL, + IVT_UNCORE_PCI_COMMON_INIT(), +}; + +/* registers in IRP boxes are not properly aligned */ +static unsigned ivt_uncore_irp_ctls[] = {0xd8, 0xdc, 0xe0, 0xe4}; +static unsigned ivt_uncore_irp_ctrs[] = {0xa0, 0xb0, 0xb8, 0xc0}; + +static void ivt_uncore_irp_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + + pci_write_config_dword(pdev, ivt_uncore_irp_ctls[hwc->idx], + hwc->config | SNBEP_PMON_CTL_EN); +} + +static void ivt_uncore_irp_disable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + + pci_write_config_dword(pdev, ivt_uncore_irp_ctls[hwc->idx], hwc->config); +} + +static u64 ivt_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + u64 count = 0; + + pci_read_config_dword(pdev, ivt_uncore_irp_ctrs[hwc->idx], (u32 *)&count); + pci_read_config_dword(pdev, ivt_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1); + + return count; +} + +static struct intel_uncore_ops ivt_uncore_irp_ops = { + .init_box = ivt_uncore_pci_init_box, + .disable_box = snbep_uncore_pci_disable_box, + .enable_box = snbep_uncore_pci_enable_box, + .disable_event = ivt_uncore_irp_disable_event, + .enable_event = ivt_uncore_irp_enable_event, + .read_counter = ivt_uncore_irp_read_counter, +}; + +static struct intel_uncore_type ivt_uncore_irp = { + .name = "irp", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .event_mask = IVT_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCI_PMON_BOX_CTL, + .ops = &ivt_uncore_irp_ops, + .format_group = &ivt_uncore_format_group, +}; + +static struct intel_uncore_ops ivt_uncore_qpi_ops = { + .init_box = ivt_uncore_pci_init_box, + .disable_box = snbep_uncore_pci_disable_box, + .enable_box = snbep_uncore_pci_enable_box, + .disable_event = snbep_uncore_pci_disable_event, + .enable_event = snbep_qpi_enable_event, + .read_counter = snbep_uncore_pci_read_counter, + .hw_config = snbep_qpi_hw_config, + .get_constraint = uncore_get_constraint, + .put_constraint = uncore_put_constraint, +}; + +static struct intel_uncore_type ivt_uncore_qpi = { + .name = "qpi", + .num_counters = 4, + .num_boxes = 3, + .perf_ctr_bits = 48, + .perf_ctr = SNBEP_PCI_PMON_CTR0, + .event_ctl = SNBEP_PCI_PMON_CTL0, + .event_mask = IVT_QPI_PCI_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCI_PMON_BOX_CTL, + .num_shared_regs = 1, + .ops = &ivt_uncore_qpi_ops, + .format_group = &ivt_uncore_qpi_format_group, +}; + +static struct intel_uncore_type ivt_uncore_r2pcie = { + .name = "r2pcie", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 44, + .constraints = snbep_uncore_r2pcie_constraints, + IVT_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type ivt_uncore_r3qpi = { + .name = "r3qpi", + .num_counters = 3, + .num_boxes = 2, + .perf_ctr_bits = 44, + .constraints = snbep_uncore_r3qpi_constraints, + IVT_UNCORE_PCI_COMMON_INIT(), +}; + +enum { + IVT_PCI_UNCORE_HA, + IVT_PCI_UNCORE_IMC, + IVT_PCI_UNCORE_IRP, + IVT_PCI_UNCORE_QPI, + IVT_PCI_UNCORE_R2PCIE, + IVT_PCI_UNCORE_R3QPI, +}; + +static struct intel_uncore_type *ivt_pci_uncores[] = { + [IVT_PCI_UNCORE_HA] = &ivt_uncore_ha, + [IVT_PCI_UNCORE_IMC] = &ivt_uncore_imc, + [IVT_PCI_UNCORE_IRP] = &ivt_uncore_irp, + [IVT_PCI_UNCORE_QPI] = &ivt_uncore_qpi, + [IVT_PCI_UNCORE_R2PCIE] = &ivt_uncore_r2pcie, + [IVT_PCI_UNCORE_R3QPI] = &ivt_uncore_r3qpi, + NULL, +}; + +static DEFINE_PCI_DEVICE_TABLE(ivt_uncore_pci_ids) = { + { /* Home Agent 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe30), + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_HA, 0), + }, + { /* Home Agent 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe38), + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_HA, 1), + }, + { /* MC0 Channel 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb4), + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 0), + }, + { /* MC0 Channel 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb5), + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 1), + }, + { /* MC0 Channel 3 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb0), + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 2), + }, + { /* MC0 Channel 4 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb1), + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 3), + }, + { /* MC1 Channel 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef4), + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 4), + }, + { /* MC1 Channel 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef5), + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 5), + }, + { /* MC1 Channel 3 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef0), + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 6), + }, + { /* MC1 Channel 4 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef1), + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 7), + }, + { /* IRP */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe39), + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IRP, 0), + }, + { /* QPI0 Port 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe32), + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_QPI, 0), + }, + { /* QPI0 Port 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe33), + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_QPI, 1), + }, + { /* QPI1 Port 2 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3a), + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_QPI, 2), + }, + { /* R2PCIe */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe34), + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R2PCIE, 0), + }, + { /* R3QPI0 Link 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe36), + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R3QPI, 0), + }, + { /* R3QPI0 Link 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe37), + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R3QPI, 1), + }, + { /* R3QPI1 Link 2 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3e), + .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R3QPI, 2), + }, + { /* QPI Port 0 filter */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe86), + .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, + SNBEP_PCI_QPI_PORT0_FILTER), + }, + { /* QPI Port 0 filter */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe96), + .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, + SNBEP_PCI_QPI_PORT1_FILTER), + }, + { /* end: all zeroes */ } +}; + +static struct pci_driver ivt_uncore_pci_driver = { + .name = "ivt_uncore", + .id_table = ivt_uncore_pci_ids, +}; +/* end of IvyTown uncore support */ + +/* Sandy Bridge uncore support */ +static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (hwc->idx < UNCORE_PMC_IDX_FIXED) + wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN); + else + wrmsrl(hwc->config_base, SNB_UNC_CTL_EN); +} + +static void snb_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + wrmsrl(event->hw.config_base, 0); +} + +static void snb_uncore_msr_init_box(struct intel_uncore_box *box) +{ + if (box->pmu->pmu_idx == 0) { + wrmsrl(SNB_UNC_PERF_GLOBAL_CTL, + SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL); + } +} + +static struct uncore_event_desc snb_uncore_events[] = { + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"), + { /* end: all zeroes */ }, +}; + +static struct attribute *snb_uncore_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_cmask5.attr, + NULL, +}; + +static struct attribute_group snb_uncore_format_group = { + .name = "format", + .attrs = snb_uncore_formats_attr, +}; + +static struct intel_uncore_ops snb_uncore_msr_ops = { + .init_box = snb_uncore_msr_init_box, + .disable_event = snb_uncore_msr_disable_event, + .enable_event = snb_uncore_msr_enable_event, + .read_counter = uncore_msr_read_counter, +}; + +static struct event_constraint snb_uncore_cbox_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x80, 0x1), + UNCORE_EVENT_CONSTRAINT(0x83, 0x1), + EVENT_CONSTRAINT_END +}; + +static struct intel_uncore_type snb_uncore_cbox = { + .name = "cbox", + .num_counters = 2, + .num_boxes = 4, + .perf_ctr_bits = 44, + .fixed_ctr_bits = 48, + .perf_ctr = SNB_UNC_CBO_0_PER_CTR0, + .event_ctl = SNB_UNC_CBO_0_PERFEVTSEL0, + .fixed_ctr = SNB_UNC_FIXED_CTR, + .fixed_ctl = SNB_UNC_FIXED_CTR_CTRL, + .single_fixed = 1, + .event_mask = SNB_UNC_RAW_EVENT_MASK, + .msr_offset = SNB_UNC_CBO_MSR_OFFSET, + .constraints = snb_uncore_cbox_constraints, + .ops = &snb_uncore_msr_ops, + .format_group = &snb_uncore_format_group, + .event_descs = snb_uncore_events, +}; + +static struct intel_uncore_type *snb_msr_uncores[] = { + &snb_uncore_cbox, + NULL, +}; + +enum { + SNB_PCI_UNCORE_IMC, +}; + +static struct uncore_event_desc snb_uncore_imc_events[] = { + INTEL_UNCORE_EVENT_DESC(data_reads, "event=0x01"), + INTEL_UNCORE_EVENT_DESC(data_reads.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(data_reads.unit, "MiB"), + + INTEL_UNCORE_EVENT_DESC(data_writes, "event=0x02"), + INTEL_UNCORE_EVENT_DESC(data_writes.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(data_writes.unit, "MiB"), + + { /* end: all zeroes */ }, +}; + +#define SNB_UNCORE_PCI_IMC_EVENT_MASK 0xff +#define SNB_UNCORE_PCI_IMC_BAR_OFFSET 0x48 + +/* page size multiple covering all config regs */ +#define SNB_UNCORE_PCI_IMC_MAP_SIZE 0x6000 + +#define SNB_UNCORE_PCI_IMC_DATA_READS 0x1 +#define SNB_UNCORE_PCI_IMC_DATA_READS_BASE 0x5050 +#define SNB_UNCORE_PCI_IMC_DATA_WRITES 0x2 +#define SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE 0x5054 +#define SNB_UNCORE_PCI_IMC_CTR_BASE SNB_UNCORE_PCI_IMC_DATA_READS_BASE + +static struct attribute *snb_uncore_imc_formats_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group snb_uncore_imc_format_group = { + .name = "format", + .attrs = snb_uncore_imc_formats_attr, +}; + +static void snb_uncore_imc_init_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + int where = SNB_UNCORE_PCI_IMC_BAR_OFFSET; + resource_size_t addr; + u32 pci_dword; + + pci_read_config_dword(pdev, where, &pci_dword); + addr = pci_dword; + +#ifdef CONFIG_PHYS_ADDR_T_64BIT + pci_read_config_dword(pdev, where + 4, &pci_dword); + addr |= ((resource_size_t)pci_dword << 32); +#endif + + addr &= ~(PAGE_SIZE - 1); + + box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE); + box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL; +} + +static void snb_uncore_imc_enable_box(struct intel_uncore_box *box) +{} + +static void snb_uncore_imc_disable_box(struct intel_uncore_box *box) +{} + +static void snb_uncore_imc_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{} + +static void snb_uncore_imc_disable_event(struct intel_uncore_box *box, struct perf_event *event) +{} + +static u64 snb_uncore_imc_read_counter(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + return (u64)*(unsigned int *)(box->io_addr + hwc->event_base); +} + +/* + * custom event_init() function because we define our own fixed, free + * running counters, so we do not want to conflict with generic uncore + * logic. Also simplifies processing + */ +static int snb_uncore_imc_event_init(struct perf_event *event) +{ + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + struct hw_perf_event *hwc = &event->hw; + u64 cfg = event->attr.config & SNB_UNCORE_PCI_IMC_EVENT_MASK; + int idx, base; + + if (event->attr.type != event->pmu->type) + return -ENOENT; + + pmu = uncore_event_to_pmu(event); + /* no device found for this pmu */ + if (pmu->func_id < 0) + return -ENOENT; + + /* Sampling not supported yet */ + if (hwc->sample_period) + return -EINVAL; + + /* unsupported modes and filters */ + if (event->attr.exclude_user || + event->attr.exclude_kernel || + event->attr.exclude_hv || + event->attr.exclude_idle || + event->attr.exclude_host || + event->attr.exclude_guest || + event->attr.sample_period) /* no sampling */ + return -EINVAL; + + /* + * Place all uncore events for a particular physical package + * onto a single cpu + */ + if (event->cpu < 0) + return -EINVAL; + + /* check only supported bits are set */ + if (event->attr.config & ~SNB_UNCORE_PCI_IMC_EVENT_MASK) + return -EINVAL; + + box = uncore_pmu_to_box(pmu, event->cpu); + if (!box || box->cpu < 0) + return -EINVAL; + + event->cpu = box->cpu; + + event->hw.idx = -1; + event->hw.last_tag = ~0ULL; + event->hw.extra_reg.idx = EXTRA_REG_NONE; + event->hw.branch_reg.idx = EXTRA_REG_NONE; + /* + * check event is known (whitelist, determines counter) + */ + switch (cfg) { + case SNB_UNCORE_PCI_IMC_DATA_READS: + base = SNB_UNCORE_PCI_IMC_DATA_READS_BASE; + idx = UNCORE_PMC_IDX_FIXED; + break; + case SNB_UNCORE_PCI_IMC_DATA_WRITES: + base = SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE; + idx = UNCORE_PMC_IDX_FIXED + 1; + break; + default: + return -EINVAL; + } + + /* must be done before validate_group */ + event->hw.event_base = base; + event->hw.config = cfg; + event->hw.idx = idx; + + /* no group validation needed, we have free running counters */ + + return 0; +} + +static int snb_uncore_imc_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + return 0; +} + +static void snb_uncore_imc_event_start(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + u64 count; + + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) + return; + + event->hw.state = 0; + box->n_active++; + + list_add_tail(&event->active_entry, &box->active_list); + + count = snb_uncore_imc_read_counter(box, event); + local64_set(&event->hw.prev_count, count); + + if (box->n_active == 1) + uncore_pmu_start_hrtimer(box); +} + +static void snb_uncore_imc_event_stop(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + struct hw_perf_event *hwc = &event->hw; + + if (!(hwc->state & PERF_HES_STOPPED)) { + box->n_active--; + + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; + + list_del(&event->active_entry); + + if (box->n_active == 0) + uncore_pmu_cancel_hrtimer(box); + } + + if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { + /* + * Drain the remaining delta count out of a event + * that we are disabling: + */ + uncore_perf_event_update(box, event); + hwc->state |= PERF_HES_UPTODATE; + } +} + +static int snb_uncore_imc_event_add(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + struct hw_perf_event *hwc = &event->hw; + + if (!box) + return -ENODEV; + + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + if (!(flags & PERF_EF_START)) + hwc->state |= PERF_HES_ARCH; + + snb_uncore_imc_event_start(event, 0); + + box->n_events++; + + return 0; +} + +static void snb_uncore_imc_event_del(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + int i; + + snb_uncore_imc_event_stop(event, PERF_EF_UPDATE); + + for (i = 0; i < box->n_events; i++) { + if (event == box->event_list[i]) { + --box->n_events; + break; + } + } +} + +static int snb_pci2phy_map_init(int devid) +{ + struct pci_dev *dev = NULL; + int bus; + + dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, dev); + if (!dev) + return -ENOTTY; + + bus = dev->bus->number; + + pcibus_to_physid[bus] = 0; + + pci_dev_put(dev); + + return 0; +} + +static struct pmu snb_uncore_imc_pmu = { + .task_ctx_nr = perf_invalid_context, + .event_init = snb_uncore_imc_event_init, + .add = snb_uncore_imc_event_add, + .del = snb_uncore_imc_event_del, + .start = snb_uncore_imc_event_start, + .stop = snb_uncore_imc_event_stop, + .read = uncore_pmu_event_read, +}; + +static struct intel_uncore_ops snb_uncore_imc_ops = { + .init_box = snb_uncore_imc_init_box, + .enable_box = snb_uncore_imc_enable_box, + .disable_box = snb_uncore_imc_disable_box, + .disable_event = snb_uncore_imc_disable_event, + .enable_event = snb_uncore_imc_enable_event, + .hw_config = snb_uncore_imc_hw_config, + .read_counter = snb_uncore_imc_read_counter, +}; + +static struct intel_uncore_type snb_uncore_imc = { + .name = "imc", + .num_counters = 2, + .num_boxes = 1, + .fixed_ctr_bits = 32, + .fixed_ctr = SNB_UNCORE_PCI_IMC_CTR_BASE, + .event_descs = snb_uncore_imc_events, + .format_group = &snb_uncore_imc_format_group, + .perf_ctr = SNB_UNCORE_PCI_IMC_DATA_READS_BASE, + .event_mask = SNB_UNCORE_PCI_IMC_EVENT_MASK, + .ops = &snb_uncore_imc_ops, + .pmu = &snb_uncore_imc_pmu, +}; + +static struct intel_uncore_type *snb_pci_uncores[] = { + [SNB_PCI_UNCORE_IMC] = &snb_uncore_imc, + NULL, +}; + +static DEFINE_PCI_DEVICE_TABLE(snb_uncore_pci_ids) = { + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SNB_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* end: all zeroes */ }, +}; + +static DEFINE_PCI_DEVICE_TABLE(ivb_uncore_pci_ids) = { + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* end: all zeroes */ }, +}; + +static DEFINE_PCI_DEVICE_TABLE(hsw_uncore_pci_ids) = { + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* end: all zeroes */ }, +}; + +static struct pci_driver snb_uncore_pci_driver = { + .name = "snb_uncore", + .id_table = snb_uncore_pci_ids, +}; + +static struct pci_driver ivb_uncore_pci_driver = { + .name = "ivb_uncore", + .id_table = ivb_uncore_pci_ids, +}; + +static struct pci_driver hsw_uncore_pci_driver = { + .name = "hsw_uncore", + .id_table = hsw_uncore_pci_ids, +}; + +/* end of Sandy Bridge uncore support */ + +/* Nehalem uncore support */ +static void nhm_uncore_msr_disable_box(struct intel_uncore_box *box) +{ + wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, 0); +} + +static void nhm_uncore_msr_enable_box(struct intel_uncore_box *box) +{ + wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, NHM_UNC_GLOBAL_CTL_EN_PC_ALL | NHM_UNC_GLOBAL_CTL_EN_FC); +} + +static void nhm_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (hwc->idx < UNCORE_PMC_IDX_FIXED) + wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN); + else + wrmsrl(hwc->config_base, NHM_UNC_FIXED_CTR_CTL_EN); +} + +static struct attribute *nhm_uncore_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_cmask8.attr, + NULL, +}; + +static struct attribute_group nhm_uncore_format_group = { + .name = "format", + .attrs = nhm_uncore_formats_attr, +}; + +static struct uncore_event_desc nhm_uncore_events[] = { + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"), + INTEL_UNCORE_EVENT_DESC(qmc_writes_full_any, "event=0x2f,umask=0x0f"), + INTEL_UNCORE_EVENT_DESC(qmc_normal_reads_any, "event=0x2c,umask=0x0f"), + INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_reads, "event=0x20,umask=0x01"), + INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_writes, "event=0x20,umask=0x02"), + INTEL_UNCORE_EVENT_DESC(qhl_request_remote_reads, "event=0x20,umask=0x04"), + INTEL_UNCORE_EVENT_DESC(qhl_request_remote_writes, "event=0x20,umask=0x08"), + INTEL_UNCORE_EVENT_DESC(qhl_request_local_reads, "event=0x20,umask=0x10"), + INTEL_UNCORE_EVENT_DESC(qhl_request_local_writes, "event=0x20,umask=0x20"), + { /* end: all zeroes */ }, +}; + +static struct intel_uncore_ops nhm_uncore_msr_ops = { + .disable_box = nhm_uncore_msr_disable_box, + .enable_box = nhm_uncore_msr_enable_box, + .disable_event = snb_uncore_msr_disable_event, + .enable_event = nhm_uncore_msr_enable_event, + .read_counter = uncore_msr_read_counter, +}; + +static struct intel_uncore_type nhm_uncore = { + .name = "", + .num_counters = 8, + .num_boxes = 1, + .perf_ctr_bits = 48, + .fixed_ctr_bits = 48, + .event_ctl = NHM_UNC_PERFEVTSEL0, + .perf_ctr = NHM_UNC_UNCORE_PMC0, + .fixed_ctr = NHM_UNC_FIXED_CTR, + .fixed_ctl = NHM_UNC_FIXED_CTR_CTRL, + .event_mask = NHM_UNC_RAW_EVENT_MASK, + .event_descs = nhm_uncore_events, + .ops = &nhm_uncore_msr_ops, + .format_group = &nhm_uncore_format_group, +}; + +static struct intel_uncore_type *nhm_msr_uncores[] = { + &nhm_uncore, + NULL, +}; +/* end of Nehalem uncore support */ + +/* Nehalem-EX uncore support */ +DEFINE_UNCORE_FORMAT_ATTR(event5, event, "config:1-5"); +DEFINE_UNCORE_FORMAT_ATTR(counter, counter, "config:6-7"); +DEFINE_UNCORE_FORMAT_ATTR(match, match, "config1:0-63"); +DEFINE_UNCORE_FORMAT_ATTR(mask, mask, "config2:0-63"); + +static void nhmex_uncore_msr_init_box(struct intel_uncore_box *box) +{ + wrmsrl(NHMEX_U_MSR_PMON_GLOBAL_CTL, NHMEX_U_PMON_GLOBAL_EN_ALL); +} + +static void nhmex_uncore_msr_disable_box(struct intel_uncore_box *box) +{ + unsigned msr = uncore_msr_box_ctl(box); + u64 config; + + if (msr) { + rdmsrl(msr, config); + config &= ~((1ULL << uncore_num_counters(box)) - 1); + /* WBox has a fixed counter */ + if (uncore_msr_fixed_ctl(box)) + config &= ~NHMEX_W_PMON_GLOBAL_FIXED_EN; + wrmsrl(msr, config); + } +} + +static void nhmex_uncore_msr_enable_box(struct intel_uncore_box *box) +{ + unsigned msr = uncore_msr_box_ctl(box); + u64 config; + + if (msr) { + rdmsrl(msr, config); + config |= (1ULL << uncore_num_counters(box)) - 1; + /* WBox has a fixed counter */ + if (uncore_msr_fixed_ctl(box)) + config |= NHMEX_W_PMON_GLOBAL_FIXED_EN; + wrmsrl(msr, config); + } +} + +static void nhmex_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + wrmsrl(event->hw.config_base, 0); +} + +static void nhmex_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (hwc->idx >= UNCORE_PMC_IDX_FIXED) + wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0); + else if (box->pmu->type->event_mask & NHMEX_PMON_CTL_EN_BIT0) + wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22); + else + wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0); +} + +#define NHMEX_UNCORE_OPS_COMMON_INIT() \ + .init_box = nhmex_uncore_msr_init_box, \ + .disable_box = nhmex_uncore_msr_disable_box, \ + .enable_box = nhmex_uncore_msr_enable_box, \ + .disable_event = nhmex_uncore_msr_disable_event, \ + .read_counter = uncore_msr_read_counter + +static struct intel_uncore_ops nhmex_uncore_ops = { + NHMEX_UNCORE_OPS_COMMON_INIT(), + .enable_event = nhmex_uncore_msr_enable_event, +}; + +static struct attribute *nhmex_uncore_ubox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_edge.attr, + NULL, +}; + +static struct attribute_group nhmex_uncore_ubox_format_group = { + .name = "format", + .attrs = nhmex_uncore_ubox_formats_attr, +}; + +static struct intel_uncore_type nhmex_uncore_ubox = { + .name = "ubox", + .num_counters = 1, + .num_boxes = 1, + .perf_ctr_bits = 48, + .event_ctl = NHMEX_U_MSR_PMON_EV_SEL, + .perf_ctr = NHMEX_U_MSR_PMON_CTR, + .event_mask = NHMEX_U_PMON_RAW_EVENT_MASK, + .box_ctl = NHMEX_U_MSR_PMON_GLOBAL_CTL, + .ops = &nhmex_uncore_ops, + .format_group = &nhmex_uncore_ubox_format_group +}; + +static struct attribute *nhmex_uncore_cbox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + NULL, +}; + +static struct attribute_group nhmex_uncore_cbox_format_group = { + .name = "format", + .attrs = nhmex_uncore_cbox_formats_attr, +}; + +/* msr offset for each instance of cbox */ +static unsigned nhmex_cbox_msr_offsets[] = { + 0x0, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x240, 0x2c0, +}; + +static struct intel_uncore_type nhmex_uncore_cbox = { + .name = "cbox", + .num_counters = 6, + .num_boxes = 10, + .perf_ctr_bits = 48, + .event_ctl = NHMEX_C0_MSR_PMON_EV_SEL0, + .perf_ctr = NHMEX_C0_MSR_PMON_CTR0, + .event_mask = NHMEX_PMON_RAW_EVENT_MASK, + .box_ctl = NHMEX_C0_MSR_PMON_GLOBAL_CTL, + .msr_offsets = nhmex_cbox_msr_offsets, + .pair_ctr_ctl = 1, + .ops = &nhmex_uncore_ops, + .format_group = &nhmex_uncore_cbox_format_group +}; + +static struct uncore_event_desc nhmex_uncore_wbox_events[] = { + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0"), + { /* end: all zeroes */ }, +}; + +static struct intel_uncore_type nhmex_uncore_wbox = { + .name = "wbox", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .event_ctl = NHMEX_W_MSR_PMON_CNT0, + .perf_ctr = NHMEX_W_MSR_PMON_EVT_SEL0, + .fixed_ctr = NHMEX_W_MSR_PMON_FIXED_CTR, + .fixed_ctl = NHMEX_W_MSR_PMON_FIXED_CTL, + .event_mask = NHMEX_PMON_RAW_EVENT_MASK, + .box_ctl = NHMEX_W_MSR_GLOBAL_CTL, + .pair_ctr_ctl = 1, + .event_descs = nhmex_uncore_wbox_events, + .ops = &nhmex_uncore_ops, + .format_group = &nhmex_uncore_cbox_format_group +}; + +static int nhmex_bbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + int ctr, ev_sel; + + ctr = (hwc->config & NHMEX_B_PMON_CTR_MASK) >> + NHMEX_B_PMON_CTR_SHIFT; + ev_sel = (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK) >> + NHMEX_B_PMON_CTL_EV_SEL_SHIFT; + + /* events that do not use the match/mask registers */ + if ((ctr == 0 && ev_sel > 0x3) || (ctr == 1 && ev_sel > 0x6) || + (ctr == 2 && ev_sel != 0x4) || ctr == 3) + return 0; + + if (box->pmu->pmu_idx == 0) + reg1->reg = NHMEX_B0_MSR_MATCH; + else + reg1->reg = NHMEX_B1_MSR_MATCH; + reg1->idx = 0; + reg1->config = event->attr.config1; + reg2->config = event->attr.config2; + return 0; +} + +static void nhmex_bbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + + if (reg1->idx != EXTRA_REG_NONE) { + wrmsrl(reg1->reg, reg1->config); + wrmsrl(reg1->reg + 1, reg2->config); + } + wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 | + (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK)); +} + +/* + * The Bbox has 4 counters, but each counter monitors different events. + * Use bits 6-7 in the event config to select counter. + */ +static struct event_constraint nhmex_uncore_bbox_constraints[] = { + EVENT_CONSTRAINT(0 , 1, 0xc0), + EVENT_CONSTRAINT(0x40, 2, 0xc0), + EVENT_CONSTRAINT(0x80, 4, 0xc0), + EVENT_CONSTRAINT(0xc0, 8, 0xc0), + EVENT_CONSTRAINT_END, +}; + +static struct attribute *nhmex_uncore_bbox_formats_attr[] = { + &format_attr_event5.attr, + &format_attr_counter.attr, + &format_attr_match.attr, + &format_attr_mask.attr, + NULL, +}; + +static struct attribute_group nhmex_uncore_bbox_format_group = { + .name = "format", + .attrs = nhmex_uncore_bbox_formats_attr, +}; + +static struct intel_uncore_ops nhmex_uncore_bbox_ops = { + NHMEX_UNCORE_OPS_COMMON_INIT(), + .enable_event = nhmex_bbox_msr_enable_event, + .hw_config = nhmex_bbox_hw_config, + .get_constraint = uncore_get_constraint, + .put_constraint = uncore_put_constraint, +}; + +static struct intel_uncore_type nhmex_uncore_bbox = { + .name = "bbox", + .num_counters = 4, + .num_boxes = 2, + .perf_ctr_bits = 48, + .event_ctl = NHMEX_B0_MSR_PMON_CTL0, + .perf_ctr = NHMEX_B0_MSR_PMON_CTR0, + .event_mask = NHMEX_B_PMON_RAW_EVENT_MASK, + .box_ctl = NHMEX_B0_MSR_PMON_GLOBAL_CTL, + .msr_offset = NHMEX_B_MSR_OFFSET, + .pair_ctr_ctl = 1, + .num_shared_regs = 1, + .constraints = nhmex_uncore_bbox_constraints, + .ops = &nhmex_uncore_bbox_ops, + .format_group = &nhmex_uncore_bbox_format_group +}; + +static int nhmex_sbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + + /* only TO_R_PROG_EV event uses the match/mask register */ + if ((hwc->config & NHMEX_PMON_CTL_EV_SEL_MASK) != + NHMEX_S_EVENT_TO_R_PROG_EV) + return 0; + + if (box->pmu->pmu_idx == 0) + reg1->reg = NHMEX_S0_MSR_MM_CFG; + else + reg1->reg = NHMEX_S1_MSR_MM_CFG; + reg1->idx = 0; + reg1->config = event->attr.config1; + reg2->config = event->attr.config2; + return 0; +} + +static void nhmex_sbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + + if (reg1->idx != EXTRA_REG_NONE) { + wrmsrl(reg1->reg, 0); + wrmsrl(reg1->reg + 1, reg1->config); + wrmsrl(reg1->reg + 2, reg2->config); + wrmsrl(reg1->reg, NHMEX_S_PMON_MM_CFG_EN); + } + wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22); +} + +static struct attribute *nhmex_uncore_sbox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + &format_attr_match.attr, + &format_attr_mask.attr, + NULL, +}; + +static struct attribute_group nhmex_uncore_sbox_format_group = { + .name = "format", + .attrs = nhmex_uncore_sbox_formats_attr, +}; + +static struct intel_uncore_ops nhmex_uncore_sbox_ops = { + NHMEX_UNCORE_OPS_COMMON_INIT(), + .enable_event = nhmex_sbox_msr_enable_event, + .hw_config = nhmex_sbox_hw_config, + .get_constraint = uncore_get_constraint, + .put_constraint = uncore_put_constraint, +}; + +static struct intel_uncore_type nhmex_uncore_sbox = { + .name = "sbox", + .num_counters = 4, + .num_boxes = 2, + .perf_ctr_bits = 48, + .event_ctl = NHMEX_S0_MSR_PMON_CTL0, + .perf_ctr = NHMEX_S0_MSR_PMON_CTR0, + .event_mask = NHMEX_PMON_RAW_EVENT_MASK, + .box_ctl = NHMEX_S0_MSR_PMON_GLOBAL_CTL, + .msr_offset = NHMEX_S_MSR_OFFSET, + .pair_ctr_ctl = 1, + .num_shared_regs = 1, + .ops = &nhmex_uncore_sbox_ops, + .format_group = &nhmex_uncore_sbox_format_group +}; + +enum { + EXTRA_REG_NHMEX_M_FILTER, + EXTRA_REG_NHMEX_M_DSP, + EXTRA_REG_NHMEX_M_ISS, + EXTRA_REG_NHMEX_M_MAP, + EXTRA_REG_NHMEX_M_MSC_THR, + EXTRA_REG_NHMEX_M_PGT, + EXTRA_REG_NHMEX_M_PLD, + EXTRA_REG_NHMEX_M_ZDP_CTL_FVC, +}; + +static struct extra_reg nhmex_uncore_mbox_extra_regs[] = { + MBOX_INC_SEL_EXTAR_REG(0x0, DSP), + MBOX_INC_SEL_EXTAR_REG(0x4, MSC_THR), + MBOX_INC_SEL_EXTAR_REG(0x5, MSC_THR), + MBOX_INC_SEL_EXTAR_REG(0x9, ISS), + /* event 0xa uses two extra registers */ + MBOX_INC_SEL_EXTAR_REG(0xa, ISS), + MBOX_INC_SEL_EXTAR_REG(0xa, PLD), + MBOX_INC_SEL_EXTAR_REG(0xb, PLD), + /* events 0xd ~ 0x10 use the same extra register */ + MBOX_INC_SEL_EXTAR_REG(0xd, ZDP_CTL_FVC), + MBOX_INC_SEL_EXTAR_REG(0xe, ZDP_CTL_FVC), + MBOX_INC_SEL_EXTAR_REG(0xf, ZDP_CTL_FVC), + MBOX_INC_SEL_EXTAR_REG(0x10, ZDP_CTL_FVC), + MBOX_INC_SEL_EXTAR_REG(0x16, PGT), + MBOX_SET_FLAG_SEL_EXTRA_REG(0x0, DSP), + MBOX_SET_FLAG_SEL_EXTRA_REG(0x1, ISS), + MBOX_SET_FLAG_SEL_EXTRA_REG(0x5, PGT), + MBOX_SET_FLAG_SEL_EXTRA_REG(0x6, MAP), + EVENT_EXTRA_END +}; + +/* Nehalem-EX or Westmere-EX ? */ +static bool uncore_nhmex; + +static bool nhmex_mbox_get_shared_reg(struct intel_uncore_box *box, int idx, u64 config) +{ + struct intel_uncore_extra_reg *er; + unsigned long flags; + bool ret = false; + u64 mask; + + if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) { + er = &box->shared_regs[idx]; + raw_spin_lock_irqsave(&er->lock, flags); + if (!atomic_read(&er->ref) || er->config == config) { + atomic_inc(&er->ref); + er->config = config; + ret = true; + } + raw_spin_unlock_irqrestore(&er->lock, flags); + + return ret; + } + /* + * The ZDP_CTL_FVC MSR has 4 fields which are used to control + * events 0xd ~ 0x10. Besides these 4 fields, there are additional + * fields which are shared. + */ + idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC; + if (WARN_ON_ONCE(idx >= 4)) + return false; + + /* mask of the shared fields */ + if (uncore_nhmex) + mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK; + else + mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK; + er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC]; + + raw_spin_lock_irqsave(&er->lock, flags); + /* add mask of the non-shared field if it's in use */ + if (__BITS_VALUE(atomic_read(&er->ref), idx, 8)) { + if (uncore_nhmex) + mask |= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + else + mask |= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + } + + if (!atomic_read(&er->ref) || !((er->config ^ config) & mask)) { + atomic_add(1 << (idx * 8), &er->ref); + if (uncore_nhmex) + mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK | + NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + else + mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK | + WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + er->config &= ~mask; + er->config |= (config & mask); + ret = true; + } + raw_spin_unlock_irqrestore(&er->lock, flags); + + return ret; +} + +static void nhmex_mbox_put_shared_reg(struct intel_uncore_box *box, int idx) +{ + struct intel_uncore_extra_reg *er; + + if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) { + er = &box->shared_regs[idx]; + atomic_dec(&er->ref); + return; + } + + idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC; + er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC]; + atomic_sub(1 << (idx * 8), &er->ref); +} + +static u64 nhmex_mbox_alter_er(struct perf_event *event, int new_idx, bool modify) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + u64 idx, orig_idx = __BITS_VALUE(reg1->idx, 0, 8); + u64 config = reg1->config; + + /* get the non-shared control bits and shift them */ + idx = orig_idx - EXTRA_REG_NHMEX_M_ZDP_CTL_FVC; + if (uncore_nhmex) + config &= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + else + config &= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + if (new_idx > orig_idx) { + idx = new_idx - orig_idx; + config <<= 3 * idx; + } else { + idx = orig_idx - new_idx; + config >>= 3 * idx; + } + + /* add the shared control bits back */ + if (uncore_nhmex) + config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config; + else + config |= WSMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config; + config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config; + if (modify) { + /* adjust the main event selector */ + if (new_idx > orig_idx) + hwc->config += idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT; + else + hwc->config -= idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT; + reg1->config = config; + reg1->idx = ~0xff | new_idx; + } + return config; +} + +static struct event_constraint * +nhmex_mbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; + int i, idx[2], alloc = 0; + u64 config1 = reg1->config; + + idx[0] = __BITS_VALUE(reg1->idx, 0, 8); + idx[1] = __BITS_VALUE(reg1->idx, 1, 8); +again: + for (i = 0; i < 2; i++) { + if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i))) + idx[i] = 0xff; + + if (idx[i] == 0xff) + continue; + + if (!nhmex_mbox_get_shared_reg(box, idx[i], + __BITS_VALUE(config1, i, 32))) + goto fail; + alloc |= (0x1 << i); + } + + /* for the match/mask registers */ + if (reg2->idx != EXTRA_REG_NONE && + (uncore_box_is_fake(box) || !reg2->alloc) && + !nhmex_mbox_get_shared_reg(box, reg2->idx, reg2->config)) + goto fail; + + /* + * If it's a fake box -- as per validate_{group,event}() we + * shouldn't touch event state and we can avoid doing so + * since both will only call get_event_constraints() once + * on each event, this avoids the need for reg->alloc. + */ + if (!uncore_box_is_fake(box)) { + if (idx[0] != 0xff && idx[0] != __BITS_VALUE(reg1->idx, 0, 8)) + nhmex_mbox_alter_er(event, idx[0], true); + reg1->alloc |= alloc; + if (reg2->idx != EXTRA_REG_NONE) + reg2->alloc = 1; + } + return NULL; +fail: + if (idx[0] != 0xff && !(alloc & 0x1) && + idx[0] >= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) { + /* + * events 0xd ~ 0x10 are functional identical, but are + * controlled by different fields in the ZDP_CTL_FVC + * register. If we failed to take one field, try the + * rest 3 choices. + */ + BUG_ON(__BITS_VALUE(reg1->idx, 1, 8) != 0xff); + idx[0] -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC; + idx[0] = (idx[0] + 1) % 4; + idx[0] += EXTRA_REG_NHMEX_M_ZDP_CTL_FVC; + if (idx[0] != __BITS_VALUE(reg1->idx, 0, 8)) { + config1 = nhmex_mbox_alter_er(event, idx[0], false); + goto again; + } + } + + if (alloc & 0x1) + nhmex_mbox_put_shared_reg(box, idx[0]); + if (alloc & 0x2) + nhmex_mbox_put_shared_reg(box, idx[1]); + return &constraint_empty; +} + +static void nhmex_mbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; + + if (uncore_box_is_fake(box)) + return; + + if (reg1->alloc & 0x1) + nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 0, 8)); + if (reg1->alloc & 0x2) + nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 1, 8)); + reg1->alloc = 0; + + if (reg2->alloc) { + nhmex_mbox_put_shared_reg(box, reg2->idx); + reg2->alloc = 0; + } +} + +static int nhmex_mbox_extra_reg_idx(struct extra_reg *er) +{ + if (er->idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) + return er->idx; + return er->idx + (er->event >> NHMEX_M_PMON_CTL_INC_SEL_SHIFT) - 0xd; +} + +static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct intel_uncore_type *type = box->pmu->type; + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; + struct extra_reg *er; + unsigned msr; + int reg_idx = 0; + /* + * The mbox events may require 2 extra MSRs at the most. But only + * the lower 32 bits in these MSRs are significant, so we can use + * config1 to pass two MSRs' config. + */ + for (er = nhmex_uncore_mbox_extra_regs; er->msr; er++) { + if (er->event != (event->hw.config & er->config_mask)) + continue; + if (event->attr.config1 & ~er->valid_mask) + return -EINVAL; + + msr = er->msr + type->msr_offset * box->pmu->pmu_idx; + if (WARN_ON_ONCE(msr >= 0xffff || er->idx >= 0xff)) + return -EINVAL; + + /* always use the 32~63 bits to pass the PLD config */ + if (er->idx == EXTRA_REG_NHMEX_M_PLD) + reg_idx = 1; + else if (WARN_ON_ONCE(reg_idx > 0)) + return -EINVAL; + + reg1->idx &= ~(0xff << (reg_idx * 8)); + reg1->reg &= ~(0xffff << (reg_idx * 16)); + reg1->idx |= nhmex_mbox_extra_reg_idx(er) << (reg_idx * 8); + reg1->reg |= msr << (reg_idx * 16); + reg1->config = event->attr.config1; + reg_idx++; + } + /* + * The mbox only provides ability to perform address matching + * for the PLD events. + */ + if (reg_idx == 2) { + reg2->idx = EXTRA_REG_NHMEX_M_FILTER; + if (event->attr.config2 & NHMEX_M_PMON_MM_CFG_EN) + reg2->config = event->attr.config2; + else + reg2->config = ~0ULL; + if (box->pmu->pmu_idx == 0) + reg2->reg = NHMEX_M0_MSR_PMU_MM_CFG; + else + reg2->reg = NHMEX_M1_MSR_PMU_MM_CFG; + } + return 0; +} + +static u64 nhmex_mbox_shared_reg_config(struct intel_uncore_box *box, int idx) +{ + struct intel_uncore_extra_reg *er; + unsigned long flags; + u64 config; + + if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) + return box->shared_regs[idx].config; + + er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC]; + raw_spin_lock_irqsave(&er->lock, flags); + config = er->config; + raw_spin_unlock_irqrestore(&er->lock, flags); + return config; +} + +static void nhmex_mbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + int idx; + + idx = __BITS_VALUE(reg1->idx, 0, 8); + if (idx != 0xff) + wrmsrl(__BITS_VALUE(reg1->reg, 0, 16), + nhmex_mbox_shared_reg_config(box, idx)); + idx = __BITS_VALUE(reg1->idx, 1, 8); + if (idx != 0xff) + wrmsrl(__BITS_VALUE(reg1->reg, 1, 16), + nhmex_mbox_shared_reg_config(box, idx)); + + if (reg2->idx != EXTRA_REG_NONE) { + wrmsrl(reg2->reg, 0); + if (reg2->config != ~0ULL) { + wrmsrl(reg2->reg + 1, + reg2->config & NHMEX_M_PMON_ADDR_MATCH_MASK); + wrmsrl(reg2->reg + 2, NHMEX_M_PMON_ADDR_MASK_MASK & + (reg2->config >> NHMEX_M_PMON_ADDR_MASK_SHIFT)); + wrmsrl(reg2->reg, NHMEX_M_PMON_MM_CFG_EN); + } + } + + wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0); +} + +DEFINE_UNCORE_FORMAT_ATTR(count_mode, count_mode, "config:2-3"); +DEFINE_UNCORE_FORMAT_ATTR(storage_mode, storage_mode, "config:4-5"); +DEFINE_UNCORE_FORMAT_ATTR(wrap_mode, wrap_mode, "config:6"); +DEFINE_UNCORE_FORMAT_ATTR(flag_mode, flag_mode, "config:7"); +DEFINE_UNCORE_FORMAT_ATTR(inc_sel, inc_sel, "config:9-13"); +DEFINE_UNCORE_FORMAT_ATTR(set_flag_sel, set_flag_sel, "config:19-21"); +DEFINE_UNCORE_FORMAT_ATTR(filter_cfg_en, filter_cfg_en, "config2:63"); +DEFINE_UNCORE_FORMAT_ATTR(filter_match, filter_match, "config2:0-33"); +DEFINE_UNCORE_FORMAT_ATTR(filter_mask, filter_mask, "config2:34-61"); +DEFINE_UNCORE_FORMAT_ATTR(dsp, dsp, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(thr, thr, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(fvc, fvc, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(pgt, pgt, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(map, map, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(iss, iss, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(pld, pld, "config1:32-63"); + +static struct attribute *nhmex_uncore_mbox_formats_attr[] = { + &format_attr_count_mode.attr, + &format_attr_storage_mode.attr, + &format_attr_wrap_mode.attr, + &format_attr_flag_mode.attr, + &format_attr_inc_sel.attr, + &format_attr_set_flag_sel.attr, + &format_attr_filter_cfg_en.attr, + &format_attr_filter_match.attr, + &format_attr_filter_mask.attr, + &format_attr_dsp.attr, + &format_attr_thr.attr, + &format_attr_fvc.attr, + &format_attr_pgt.attr, + &format_attr_map.attr, + &format_attr_iss.attr, + &format_attr_pld.attr, + NULL, +}; + +static struct attribute_group nhmex_uncore_mbox_format_group = { + .name = "format", + .attrs = nhmex_uncore_mbox_formats_attr, +}; + +static struct uncore_event_desc nhmex_uncore_mbox_events[] = { + INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x2800"), + INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x2820"), + { /* end: all zeroes */ }, +}; + +static struct uncore_event_desc wsmex_uncore_mbox_events[] = { + INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x5000"), + INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x5040"), + { /* end: all zeroes */ }, +}; + +static struct intel_uncore_ops nhmex_uncore_mbox_ops = { + NHMEX_UNCORE_OPS_COMMON_INIT(), + .enable_event = nhmex_mbox_msr_enable_event, + .hw_config = nhmex_mbox_hw_config, + .get_constraint = nhmex_mbox_get_constraint, + .put_constraint = nhmex_mbox_put_constraint, +}; + +static struct intel_uncore_type nhmex_uncore_mbox = { + .name = "mbox", + .num_counters = 6, + .num_boxes = 2, + .perf_ctr_bits = 48, + .event_ctl = NHMEX_M0_MSR_PMU_CTL0, + .perf_ctr = NHMEX_M0_MSR_PMU_CNT0, + .event_mask = NHMEX_M_PMON_RAW_EVENT_MASK, + .box_ctl = NHMEX_M0_MSR_GLOBAL_CTL, + .msr_offset = NHMEX_M_MSR_OFFSET, + .pair_ctr_ctl = 1, + .num_shared_regs = 8, + .event_descs = nhmex_uncore_mbox_events, + .ops = &nhmex_uncore_mbox_ops, + .format_group = &nhmex_uncore_mbox_format_group, +}; + +static void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + + /* adjust the main event selector and extra register index */ + if (reg1->idx % 2) { + reg1->idx--; + hwc->config -= 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT; + } else { + reg1->idx++; + hwc->config += 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT; + } + + /* adjust extra register config */ + switch (reg1->idx % 6) { + case 2: + /* shift the 8~15 bits to the 0~7 bits */ + reg1->config >>= 8; + break; + case 3: + /* shift the 0~7 bits to the 8~15 bits */ + reg1->config <<= 8; + break; + }; +} + +/* + * Each rbox has 4 event set which monitor PQI port 0~3 or 4~7. + * An event set consists of 6 events, the 3rd and 4th events in + * an event set use the same extra register. So an event set uses + * 5 extra registers. + */ +static struct event_constraint * +nhmex_rbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + struct intel_uncore_extra_reg *er; + unsigned long flags; + int idx, er_idx; + u64 config1; + bool ok = false; + + if (!uncore_box_is_fake(box) && reg1->alloc) + return NULL; + + idx = reg1->idx % 6; + config1 = reg1->config; +again: + er_idx = idx; + /* the 3rd and 4th events use the same extra register */ + if (er_idx > 2) + er_idx--; + er_idx += (reg1->idx / 6) * 5; + + er = &box->shared_regs[er_idx]; + raw_spin_lock_irqsave(&er->lock, flags); + if (idx < 2) { + if (!atomic_read(&er->ref) || er->config == reg1->config) { + atomic_inc(&er->ref); + er->config = reg1->config; + ok = true; + } + } else if (idx == 2 || idx == 3) { + /* + * these two events use different fields in a extra register, + * the 0~7 bits and the 8~15 bits respectively. + */ + u64 mask = 0xff << ((idx - 2) * 8); + if (!__BITS_VALUE(atomic_read(&er->ref), idx - 2, 8) || + !((er->config ^ config1) & mask)) { + atomic_add(1 << ((idx - 2) * 8), &er->ref); + er->config &= ~mask; + er->config |= config1 & mask; + ok = true; + } + } else { + if (!atomic_read(&er->ref) || + (er->config == (hwc->config >> 32) && + er->config1 == reg1->config && + er->config2 == reg2->config)) { + atomic_inc(&er->ref); + er->config = (hwc->config >> 32); + er->config1 = reg1->config; + er->config2 = reg2->config; + ok = true; + } + } + raw_spin_unlock_irqrestore(&er->lock, flags); + + if (!ok) { + /* + * The Rbox events are always in pairs. The paired + * events are functional identical, but use different + * extra registers. If we failed to take an extra + * register, try the alternative. + */ + if (idx % 2) + idx--; + else + idx++; + if (idx != reg1->idx % 6) { + if (idx == 2) + config1 >>= 8; + else if (idx == 3) + config1 <<= 8; + goto again; + } + } else { + if (!uncore_box_is_fake(box)) { + if (idx != reg1->idx % 6) + nhmex_rbox_alter_er(box, event); + reg1->alloc = 1; + } + return NULL; + } + return &constraint_empty; +} + +static void nhmex_rbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + struct intel_uncore_extra_reg *er; + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + int idx, er_idx; + + if (uncore_box_is_fake(box) || !reg1->alloc) + return; + + idx = reg1->idx % 6; + er_idx = idx; + if (er_idx > 2) + er_idx--; + er_idx += (reg1->idx / 6) * 5; + + er = &box->shared_regs[er_idx]; + if (idx == 2 || idx == 3) + atomic_sub(1 << ((idx - 2) * 8), &er->ref); + else + atomic_dec(&er->ref); + + reg1->alloc = 0; +} + +static int nhmex_rbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; + int idx; + + idx = (event->hw.config & NHMEX_R_PMON_CTL_EV_SEL_MASK) >> + NHMEX_R_PMON_CTL_EV_SEL_SHIFT; + if (idx >= 0x18) + return -EINVAL; + + reg1->idx = idx; + reg1->config = event->attr.config1; + + switch (idx % 6) { + case 4: + case 5: + hwc->config |= event->attr.config & (~0ULL << 32); + reg2->config = event->attr.config2; + break; + }; + return 0; +} + +static void nhmex_rbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + int idx, port; + + idx = reg1->idx; + port = idx / 6 + box->pmu->pmu_idx * 4; + + switch (idx % 6) { + case 0: + wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG0(port), reg1->config); + break; + case 1: + wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG1(port), reg1->config); + break; + case 2: + case 3: + wrmsrl(NHMEX_R_MSR_PORTN_QLX_CFG(port), + uncore_shared_reg_config(box, 2 + (idx / 6) * 5)); + break; + case 4: + wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port), + hwc->config >> 32); + wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(port), reg1->config); + wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MASK(port), reg2->config); + break; + case 5: + wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port), + hwc->config >> 32); + wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(port), reg1->config); + wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MASK(port), reg2->config); + break; + }; + + wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 | + (hwc->config & NHMEX_R_PMON_CTL_EV_SEL_MASK)); +} + +DEFINE_UNCORE_FORMAT_ATTR(xbr_mm_cfg, xbr_mm_cfg, "config:32-63"); +DEFINE_UNCORE_FORMAT_ATTR(xbr_match, xbr_match, "config1:0-63"); +DEFINE_UNCORE_FORMAT_ATTR(xbr_mask, xbr_mask, "config2:0-63"); +DEFINE_UNCORE_FORMAT_ATTR(qlx_cfg, qlx_cfg, "config1:0-15"); +DEFINE_UNCORE_FORMAT_ATTR(iperf_cfg, iperf_cfg, "config1:0-31"); + +static struct attribute *nhmex_uncore_rbox_formats_attr[] = { + &format_attr_event5.attr, + &format_attr_xbr_mm_cfg.attr, + &format_attr_xbr_match.attr, + &format_attr_xbr_mask.attr, + &format_attr_qlx_cfg.attr, + &format_attr_iperf_cfg.attr, + NULL, +}; + +static struct attribute_group nhmex_uncore_rbox_format_group = { + .name = "format", + .attrs = nhmex_uncore_rbox_formats_attr, +}; + +static struct uncore_event_desc nhmex_uncore_rbox_events[] = { + INTEL_UNCORE_EVENT_DESC(qpi0_flit_send, "event=0x0,iperf_cfg=0x80000000"), + INTEL_UNCORE_EVENT_DESC(qpi1_filt_send, "event=0x6,iperf_cfg=0x80000000"), + INTEL_UNCORE_EVENT_DESC(qpi0_idle_filt, "event=0x0,iperf_cfg=0x40000000"), + INTEL_UNCORE_EVENT_DESC(qpi1_idle_filt, "event=0x6,iperf_cfg=0x40000000"), + INTEL_UNCORE_EVENT_DESC(qpi0_date_response, "event=0x0,iperf_cfg=0xc4"), + INTEL_UNCORE_EVENT_DESC(qpi1_date_response, "event=0x6,iperf_cfg=0xc4"), + { /* end: all zeroes */ }, +}; + +static struct intel_uncore_ops nhmex_uncore_rbox_ops = { + NHMEX_UNCORE_OPS_COMMON_INIT(), + .enable_event = nhmex_rbox_msr_enable_event, + .hw_config = nhmex_rbox_hw_config, + .get_constraint = nhmex_rbox_get_constraint, + .put_constraint = nhmex_rbox_put_constraint, +}; + +static struct intel_uncore_type nhmex_uncore_rbox = { + .name = "rbox", + .num_counters = 8, + .num_boxes = 2, + .perf_ctr_bits = 48, + .event_ctl = NHMEX_R_MSR_PMON_CTL0, + .perf_ctr = NHMEX_R_MSR_PMON_CNT0, + .event_mask = NHMEX_R_PMON_RAW_EVENT_MASK, + .box_ctl = NHMEX_R_MSR_GLOBAL_CTL, + .msr_offset = NHMEX_R_MSR_OFFSET, + .pair_ctr_ctl = 1, + .num_shared_regs = 20, + .event_descs = nhmex_uncore_rbox_events, + .ops = &nhmex_uncore_rbox_ops, + .format_group = &nhmex_uncore_rbox_format_group +}; + +static struct intel_uncore_type *nhmex_msr_uncores[] = { + &nhmex_uncore_ubox, + &nhmex_uncore_cbox, + &nhmex_uncore_bbox, + &nhmex_uncore_sbox, + &nhmex_uncore_mbox, + &nhmex_uncore_rbox, + &nhmex_uncore_wbox, + NULL, +}; +/* end of Nehalem-EX uncore support */ + +static void uncore_assign_hw_event(struct intel_uncore_box *box, struct perf_event *event, int idx) +{ + struct hw_perf_event *hwc = &event->hw; + + hwc->idx = idx; + hwc->last_tag = ++box->tags[idx]; + + if (hwc->idx == UNCORE_PMC_IDX_FIXED) { + hwc->event_base = uncore_fixed_ctr(box); + hwc->config_base = uncore_fixed_ctl(box); + return; + } + + hwc->config_base = uncore_event_ctl(box, hwc->idx); + hwc->event_base = uncore_perf_ctr(box, hwc->idx); +} + +static void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event) +{ + u64 prev_count, new_count, delta; + int shift; + + if (event->hw.idx >= UNCORE_PMC_IDX_FIXED) + shift = 64 - uncore_fixed_ctr_bits(box); + else + shift = 64 - uncore_perf_ctr_bits(box); + + /* the hrtimer might modify the previous event value */ +again: + prev_count = local64_read(&event->hw.prev_count); + new_count = uncore_read_counter(box, event); + if (local64_xchg(&event->hw.prev_count, new_count) != prev_count) + goto again; + + delta = (new_count << shift) - (prev_count << shift); + delta >>= shift; + + local64_add(delta, &event->count); +} + +/* + * The overflow interrupt is unavailable for SandyBridge-EP, is broken + * for SandyBridge. So we use hrtimer to periodically poll the counter + * to avoid overflow. + */ +static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer) +{ + struct intel_uncore_box *box; + struct perf_event *event; + unsigned long flags; + int bit; + + box = container_of(hrtimer, struct intel_uncore_box, hrtimer); + if (!box->n_active || box->cpu != smp_processor_id()) + return HRTIMER_NORESTART; + /* + * disable local interrupt to prevent uncore_pmu_event_start/stop + * to interrupt the update process + */ + local_irq_save(flags); + + /* + * handle boxes with an active event list as opposed to active + * counters + */ + list_for_each_entry(event, &box->active_list, active_entry) { + uncore_perf_event_update(box, event); + } + + for_each_set_bit(bit, box->active_mask, UNCORE_PMC_IDX_MAX) + uncore_perf_event_update(box, box->events[bit]); + + local_irq_restore(flags); + + hrtimer_forward_now(hrtimer, ns_to_ktime(box->hrtimer_duration)); + return HRTIMER_RESTART; +} + +static void uncore_pmu_start_hrtimer(struct intel_uncore_box *box) +{ + __hrtimer_start_range_ns(&box->hrtimer, + ns_to_ktime(box->hrtimer_duration), 0, + HRTIMER_MODE_REL_PINNED, 0); +} + +static void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box) +{ + hrtimer_cancel(&box->hrtimer); +} + +static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box) +{ + hrtimer_init(&box->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + box->hrtimer.function = uncore_pmu_hrtimer; +} + +static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, int node) +{ + struct intel_uncore_box *box; + int i, size; + + size = sizeof(*box) + type->num_shared_regs * sizeof(struct intel_uncore_extra_reg); + + box = kzalloc_node(size, GFP_KERNEL, node); + if (!box) + return NULL; + + for (i = 0; i < type->num_shared_regs; i++) + raw_spin_lock_init(&box->shared_regs[i].lock); + + uncore_pmu_init_hrtimer(box); + atomic_set(&box->refcnt, 1); + box->cpu = -1; + box->phys_id = -1; + + /* set default hrtimer timeout */ + box->hrtimer_duration = UNCORE_PMU_HRTIMER_INTERVAL; + + INIT_LIST_HEAD(&box->active_list); + + return box; +} + +static int +uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader, bool dogrp) +{ + struct perf_event *event; + int n, max_count; + + max_count = box->pmu->type->num_counters; + if (box->pmu->type->fixed_ctl) + max_count++; + + if (box->n_events >= max_count) + return -EINVAL; + + n = box->n_events; + box->event_list[n] = leader; + n++; + if (!dogrp) + return n; + + list_for_each_entry(event, &leader->sibling_list, group_entry) { + if (event->state <= PERF_EVENT_STATE_OFF) + continue; + + if (n >= max_count) + return -EINVAL; + + box->event_list[n] = event; + n++; + } + return n; +} + +static struct event_constraint * +uncore_get_event_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + struct intel_uncore_type *type = box->pmu->type; + struct event_constraint *c; + + if (type->ops->get_constraint) { + c = type->ops->get_constraint(box, event); + if (c) + return c; + } + + if (event->attr.config == UNCORE_FIXED_EVENT) + return &constraint_fixed; + + if (type->constraints) { + for_each_event_constraint(c, type->constraints) { + if ((event->hw.config & c->cmask) == c->code) + return c; + } + } + + return &type->unconstrainted; +} + +static void uncore_put_event_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + if (box->pmu->type->ops->put_constraint) + box->pmu->type->ops->put_constraint(box, event); +} + +static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int n) +{ + unsigned long used_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)]; + struct event_constraint *c; + int i, wmin, wmax, ret = 0; + struct hw_perf_event *hwc; + + bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX); + + for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) { + hwc = &box->event_list[i]->hw; + c = uncore_get_event_constraint(box, box->event_list[i]); + hwc->constraint = c; + wmin = min(wmin, c->weight); + wmax = max(wmax, c->weight); + } + + /* fastpath, try to reuse previous register */ + for (i = 0; i < n; i++) { + hwc = &box->event_list[i]->hw; + c = hwc->constraint; + + /* never assigned */ + if (hwc->idx == -1) + break; + + /* constraint still honored */ + if (!test_bit(hwc->idx, c->idxmsk)) + break; + + /* not already used */ + if (test_bit(hwc->idx, used_mask)) + break; + + __set_bit(hwc->idx, used_mask); + if (assign) + assign[i] = hwc->idx; + } + /* slow path */ + if (i != n) + ret = perf_assign_events(box->event_list, n, + wmin, wmax, assign); + + if (!assign || ret) { + for (i = 0; i < n; i++) + uncore_put_event_constraint(box, box->event_list[i]); + } + return ret ? -EINVAL : 0; +} + +static void uncore_pmu_event_start(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + int idx = event->hw.idx; + + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) + return; + + if (WARN_ON_ONCE(idx == -1 || idx >= UNCORE_PMC_IDX_MAX)) + return; + + event->hw.state = 0; + box->events[idx] = event; + box->n_active++; + __set_bit(idx, box->active_mask); + + local64_set(&event->hw.prev_count, uncore_read_counter(box, event)); + uncore_enable_event(box, event); + + if (box->n_active == 1) { + uncore_enable_box(box); + uncore_pmu_start_hrtimer(box); + } +} + +static void uncore_pmu_event_stop(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + struct hw_perf_event *hwc = &event->hw; + + if (__test_and_clear_bit(hwc->idx, box->active_mask)) { + uncore_disable_event(box, event); + box->n_active--; + box->events[hwc->idx] = NULL; + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; + + if (box->n_active == 0) { + uncore_disable_box(box); + uncore_pmu_cancel_hrtimer(box); + } + } + + if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { + /* + * Drain the remaining delta count out of a event + * that we are disabling: + */ + uncore_perf_event_update(box, event); + hwc->state |= PERF_HES_UPTODATE; + } +} + +static int uncore_pmu_event_add(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + struct hw_perf_event *hwc = &event->hw; + int assign[UNCORE_PMC_IDX_MAX]; + int i, n, ret; + + if (!box) + return -ENODEV; + + ret = n = uncore_collect_events(box, event, false); + if (ret < 0) + return ret; + + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + if (!(flags & PERF_EF_START)) + hwc->state |= PERF_HES_ARCH; + + ret = uncore_assign_events(box, assign, n); + if (ret) + return ret; + + /* save events moving to new counters */ + for (i = 0; i < box->n_events; i++) { + event = box->event_list[i]; + hwc = &event->hw; + + if (hwc->idx == assign[i] && + hwc->last_tag == box->tags[assign[i]]) + continue; + /* + * Ensure we don't accidentally enable a stopped + * counter simply because we rescheduled. + */ + if (hwc->state & PERF_HES_STOPPED) + hwc->state |= PERF_HES_ARCH; + + uncore_pmu_event_stop(event, PERF_EF_UPDATE); + } + + /* reprogram moved events into new counters */ + for (i = 0; i < n; i++) { + event = box->event_list[i]; + hwc = &event->hw; + + if (hwc->idx != assign[i] || + hwc->last_tag != box->tags[assign[i]]) + uncore_assign_hw_event(box, event, assign[i]); + else if (i < box->n_events) + continue; + + if (hwc->state & PERF_HES_ARCH) + continue; + + uncore_pmu_event_start(event, 0); + } + box->n_events = n; + + return 0; +} + +static void uncore_pmu_event_del(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + int i; + + uncore_pmu_event_stop(event, PERF_EF_UPDATE); + + for (i = 0; i < box->n_events; i++) { + if (event == box->event_list[i]) { + uncore_put_event_constraint(box, event); + + while (++i < box->n_events) + box->event_list[i - 1] = box->event_list[i]; + + --box->n_events; + break; + } + } + + event->hw.idx = -1; + event->hw.last_tag = ~0ULL; +} + +static void uncore_pmu_event_read(struct perf_event *event) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + uncore_perf_event_update(box, event); +} + +/* + * validation ensures the group can be loaded onto the + * PMU if it was the only group available. + */ +static int uncore_validate_group(struct intel_uncore_pmu *pmu, + struct perf_event *event) +{ + struct perf_event *leader = event->group_leader; + struct intel_uncore_box *fake_box; + int ret = -EINVAL, n; + + fake_box = uncore_alloc_box(pmu->type, NUMA_NO_NODE); + if (!fake_box) + return -ENOMEM; + + fake_box->pmu = pmu; + /* + * the event is not yet connected with its + * siblings therefore we must first collect + * existing siblings, then add the new event + * before we can simulate the scheduling + */ + n = uncore_collect_events(fake_box, leader, true); + if (n < 0) + goto out; + + fake_box->n_events = n; + n = uncore_collect_events(fake_box, event, false); + if (n < 0) + goto out; + + fake_box->n_events = n; + + ret = uncore_assign_events(fake_box, NULL, n); +out: + kfree(fake_box); + return ret; +} + +static int uncore_pmu_event_init(struct perf_event *event) +{ + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + struct hw_perf_event *hwc = &event->hw; + int ret; + + if (event->attr.type != event->pmu->type) + return -ENOENT; + + pmu = uncore_event_to_pmu(event); + /* no device found for this pmu */ + if (pmu->func_id < 0) + return -ENOENT; + + /* + * Uncore PMU does measure at all privilege level all the time. + * So it doesn't make sense to specify any exclude bits. + */ + if (event->attr.exclude_user || event->attr.exclude_kernel || + event->attr.exclude_hv || event->attr.exclude_idle) + return -EINVAL; + + /* Sampling not supported yet */ + if (hwc->sample_period) + return -EINVAL; + + /* + * Place all uncore events for a particular physical package + * onto a single cpu + */ + if (event->cpu < 0) + return -EINVAL; + box = uncore_pmu_to_box(pmu, event->cpu); + if (!box || box->cpu < 0) + return -EINVAL; + event->cpu = box->cpu; + + event->hw.idx = -1; + event->hw.last_tag = ~0ULL; + event->hw.extra_reg.idx = EXTRA_REG_NONE; + event->hw.branch_reg.idx = EXTRA_REG_NONE; + + if (event->attr.config == UNCORE_FIXED_EVENT) { + /* no fixed counter */ + if (!pmu->type->fixed_ctl) + return -EINVAL; + /* + * if there is only one fixed counter, only the first pmu + * can access the fixed counter + */ + if (pmu->type->single_fixed && pmu->pmu_idx > 0) + return -EINVAL; + + /* fixed counters have event field hardcoded to zero */ + hwc->config = 0ULL; + } else { + hwc->config = event->attr.config & pmu->type->event_mask; + if (pmu->type->ops->hw_config) { + ret = pmu->type->ops->hw_config(box, event); + if (ret) + return ret; + } + } + + if (event->group_leader != event) + ret = uncore_validate_group(pmu, event); + else + ret = 0; + + return ret; +} + +static ssize_t uncore_get_attr_cpumask(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &uncore_cpu_mask); + + buf[n++] = '\n'; + buf[n] = '\0'; + return n; +} + +static DEVICE_ATTR(cpumask, S_IRUGO, uncore_get_attr_cpumask, NULL); + +static struct attribute *uncore_pmu_attrs[] = { + &dev_attr_cpumask.attr, + NULL, +}; + +static struct attribute_group uncore_pmu_attr_group = { + .attrs = uncore_pmu_attrs, +}; + +static int __init uncore_pmu_register(struct intel_uncore_pmu *pmu) +{ + int ret; + + if (!pmu->type->pmu) { + pmu->pmu = (struct pmu) { + .attr_groups = pmu->type->attr_groups, + .task_ctx_nr = perf_invalid_context, + .event_init = uncore_pmu_event_init, + .add = uncore_pmu_event_add, + .del = uncore_pmu_event_del, + .start = uncore_pmu_event_start, + .stop = uncore_pmu_event_stop, + .read = uncore_pmu_event_read, + }; + } else { + pmu->pmu = *pmu->type->pmu; + pmu->pmu.attr_groups = pmu->type->attr_groups; + } + + if (pmu->type->num_boxes == 1) { + if (strlen(pmu->type->name) > 0) + sprintf(pmu->name, "uncore_%s", pmu->type->name); + else + sprintf(pmu->name, "uncore"); + } else { + sprintf(pmu->name, "uncore_%s_%d", pmu->type->name, + pmu->pmu_idx); + } + + ret = perf_pmu_register(&pmu->pmu, pmu->name, -1); + return ret; +} + +static void __init uncore_type_exit(struct intel_uncore_type *type) +{ + int i; + + for (i = 0; i < type->num_boxes; i++) + free_percpu(type->pmus[i].box); + kfree(type->pmus); + type->pmus = NULL; + kfree(type->events_group); + type->events_group = NULL; +} + +static void __init uncore_types_exit(struct intel_uncore_type **types) +{ + int i; + for (i = 0; types[i]; i++) + uncore_type_exit(types[i]); +} + +static int __init uncore_type_init(struct intel_uncore_type *type) +{ + struct intel_uncore_pmu *pmus; + struct attribute_group *attr_group; + struct attribute **attrs; + int i, j; + + pmus = kzalloc(sizeof(*pmus) * type->num_boxes, GFP_KERNEL); + if (!pmus) + return -ENOMEM; + + type->pmus = pmus; + + type->unconstrainted = (struct event_constraint) + __EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1, + 0, type->num_counters, 0, 0); + + for (i = 0; i < type->num_boxes; i++) { + pmus[i].func_id = -1; + pmus[i].pmu_idx = i; + pmus[i].type = type; + INIT_LIST_HEAD(&pmus[i].box_list); + pmus[i].box = alloc_percpu(struct intel_uncore_box *); + if (!pmus[i].box) + goto fail; + } + + if (type->event_descs) { + i = 0; + while (type->event_descs[i].attr.attr.name) + i++; + + attr_group = kzalloc(sizeof(struct attribute *) * (i + 1) + + sizeof(*attr_group), GFP_KERNEL); + if (!attr_group) + goto fail; + + attrs = (struct attribute **)(attr_group + 1); + attr_group->name = "events"; + attr_group->attrs = attrs; + + for (j = 0; j < i; j++) + attrs[j] = &type->event_descs[j].attr.attr; + + type->events_group = attr_group; + } + + type->pmu_group = &uncore_pmu_attr_group; + return 0; +fail: + uncore_type_exit(type); + return -ENOMEM; +} + +static int __init uncore_types_init(struct intel_uncore_type **types) +{ + int i, ret; + + for (i = 0; types[i]; i++) { + ret = uncore_type_init(types[i]); + if (ret) + goto fail; + } + return 0; +fail: + while (--i >= 0) + uncore_type_exit(types[i]); + return ret; +} + +static struct pci_driver *uncore_pci_driver; +static bool pcidrv_registered; + +/* + * add a pci uncore device + */ +static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + struct intel_uncore_type *type; + int phys_id; + + phys_id = pcibus_to_physid[pdev->bus->number]; + if (phys_id < 0) + return -ENODEV; + + if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) { + extra_pci_dev[phys_id][UNCORE_PCI_DEV_IDX(id->driver_data)] = pdev; + pci_set_drvdata(pdev, NULL); + return 0; + } + + type = pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)]; + box = uncore_alloc_box(type, NUMA_NO_NODE); + if (!box) + return -ENOMEM; + + /* + * for performance monitoring unit with multiple boxes, + * each box has a different function id. + */ + pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)]; + if (pmu->func_id < 0) + pmu->func_id = pdev->devfn; + else + WARN_ON_ONCE(pmu->func_id != pdev->devfn); + + box->phys_id = phys_id; + box->pci_dev = pdev; + box->pmu = pmu; + uncore_box_init(box); + pci_set_drvdata(pdev, box); + + raw_spin_lock(&uncore_box_lock); + list_add_tail(&box->list, &pmu->box_list); + raw_spin_unlock(&uncore_box_lock); + + return 0; +} + +static void uncore_pci_remove(struct pci_dev *pdev) +{ + struct intel_uncore_box *box = pci_get_drvdata(pdev); + struct intel_uncore_pmu *pmu; + int i, cpu, phys_id = pcibus_to_physid[pdev->bus->number]; + + box = pci_get_drvdata(pdev); + if (!box) { + for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) { + if (extra_pci_dev[phys_id][i] == pdev) { + extra_pci_dev[phys_id][i] = NULL; + break; + } + } + WARN_ON_ONCE(i >= UNCORE_EXTRA_PCI_DEV_MAX); + return; + } + + pmu = box->pmu; + if (WARN_ON_ONCE(phys_id != box->phys_id)) + return; + + pci_set_drvdata(pdev, NULL); + + raw_spin_lock(&uncore_box_lock); + list_del(&box->list); + raw_spin_unlock(&uncore_box_lock); + + for_each_possible_cpu(cpu) { + if (*per_cpu_ptr(pmu->box, cpu) == box) { + *per_cpu_ptr(pmu->box, cpu) = NULL; + atomic_dec(&box->refcnt); + } + } + + WARN_ON_ONCE(atomic_read(&box->refcnt) != 1); + kfree(box); +} + +static int __init uncore_pci_init(void) +{ + int ret; + + switch (boot_cpu_data.x86_model) { + case 45: /* Sandy Bridge-EP */ + ret = snbep_pci2phy_map_init(0x3ce0); + if (ret) + return ret; + pci_uncores = snbep_pci_uncores; + uncore_pci_driver = &snbep_uncore_pci_driver; + break; + case 62: /* IvyTown */ + ret = snbep_pci2phy_map_init(0x0e1e); + if (ret) + return ret; + pci_uncores = ivt_pci_uncores; + uncore_pci_driver = &ivt_uncore_pci_driver; + break; + case 42: /* Sandy Bridge */ + ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_SNB_IMC); + if (ret) + return ret; + pci_uncores = snb_pci_uncores; + uncore_pci_driver = &snb_uncore_pci_driver; + break; + case 58: /* Ivy Bridge */ + ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_IVB_IMC); + if (ret) + return ret; + pci_uncores = snb_pci_uncores; + uncore_pci_driver = &ivb_uncore_pci_driver; + break; + case 60: /* Haswell */ + case 69: /* Haswell Celeron */ + ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_HSW_IMC); + if (ret) + return ret; + pci_uncores = snb_pci_uncores; + uncore_pci_driver = &hsw_uncore_pci_driver; + break; + default: + return 0; + } + + ret = uncore_types_init(pci_uncores); + if (ret) + return ret; + + uncore_pci_driver->probe = uncore_pci_probe; + uncore_pci_driver->remove = uncore_pci_remove; + + ret = pci_register_driver(uncore_pci_driver); + if (ret == 0) + pcidrv_registered = true; + else + uncore_types_exit(pci_uncores); + + return ret; +} + +static void __init uncore_pci_exit(void) +{ + if (pcidrv_registered) { + pcidrv_registered = false; + pci_unregister_driver(uncore_pci_driver); + uncore_types_exit(pci_uncores); + } +} + +/* CPU hot plug/unplug are serialized by cpu_add_remove_lock mutex */ +static LIST_HEAD(boxes_to_free); + +static void uncore_kfree_boxes(void) +{ + struct intel_uncore_box *box; + + while (!list_empty(&boxes_to_free)) { + box = list_entry(boxes_to_free.next, + struct intel_uncore_box, list); + list_del(&box->list); + kfree(box); + } +} + +static void uncore_cpu_dying(int cpu) +{ + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + int i, j; + + for (i = 0; msr_uncores[i]; i++) { + type = msr_uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + box = *per_cpu_ptr(pmu->box, cpu); + *per_cpu_ptr(pmu->box, cpu) = NULL; + if (box && atomic_dec_and_test(&box->refcnt)) + list_add(&box->list, &boxes_to_free); + } + } +} + +static int uncore_cpu_starting(int cpu) +{ + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box, *exist; + int i, j, k, phys_id; + + phys_id = topology_physical_package_id(cpu); + + for (i = 0; msr_uncores[i]; i++) { + type = msr_uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + box = *per_cpu_ptr(pmu->box, cpu); + /* called by uncore_cpu_init? */ + if (box && box->phys_id >= 0) { + uncore_box_init(box); + continue; + } + + for_each_online_cpu(k) { + exist = *per_cpu_ptr(pmu->box, k); + if (exist && exist->phys_id == phys_id) { + atomic_inc(&exist->refcnt); + *per_cpu_ptr(pmu->box, cpu) = exist; + if (box) { + list_add(&box->list, + &boxes_to_free); + box = NULL; + } + break; + } + } + + if (box) { + box->phys_id = phys_id; + uncore_box_init(box); + } + } + } + return 0; +} + +static int uncore_cpu_prepare(int cpu, int phys_id) +{ + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + int i, j; + + for (i = 0; msr_uncores[i]; i++) { + type = msr_uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + if (pmu->func_id < 0) + pmu->func_id = j; + + box = uncore_alloc_box(type, cpu_to_node(cpu)); + if (!box) + return -ENOMEM; + + box->pmu = pmu; + box->phys_id = phys_id; + *per_cpu_ptr(pmu->box, cpu) = box; + } + } + return 0; +} + +static void +uncore_change_context(struct intel_uncore_type **uncores, int old_cpu, int new_cpu) +{ + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + int i, j; + + for (i = 0; uncores[i]; i++) { + type = uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + if (old_cpu < 0) + box = uncore_pmu_to_box(pmu, new_cpu); + else + box = uncore_pmu_to_box(pmu, old_cpu); + if (!box) + continue; + + if (old_cpu < 0) { + WARN_ON_ONCE(box->cpu != -1); + box->cpu = new_cpu; + continue; + } + + WARN_ON_ONCE(box->cpu != old_cpu); + if (new_cpu >= 0) { + uncore_pmu_cancel_hrtimer(box); + perf_pmu_migrate_context(&pmu->pmu, + old_cpu, new_cpu); + box->cpu = new_cpu; + } else { + box->cpu = -1; + } + } + } +} + +static void uncore_event_exit_cpu(int cpu) +{ + int i, phys_id, target; + + /* if exiting cpu is used for collecting uncore events */ + if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask)) + return; + + /* find a new cpu to collect uncore events */ + phys_id = topology_physical_package_id(cpu); + target = -1; + for_each_online_cpu(i) { + if (i == cpu) + continue; + if (phys_id == topology_physical_package_id(i)) { + target = i; + break; + } + } + + /* migrate uncore events to the new cpu */ + if (target >= 0) + cpumask_set_cpu(target, &uncore_cpu_mask); + + uncore_change_context(msr_uncores, cpu, target); + uncore_change_context(pci_uncores, cpu, target); +} + +static void uncore_event_init_cpu(int cpu) +{ + int i, phys_id; + + phys_id = topology_physical_package_id(cpu); + for_each_cpu(i, &uncore_cpu_mask) { + if (phys_id == topology_physical_package_id(i)) + return; + } + + cpumask_set_cpu(cpu, &uncore_cpu_mask); + + uncore_change_context(msr_uncores, -1, cpu); + uncore_change_context(pci_uncores, -1, cpu); +} + +static int uncore_cpu_notifier(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (long)hcpu; + + /* allocate/free data structure for uncore box */ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_UP_PREPARE: + uncore_cpu_prepare(cpu, -1); + break; + case CPU_STARTING: + uncore_cpu_starting(cpu); + break; + case CPU_UP_CANCELED: + case CPU_DYING: + uncore_cpu_dying(cpu); + break; + case CPU_ONLINE: + case CPU_DEAD: + uncore_kfree_boxes(); + break; + default: + break; + } + + /* select the cpu that collects uncore events */ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_FAILED: + case CPU_STARTING: + uncore_event_init_cpu(cpu); + break; + case CPU_DOWN_PREPARE: + uncore_event_exit_cpu(cpu); + break; + default: + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block uncore_cpu_nb = { + .notifier_call = uncore_cpu_notifier, + /* + * to migrate uncore events, our notifier should be executed + * before perf core's notifier. + */ + .priority = CPU_PRI_PERF + 1, +}; + +static void __init uncore_cpu_setup(void *dummy) +{ + uncore_cpu_starting(smp_processor_id()); +} + +static int __init uncore_cpu_init(void) +{ + int ret, max_cores; + + max_cores = boot_cpu_data.x86_max_cores; + switch (boot_cpu_data.x86_model) { + case 26: /* Nehalem */ + case 30: + case 37: /* Westmere */ + case 44: + msr_uncores = nhm_msr_uncores; + break; + case 42: /* Sandy Bridge */ + case 58: /* Ivy Bridge */ + if (snb_uncore_cbox.num_boxes > max_cores) + snb_uncore_cbox.num_boxes = max_cores; + msr_uncores = snb_msr_uncores; + break; + case 45: /* Sandy Bridge-EP */ + if (snbep_uncore_cbox.num_boxes > max_cores) + snbep_uncore_cbox.num_boxes = max_cores; + msr_uncores = snbep_msr_uncores; + break; + case 46: /* Nehalem-EX */ + uncore_nhmex = true; + case 47: /* Westmere-EX aka. Xeon E7 */ + if (!uncore_nhmex) + nhmex_uncore_mbox.event_descs = wsmex_uncore_mbox_events; + if (nhmex_uncore_cbox.num_boxes > max_cores) + nhmex_uncore_cbox.num_boxes = max_cores; + msr_uncores = nhmex_msr_uncores; + break; + case 62: /* IvyTown */ + if (ivt_uncore_cbox.num_boxes > max_cores) + ivt_uncore_cbox.num_boxes = max_cores; + msr_uncores = ivt_msr_uncores; + break; + + default: + return 0; + } + + ret = uncore_types_init(msr_uncores); + if (ret) + return ret; + + return 0; +} + +static int __init uncore_pmus_register(void) +{ + struct intel_uncore_pmu *pmu; + struct intel_uncore_type *type; + int i, j; + + for (i = 0; msr_uncores[i]; i++) { + type = msr_uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + uncore_pmu_register(pmu); + } + } + + for (i = 0; pci_uncores[i]; i++) { + type = pci_uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + uncore_pmu_register(pmu); + } + } + + return 0; +} + +static void __init uncore_cpumask_init(void) +{ + int cpu; + + /* + * ony invoke once from msr or pci init code + */ + if (!cpumask_empty(&uncore_cpu_mask)) + return; + + cpu_notifier_register_begin(); + + for_each_online_cpu(cpu) { + int i, phys_id = topology_physical_package_id(cpu); + + for_each_cpu(i, &uncore_cpu_mask) { + if (phys_id == topology_physical_package_id(i)) { + phys_id = -1; + break; + } + } + if (phys_id < 0) + continue; + + uncore_cpu_prepare(cpu, phys_id); + uncore_event_init_cpu(cpu); + } + on_each_cpu(uncore_cpu_setup, NULL, 1); + + __register_cpu_notifier(&uncore_cpu_nb); + + cpu_notifier_register_done(); +} + + +static int __init intel_uncore_init(void) +{ + int ret; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return -ENODEV; + + if (cpu_has_hypervisor) + return -ENODEV; + + ret = uncore_pci_init(); + if (ret) + goto fail; + ret = uncore_cpu_init(); + if (ret) { + uncore_pci_exit(); + goto fail; + } + uncore_cpumask_init(); + + uncore_pmus_register(); + return 0; +fail: + return ret; +} +device_initcall(intel_uncore_init); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h new file mode 100644 index 00000000000..90236f0c94a --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -0,0 +1,696 @@ +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/pci.h> +#include <linux/perf_event.h> +#include "perf_event.h" + +#define UNCORE_PMU_NAME_LEN 32 +#define UNCORE_PMU_HRTIMER_INTERVAL (60LL * NSEC_PER_SEC) +#define UNCORE_SNB_IMC_HRTIMER_INTERVAL (5ULL * NSEC_PER_SEC) + +#define UNCORE_FIXED_EVENT 0xff +#define UNCORE_PMC_IDX_MAX_GENERIC 8 +#define UNCORE_PMC_IDX_FIXED UNCORE_PMC_IDX_MAX_GENERIC +#define UNCORE_PMC_IDX_MAX (UNCORE_PMC_IDX_FIXED + 1) + +#define UNCORE_PCI_DEV_DATA(type, idx) ((type << 8) | idx) +#define UNCORE_PCI_DEV_TYPE(data) ((data >> 8) & 0xff) +#define UNCORE_PCI_DEV_IDX(data) (data & 0xff) +#define UNCORE_EXTRA_PCI_DEV 0xff +#define UNCORE_EXTRA_PCI_DEV_MAX 2 + +/* support up to 8 sockets */ +#define UNCORE_SOCKET_MAX 8 + +#define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff) + +/* SNB event control */ +#define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff +#define SNB_UNC_CTL_UMASK_MASK 0x0000ff00 +#define SNB_UNC_CTL_EDGE_DET (1 << 18) +#define SNB_UNC_CTL_EN (1 << 22) +#define SNB_UNC_CTL_INVERT (1 << 23) +#define SNB_UNC_CTL_CMASK_MASK 0x1f000000 +#define NHM_UNC_CTL_CMASK_MASK 0xff000000 +#define NHM_UNC_FIXED_CTR_CTL_EN (1 << 0) + +#define SNB_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \ + SNB_UNC_CTL_UMASK_MASK | \ + SNB_UNC_CTL_EDGE_DET | \ + SNB_UNC_CTL_INVERT | \ + SNB_UNC_CTL_CMASK_MASK) + +#define NHM_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \ + SNB_UNC_CTL_UMASK_MASK | \ + SNB_UNC_CTL_EDGE_DET | \ + SNB_UNC_CTL_INVERT | \ + NHM_UNC_CTL_CMASK_MASK) + +/* SNB global control register */ +#define SNB_UNC_PERF_GLOBAL_CTL 0x391 +#define SNB_UNC_FIXED_CTR_CTRL 0x394 +#define SNB_UNC_FIXED_CTR 0x395 + +/* SNB uncore global control */ +#define SNB_UNC_GLOBAL_CTL_CORE_ALL ((1 << 4) - 1) +#define SNB_UNC_GLOBAL_CTL_EN (1 << 29) + +/* SNB Cbo register */ +#define SNB_UNC_CBO_0_PERFEVTSEL0 0x700 +#define SNB_UNC_CBO_0_PER_CTR0 0x706 +#define SNB_UNC_CBO_MSR_OFFSET 0x10 + +/* NHM global control register */ +#define NHM_UNC_PERF_GLOBAL_CTL 0x391 +#define NHM_UNC_FIXED_CTR 0x394 +#define NHM_UNC_FIXED_CTR_CTRL 0x395 + +/* NHM uncore global control */ +#define NHM_UNC_GLOBAL_CTL_EN_PC_ALL ((1ULL << 8) - 1) +#define NHM_UNC_GLOBAL_CTL_EN_FC (1ULL << 32) + +/* NHM uncore register */ +#define NHM_UNC_PERFEVTSEL0 0x3c0 +#define NHM_UNC_UNCORE_PMC0 0x3b0 + +/* SNB-EP Box level control */ +#define SNBEP_PMON_BOX_CTL_RST_CTRL (1 << 0) +#define SNBEP_PMON_BOX_CTL_RST_CTRS (1 << 1) +#define SNBEP_PMON_BOX_CTL_FRZ (1 << 8) +#define SNBEP_PMON_BOX_CTL_FRZ_EN (1 << 16) +#define SNBEP_PMON_BOX_CTL_INT (SNBEP_PMON_BOX_CTL_RST_CTRL | \ + SNBEP_PMON_BOX_CTL_RST_CTRS | \ + SNBEP_PMON_BOX_CTL_FRZ_EN) +/* SNB-EP event control */ +#define SNBEP_PMON_CTL_EV_SEL_MASK 0x000000ff +#define SNBEP_PMON_CTL_UMASK_MASK 0x0000ff00 +#define SNBEP_PMON_CTL_RST (1 << 17) +#define SNBEP_PMON_CTL_EDGE_DET (1 << 18) +#define SNBEP_PMON_CTL_EV_SEL_EXT (1 << 21) +#define SNBEP_PMON_CTL_EN (1 << 22) +#define SNBEP_PMON_CTL_INVERT (1 << 23) +#define SNBEP_PMON_CTL_TRESH_MASK 0xff000000 +#define SNBEP_PMON_RAW_EVENT_MASK (SNBEP_PMON_CTL_EV_SEL_MASK | \ + SNBEP_PMON_CTL_UMASK_MASK | \ + SNBEP_PMON_CTL_EDGE_DET | \ + SNBEP_PMON_CTL_INVERT | \ + SNBEP_PMON_CTL_TRESH_MASK) + +/* SNB-EP Ubox event control */ +#define SNBEP_U_MSR_PMON_CTL_TRESH_MASK 0x1f000000 +#define SNBEP_U_MSR_PMON_RAW_EVENT_MASK \ + (SNBEP_PMON_CTL_EV_SEL_MASK | \ + SNBEP_PMON_CTL_UMASK_MASK | \ + SNBEP_PMON_CTL_EDGE_DET | \ + SNBEP_PMON_CTL_INVERT | \ + SNBEP_U_MSR_PMON_CTL_TRESH_MASK) + +#define SNBEP_CBO_PMON_CTL_TID_EN (1 << 19) +#define SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK (SNBEP_PMON_RAW_EVENT_MASK | \ + SNBEP_CBO_PMON_CTL_TID_EN) + +/* SNB-EP PCU event control */ +#define SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK 0x0000c000 +#define SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK 0x1f000000 +#define SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT (1 << 30) +#define SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET (1 << 31) +#define SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK \ + (SNBEP_PMON_CTL_EV_SEL_MASK | \ + SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \ + SNBEP_PMON_CTL_EDGE_DET | \ + SNBEP_PMON_CTL_EV_SEL_EXT | \ + SNBEP_PMON_CTL_INVERT | \ + SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \ + SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \ + SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET) + +#define SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK \ + (SNBEP_PMON_RAW_EVENT_MASK | \ + SNBEP_PMON_CTL_EV_SEL_EXT) + +/* SNB-EP pci control register */ +#define SNBEP_PCI_PMON_BOX_CTL 0xf4 +#define SNBEP_PCI_PMON_CTL0 0xd8 +/* SNB-EP pci counter register */ +#define SNBEP_PCI_PMON_CTR0 0xa0 + +/* SNB-EP home agent register */ +#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH0 0x40 +#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH1 0x44 +#define SNBEP_HA_PCI_PMON_BOX_OPCODEMATCH 0x48 +/* SNB-EP memory controller register */ +#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTL 0xf0 +#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTR 0xd0 +/* SNB-EP QPI register */ +#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH0 0x228 +#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH1 0x22c +#define SNBEP_Q_Py_PCI_PMON_PKT_MASK0 0x238 +#define SNBEP_Q_Py_PCI_PMON_PKT_MASK1 0x23c + +/* SNB-EP Ubox register */ +#define SNBEP_U_MSR_PMON_CTR0 0xc16 +#define SNBEP_U_MSR_PMON_CTL0 0xc10 + +#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTL 0xc08 +#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTR 0xc09 + +/* SNB-EP Cbo register */ +#define SNBEP_C0_MSR_PMON_CTR0 0xd16 +#define SNBEP_C0_MSR_PMON_CTL0 0xd10 +#define SNBEP_C0_MSR_PMON_BOX_CTL 0xd04 +#define SNBEP_C0_MSR_PMON_BOX_FILTER 0xd14 +#define SNBEP_CBO_MSR_OFFSET 0x20 + +#define SNBEP_CB0_MSR_PMON_BOX_FILTER_TID 0x1f +#define SNBEP_CB0_MSR_PMON_BOX_FILTER_NID 0x3fc00 +#define SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE 0x7c0000 +#define SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC 0xff800000 + +#define SNBEP_CBO_EVENT_EXTRA_REG(e, m, i) { \ + .event = (e), \ + .msr = SNBEP_C0_MSR_PMON_BOX_FILTER, \ + .config_mask = (m), \ + .idx = (i) \ +} + +/* SNB-EP PCU register */ +#define SNBEP_PCU_MSR_PMON_CTR0 0xc36 +#define SNBEP_PCU_MSR_PMON_CTL0 0xc30 +#define SNBEP_PCU_MSR_PMON_BOX_CTL 0xc24 +#define SNBEP_PCU_MSR_PMON_BOX_FILTER 0xc34 +#define SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK 0xffffffff +#define SNBEP_PCU_MSR_CORE_C3_CTR 0x3fc +#define SNBEP_PCU_MSR_CORE_C6_CTR 0x3fd + +/* IVT event control */ +#define IVT_PMON_BOX_CTL_INT (SNBEP_PMON_BOX_CTL_RST_CTRL | \ + SNBEP_PMON_BOX_CTL_RST_CTRS) +#define IVT_PMON_RAW_EVENT_MASK (SNBEP_PMON_CTL_EV_SEL_MASK | \ + SNBEP_PMON_CTL_UMASK_MASK | \ + SNBEP_PMON_CTL_EDGE_DET | \ + SNBEP_PMON_CTL_TRESH_MASK) +/* IVT Ubox */ +#define IVT_U_MSR_PMON_GLOBAL_CTL 0xc00 +#define IVT_U_PMON_GLOBAL_FRZ_ALL (1 << 31) +#define IVT_U_PMON_GLOBAL_UNFRZ_ALL (1 << 29) + +#define IVT_U_MSR_PMON_RAW_EVENT_MASK \ + (SNBEP_PMON_CTL_EV_SEL_MASK | \ + SNBEP_PMON_CTL_UMASK_MASK | \ + SNBEP_PMON_CTL_EDGE_DET | \ + SNBEP_U_MSR_PMON_CTL_TRESH_MASK) +/* IVT Cbo */ +#define IVT_CBO_MSR_PMON_RAW_EVENT_MASK (IVT_PMON_RAW_EVENT_MASK | \ + SNBEP_CBO_PMON_CTL_TID_EN) + +#define IVT_CB0_MSR_PMON_BOX_FILTER_TID (0x1fULL << 0) +#define IVT_CB0_MSR_PMON_BOX_FILTER_LINK (0xfULL << 5) +#define IVT_CB0_MSR_PMON_BOX_FILTER_STATE (0x3fULL << 17) +#define IVT_CB0_MSR_PMON_BOX_FILTER_NID (0xffffULL << 32) +#define IVT_CB0_MSR_PMON_BOX_FILTER_OPC (0x1ffULL << 52) +#define IVT_CB0_MSR_PMON_BOX_FILTER_C6 (0x1ULL << 61) +#define IVT_CB0_MSR_PMON_BOX_FILTER_NC (0x1ULL << 62) +#define IVT_CB0_MSR_PMON_BOX_FILTER_IOSC (0x1ULL << 63) + +/* IVT home agent */ +#define IVT_HA_PCI_PMON_CTL_Q_OCC_RST (1 << 16) +#define IVT_HA_PCI_PMON_RAW_EVENT_MASK \ + (IVT_PMON_RAW_EVENT_MASK | \ + IVT_HA_PCI_PMON_CTL_Q_OCC_RST) +/* IVT PCU */ +#define IVT_PCU_MSR_PMON_RAW_EVENT_MASK \ + (SNBEP_PMON_CTL_EV_SEL_MASK | \ + SNBEP_PMON_CTL_EV_SEL_EXT | \ + SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \ + SNBEP_PMON_CTL_EDGE_DET | \ + SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \ + SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \ + SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET) +/* IVT QPI */ +#define IVT_QPI_PCI_PMON_RAW_EVENT_MASK \ + (IVT_PMON_RAW_EVENT_MASK | \ + SNBEP_PMON_CTL_EV_SEL_EXT) + +/* NHM-EX event control */ +#define NHMEX_PMON_CTL_EV_SEL_MASK 0x000000ff +#define NHMEX_PMON_CTL_UMASK_MASK 0x0000ff00 +#define NHMEX_PMON_CTL_EN_BIT0 (1 << 0) +#define NHMEX_PMON_CTL_EDGE_DET (1 << 18) +#define NHMEX_PMON_CTL_PMI_EN (1 << 20) +#define NHMEX_PMON_CTL_EN_BIT22 (1 << 22) +#define NHMEX_PMON_CTL_INVERT (1 << 23) +#define NHMEX_PMON_CTL_TRESH_MASK 0xff000000 +#define NHMEX_PMON_RAW_EVENT_MASK (NHMEX_PMON_CTL_EV_SEL_MASK | \ + NHMEX_PMON_CTL_UMASK_MASK | \ + NHMEX_PMON_CTL_EDGE_DET | \ + NHMEX_PMON_CTL_INVERT | \ + NHMEX_PMON_CTL_TRESH_MASK) + +/* NHM-EX Ubox */ +#define NHMEX_U_MSR_PMON_GLOBAL_CTL 0xc00 +#define NHMEX_U_MSR_PMON_CTR 0xc11 +#define NHMEX_U_MSR_PMON_EV_SEL 0xc10 + +#define NHMEX_U_PMON_GLOBAL_EN (1 << 0) +#define NHMEX_U_PMON_GLOBAL_PMI_CORE_SEL 0x0000001e +#define NHMEX_U_PMON_GLOBAL_EN_ALL (1 << 28) +#define NHMEX_U_PMON_GLOBAL_RST_ALL (1 << 29) +#define NHMEX_U_PMON_GLOBAL_FRZ_ALL (1 << 31) + +#define NHMEX_U_PMON_RAW_EVENT_MASK \ + (NHMEX_PMON_CTL_EV_SEL_MASK | \ + NHMEX_PMON_CTL_EDGE_DET) + +/* NHM-EX Cbox */ +#define NHMEX_C0_MSR_PMON_GLOBAL_CTL 0xd00 +#define NHMEX_C0_MSR_PMON_CTR0 0xd11 +#define NHMEX_C0_MSR_PMON_EV_SEL0 0xd10 +#define NHMEX_C_MSR_OFFSET 0x20 + +/* NHM-EX Bbox */ +#define NHMEX_B0_MSR_PMON_GLOBAL_CTL 0xc20 +#define NHMEX_B0_MSR_PMON_CTR0 0xc31 +#define NHMEX_B0_MSR_PMON_CTL0 0xc30 +#define NHMEX_B_MSR_OFFSET 0x40 +#define NHMEX_B0_MSR_MATCH 0xe45 +#define NHMEX_B0_MSR_MASK 0xe46 +#define NHMEX_B1_MSR_MATCH 0xe4d +#define NHMEX_B1_MSR_MASK 0xe4e + +#define NHMEX_B_PMON_CTL_EN (1 << 0) +#define NHMEX_B_PMON_CTL_EV_SEL_SHIFT 1 +#define NHMEX_B_PMON_CTL_EV_SEL_MASK \ + (0x1f << NHMEX_B_PMON_CTL_EV_SEL_SHIFT) +#define NHMEX_B_PMON_CTR_SHIFT 6 +#define NHMEX_B_PMON_CTR_MASK \ + (0x3 << NHMEX_B_PMON_CTR_SHIFT) +#define NHMEX_B_PMON_RAW_EVENT_MASK \ + (NHMEX_B_PMON_CTL_EV_SEL_MASK | \ + NHMEX_B_PMON_CTR_MASK) + +/* NHM-EX Sbox */ +#define NHMEX_S0_MSR_PMON_GLOBAL_CTL 0xc40 +#define NHMEX_S0_MSR_PMON_CTR0 0xc51 +#define NHMEX_S0_MSR_PMON_CTL0 0xc50 +#define NHMEX_S_MSR_OFFSET 0x80 +#define NHMEX_S0_MSR_MM_CFG 0xe48 +#define NHMEX_S0_MSR_MATCH 0xe49 +#define NHMEX_S0_MSR_MASK 0xe4a +#define NHMEX_S1_MSR_MM_CFG 0xe58 +#define NHMEX_S1_MSR_MATCH 0xe59 +#define NHMEX_S1_MSR_MASK 0xe5a + +#define NHMEX_S_PMON_MM_CFG_EN (0x1ULL << 63) +#define NHMEX_S_EVENT_TO_R_PROG_EV 0 + +/* NHM-EX Mbox */ +#define NHMEX_M0_MSR_GLOBAL_CTL 0xca0 +#define NHMEX_M0_MSR_PMU_DSP 0xca5 +#define NHMEX_M0_MSR_PMU_ISS 0xca6 +#define NHMEX_M0_MSR_PMU_MAP 0xca7 +#define NHMEX_M0_MSR_PMU_MSC_THR 0xca8 +#define NHMEX_M0_MSR_PMU_PGT 0xca9 +#define NHMEX_M0_MSR_PMU_PLD 0xcaa +#define NHMEX_M0_MSR_PMU_ZDP_CTL_FVC 0xcab +#define NHMEX_M0_MSR_PMU_CTL0 0xcb0 +#define NHMEX_M0_MSR_PMU_CNT0 0xcb1 +#define NHMEX_M_MSR_OFFSET 0x40 +#define NHMEX_M0_MSR_PMU_MM_CFG 0xe54 +#define NHMEX_M1_MSR_PMU_MM_CFG 0xe5c + +#define NHMEX_M_PMON_MM_CFG_EN (1ULL << 63) +#define NHMEX_M_PMON_ADDR_MATCH_MASK 0x3ffffffffULL +#define NHMEX_M_PMON_ADDR_MASK_MASK 0x7ffffffULL +#define NHMEX_M_PMON_ADDR_MASK_SHIFT 34 + +#define NHMEX_M_PMON_CTL_EN (1 << 0) +#define NHMEX_M_PMON_CTL_PMI_EN (1 << 1) +#define NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT 2 +#define NHMEX_M_PMON_CTL_COUNT_MODE_MASK \ + (0x3 << NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT) +#define NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT 4 +#define NHMEX_M_PMON_CTL_STORAGE_MODE_MASK \ + (0x3 << NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT) +#define NHMEX_M_PMON_CTL_WRAP_MODE (1 << 6) +#define NHMEX_M_PMON_CTL_FLAG_MODE (1 << 7) +#define NHMEX_M_PMON_CTL_INC_SEL_SHIFT 9 +#define NHMEX_M_PMON_CTL_INC_SEL_MASK \ + (0x1f << NHMEX_M_PMON_CTL_INC_SEL_SHIFT) +#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT 19 +#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK \ + (0x7 << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT) +#define NHMEX_M_PMON_RAW_EVENT_MASK \ + (NHMEX_M_PMON_CTL_COUNT_MODE_MASK | \ + NHMEX_M_PMON_CTL_STORAGE_MODE_MASK | \ + NHMEX_M_PMON_CTL_WRAP_MODE | \ + NHMEX_M_PMON_CTL_FLAG_MODE | \ + NHMEX_M_PMON_CTL_INC_SEL_MASK | \ + NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK) + +#define NHMEX_M_PMON_ZDP_CTL_FVC_MASK (((1 << 11) - 1) | (1 << 23)) +#define NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7ULL << (11 + 3 * (n))) + +#define WSMEX_M_PMON_ZDP_CTL_FVC_MASK (((1 << 12) - 1) | (1 << 24)) +#define WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7ULL << (12 + 3 * (n))) + +/* + * use the 9~13 bits to select event If the 7th bit is not set, + * otherwise use the 19~21 bits to select event. + */ +#define MBOX_INC_SEL(x) ((x) << NHMEX_M_PMON_CTL_INC_SEL_SHIFT) +#define MBOX_SET_FLAG_SEL(x) (((x) << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT) | \ + NHMEX_M_PMON_CTL_FLAG_MODE) +#define MBOX_INC_SEL_MASK (NHMEX_M_PMON_CTL_INC_SEL_MASK | \ + NHMEX_M_PMON_CTL_FLAG_MODE) +#define MBOX_SET_FLAG_SEL_MASK (NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK | \ + NHMEX_M_PMON_CTL_FLAG_MODE) +#define MBOX_INC_SEL_EXTAR_REG(c, r) \ + EVENT_EXTRA_REG(MBOX_INC_SEL(c), NHMEX_M0_MSR_PMU_##r, \ + MBOX_INC_SEL_MASK, (u64)-1, NHMEX_M_##r) +#define MBOX_SET_FLAG_SEL_EXTRA_REG(c, r) \ + EVENT_EXTRA_REG(MBOX_SET_FLAG_SEL(c), NHMEX_M0_MSR_PMU_##r, \ + MBOX_SET_FLAG_SEL_MASK, \ + (u64)-1, NHMEX_M_##r) + +/* NHM-EX Rbox */ +#define NHMEX_R_MSR_GLOBAL_CTL 0xe00 +#define NHMEX_R_MSR_PMON_CTL0 0xe10 +#define NHMEX_R_MSR_PMON_CNT0 0xe11 +#define NHMEX_R_MSR_OFFSET 0x20 + +#define NHMEX_R_MSR_PORTN_QLX_CFG(n) \ + ((n) < 4 ? (0xe0c + (n)) : (0xe2c + (n) - 4)) +#define NHMEX_R_MSR_PORTN_IPERF_CFG0(n) (0xe04 + (n)) +#define NHMEX_R_MSR_PORTN_IPERF_CFG1(n) (0xe24 + (n)) +#define NHMEX_R_MSR_PORTN_XBR_OFFSET(n) \ + (((n) < 4 ? 0 : 0x10) + (n) * 4) +#define NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) \ + (0xe60 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n)) +#define NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(n) \ + (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 1) +#define NHMEX_R_MSR_PORTN_XBR_SET1_MASK(n) \ + (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 2) +#define NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) \ + (0xe70 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n)) +#define NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(n) \ + (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 1) +#define NHMEX_R_MSR_PORTN_XBR_SET2_MASK(n) \ + (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 2) + +#define NHMEX_R_PMON_CTL_EN (1 << 0) +#define NHMEX_R_PMON_CTL_EV_SEL_SHIFT 1 +#define NHMEX_R_PMON_CTL_EV_SEL_MASK \ + (0x1f << NHMEX_R_PMON_CTL_EV_SEL_SHIFT) +#define NHMEX_R_PMON_CTL_PMI_EN (1 << 6) +#define NHMEX_R_PMON_RAW_EVENT_MASK NHMEX_R_PMON_CTL_EV_SEL_MASK + +/* NHM-EX Wbox */ +#define NHMEX_W_MSR_GLOBAL_CTL 0xc80 +#define NHMEX_W_MSR_PMON_CNT0 0xc90 +#define NHMEX_W_MSR_PMON_EVT_SEL0 0xc91 +#define NHMEX_W_MSR_PMON_FIXED_CTR 0x394 +#define NHMEX_W_MSR_PMON_FIXED_CTL 0x395 + +#define NHMEX_W_PMON_GLOBAL_FIXED_EN (1ULL << 31) + +struct intel_uncore_ops; +struct intel_uncore_pmu; +struct intel_uncore_box; +struct uncore_event_desc; + +struct intel_uncore_type { + const char *name; + int num_counters; + int num_boxes; + int perf_ctr_bits; + int fixed_ctr_bits; + unsigned perf_ctr; + unsigned event_ctl; + unsigned event_mask; + unsigned fixed_ctr; + unsigned fixed_ctl; + unsigned box_ctl; + unsigned msr_offset; + unsigned num_shared_regs:8; + unsigned single_fixed:1; + unsigned pair_ctr_ctl:1; + unsigned *msr_offsets; + struct event_constraint unconstrainted; + struct event_constraint *constraints; + struct intel_uncore_pmu *pmus; + struct intel_uncore_ops *ops; + struct uncore_event_desc *event_descs; + const struct attribute_group *attr_groups[4]; + struct pmu *pmu; /* for custom pmu ops */ +}; + +#define pmu_group attr_groups[0] +#define format_group attr_groups[1] +#define events_group attr_groups[2] + +struct intel_uncore_ops { + void (*init_box)(struct intel_uncore_box *); + void (*disable_box)(struct intel_uncore_box *); + void (*enable_box)(struct intel_uncore_box *); + void (*disable_event)(struct intel_uncore_box *, struct perf_event *); + void (*enable_event)(struct intel_uncore_box *, struct perf_event *); + u64 (*read_counter)(struct intel_uncore_box *, struct perf_event *); + int (*hw_config)(struct intel_uncore_box *, struct perf_event *); + struct event_constraint *(*get_constraint)(struct intel_uncore_box *, + struct perf_event *); + void (*put_constraint)(struct intel_uncore_box *, struct perf_event *); +}; + +struct intel_uncore_pmu { + struct pmu pmu; + char name[UNCORE_PMU_NAME_LEN]; + int pmu_idx; + int func_id; + struct intel_uncore_type *type; + struct intel_uncore_box ** __percpu box; + struct list_head box_list; +}; + +struct intel_uncore_extra_reg { + raw_spinlock_t lock; + u64 config, config1, config2; + atomic_t ref; +}; + +struct intel_uncore_box { + int phys_id; + int n_active; /* number of active events */ + int n_events; + int cpu; /* cpu to collect events */ + unsigned long flags; + atomic_t refcnt; + struct perf_event *events[UNCORE_PMC_IDX_MAX]; + struct perf_event *event_list[UNCORE_PMC_IDX_MAX]; + unsigned long active_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)]; + u64 tags[UNCORE_PMC_IDX_MAX]; + struct pci_dev *pci_dev; + struct intel_uncore_pmu *pmu; + u64 hrtimer_duration; /* hrtimer timeout for this box */ + struct hrtimer hrtimer; + struct list_head list; + struct list_head active_list; + void *io_addr; + struct intel_uncore_extra_reg shared_regs[0]; +}; + +#define UNCORE_BOX_FLAG_INITIATED 0 + +struct uncore_event_desc { + struct kobj_attribute attr; + const char *config; +}; + +#define INTEL_UNCORE_EVENT_DESC(_name, _config) \ +{ \ + .attr = __ATTR(_name, 0444, uncore_event_show, NULL), \ + .config = _config, \ +} + +#define DEFINE_UNCORE_FORMAT_ATTR(_var, _name, _format) \ +static ssize_t __uncore_##_var##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + char *page) \ +{ \ + BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ + return sprintf(page, _format "\n"); \ +} \ +static struct kobj_attribute format_attr_##_var = \ + __ATTR(_name, 0444, __uncore_##_var##_show, NULL) + + +static ssize_t uncore_event_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct uncore_event_desc *event = + container_of(attr, struct uncore_event_desc, attr); + return sprintf(buf, "%s", event->config); +} + +static inline unsigned uncore_pci_box_ctl(struct intel_uncore_box *box) +{ + return box->pmu->type->box_ctl; +} + +static inline unsigned uncore_pci_fixed_ctl(struct intel_uncore_box *box) +{ + return box->pmu->type->fixed_ctl; +} + +static inline unsigned uncore_pci_fixed_ctr(struct intel_uncore_box *box) +{ + return box->pmu->type->fixed_ctr; +} + +static inline +unsigned uncore_pci_event_ctl(struct intel_uncore_box *box, int idx) +{ + return idx * 4 + box->pmu->type->event_ctl; +} + +static inline +unsigned uncore_pci_perf_ctr(struct intel_uncore_box *box, int idx) +{ + return idx * 8 + box->pmu->type->perf_ctr; +} + +static inline unsigned uncore_msr_box_offset(struct intel_uncore_box *box) +{ + struct intel_uncore_pmu *pmu = box->pmu; + return pmu->type->msr_offsets ? + pmu->type->msr_offsets[pmu->pmu_idx] : + pmu->type->msr_offset * pmu->pmu_idx; +} + +static inline unsigned uncore_msr_box_ctl(struct intel_uncore_box *box) +{ + if (!box->pmu->type->box_ctl) + return 0; + return box->pmu->type->box_ctl + uncore_msr_box_offset(box); +} + +static inline unsigned uncore_msr_fixed_ctl(struct intel_uncore_box *box) +{ + if (!box->pmu->type->fixed_ctl) + return 0; + return box->pmu->type->fixed_ctl + uncore_msr_box_offset(box); +} + +static inline unsigned uncore_msr_fixed_ctr(struct intel_uncore_box *box) +{ + return box->pmu->type->fixed_ctr + uncore_msr_box_offset(box); +} + +static inline +unsigned uncore_msr_event_ctl(struct intel_uncore_box *box, int idx) +{ + return box->pmu->type->event_ctl + + (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) + + uncore_msr_box_offset(box); +} + +static inline +unsigned uncore_msr_perf_ctr(struct intel_uncore_box *box, int idx) +{ + return box->pmu->type->perf_ctr + + (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) + + uncore_msr_box_offset(box); +} + +static inline +unsigned uncore_fixed_ctl(struct intel_uncore_box *box) +{ + if (box->pci_dev) + return uncore_pci_fixed_ctl(box); + else + return uncore_msr_fixed_ctl(box); +} + +static inline +unsigned uncore_fixed_ctr(struct intel_uncore_box *box) +{ + if (box->pci_dev) + return uncore_pci_fixed_ctr(box); + else + return uncore_msr_fixed_ctr(box); +} + +static inline +unsigned uncore_event_ctl(struct intel_uncore_box *box, int idx) +{ + if (box->pci_dev) + return uncore_pci_event_ctl(box, idx); + else + return uncore_msr_event_ctl(box, idx); +} + +static inline +unsigned uncore_perf_ctr(struct intel_uncore_box *box, int idx) +{ + if (box->pci_dev) + return uncore_pci_perf_ctr(box, idx); + else + return uncore_msr_perf_ctr(box, idx); +} + +static inline int uncore_perf_ctr_bits(struct intel_uncore_box *box) +{ + return box->pmu->type->perf_ctr_bits; +} + +static inline int uncore_fixed_ctr_bits(struct intel_uncore_box *box) +{ + return box->pmu->type->fixed_ctr_bits; +} + +static inline int uncore_num_counters(struct intel_uncore_box *box) +{ + return box->pmu->type->num_counters; +} + +static inline void uncore_disable_box(struct intel_uncore_box *box) +{ + if (box->pmu->type->ops->disable_box) + box->pmu->type->ops->disable_box(box); +} + +static inline void uncore_enable_box(struct intel_uncore_box *box) +{ + if (box->pmu->type->ops->enable_box) + box->pmu->type->ops->enable_box(box); +} + +static inline void uncore_disable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + box->pmu->type->ops->disable_event(box, event); +} + +static inline void uncore_enable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + box->pmu->type->ops->enable_event(box, event); +} + +static inline u64 uncore_read_counter(struct intel_uncore_box *box, + struct perf_event *event) +{ + return box->pmu->type->ops->read_counter(box, event); +} + +static inline void uncore_box_init(struct intel_uncore_box *box) +{ + if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) { + if (box->pmu->type->ops->init_box) + box->pmu->type->ops->init_box(box); + } +} + +static inline bool uncore_box_is_fake(struct intel_uncore_box *box) +{ + return (box->phys_id < 0); +} diff --git a/arch/x86/kernel/cpu/perf_event_knc.c b/arch/x86/kernel/cpu/perf_event_knc.c new file mode 100644 index 00000000000..838fa8772c6 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_knc.c @@ -0,0 +1,319 @@ +/* Driver for Intel Xeon Phi "Knights Corner" PMU */ + +#include <linux/perf_event.h> +#include <linux/types.h> + +#include <asm/hardirq.h> + +#include "perf_event.h" + +static const u64 knc_perfmon_event_map[] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x002a, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x0016, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0028, + [PERF_COUNT_HW_CACHE_MISSES] = 0x0029, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0012, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x002b, +}; + +static const u64 __initconst knc_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + /* On Xeon Phi event "0" is a valid DATA_READ */ + /* (L1 Data Cache Reads) Instruction. */ + /* We code this as ARCH_PERFMON_EVENTSEL_INT as this */ + /* bit will always be set in x86_pmu_hw_config(). */ + [ C(RESULT_ACCESS) ] = ARCH_PERFMON_EVENTSEL_INT, + /* DATA_READ */ + [ C(RESULT_MISS) ] = 0x0003, /* DATA_READ_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0001, /* DATA_WRITE */ + [ C(RESULT_MISS) ] = 0x0004, /* DATA_WRITE_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0011, /* L1_DATA_PF1 */ + [ C(RESULT_MISS) ] = 0x001c, /* L1_DATA_PF1_MISS */ + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x000c, /* CODE_READ */ + [ C(RESULT_MISS) ] = 0x000e, /* CODE_CACHE_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0x10cb, /* L2_READ_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x10cc, /* L2_WRITE_HIT */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x10fc, /* L2_DATA_PF2 */ + [ C(RESULT_MISS) ] = 0x10fe, /* L2_DATA_PF2_MISS */ + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = ARCH_PERFMON_EVENTSEL_INT, + /* DATA_READ */ + /* see note on L1 OP_READ */ + [ C(RESULT_MISS) ] = 0x0002, /* DATA_PAGE_WALK */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0001, /* DATA_WRITE */ + [ C(RESULT_MISS) ] = 0x0002, /* DATA_PAGE_WALK */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x000c, /* CODE_READ */ + [ C(RESULT_MISS) ] = 0x000d, /* CODE_PAGE_WALK */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0012, /* BRANCHES */ + [ C(RESULT_MISS) ] = 0x002b, /* BRANCHES_MISPREDICTED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + + +static u64 knc_pmu_event_map(int hw_event) +{ + return knc_perfmon_event_map[hw_event]; +} + +static struct event_constraint knc_event_constraints[] = +{ + INTEL_EVENT_CONSTRAINT(0xc3, 0x1), /* HWP_L2HIT */ + INTEL_EVENT_CONSTRAINT(0xc4, 0x1), /* HWP_L2MISS */ + INTEL_EVENT_CONSTRAINT(0xc8, 0x1), /* L2_READ_HIT_E */ + INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* L2_READ_HIT_M */ + INTEL_EVENT_CONSTRAINT(0xca, 0x1), /* L2_READ_HIT_S */ + INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* L2_READ_MISS */ + INTEL_EVENT_CONSTRAINT(0xcc, 0x1), /* L2_WRITE_HIT */ + INTEL_EVENT_CONSTRAINT(0xce, 0x1), /* L2_STRONGLY_ORDERED_STREAMING_VSTORES_MISS */ + INTEL_EVENT_CONSTRAINT(0xcf, 0x1), /* L2_WEAKLY_ORDERED_STREAMING_VSTORE_MISS */ + INTEL_EVENT_CONSTRAINT(0xd7, 0x1), /* L2_VICTIM_REQ_WITH_DATA */ + INTEL_EVENT_CONSTRAINT(0xe3, 0x1), /* SNP_HITM_BUNIT */ + INTEL_EVENT_CONSTRAINT(0xe6, 0x1), /* SNP_HIT_L2 */ + INTEL_EVENT_CONSTRAINT(0xe7, 0x1), /* SNP_HITM_L2 */ + INTEL_EVENT_CONSTRAINT(0xf1, 0x1), /* L2_DATA_READ_MISS_CACHE_FILL */ + INTEL_EVENT_CONSTRAINT(0xf2, 0x1), /* L2_DATA_WRITE_MISS_CACHE_FILL */ + INTEL_EVENT_CONSTRAINT(0xf6, 0x1), /* L2_DATA_READ_MISS_MEM_FILL */ + INTEL_EVENT_CONSTRAINT(0xf7, 0x1), /* L2_DATA_WRITE_MISS_MEM_FILL */ + INTEL_EVENT_CONSTRAINT(0xfc, 0x1), /* L2_DATA_PF2 */ + INTEL_EVENT_CONSTRAINT(0xfd, 0x1), /* L2_DATA_PF2_DROP */ + INTEL_EVENT_CONSTRAINT(0xfe, 0x1), /* L2_DATA_PF2_MISS */ + INTEL_EVENT_CONSTRAINT(0xff, 0x1), /* L2_DATA_HIT_INFLIGHT_PF2 */ + EVENT_CONSTRAINT_END +}; + +#define MSR_KNC_IA32_PERF_GLOBAL_STATUS 0x0000002d +#define MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL 0x0000002e +#define MSR_KNC_IA32_PERF_GLOBAL_CTRL 0x0000002f + +#define KNC_ENABLE_COUNTER0 0x00000001 +#define KNC_ENABLE_COUNTER1 0x00000002 + +static void knc_pmu_disable_all(void) +{ + u64 val; + + rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val); + val &= ~(KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1); + wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val); +} + +static void knc_pmu_enable_all(int added) +{ + u64 val; + + rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val); + val |= (KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1); + wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val); +} + +static inline void +knc_pmu_disable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 val; + + val = hwc->config; + val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; + + (void)wrmsrl_safe(hwc->config_base + hwc->idx, val); +} + +static void knc_pmu_enable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 val; + + val = hwc->config; + val |= ARCH_PERFMON_EVENTSEL_ENABLE; + + (void)wrmsrl_safe(hwc->config_base + hwc->idx, val); +} + +static inline u64 knc_pmu_get_status(void) +{ + u64 status; + + rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_STATUS, status); + + return status; +} + +static inline void knc_pmu_ack_status(u64 ack) +{ + wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL, ack); +} + +static int knc_pmu_handle_irq(struct pt_regs *regs) +{ + struct perf_sample_data data; + struct cpu_hw_events *cpuc; + int handled = 0; + int bit, loops; + u64 status; + + cpuc = &__get_cpu_var(cpu_hw_events); + + knc_pmu_disable_all(); + + status = knc_pmu_get_status(); + if (!status) { + knc_pmu_enable_all(0); + return handled; + } + + loops = 0; +again: + knc_pmu_ack_status(status); + if (++loops > 100) { + WARN_ONCE(1, "perf: irq loop stuck!\n"); + perf_event_print_debug(); + goto done; + } + + inc_irq_stat(apic_perf_irqs); + + for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { + struct perf_event *event = cpuc->events[bit]; + + handled++; + + if (!test_bit(bit, cpuc->active_mask)) + continue; + + if (!intel_pmu_save_and_restart(event)) + continue; + + perf_sample_data_init(&data, 0, event->hw.last_period); + + if (perf_event_overflow(event, &data, regs)) + x86_pmu_stop(event, 0); + } + + /* + * Repeat if there is more work to be done: + */ + status = knc_pmu_get_status(); + if (status) + goto again; + +done: + knc_pmu_enable_all(0); + + return handled; +} + + +PMU_FORMAT_ATTR(event, "config:0-7" ); +PMU_FORMAT_ATTR(umask, "config:8-15" ); +PMU_FORMAT_ATTR(edge, "config:18" ); +PMU_FORMAT_ATTR(inv, "config:23" ); +PMU_FORMAT_ATTR(cmask, "config:24-31" ); + +static struct attribute *intel_knc_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_cmask.attr, + NULL, +}; + +static const struct x86_pmu knc_pmu __initconst = { + .name = "knc", + .handle_irq = knc_pmu_handle_irq, + .disable_all = knc_pmu_disable_all, + .enable_all = knc_pmu_enable_all, + .enable = knc_pmu_enable_event, + .disable = knc_pmu_disable_event, + .hw_config = x86_pmu_hw_config, + .schedule_events = x86_schedule_events, + .eventsel = MSR_KNC_EVNTSEL0, + .perfctr = MSR_KNC_PERFCTR0, + .event_map = knc_pmu_event_map, + .max_events = ARRAY_SIZE(knc_perfmon_event_map), + .apic = 1, + .max_period = (1ULL << 39) - 1, + .version = 0, + .num_counters = 2, + .cntval_bits = 40, + .cntval_mask = (1ULL << 40) - 1, + .get_event_constraints = x86_get_event_constraints, + .event_constraints = knc_event_constraints, + .format_attrs = intel_knc_formats_attr, +}; + +__init int knc_pmu_init(void) +{ + x86_pmu = knc_pmu; + + memcpy(hw_cache_event_ids, knc_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + return 0; +} diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index 81400b93e69..5d466b7d860 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -1,5 +1,5 @@ /* - * Netburst Perfomance Events (P4, old Xeon) + * Netburst Performance Events (P4, old Xeon) * * Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org> * Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com> @@ -7,9 +7,13 @@ * For licencing details see kernel-base/COPYING */ -#ifdef CONFIG_CPU_SUP_INTEL +#include <linux/perf_event.h> #include <asm/perf_event_p4.h> +#include <asm/hardirq.h> +#include <asm/apic.h> + +#include "perf_event.h" #define P4_CNTR_LIMIT 3 /* @@ -468,7 +472,7 @@ static struct p4_event_bind p4_event_bind_map[] = { .opcode = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED), .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS), + P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS), .cntr = { {12, 13, 16}, {14, 15, 17} }, }, [P4_EVENT_X87_ASSIST] = { @@ -554,13 +558,102 @@ static __initconst const u64 p4_hw_cache_event_ids [ C(RESULT_MISS) ] = -1, }, }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, }; +/* + * Because of Netburst being quite restricted in how many + * identical events may run simultaneously, we introduce event aliases, + * ie the different events which have the same functionality but + * utilize non-intersected resources (ESCR/CCCR/counter registers). + * + * This allow us to relax restrictions a bit and run two or more + * identical events together. + * + * Never set any custom internal bits such as P4_CONFIG_HT, + * P4_CONFIG_ALIASABLE or bits for P4_PEBS_METRIC, they are + * either up to date automatically or not applicable at all. + */ +struct p4_event_alias { + u64 original; + u64 alternative; +} p4_event_aliases[] = { + { + /* + * Non-halted cycles can be substituted with non-sleeping cycles (see + * Intel SDM Vol3b for details). We need this alias to be able + * to run nmi-watchdog and 'perf top' (or any other user space tool + * which is interested in running PERF_COUNT_HW_CPU_CYCLES) + * simultaneously. + */ + .original = + p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) | + P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)), + .alternative = + p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)| + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)| + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)| + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)| + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3))| + p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT | + P4_CCCR_COMPARE), + }, +}; + +static u64 p4_get_alias_event(u64 config) +{ + u64 config_match; + int i; + + /* + * Only event with special mark is allowed, + * we're to be sure it didn't come as malformed + * RAW event. + */ + if (!(config & P4_CONFIG_ALIASABLE)) + return 0; + + config_match = config & P4_CONFIG_EVENT_ALIAS_MASK; + + for (i = 0; i < ARRAY_SIZE(p4_event_aliases); i++) { + if (config_match == p4_event_aliases[i].original) { + config_match = p4_event_aliases[i].alternative; + break; + } else if (config_match == p4_event_aliases[i].alternative) { + config_match = p4_event_aliases[i].original; + break; + } + } + + if (i >= ARRAY_SIZE(p4_event_aliases)) + return 0; + + return config_match | (config & P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS); +} + static u64 p4_general_events[PERF_COUNT_HW_MAX] = { /* non-halted CPU clocks */ [PERF_COUNT_HW_CPU_CYCLES] = p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) | - P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)), + P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)) | + P4_CONFIG_ALIASABLE, /* * retired instructions @@ -679,10 +772,10 @@ static int p4_validate_raw_event(struct perf_event *event) */ /* - * if an event is shared accross the logical threads + * if an event is shared across the logical threads * the user needs special permissions to be able to use it */ - if (p4_event_bind_map[v].shared) { + if (p4_ht_active() && p4_event_bind_map[v].shared) { if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) return -EACCES; } @@ -727,7 +820,8 @@ static int p4_hw_config(struct perf_event *event) event->hw.config = p4_set_ht_bit(event->hw.config); if (event->attr.type == PERF_TYPE_RAW) { - + struct p4_event_bind *bind; + unsigned int esel; /* * Clear bits we reserve to be managed by kernel itself * and never allowed from a user space @@ -743,6 +837,13 @@ static int p4_hw_config(struct perf_event *event) * bits since we keep additional info here (for cache events and etc) */ event->hw.config |= event->attr.config; + bind = p4_config_get_bind(event->attr.config); + if (!bind) { + rc = -EINVAL; + goto out; + } + esel = P4_OPCODE_ESEL(bind->opcode); + event->hw.config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel)); } rc = x86_setup_perfctr(event); @@ -753,19 +854,27 @@ out: static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc) { - int overflow = 0; - u32 low, high; - - rdmsr(hwc->config_base + hwc->idx, low, high); + u64 v; - /* we need to check high bit for unflagged overflows */ - if ((low & P4_CCCR_OVF) || !(high & (1 << 31))) { - overflow = 1; - (void)checking_wrmsrl(hwc->config_base + hwc->idx, - ((u64)low) & ~P4_CCCR_OVF); + /* an official way for overflow indication */ + rdmsrl(hwc->config_base, v); + if (v & P4_CCCR_OVF) { + wrmsrl(hwc->config_base, v & ~P4_CCCR_OVF); + return 1; } - return overflow; + /* + * In some circumstances the overflow might issue an NMI but did + * not set P4_CCCR_OVF bit. Because a counter holds a negative value + * we simply check for high bit being set, if it's cleared it means + * the counter has reached zero value and continued counting before + * real NMI signal was received: + */ + rdmsrl(hwc->event_base, v); + if (!(v & ARCH_P4_UNFLAGGED_BIT)) + return 1; + + return 0; } static void p4_pmu_disable_pebs(void) @@ -775,19 +884,19 @@ static void p4_pmu_disable_pebs(void) * * It's still allowed that two threads setup same cache * events so we can't simply clear metrics until we knew - * noone is depending on us, so we need kind of counter + * no one is depending on us, so we need kind of counter * for "ReplayEvent" users. * * What is more complex -- RAW events, if user (for some * reason) will pass some cache event metric with improper * event opcode -- it's fine from hardware point of view - * but completely nonsence from "meaning" of such action. + * but completely nonsense from "meaning" of such action. * * So at moment let leave metrics turned on forever -- it's * ok for now but need to be revisited! * - * (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)0); - * (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)0); + * (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, 0); + * (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, 0); */ } @@ -800,9 +909,8 @@ static inline void p4_pmu_disable_event(struct perf_event *event) * state we need to clear P4_CCCR_OVF, otherwise interrupt get * asserted again and again */ - (void)checking_wrmsrl(hwc->config_base + hwc->idx, - (u64)(p4_config_unpack_cccr(hwc->config)) & - ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED); + (void)wrmsrl_safe(hwc->config_base, + p4_config_unpack_cccr(hwc->config) & ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED); } static void p4_pmu_disable_all(void) @@ -834,8 +942,8 @@ static void p4_pmu_enable_pebs(u64 config) bind = &p4_pebs_bind_map[idx]; - (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs); - (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert); + (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs); + (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert); } static void p4_pmu_enable_event(struct perf_event *event) @@ -848,7 +956,7 @@ static void p4_pmu_enable_event(struct perf_event *event) u64 escr_addr, cccr; bind = &p4_event_bind_map[idx]; - escr_addr = (u64)bind->escr_msr[thread]; + escr_addr = bind->escr_msr[thread]; /* * - we dont support cascaded counters yet @@ -869,8 +977,8 @@ static void p4_pmu_enable_event(struct perf_event *event) */ p4_pmu_enable_pebs(hwc->config); - (void)checking_wrmsrl(escr_addr, escr_conf); - (void)checking_wrmsrl(hwc->config_base + hwc->idx, + (void)wrmsrl_safe(escr_addr, escr_conf); + (void)wrmsrl_safe(hwc->config_base, (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE); } @@ -896,9 +1004,6 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) int idx, handled = 0; u64 val; - data.addr = 0; - data.raw = NULL; - cpuc = &__get_cpu_var(cpu_hw_events); for (idx = 0; idx < x86_pmu.num_counters; idx++) { @@ -926,19 +1031,30 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) handled += overflow; /* event overflow for sure */ - data.period = event->hw.last_period; + perf_sample_data_init(&data, 0, hwc->last_period); if (!x86_perf_event_set_period(event)) continue; - if (perf_event_overflow(event, 1, &data, regs)) - p4_pmu_disable_event(event); + + + if (perf_event_overflow(event, &data, regs)) + x86_pmu_stop(event, 0); } - if (handled) { - /* p4 quirk: unmask it again */ - apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED); + if (handled) inc_irq_stat(apic_perf_irqs); - } + + /* + * When dealing with the unmasking of the LVTPC on P4 perf hw, it has + * been observed that the OVF bit flag has to be cleared first _before_ + * the LVTPC can be unmasked. + * + * The reason is the NMI line will continue to be asserted while the OVF + * bit is set. This causes a second NMI to generate if the LVTPC is + * unmasked before the OVF bit is cleared, leading to unknown NMI + * messages. + */ + apic_write(APIC_LVTPC, APIC_DM_NMI); return handled; } @@ -1096,6 +1212,8 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign struct p4_event_bind *bind; unsigned int i, thread, num; int cntr_idx, escr_idx; + u64 config_alias; + int pass; bitmap_zero(used_mask, X86_PMC_IDX_MAX); bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE); @@ -1104,6 +1222,17 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign hwc = &cpuc->event_list[i]->hw; thread = p4_ht_thread(cpu); + pass = 0; + +again: + /* + * It's possible to hit a circular lock + * between original and alternative events + * if both are scheduled already. + */ + if (pass > 2) + goto done; + bind = p4_config_get_bind(hwc->config); escr_idx = p4_get_escr_idx(bind->escr_msr[thread]); if (unlikely(escr_idx == -1)) @@ -1117,9 +1246,35 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign } cntr_idx = p4_next_cntr(thread, used_mask, bind); - if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) - goto done; - + if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) { + /* + * Check whether an event alias is still available. + */ + config_alias = p4_get_alias_event(hwc->config); + if (!config_alias) + goto done; + hwc->config = config_alias; + pass++; + goto again; + } + /* + * Perf does test runs to see if a whole group can be assigned + * together succesfully. There can be multiple rounds of this. + * Unfortunately, p4_pmu_swap_config_ts touches the hwc->config + * bits, such that the next round of group assignments will + * cause the above p4_should_swap_ts to pass instead of fail. + * This leads to counters exclusive to thread0 being used by + * thread1. + * + * Solve this with a cheap hack, reset the idx back to -1 to + * force a new lookup (p4_next_cntr) to get the right counter + * for the right thread. + * + * This probably doesn't comply with the general spirit of how + * perf wants to work, but P4 is special. :-( + */ + if (p4_should_swap_ts(hwc->config, cpu)) + hwc->idx = -1; p4_pmu_swap_config_ts(hwc, cpu); if (assign) assign[i] = cntr_idx; @@ -1129,9 +1284,20 @@ reserve: } done: - return num ? -ENOSPC : 0; + return num ? -EINVAL : 0; } +PMU_FORMAT_ATTR(cccr, "config:0-31" ); +PMU_FORMAT_ATTR(escr, "config:32-62"); +PMU_FORMAT_ATTR(ht, "config:63" ); + +static struct attribute *intel_p4_formats_attr[] = { + &format_attr_cccr.attr, + &format_attr_escr.attr, + &format_attr_ht.attr, + NULL, +}; + static __initconst const struct x86_pmu p4_pmu = { .name = "Netburst P4/Xeon", .handle_irq = p4_pmu_handle_irq, @@ -1152,9 +1318,9 @@ static __initconst const struct x86_pmu p4_pmu = { */ .num_counters = ARCH_P4_MAX_CCCR, .apic = 1, - .cntval_bits = 40, - .cntval_mask = (1ULL << 40) - 1, - .max_period = (1ULL << 39) - 1, + .cntval_bits = ARCH_P4_CNTRVAL_BITS, + .cntval_mask = ARCH_P4_CNTRVAL_MASK, + .max_period = (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1, .hw_config = p4_hw_config, .schedule_events = p4_pmu_schedule_events, /* @@ -1166,14 +1332,17 @@ static __initconst const struct x86_pmu p4_pmu = { * the former idea is taken from OProfile code */ .perfctr_second_write = 1, + + .format_attrs = intel_p4_formats_attr, }; -static __init int p4_pmu_init(void) +__init int p4_pmu_init(void) { unsigned int low, high; + int i, reg; - /* If we get stripped -- indexig fails */ - BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC); + /* If we get stripped -- indexing fails */ + BUILD_BUG_ON(ARCH_P4_MAX_CCCR > INTEL_PMC_MAX_GENERIC); rdmsr(MSR_IA32_MISC_ENABLE, low, high); if (!(low & (1 << 7))) { @@ -1189,7 +1358,19 @@ static __init int p4_pmu_init(void) x86_pmu = p4_pmu; + /* + * Even though the counters are configured to interrupt a particular + * logical processor when an overflow happens, testing has shown that + * on kdump kernels (which uses a single cpu), thread1's counter + * continues to run and will report an NMI on thread0. Due to the + * overflow bug, this leads to a stream of unknown NMIs. + * + * Solve this by zero'ing out the registers to mimic a reset. + */ + for (i = 0; i < x86_pmu.num_counters; i++) { + reg = x86_pmu_config_addr(i); + wrmsrl_safe(reg, 0ULL); + } + return 0; } - -#endif /* CONFIG_CPU_SUP_INTEL */ diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c index 34ba07be2cd..7c1a0c07b60 100644 --- a/arch/x86/kernel/cpu/perf_event_p6.c +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -1,17 +1,113 @@ -#ifdef CONFIG_CPU_SUP_INTEL +#include <linux/perf_event.h> +#include <linux/types.h> + +#include "perf_event.h" /* * Not sure about some of these */ static const u64 p6_perfmon_event_map[] = { - [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, - [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, - [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, + [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, /* CPU_CLK_UNHALTED */ + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, /* INST_RETIRED */ + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, /* L2_RQSTS:M:E:S:I */ + [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, /* L2_RQSTS:I */ + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, /* BR_INST_RETIRED */ + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, /* BR_MISS_PRED_RETIRED */ + [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, /* BUS_DRDY_CLOCKS */ + [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00a2, /* RESOURCE_STALLS */ + +}; + +static const u64 __initconst p6_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0043, /* DATA_MEM_REFS */ + [ C(RESULT_MISS) ] = 0x0045, /* DCU_LINES_IN */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0x0f29, /* L2_LD:M:E:S:I */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* IFU_IFETCH */ + [ C(RESULT_MISS) ] = 0x0f28, /* L2_IFETCH:M:E:S:I */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0x0025, /* L2_M_LINES_INM */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0043, /* DATA_MEM_REFS */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* IFU_IFETCH */ + [ C(RESULT_MISS) ] = 0x0085, /* ITLB_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED */ + [ C(RESULT_MISS) ] = 0x00c5, /* BR_MISS_PRED_RETIRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, }; static u64 p6_pmu_event_map(int hw_event) @@ -31,7 +127,7 @@ static struct event_constraint p6_event_constraints[] = { INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */ INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ - INTEL_EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */ + INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ @@ -61,29 +157,46 @@ static void p6_pmu_enable_all(int added) static inline void p6_pmu_disable_event(struct perf_event *event) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; u64 val = P6_NOP_EVENT; - if (cpuc->enabled) - val |= ARCH_PERFMON_EVENTSEL_ENABLE; - - (void)checking_wrmsrl(hwc->config_base + hwc->idx, val); + (void)wrmsrl_safe(hwc->config_base, val); } static void p6_pmu_enable_event(struct perf_event *event) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; u64 val; val = hwc->config; - if (cpuc->enabled) - val |= ARCH_PERFMON_EVENTSEL_ENABLE; - (void)checking_wrmsrl(hwc->config_base + hwc->idx, val); + /* + * p6 only has a global event enable, set on PerfEvtSel0 + * We "disable" events by programming P6_NOP_EVENT + * and we rely on p6_pmu_enable_all() being called + * to actually enable the events. + */ + + (void)wrmsrl_safe(hwc->config_base, val); } +PMU_FORMAT_ATTR(event, "config:0-7" ); +PMU_FORMAT_ATTR(umask, "config:8-15" ); +PMU_FORMAT_ATTR(edge, "config:18" ); +PMU_FORMAT_ATTR(pc, "config:19" ); +PMU_FORMAT_ATTR(inv, "config:23" ); +PMU_FORMAT_ATTR(cmask, "config:24-31" ); + +static struct attribute *intel_p6_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_pc.attr, + &format_attr_inv.attr, + &format_attr_cmask.attr, + NULL, +}; + static __initconst const struct x86_pmu p6_pmu = { .name = "p6", .handle_irq = x86_pmu_handle_irq, @@ -112,31 +225,55 @@ static __initconst const struct x86_pmu p6_pmu = { .cntval_mask = (1ULL << 32) - 1, .get_event_constraints = x86_get_event_constraints, .event_constraints = p6_event_constraints, + + .format_attrs = intel_p6_formats_attr, + .events_sysfs_show = intel_event_sysfs_show, + }; -static __init int p6_pmu_init(void) +static __init void p6_pmu_rdpmc_quirk(void) { + if (boot_cpu_data.x86_mask < 9) { + /* + * PPro erratum 26; fixed in stepping 9 and above. + */ + pr_warn("Userspace RDPMC support disabled due to a CPU erratum\n"); + x86_pmu.attr_rdpmc_broken = 1; + x86_pmu.attr_rdpmc = 0; + } +} + +__init int p6_pmu_init(void) +{ + x86_pmu = p6_pmu; + switch (boot_cpu_data.x86_model) { - case 1: - case 3: /* Pentium Pro */ - case 5: - case 6: /* Pentium II */ - case 7: - case 8: - case 11: /* Pentium III */ - case 9: - case 13: - /* Pentium M */ + case 1: /* Pentium Pro */ + x86_add_quirk(p6_pmu_rdpmc_quirk); + break; + + case 3: /* Pentium II - Klamath */ + case 5: /* Pentium II - Deschutes */ + case 6: /* Pentium II - Mendocino */ break; + + case 7: /* Pentium III - Katmai */ + case 8: /* Pentium III - Coppermine */ + case 10: /* Pentium III Xeon */ + case 11: /* Pentium III - Tualatin */ + break; + + case 9: /* Pentium M - Banias */ + case 13: /* Pentium M - Dothan */ + break; + default: - pr_cont("unsupported p6 CPU model %d ", - boot_cpu_data.x86_model); + pr_cont("unsupported p6 CPU model %d ", boot_cpu_data.x86_model); return -ENODEV; } - x86_pmu = p6_pmu; + memcpy(hw_cache_event_ids, p6_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); return 0; } - -#endif /* CONFIG_CPU_SUP_INTEL */ diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index d9f4ff8fcd6..2e8caf03f59 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -16,32 +16,12 @@ #include <linux/kernel.h> #include <linux/bitops.h> #include <linux/smp.h> -#include <linux/nmi.h> +#include <asm/nmi.h> #include <linux/kprobes.h> #include <asm/apic.h> #include <asm/perf_event.h> -struct nmi_watchdog_ctlblk { - unsigned int cccr_msr; - unsigned int perfctr_msr; /* the MSR to reset in NMI handler */ - unsigned int evntsel_msr; /* the MSR to select the events to handle */ -}; - -/* Interface defining a CPU specific perfctr watchdog */ -struct wd_ops { - int (*reserve)(void); - void (*unreserve)(void); - int (*setup)(unsigned nmi_hz); - void (*rearm)(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz); - void (*stop)(void); - unsigned perfctr; - unsigned evntsel; - u64 checkbit; -}; - -static const struct wd_ops *wd_ops; - /* * this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's * offset from MSR_P4_BSU_ESCR0. @@ -60,14 +40,14 @@ static const struct wd_ops *wd_ops; static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS); static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS); -static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk); - /* converts an msr to an appropriate reservation bit */ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) { /* returns the bit offset of the performance counter register */ switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: + if (msr >= MSR_F15H_PERF_CTR) + return (msr - MSR_F15H_PERF_CTR) >> 1; return msr - MSR_K7_PERFCTR0; case X86_VENDOR_INTEL: if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) @@ -76,6 +56,8 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) switch (boot_cpu_data.x86) { case 6: return msr - MSR_P6_PERFCTR0; + case 11: + return msr - MSR_KNC_PERFCTR0; case 15: return msr - MSR_P4_BPU_PERFCTR0; } @@ -92,6 +74,8 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) /* returns the bit offset of the event selection register */ switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: + if (msr >= MSR_F15H_PERF_CTL) + return (msr - MSR_F15H_PERF_CTL) >> 1; return msr - MSR_K7_EVNTSEL0; case X86_VENDOR_INTEL: if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) @@ -100,6 +84,8 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) switch (boot_cpu_data.x86) { case 6: return msr - MSR_P6_EVNTSEL0; + case 11: + return msr - MSR_KNC_EVNTSEL0; case 15: return msr - MSR_P4_BSU_ESCR0; } @@ -172,623 +158,3 @@ void release_evntsel_nmi(unsigned int msr) clear_bit(counter, evntsel_nmi_owner); } EXPORT_SYMBOL(release_evntsel_nmi); - -void disable_lapic_nmi_watchdog(void) -{ - BUG_ON(nmi_watchdog != NMI_LOCAL_APIC); - - if (atomic_read(&nmi_active) <= 0) - return; - - on_each_cpu(stop_apic_nmi_watchdog, NULL, 1); - - if (wd_ops) - wd_ops->unreserve(); - - BUG_ON(atomic_read(&nmi_active) != 0); -} - -void enable_lapic_nmi_watchdog(void) -{ - BUG_ON(nmi_watchdog != NMI_LOCAL_APIC); - - /* are we already enabled */ - if (atomic_read(&nmi_active) != 0) - return; - - /* are we lapic aware */ - if (!wd_ops) - return; - if (!wd_ops->reserve()) { - printk(KERN_ERR "NMI watchdog: cannot reserve perfctrs\n"); - return; - } - - on_each_cpu(setup_apic_nmi_watchdog, NULL, 1); - touch_nmi_watchdog(); -} - -/* - * Activate the NMI watchdog via the local APIC. - */ - -static unsigned int adjust_for_32bit_ctr(unsigned int hz) -{ - u64 counter_val; - unsigned int retval = hz; - - /* - * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter - * are writable, with higher bits sign extending from bit 31. - * So, we can only program the counter with 31 bit values and - * 32nd bit should be 1, for 33.. to be 1. - * Find the appropriate nmi_hz - */ - counter_val = (u64)cpu_khz * 1000; - do_div(counter_val, retval); - if (counter_val > 0x7fffffffULL) { - u64 count = (u64)cpu_khz * 1000; - do_div(count, 0x7fffffffUL); - retval = count + 1; - } - return retval; -} - -static void write_watchdog_counter(unsigned int perfctr_msr, - const char *descr, unsigned nmi_hz) -{ - u64 count = (u64)cpu_khz * 1000; - - do_div(count, nmi_hz); - if (descr) - pr_debug("setting %s to -0x%08Lx\n", descr, count); - wrmsrl(perfctr_msr, 0 - count); -} - -static void write_watchdog_counter32(unsigned int perfctr_msr, - const char *descr, unsigned nmi_hz) -{ - u64 count = (u64)cpu_khz * 1000; - - do_div(count, nmi_hz); - if (descr) - pr_debug("setting %s to -0x%08Lx\n", descr, count); - wrmsr(perfctr_msr, (u32)(-count), 0); -} - -/* - * AMD K7/K8/Family10h/Family11h support. - * AMD keeps this interface nicely stable so there is not much variety - */ -#define K7_EVNTSEL_ENABLE (1 << 22) -#define K7_EVNTSEL_INT (1 << 20) -#define K7_EVNTSEL_OS (1 << 17) -#define K7_EVNTSEL_USR (1 << 16) -#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 -#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING - -static int setup_k7_watchdog(unsigned nmi_hz) -{ - unsigned int perfctr_msr, evntsel_msr; - unsigned int evntsel; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - perfctr_msr = wd_ops->perfctr; - evntsel_msr = wd_ops->evntsel; - - wrmsrl(perfctr_msr, 0UL); - - evntsel = K7_EVNTSEL_INT - | K7_EVNTSEL_OS - | K7_EVNTSEL_USR - | K7_NMI_EVENT; - - /* setup the timer */ - wrmsr(evntsel_msr, evntsel, 0); - write_watchdog_counter(perfctr_msr, "K7_PERFCTR0", nmi_hz); - - /* initialize the wd struct before enabling */ - wd->perfctr_msr = perfctr_msr; - wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = 0; /* unused */ - - /* ok, everything is initialized, announce that we're set */ - cpu_nmi_set_wd_enabled(); - - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= K7_EVNTSEL_ENABLE; - wrmsr(evntsel_msr, evntsel, 0); - - return 1; -} - -static void single_msr_stop_watchdog(void) -{ - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - wrmsr(wd->evntsel_msr, 0, 0); -} - -static int single_msr_reserve(void) -{ - if (!reserve_perfctr_nmi(wd_ops->perfctr)) - return 0; - - if (!reserve_evntsel_nmi(wd_ops->evntsel)) { - release_perfctr_nmi(wd_ops->perfctr); - return 0; - } - return 1; -} - -static void single_msr_unreserve(void) -{ - release_evntsel_nmi(wd_ops->evntsel); - release_perfctr_nmi(wd_ops->perfctr); -} - -static void __kprobes -single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) -{ - /* start the cycle over again */ - write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz); -} - -static const struct wd_ops k7_wd_ops = { - .reserve = single_msr_reserve, - .unreserve = single_msr_unreserve, - .setup = setup_k7_watchdog, - .rearm = single_msr_rearm, - .stop = single_msr_stop_watchdog, - .perfctr = MSR_K7_PERFCTR0, - .evntsel = MSR_K7_EVNTSEL0, - .checkbit = 1ULL << 47, -}; - -/* - * Intel Model 6 (PPro+,P2,P3,P-M,Core1) - */ -#define P6_EVNTSEL0_ENABLE (1 << 22) -#define P6_EVNTSEL_INT (1 << 20) -#define P6_EVNTSEL_OS (1 << 17) -#define P6_EVNTSEL_USR (1 << 16) -#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79 -#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED - -static int setup_p6_watchdog(unsigned nmi_hz) -{ - unsigned int perfctr_msr, evntsel_msr; - unsigned int evntsel; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - perfctr_msr = wd_ops->perfctr; - evntsel_msr = wd_ops->evntsel; - - /* KVM doesn't implement this MSR */ - if (wrmsr_safe(perfctr_msr, 0, 0) < 0) - return 0; - - evntsel = P6_EVNTSEL_INT - | P6_EVNTSEL_OS - | P6_EVNTSEL_USR - | P6_NMI_EVENT; - - /* setup the timer */ - wrmsr(evntsel_msr, evntsel, 0); - nmi_hz = adjust_for_32bit_ctr(nmi_hz); - write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0", nmi_hz); - - /* initialize the wd struct before enabling */ - wd->perfctr_msr = perfctr_msr; - wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = 0; /* unused */ - - /* ok, everything is initialized, announce that we're set */ - cpu_nmi_set_wd_enabled(); - - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= P6_EVNTSEL0_ENABLE; - wrmsr(evntsel_msr, evntsel, 0); - - return 1; -} - -static void __kprobes p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) -{ - /* - * P6 based Pentium M need to re-unmask - * the apic vector but it doesn't hurt - * other P6 variant. - * ArchPerfom/Core Duo also needs this - */ - apic_write(APIC_LVTPC, APIC_DM_NMI); - - /* P6/ARCH_PERFMON has 32 bit counter write */ - write_watchdog_counter32(wd->perfctr_msr, NULL, nmi_hz); -} - -static const struct wd_ops p6_wd_ops = { - .reserve = single_msr_reserve, - .unreserve = single_msr_unreserve, - .setup = setup_p6_watchdog, - .rearm = p6_rearm, - .stop = single_msr_stop_watchdog, - .perfctr = MSR_P6_PERFCTR0, - .evntsel = MSR_P6_EVNTSEL0, - .checkbit = 1ULL << 39, -}; - -/* - * Intel P4 performance counters. - * By far the most complicated of all. - */ -#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1 << 7) -#define P4_ESCR_EVENT_SELECT(N) ((N) << 25) -#define P4_ESCR_OS (1 << 3) -#define P4_ESCR_USR (1 << 2) -#define P4_CCCR_OVF_PMI0 (1 << 26) -#define P4_CCCR_OVF_PMI1 (1 << 27) -#define P4_CCCR_THRESHOLD(N) ((N) << 20) -#define P4_CCCR_COMPLEMENT (1 << 19) -#define P4_CCCR_COMPARE (1 << 18) -#define P4_CCCR_REQUIRED (3 << 16) -#define P4_CCCR_ESCR_SELECT(N) ((N) << 13) -#define P4_CCCR_ENABLE (1 << 12) -#define P4_CCCR_OVF (1 << 31) - -#define P4_CONTROLS 18 -static unsigned int p4_controls[18] = { - MSR_P4_BPU_CCCR0, - MSR_P4_BPU_CCCR1, - MSR_P4_BPU_CCCR2, - MSR_P4_BPU_CCCR3, - MSR_P4_MS_CCCR0, - MSR_P4_MS_CCCR1, - MSR_P4_MS_CCCR2, - MSR_P4_MS_CCCR3, - MSR_P4_FLAME_CCCR0, - MSR_P4_FLAME_CCCR1, - MSR_P4_FLAME_CCCR2, - MSR_P4_FLAME_CCCR3, - MSR_P4_IQ_CCCR0, - MSR_P4_IQ_CCCR1, - MSR_P4_IQ_CCCR2, - MSR_P4_IQ_CCCR3, - MSR_P4_IQ_CCCR4, - MSR_P4_IQ_CCCR5, -}; -/* - * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter - * CRU_ESCR0 (with any non-null event selector) through a complemented - * max threshold. [IA32-Vol3, Section 14.9.9] - */ -static int setup_p4_watchdog(unsigned nmi_hz) -{ - unsigned int perfctr_msr, evntsel_msr, cccr_msr; - unsigned int evntsel, cccr_val; - unsigned int misc_enable, dummy; - unsigned int ht_num; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy); - if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL)) - return 0; - -#ifdef CONFIG_SMP - /* detect which hyperthread we are on */ - if (smp_num_siblings == 2) { - unsigned int ebx, apicid; - - ebx = cpuid_ebx(1); - apicid = (ebx >> 24) & 0xff; - ht_num = apicid & 1; - } else -#endif - ht_num = 0; - - /* - * performance counters are shared resources - * assign each hyperthread its own set - * (re-use the ESCR0 register, seems safe - * and keeps the cccr_val the same) - */ - if (!ht_num) { - /* logical cpu 0 */ - perfctr_msr = MSR_P4_IQ_PERFCTR0; - evntsel_msr = MSR_P4_CRU_ESCR0; - cccr_msr = MSR_P4_IQ_CCCR0; - cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4); - - /* - * If we're on the kdump kernel or other situation, we may - * still have other performance counter registers set to - * interrupt and they'll keep interrupting forever because - * of the P4_CCCR_OVF quirk. So we need to ACK all the - * pending interrupts and disable all the registers here, - * before reenabling the NMI delivery. Refer to p4_rearm() - * about the P4_CCCR_OVF quirk. - */ - if (reset_devices) { - unsigned int low, high; - int i; - - for (i = 0; i < P4_CONTROLS; i++) { - rdmsr(p4_controls[i], low, high); - low &= ~(P4_CCCR_ENABLE | P4_CCCR_OVF); - wrmsr(p4_controls[i], low, high); - } - } - } else { - /* logical cpu 1 */ - perfctr_msr = MSR_P4_IQ_PERFCTR1; - evntsel_msr = MSR_P4_CRU_ESCR0; - cccr_msr = MSR_P4_IQ_CCCR1; - - /* Pentium 4 D processors don't support P4_CCCR_OVF_PMI1 */ - if (boot_cpu_data.x86_model == 4 && boot_cpu_data.x86_mask == 4) - cccr_val = P4_CCCR_OVF_PMI0; - else - cccr_val = P4_CCCR_OVF_PMI1; - cccr_val |= P4_CCCR_ESCR_SELECT(4); - } - - evntsel = P4_ESCR_EVENT_SELECT(0x3F) - | P4_ESCR_OS - | P4_ESCR_USR; - - cccr_val |= P4_CCCR_THRESHOLD(15) - | P4_CCCR_COMPLEMENT - | P4_CCCR_COMPARE - | P4_CCCR_REQUIRED; - - wrmsr(evntsel_msr, evntsel, 0); - wrmsr(cccr_msr, cccr_val, 0); - write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz); - - wd->perfctr_msr = perfctr_msr; - wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = cccr_msr; - - /* ok, everything is initialized, announce that we're set */ - cpu_nmi_set_wd_enabled(); - - apic_write(APIC_LVTPC, APIC_DM_NMI); - cccr_val |= P4_CCCR_ENABLE; - wrmsr(cccr_msr, cccr_val, 0); - return 1; -} - -static void stop_p4_watchdog(void) -{ - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - wrmsr(wd->cccr_msr, 0, 0); - wrmsr(wd->evntsel_msr, 0, 0); -} - -static int p4_reserve(void) -{ - if (!reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR0)) - return 0; -#ifdef CONFIG_SMP - if (smp_num_siblings > 1 && !reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR1)) - goto fail1; -#endif - if (!reserve_evntsel_nmi(MSR_P4_CRU_ESCR0)) - goto fail2; - /* RED-PEN why is ESCR1 not reserved here? */ - return 1; - fail2: -#ifdef CONFIG_SMP - if (smp_num_siblings > 1) - release_perfctr_nmi(MSR_P4_IQ_PERFCTR1); - fail1: -#endif - release_perfctr_nmi(MSR_P4_IQ_PERFCTR0); - return 0; -} - -static void p4_unreserve(void) -{ -#ifdef CONFIG_SMP - if (smp_num_siblings > 1) - release_perfctr_nmi(MSR_P4_IQ_PERFCTR1); -#endif - release_evntsel_nmi(MSR_P4_CRU_ESCR0); - release_perfctr_nmi(MSR_P4_IQ_PERFCTR0); -} - -static void __kprobes p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) -{ - unsigned dummy; - /* - * P4 quirks: - * - An overflown perfctr will assert its interrupt - * until the OVF flag in its CCCR is cleared. - * - LVTPC is masked on interrupt and must be - * unmasked by the LVTPC handler. - */ - rdmsrl(wd->cccr_msr, dummy); - dummy &= ~P4_CCCR_OVF; - wrmsrl(wd->cccr_msr, dummy); - apic_write(APIC_LVTPC, APIC_DM_NMI); - /* start the cycle over again */ - write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz); -} - -static const struct wd_ops p4_wd_ops = { - .reserve = p4_reserve, - .unreserve = p4_unreserve, - .setup = setup_p4_watchdog, - .rearm = p4_rearm, - .stop = stop_p4_watchdog, - /* RED-PEN this is wrong for the other sibling */ - .perfctr = MSR_P4_BPU_PERFCTR0, - .evntsel = MSR_P4_BSU_ESCR0, - .checkbit = 1ULL << 39, -}; - -/* - * Watchdog using the Intel architected PerfMon. - * Used for Core2 and hopefully all future Intel CPUs. - */ -#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL -#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK - -static struct wd_ops intel_arch_wd_ops; - -static int setup_intel_arch_watchdog(unsigned nmi_hz) -{ - unsigned int ebx; - union cpuid10_eax eax; - unsigned int unused; - unsigned int perfctr_msr, evntsel_msr; - unsigned int evntsel; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - /* - * Check whether the Architectural PerfMon supports - * Unhalted Core Cycles Event or not. - * NOTE: Corresponding bit = 0 in ebx indicates event present. - */ - cpuid(10, &(eax.full), &ebx, &unused, &unused); - if ((eax.split.mask_length < - (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || - (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) - return 0; - - perfctr_msr = wd_ops->perfctr; - evntsel_msr = wd_ops->evntsel; - - wrmsrl(perfctr_msr, 0UL); - - evntsel = ARCH_PERFMON_EVENTSEL_INT - | ARCH_PERFMON_EVENTSEL_OS - | ARCH_PERFMON_EVENTSEL_USR - | ARCH_PERFMON_NMI_EVENT_SEL - | ARCH_PERFMON_NMI_EVENT_UMASK; - - /* setup the timer */ - wrmsr(evntsel_msr, evntsel, 0); - nmi_hz = adjust_for_32bit_ctr(nmi_hz); - write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz); - - wd->perfctr_msr = perfctr_msr; - wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = 0; /* unused */ - - /* ok, everything is initialized, announce that we're set */ - cpu_nmi_set_wd_enabled(); - - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= ARCH_PERFMON_EVENTSEL_ENABLE; - wrmsr(evntsel_msr, evntsel, 0); - intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1); - return 1; -} - -static struct wd_ops intel_arch_wd_ops __read_mostly = { - .reserve = single_msr_reserve, - .unreserve = single_msr_unreserve, - .setup = setup_intel_arch_watchdog, - .rearm = p6_rearm, - .stop = single_msr_stop_watchdog, - .perfctr = MSR_ARCH_PERFMON_PERFCTR1, - .evntsel = MSR_ARCH_PERFMON_EVENTSEL1, -}; - -static void probe_nmi_watchdog(void) -{ - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - if (boot_cpu_data.x86 == 6 || - (boot_cpu_data.x86 >= 0xf && boot_cpu_data.x86 <= 0x15)) - wd_ops = &k7_wd_ops; - return; - case X86_VENDOR_INTEL: - /* Work around where perfctr1 doesn't have a working enable - * bit as described in the following errata: - * AE49 Core Duo and Intel Core Solo 65 nm - * AN49 Intel Pentium Dual-Core - * AF49 Dual-Core Intel Xeon Processor LV - */ - if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) || - ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 15 && - boot_cpu_data.x86_mask == 4))) { - intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0; - intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0; - } - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { - wd_ops = &intel_arch_wd_ops; - break; - } - switch (boot_cpu_data.x86) { - case 6: - if (boot_cpu_data.x86_model > 13) - return; - - wd_ops = &p6_wd_ops; - break; - case 15: - wd_ops = &p4_wd_ops; - break; - default: - return; - } - break; - } -} - -/* Interface to nmi.c */ - -int lapic_watchdog_init(unsigned nmi_hz) -{ - if (!wd_ops) { - probe_nmi_watchdog(); - if (!wd_ops) { - printk(KERN_INFO "NMI watchdog: CPU not supported\n"); - return -1; - } - - if (!wd_ops->reserve()) { - printk(KERN_ERR - "NMI watchdog: cannot reserve perfctrs\n"); - return -1; - } - } - - if (!(wd_ops->setup(nmi_hz))) { - printk(KERN_ERR "Cannot setup NMI watchdog on CPU %d\n", - raw_smp_processor_id()); - return -1; - } - - return 0; -} - -void lapic_watchdog_stop(void) -{ - if (wd_ops) - wd_ops->stop(); -} - -unsigned lapic_adjust_nmi_hz(unsigned hz) -{ - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - if (wd->perfctr_msr == MSR_P6_PERFCTR0 || - wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR1) - hz = adjust_for_32bit_ctr(hz); - return hz; -} - -int __kprobes lapic_wd_event(unsigned nmi_hz) -{ - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - u64 ctr; - - rdmsrl(wd->perfctr_msr, ctr); - if (ctr & wd_ops->checkbit) /* perfctr still running? */ - return 0; - - wd_ops->rearm(wd, nmi_hz); - return 1; -} diff --git a/arch/x86/kernel/cpu/powerflags.c b/arch/x86/kernel/cpu/powerflags.c index 5abbea297e0..31f0f335ed2 100644 --- a/arch/x86/kernel/cpu/powerflags.c +++ b/arch/x86/kernel/cpu/powerflags.c @@ -11,10 +11,11 @@ const char *const x86_power_flags[32] = { "fid", /* frequency id control */ "vid", /* voltage id control */ "ttp", /* thermal trip */ - "tm", - "stc", - "100mhzsteps", - "hwpstate", + "tm", /* hardware thermal control */ + "stc", /* software thermal control */ + "100mhzsteps", /* 100 MHz multiplier control */ + "hwpstate", /* hardware P-state control */ "", /* tsc invariant mapped to constant_tsc */ - /* nothing */ + "cpb", /* core performance boost */ + "eff_freq_ro", /* Readonly aperf/mperf */ }; diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 62ac8cb6ba2..06fe3ed8b85 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -11,41 +11,31 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c, unsigned int cpu) { #ifdef CONFIG_SMP - if (c->x86_max_cores * smp_num_siblings > 1) { - seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); - seq_printf(m, "siblings\t: %d\n", - cpumask_weight(cpu_core_mask(cpu))); - seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); - seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); - seq_printf(m, "apicid\t\t: %d\n", c->apicid); - seq_printf(m, "initial apicid\t: %d\n", c->initial_apicid); - } + seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); + seq_printf(m, "siblings\t: %d\n", cpumask_weight(cpu_core_mask(cpu))); + seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); + seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); + seq_printf(m, "apicid\t\t: %d\n", c->apicid); + seq_printf(m, "initial apicid\t: %d\n", c->initial_apicid); #endif } #ifdef CONFIG_X86_32 static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) { - /* - * We use exception 16 if we have hardware math and we've either seen - * it or the CPU claims it is internal - */ - int fpu_exception = c->hard_math && (ignore_fpu_irq || cpu_has_fpu); seq_printf(m, "fdiv_bug\t: %s\n" - "hlt_bug\t\t: %s\n" "f00f_bug\t: %s\n" "coma_bug\t: %s\n" "fpu\t\t: %s\n" "fpu_exception\t: %s\n" "cpuid level\t: %d\n" "wp\t\t: %s\n", - c->fdiv_bug ? "yes" : "no", - c->hlt_works_ok ? "no" : "yes", - c->f00f_bug ? "yes" : "no", - c->coma_bug ? "yes" : "no", - c->hard_math ? "yes" : "no", - fpu_exception ? "yes" : "no", + static_cpu_has_bug(X86_BUG_FDIV) ? "yes" : "no", + static_cpu_has_bug(X86_BUG_F00F) ? "yes" : "no", + static_cpu_has_bug(X86_BUG_COMA) ? "yes" : "no", + static_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", + static_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", c->cpuid_level, c->wp_works_ok ? "yes" : "no"); } @@ -64,12 +54,10 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) static int show_cpuinfo(struct seq_file *m, void *v) { struct cpuinfo_x86 *c = v; - unsigned int cpu = 0; + unsigned int cpu; int i; -#ifdef CONFIG_SMP cpu = c->cpu_index; -#endif seq_printf(m, "processor\t: %u\n" "vendor_id\t: %s\n" "cpu family\t: %d\n" @@ -85,6 +73,8 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "stepping\t: %d\n", c->x86_mask); else seq_printf(m, "stepping\t: unknown\n"); + if (c->microcode) + seq_printf(m, "microcode\t: 0x%x\n", c->microcode); if (cpu_has(c, X86_FEATURE_TSC)) { unsigned int freq = cpufreq_quick_get(cpu); @@ -140,10 +130,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) static void *c_start(struct seq_file *m, loff_t *pos) { - if (*pos == 0) /* just in case, cpu 0 is not the first */ - *pos = cpumask_first(cpu_online_mask); - else - *pos = cpumask_next(*pos - 1, cpu_online_mask); + *pos = cpumask_next(*pos - 1, cpu_online_mask); if ((*pos) < nr_cpu_ids) return &cpu_data(*pos); return NULL; diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c new file mode 100644 index 00000000000..136ac74dee8 --- /dev/null +++ b/arch/x86/kernel/cpu/rdrand.c @@ -0,0 +1,60 @@ +/* + * This file is part of the Linux kernel. + * + * Copyright (c) 2011, Intel Corporation + * Authors: Fenghua Yu <fenghua.yu@intel.com>, + * H. Peter Anvin <hpa@linux.intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#include <asm/processor.h> +#include <asm/archrandom.h> +#include <asm/sections.h> + +static int __init x86_rdrand_setup(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_RDRAND); + setup_clear_cpu_cap(X86_FEATURE_RDSEED); + return 1; +} +__setup("nordrand", x86_rdrand_setup); + +/* + * Force a reseed cycle; we are architecturally guaranteed a reseed + * after no more than 512 128-bit chunks of random data. This also + * acts as a test of the CPU capability. + */ +#define RESEED_LOOP ((512*128)/sizeof(unsigned long)) + +void x86_init_rdrand(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_ARCH_RANDOM + unsigned long tmp; + int i, count, ok; + + if (!cpu_has(c, X86_FEATURE_RDRAND)) + return; /* Nothing to do */ + + for (count = i = 0; i < RESEED_LOOP; i++) { + ok = rdrand_long(&tmp); + if (ok) + count++; + } + + if (count != RESEED_LOOP) + clear_cpu_cap(c, X86_FEATURE_RDRAND); +#endif +} diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index c7f64e6f537..b6f794aa169 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -1,5 +1,5 @@ /* - * Routines to indentify additional cpu features that are scattered in + * Routines to identify additional cpu features that are scattered in * cpuid space. */ #include <linux/cpu.h> @@ -24,14 +24,14 @@ enum cpuid_regs { CR_EBX }; -void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) +void init_scattered_cpuid_features(struct cpuinfo_x86 *c) { u32 max_level; u32 regs[4]; const struct cpuid_bit *cb; - static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { - { X86_FEATURE_DTS, CR_EAX, 0, 0x00000006, 0 }, + static const struct cpuid_bit cpuid_bits[] = { + { X86_FEATURE_DTHERM, CR_EAX, 0, 0x00000006, 0 }, { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 }, { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 }, { X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 }, @@ -39,7 +39,9 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 }, { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 }, + { X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 }, { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 }, + { X86_FEATURE_PROC_FEEDBACK, CR_EDX,11, 0x80000007, 0 }, { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 }, { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 }, { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 }, diff --git a/arch/x86/kernel/cpu/sched.c b/arch/x86/kernel/cpu/sched.c deleted file mode 100644 index a640ae5ad20..00000000000 --- a/arch/x86/kernel/cpu/sched.c +++ /dev/null @@ -1,55 +0,0 @@ -#include <linux/sched.h> -#include <linux/math64.h> -#include <linux/percpu.h> -#include <linux/irqflags.h> - -#include <asm/cpufeature.h> -#include <asm/processor.h> - -#ifdef CONFIG_SMP - -static DEFINE_PER_CPU(struct aperfmperf, old_perf_sched); - -static unsigned long scale_aperfmperf(void) -{ - struct aperfmperf val, *old = &__get_cpu_var(old_perf_sched); - unsigned long ratio, flags; - - local_irq_save(flags); - get_aperfmperf(&val); - local_irq_restore(flags); - - ratio = calc_aperfmperf_ratio(old, &val); - *old = val; - - return ratio; -} - -unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu) -{ - /* - * do aperf/mperf on the cpu level because it includes things - * like turbo mode, which are relevant to full cores. - */ - if (boot_cpu_has(X86_FEATURE_APERFMPERF)) - return scale_aperfmperf(); - - /* - * maybe have something cpufreq here - */ - - return default_scale_freq_power(sd, cpu); -} - -unsigned long arch_scale_smt_power(struct sched_domain *sd, int cpu) -{ - /* - * aperf/mperf already includes the smt gain - */ - if (boot_cpu_has(X86_FEATURE_APERFMPERF)) - return SCHED_LOAD_SCALE; - - return default_scale_smt_power(sd, cpu); -} - -#endif diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index 4397e987a1c..4c60eaf0571 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -26,7 +26,7 @@ * exists, use it for populating initial_apicid and cpu topology * detection. */ -void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c) +void detect_extended_topology(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP unsigned int eax, ebx, ecx, edx, sub_index; diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c index 28000743bbb..3fa0e5ad86b 100644 --- a/arch/x86/kernel/cpu/transmeta.c +++ b/arch/x86/kernel/cpu/transmeta.c @@ -1,11 +1,10 @@ #include <linux/kernel.h> #include <linux/mm.h> -#include <linux/init.h> #include <asm/processor.h> #include <asm/msr.h> #include "cpu.h" -static void __cpuinit early_init_transmeta(struct cpuinfo_x86 *c) +static void early_init_transmeta(struct cpuinfo_x86 *c) { u32 xlvl; @@ -17,7 +16,7 @@ static void __cpuinit early_init_transmeta(struct cpuinfo_x86 *c) } } -static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) +static void init_transmeta(struct cpuinfo_x86 *c) { unsigned int cap_mask, uk, max, dummy; unsigned int cms_rev1, cms_rev2; @@ -98,7 +97,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) #endif } -static const struct cpu_dev __cpuinitconst transmeta_cpu_dev = { +static const struct cpu_dev transmeta_cpu_dev = { .c_vendor = "Transmeta", .c_ident = { "GenuineTMx86", "TransmetaCPU" }, .c_early_init = early_init_transmeta, diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c index fd2c37bf7ac..ef9c2a0078b 100644 --- a/arch/x86/kernel/cpu/umc.c +++ b/arch/x86/kernel/cpu/umc.c @@ -1,5 +1,4 @@ #include <linux/kernel.h> -#include <linux/init.h> #include <asm/processor.h> #include "cpu.h" @@ -8,11 +7,11 @@ * so no special init takes place. */ -static const struct cpu_dev __cpuinitconst umc_cpu_dev = { +static const struct cpu_dev umc_cpu_dev = { .c_vendor = "UMC", .c_ident = { "UMC UMC UMC" }, - .c_models = { - { .vendor = X86_VENDOR_UMC, .family = 4, .model_names = + .legacy_models = { + { .family = 4, .model_names = { [1] = "U5D", [2] = "U5S", diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 227b0448960..628a059a9a0 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -33,6 +33,9 @@ #define VMWARE_PORT_CMD_GETVERSION 10 #define VMWARE_PORT_CMD_GETHZ 45 +#define VMWARE_PORT_CMD_GETVCPU_INFO 68 +#define VMWARE_PORT_CMD_LEGACY_X2APIC 3 +#define VMWARE_PORT_CMD_VCPU_RESERVED 31 #define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \ __asm__("inl (%%dx)" : \ @@ -86,11 +89,11 @@ static void __init vmware_platform_setup(void) } /* - * While checking the dmi string infomation, just checking the product + * While checking the dmi string information, just checking the product * serial key should be enough, as this will always have a VMware * specific string when running under VMware hypervisor. */ -static bool __init vmware_platform(void) +static uint32_t __init vmware_platform(void) { if (cpu_has_hypervisor) { unsigned int eax; @@ -99,12 +102,12 @@ static bool __init vmware_platform(void) cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &hyper_vendor_id[0], &hyper_vendor_id[1], &hyper_vendor_id[2]); if (!memcmp(hyper_vendor_id, "VMwareVMware", 12)) - return true; + return CPUID_VMWARE_INFO_LEAF; } else if (dmi_available && dmi_name_in_serial("VMware") && __vmware_platform()) - return true; + return 1; - return false; + return 0; } /* @@ -119,16 +122,26 @@ static bool __init vmware_platform(void) * so that the kernel could just trust the hypervisor with providing a * reliable virtual TSC that is suitable for timekeeping. */ -static void __cpuinit vmware_set_cpu_features(struct cpuinfo_x86 *c) +static void vmware_set_cpu_features(struct cpuinfo_x86 *c) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE); } +/* Checks if hypervisor supports x2apic without VT-D interrupt remapping. */ +static bool __init vmware_legacy_x2apic_available(void) +{ + uint32_t eax, ebx, ecx, edx; + VMWARE_PORT(GETVCPU_INFO, eax, ebx, ecx, edx); + return (eax & (1 << VMWARE_PORT_CMD_VCPU_RESERVED)) == 0 && + (eax & (1 << VMWARE_PORT_CMD_LEGACY_X2APIC)) != 0; +} + const __refconst struct hypervisor_x86 x86_hyper_vmware = { .name = "VMware", .detect = vmware_platform, .set_cpu_features = vmware_set_cpu_features, .init_platform = vmware_platform_setup, + .x2apic_available = vmware_legacy_x2apic_available, }; EXPORT_SYMBOL(x86_hyper_vmware); |
