diff options
Diffstat (limited to 'arch/x86/kernel/cpu')
43 files changed, 5286 insertions, 858 deletions
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 47b56a7e99c..7fd54f09b01 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -36,12 +36,13 @@ obj-$(CONFIG_CPU_SUP_AMD)		+= perf_event_amd_iommu.o  endif  obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_knc.o perf_event_p4.o  obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o -obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_uncore.o +obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_uncore.o perf_event_intel_rapl.o  endif  obj-$(CONFIG_X86_MCE)			+= mcheck/  obj-$(CONFIG_MTRR)			+= mtrr/ +obj-$(CONFIG_MICROCODE)			+= microcode/  obj-$(CONFIG_X86_LOCAL_APIC)		+= perfctr-watchdog.o perf_event_amd_ibs.o diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 903a264af98..ce8b8ff0e0e 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -1,5 +1,4 @@  #include <linux/export.h> -#include <linux/init.h>  #include <linux/bitops.h>  #include <linux/elf.h>  #include <linux/mm.h> @@ -219,7 +218,7 @@ static void amd_k7_smp_check(struct cpuinfo_x86 *c)  	 */  	WARN_ONCE(1, "WARNING: This combination of AMD"  		" processors is not suitable for SMP.\n"); -	add_taint(TAINT_UNSAFE_SMP, LOCKDEP_NOW_UNRELIABLE); +	add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE);  }  static void init_amd_k7(struct cpuinfo_x86 *c) @@ -234,9 +233,7 @@ static void init_amd_k7(struct cpuinfo_x86 *c)  	if (c->x86_model >= 6 && c->x86_model <= 10) {  		if (!cpu_has(c, X86_FEATURE_XMM)) {  			printk(KERN_INFO "Enabling disabled K7/SSE Support.\n"); -			rdmsr(MSR_K7_HWCR, l, h); -			l &= ~0x00008000; -			wrmsr(MSR_K7_HWCR, l, h); +			msr_clear_bit(MSR_K7_HWCR, 15);  			set_cpu_cap(c, X86_FEATURE_XMM);  		}  	} @@ -339,7 +336,7 @@ static void amd_get_topology(struct cpuinfo_x86 *c)  #endif  /* - * On a AMD dual core setup the lower bits of the APIC id distingush the cores. + * On a AMD dual core setup the lower bits of the APIC id distinguish the cores.   * Assumes number of cores is a power of two.   */  static void amd_detect_cmp(struct cpuinfo_x86 *c) @@ -487,7 +484,7 @@ static void early_init_amd(struct cpuinfo_x86 *c)  		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);  		set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);  		if (!check_tsc_unstable()) -			sched_clock_stable = 1; +			set_sched_clock_stable();  	}  #ifdef CONFIG_X86_64 @@ -508,6 +505,10 @@ static void early_init_amd(struct cpuinfo_x86 *c)  			set_cpu_cap(c, X86_FEATURE_EXTD_APICID);  	}  #endif + +	/* F16h erratum 793, CVE-2013-6885 */ +	if (c->x86 == 0x16 && c->x86_model <= 0xf) +		msr_set_bit(MSR_AMD64_LS_CFG, 15);  }  static const int amd_erratum_383[]; @@ -527,11 +528,8 @@ static void init_amd(struct cpuinfo_x86 *c)  	 * Errata 63 for SH-B3 steppings  	 * Errata 122 for all steppings (F+ have it disabled by default)  	 */ -	if (c->x86 == 0xf) { -		rdmsrl(MSR_K7_HWCR, value); -		value |= 1 << 6; -		wrmsrl(MSR_K7_HWCR, value); -	} +	if (c->x86 == 0xf) +		msr_set_bit(MSR_K7_HWCR, 6);  #endif  	early_init_amd(c); @@ -614,14 +612,11 @@ static void init_amd(struct cpuinfo_x86 *c)  	    (c->x86_model >= 0x10) && (c->x86_model <= 0x1f) &&  	    !cpu_has(c, X86_FEATURE_TOPOEXT)) { -		if (!rdmsrl_safe(0xc0011005, &value)) { -			value |= 1ULL << 54; -			wrmsrl_safe(0xc0011005, value); +		if (msr_set_bit(0xc0011005, 54) > 0) {  			rdmsrl(0xc0011005, value); -			if (value & (1ULL << 54)) { +			if (value & BIT_64(54)) {  				set_cpu_cap(c, X86_FEATURE_TOPOEXT); -				printk(KERN_INFO FW_INFO "CPU: Re-enabling " -				  "disabled Topology Extensions Support\n"); +				pr_info(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n");  			}  		}  	} @@ -700,19 +695,12 @@ static void init_amd(struct cpuinfo_x86 *c)  		 * Disable GART TLB Walk Errors on Fam10h. We do this here  		 * because this is always needed when GART is enabled, even in a  		 * kernel which has no MCE support built in. -		 * BIOS should disable GartTlbWlk Errors themself. If -		 * it doesn't do it here as suggested by the BKDG. +		 * BIOS should disable GartTlbWlk Errors already. If +		 * it doesn't, do it here as suggested by the BKDG.  		 *  		 * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012  		 */ -		u64 mask; -		int err; - -		err = rdmsrl_safe(MSR_AMD64_MCx_MASK(4), &mask); -		if (err == 0) { -			mask |= (1 << 10); -			wrmsrl_safe(MSR_AMD64_MCx_MASK(4), mask); -		} +		msr_set_bit(MSR_AMD64_MCx_MASK(4), 10);  		/*  		 * On family 10h BIOS may not have properly enabled WC+ support, @@ -724,10 +712,7 @@ static void init_amd(struct cpuinfo_x86 *c)  		 * NOTE: we want to use the _safe accessors so as not to #GP kvm  		 * guests on older kvm hosts.  		 */ - -		rdmsrl_safe(MSR_AMD64_BU_CFG2, &value); -		value &= ~(1ULL << 24); -		wrmsrl_safe(MSR_AMD64_BU_CFG2, value); +		msr_clear_bit(MSR_AMD64_BU_CFG2, 24);  		if (cpu_has_amd_erratum(c, amd_erratum_383))  			set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH); @@ -758,10 +743,7 @@ static unsigned int amd_size_cache(struct cpuinfo_x86 *c, unsigned int size)  static void cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c)  { -	tlb_flushall_shift = 5; - -	if (c->x86 <= 0x11) -		tlb_flushall_shift = 4; +	tlb_flushall_shift = 6;  }  static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c) @@ -790,14 +772,10 @@ static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c)  	}  	/* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ -	if (!((eax >> 16) & mask)) { -		u32 a, b, c, d; - -		cpuid(0x80000005, &a, &b, &c, &d); -		tlb_lld_2m[ENTRIES] = (a >> 16) & 0xff; -	} else { +	if (!((eax >> 16) & mask)) +		tlb_lld_2m[ENTRIES] = (cpuid_eax(0x80000005) >> 16) & 0xff; +	else  		tlb_lld_2m[ENTRIES] = (eax >> 16) & mask; -	}  	/* a 4M entry uses two 2M entries */  	tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1; @@ -823,8 +801,8 @@ static const struct cpu_dev amd_cpu_dev = {  	.c_vendor	= "AMD",  	.c_ident	= { "AuthenticAMD" },  #ifdef CONFIG_X86_32 -	.c_models = { -		{ .vendor = X86_VENDOR_AMD, .family = 4, .model_names = +	.legacy_models = { +		{ .family = 4, .model_names =  		  {  			  [3] = "486 DX/2",  			  [7] = "486 DX/2-WB", @@ -835,7 +813,7 @@ static const struct cpu_dev amd_cpu_dev = {  		  }  		},  	}, -	.c_size_cache	= amd_size_cache, +	.legacy_cache_size = amd_size_cache,  #endif  	.c_early_init   = early_init_amd,  	.c_detect_tlb	= cpu_detect_tlb_amd, diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index fbf6c3bc240..d8fba5c15fb 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -1,6 +1,5 @@  #include <linux/bitops.h>  #include <linux/kernel.h> -#include <linux/init.h>  #include <asm/processor.h>  #include <asm/e820.h> @@ -9,236 +8,6 @@  #include "cpu.h" -#ifdef CONFIG_X86_OOSTORE - -static u32 power2(u32 x) -{ -	u32 s = 1; - -	while (s <= x) -		s <<= 1; - -	return s >>= 1; -} - - -/* - * Set up an actual MCR - */ -static void centaur_mcr_insert(int reg, u32 base, u32 size, int key) -{ -	u32 lo, hi; - -	hi = base & ~0xFFF; -	lo = ~(size-1);		/* Size is a power of 2 so this makes a mask */ -	lo &= ~0xFFF;		/* Remove the ctrl value bits */ -	lo |= key;		/* Attribute we wish to set */ -	wrmsr(reg+MSR_IDT_MCR0, lo, hi); -	mtrr_centaur_report_mcr(reg, lo, hi);	/* Tell the mtrr driver */ -} - -/* - * Figure what we can cover with MCR's - * - * Shortcut: We know you can't put 4Gig of RAM on a winchip - */ -static u32 ramtop(void) -{ -	u32 clip = 0xFFFFFFFFUL; -	u32 top = 0; -	int i; - -	for (i = 0; i < e820.nr_map; i++) { -		unsigned long start, end; - -		if (e820.map[i].addr > 0xFFFFFFFFUL) -			continue; -		/* -		 * Don't MCR over reserved space. Ignore the ISA hole -		 * we frob around that catastrophe already -		 */ -		if (e820.map[i].type == E820_RESERVED) { -			if (e820.map[i].addr >= 0x100000UL && -			    e820.map[i].addr < clip) -				clip = e820.map[i].addr; -			continue; -		} -		start = e820.map[i].addr; -		end = e820.map[i].addr + e820.map[i].size; -		if (start >= end) -			continue; -		if (end > top) -			top = end; -	} -	/* -	 * Everything below 'top' should be RAM except for the ISA hole. -	 * Because of the limited MCR's we want to map NV/ACPI into our -	 * MCR range for gunk in RAM -	 * -	 * Clip might cause us to MCR insufficient RAM but that is an -	 * acceptable failure mode and should only bite obscure boxes with -	 * a VESA hole at 15Mb -	 * -	 * The second case Clip sometimes kicks in is when the EBDA is marked -	 * as reserved. Again we fail safe with reasonable results -	 */ -	if (top > clip) -		top = clip; - -	return top; -} - -/* - * Compute a set of MCR's to give maximum coverage - */ -static int centaur_mcr_compute(int nr, int key) -{ -	u32 mem = ramtop(); -	u32 root = power2(mem); -	u32 base = root; -	u32 top = root; -	u32 floor = 0; -	int ct = 0; - -	while (ct < nr) { -		u32 fspace = 0; -		u32 high; -		u32 low; - -		/* -		 * Find the largest block we will fill going upwards -		 */ -		high = power2(mem-top); - -		/* -		 * Find the largest block we will fill going downwards -		 */ -		low = base/2; - -		/* -		 * Don't fill below 1Mb going downwards as there -		 * is an ISA hole in the way. -		 */ -		if (base <= 1024*1024) -			low = 0; - -		/* -		 * See how much space we could cover by filling below -		 * the ISA hole -		 */ - -		if (floor == 0) -			fspace = 512*1024; -		else if (floor == 512*1024) -			fspace = 128*1024; - -		/* And forget ROM space */ - -		/* -		 * Now install the largest coverage we get -		 */ -		if (fspace > high && fspace > low) { -			centaur_mcr_insert(ct, floor, fspace, key); -			floor += fspace; -		} else if (high > low) { -			centaur_mcr_insert(ct, top, high, key); -			top += high; -		} else if (low > 0) { -			base -= low; -			centaur_mcr_insert(ct, base, low, key); -		} else -			break; -		ct++; -	} -	/* -	 * We loaded ct values. We now need to set the mask. The caller -	 * must do this bit. -	 */ -	return ct; -} - -static void centaur_create_optimal_mcr(void) -{ -	int used; -	int i; - -	/* -	 * Allocate up to 6 mcrs to mark as much of ram as possible -	 * as write combining and weak write ordered. -	 * -	 * To experiment with: Linux never uses stack operations for -	 * mmio spaces so we could globally enable stack operation wc -	 * -	 * Load the registers with type 31 - full write combining, all -	 * writes weakly ordered. -	 */ -	used = centaur_mcr_compute(6, 31); - -	/* -	 * Wipe unused MCRs -	 */ -	for (i = used; i < 8; i++) -		wrmsr(MSR_IDT_MCR0+i, 0, 0); -} - -static void winchip2_create_optimal_mcr(void) -{ -	u32 lo, hi; -	int used; -	int i; - -	/* -	 * Allocate up to 6 mcrs to mark as much of ram as possible -	 * as write combining, weak store ordered. -	 * -	 * Load the registers with type 25 -	 *	8	-	weak write ordering -	 *	16	-	weak read ordering -	 *	1	-	write combining -	 */ -	used = centaur_mcr_compute(6, 25); - -	/* -	 * Mark the registers we are using. -	 */ -	rdmsr(MSR_IDT_MCR_CTRL, lo, hi); -	for (i = 0; i < used; i++) -		lo |= 1<<(9+i); -	wrmsr(MSR_IDT_MCR_CTRL, lo, hi); - -	/* -	 * Wipe unused MCRs -	 */ - -	for (i = used; i < 8; i++) -		wrmsr(MSR_IDT_MCR0+i, 0, 0); -} - -/* - * Handle the MCR key on the Winchip 2. - */ -static void winchip2_unprotect_mcr(void) -{ -	u32 lo, hi; -	u32 key; - -	rdmsr(MSR_IDT_MCR_CTRL, lo, hi); -	lo &= ~0x1C0;	/* blank bits 8-6 */ -	key = (lo>>17) & 7; -	lo |= key<<6;	/* replace with unlock key */ -	wrmsr(MSR_IDT_MCR_CTRL, lo, hi); -} - -static void winchip2_protect_mcr(void) -{ -	u32 lo, hi; - -	rdmsr(MSR_IDT_MCR_CTRL, lo, hi); -	lo &= ~0x1C0;	/* blank bits 8-6 */ -	wrmsr(MSR_IDT_MCR_CTRL, lo, hi); -} -#endif /* CONFIG_X86_OOSTORE */ -  #define ACE_PRESENT	(1 << 6)  #define ACE_ENABLED	(1 << 7)  #define ACE_FCR		(1 << 28)	/* MSR_VIA_FCR */ @@ -363,20 +132,6 @@ static void init_centaur(struct cpuinfo_x86 *c)  			fcr_clr = DPDC;  			printk(KERN_NOTICE "Disabling bugged TSC.\n");  			clear_cpu_cap(c, X86_FEATURE_TSC); -#ifdef CONFIG_X86_OOSTORE -			centaur_create_optimal_mcr(); -			/* -			 * Enable: -			 *	write combining on non-stack, non-string -			 *	write combining on string, all types -			 *	weak write ordering -			 * -			 * The C6 original lacks weak read order -			 * -			 * Note 0x120 is write only on Winchip 1 -			 */ -			wrmsr(MSR_IDT_MCR_CTRL, 0x01F0001F, 0); -#endif  			break;  		case 8:  			switch (c->x86_mask) { @@ -393,40 +148,12 @@ static void init_centaur(struct cpuinfo_x86 *c)  			fcr_set = ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK|  				  E2MMX|EAMD3D;  			fcr_clr = DPDC; -#ifdef CONFIG_X86_OOSTORE -			winchip2_unprotect_mcr(); -			winchip2_create_optimal_mcr(); -			rdmsr(MSR_IDT_MCR_CTRL, lo, hi); -			/* -			 * Enable: -			 *	write combining on non-stack, non-string -			 *	write combining on string, all types -			 *	weak write ordering -			 */ -			lo |= 31; -			wrmsr(MSR_IDT_MCR_CTRL, lo, hi); -			winchip2_protect_mcr(); -#endif  			break;  		case 9:  			name = "3";  			fcr_set = ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK|  				  E2MMX|EAMD3D;  			fcr_clr = DPDC; -#ifdef CONFIG_X86_OOSTORE -			winchip2_unprotect_mcr(); -			winchip2_create_optimal_mcr(); -			rdmsr(MSR_IDT_MCR_CTRL, lo, hi); -			/* -			 * Enable: -			 *	write combining on non-stack, non-string -			 *	write combining on string, all types -			 *	weak write ordering -			 */ -			lo |= 31; -			wrmsr(MSR_IDT_MCR_CTRL, lo, hi); -			winchip2_protect_mcr(); -#endif  			break;  		default:  			name = "??"; @@ -468,10 +195,10 @@ static void init_centaur(struct cpuinfo_x86 *c)  #endif  } +#ifdef CONFIG_X86_32  static unsigned int  centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)  { -#ifdef CONFIG_X86_32  	/* VIA C3 CPUs (670-68F) need further shifting. */  	if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8)))  		size >>= 8; @@ -484,16 +211,18 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)  	if ((c->x86 == 6) && (c->x86_model == 9) &&  				(c->x86_mask == 1) && (size == 65))  		size -= 1; -#endif  	return size;  } +#endif  static const struct cpu_dev centaur_cpu_dev = {  	.c_vendor	= "Centaur",  	.c_ident	= { "CentaurHauls" },  	.c_early_init	= early_init_centaur,  	.c_init		= init_centaur, -	.c_size_cache	= centaur_size_cache, +#ifdef CONFIG_X86_32 +	.legacy_cache_size = centaur_size_cache, +#endif  	.c_x86_vendor	= X86_VENDOR_CENTAUR,  }; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 2793d1f095a..ef1b93f18ed 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -8,6 +8,7 @@  #include <linux/delay.h>  #include <linux/sched.h>  #include <linux/init.h> +#include <linux/kprobes.h>  #include <linux/kgdb.h>  #include <linux/smp.h>  #include <linux/io.h> @@ -20,6 +21,7 @@  #include <asm/processor.h>  #include <asm/debugreg.h>  #include <asm/sections.h> +#include <asm/vsyscall.h>  #include <linux/topology.h>  #include <linux/cpumask.h>  #include <asm/pgtable.h> @@ -284,8 +286,13 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)  	raw_local_save_flags(eflags);  	BUG_ON(eflags & X86_EFLAGS_AC); -	if (cpu_has(c, X86_FEATURE_SMAP)) +	if (cpu_has(c, X86_FEATURE_SMAP)) { +#ifdef CONFIG_X86_SMAP  		set_in_cr4(X86_CR4_SMAP); +#else +		clear_in_cr4(X86_CR4_SMAP); +#endif +	}  }  /* @@ -346,7 +353,8 @@ static void filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)  /* Look up CPU names by table lookup. */  static const char *table_lookup_model(struct cpuinfo_x86 *c)  { -	const struct cpu_model_info *info; +#ifdef CONFIG_X86_32 +	const struct legacy_cpu_model_info *info;  	if (c->x86_model >= 16)  		return NULL;	/* Range check */ @@ -354,13 +362,14 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c)  	if (!this_cpu)  		return NULL; -	info = this_cpu->c_models; +	info = this_cpu->legacy_models; -	while (info && info->family) { +	while (info->family) {  		if (info->family == c->x86)  			return info->model_names[c->x86_model];  		info++;  	} +#endif  	return NULL;		/* Not found */  } @@ -450,8 +459,8 @@ void cpu_detect_cache_sizes(struct cpuinfo_x86 *c)  	c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);  #else  	/* do processor-specific cache resizing */ -	if (this_cpu->c_size_cache) -		l2size = this_cpu->c_size_cache(c, l2size); +	if (this_cpu->legacy_cache_size) +		l2size = this_cpu->legacy_cache_size(c, l2size);  	/* Allow user to override all this if necessary. */  	if (cachesize_override != -1) @@ -470,6 +479,7 @@ u16 __read_mostly tlb_lli_4m[NR_INFO];  u16 __read_mostly tlb_lld_4k[NR_INFO];  u16 __read_mostly tlb_lld_2m[NR_INFO];  u16 __read_mostly tlb_lld_4m[NR_INFO]; +u16 __read_mostly tlb_lld_1g[NR_INFO];  /*   * tlb_flushall_shift shows the balance point in replacing cr3 write @@ -484,13 +494,13 @@ void cpu_detect_tlb(struct cpuinfo_x86 *c)  	if (this_cpu->c_detect_tlb)  		this_cpu->c_detect_tlb(c); -	printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \ -		"Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d\n"	     \ +	printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" +		"Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n"  		"tlb_flushall_shift: %d\n",  		tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES],  		tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES],  		tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES], -		tlb_flushall_shift); +		tlb_lld_1g[ENTRIES], tlb_flushall_shift);  }  void detect_ht(struct cpuinfo_x86 *c) @@ -945,6 +955,38 @@ static void vgetcpu_set_mode(void)  	else  		vgetcpu_mode = VGETCPU_LSL;  } + +/* May not be __init: called during resume */ +static void syscall32_cpu_init(void) +{ +	/* Load these always in case some future AMD CPU supports +	   SYSENTER from compat mode too. */ +	wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); +	wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); +	wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); + +	wrmsrl(MSR_CSTAR, ia32_cstar_target); +} +#endif + +#ifdef CONFIG_X86_32 +void enable_sep_cpu(void) +{ +	int cpu = get_cpu(); +	struct tss_struct *tss = &per_cpu(init_tss, cpu); + +	if (!boot_cpu_has(X86_FEATURE_SEP)) { +		put_cpu(); +		return; +	} + +	tss->x86_tss.ss1 = __KERNEL_CS; +	tss->x86_tss.sp1 = sizeof(struct tss_struct) + (unsigned long) tss; +	wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); +	wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0); +	wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0); +	put_cpu(); +}  #endif  void __init identify_boot_cpu(void) @@ -1017,7 +1059,8 @@ __setup("show_msr=", setup_show_msr);  static __init int setup_noclflush(char *arg)  { -	setup_clear_cpu_cap(X86_FEATURE_CLFLSH); +	setup_clear_cpu_cap(X86_FEATURE_CLFLUSH); +	setup_clear_cpu_cap(X86_FEATURE_CLFLUSHOPT);  	return 1;  }  __setup("noclflush", setup_noclflush); @@ -1070,6 +1113,10 @@ static __init int setup_disablecpuid(char *arg)  }  __setup("clearcpuid=", setup_disablecpuid); +DEFINE_PER_CPU(unsigned long, kernel_stack) = +	(unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE; +EXPORT_PER_CPU_SYMBOL(kernel_stack); +  #ifdef CONFIG_X86_64  struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table };  struct desc_ptr debug_idt_descr = { NR_VECTORS * 16 - 1, @@ -1086,15 +1133,14 @@ DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =  	&init_task;  EXPORT_PER_CPU_SYMBOL(current_task); -DEFINE_PER_CPU(unsigned long, kernel_stack) = -	(unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE; -EXPORT_PER_CPU_SYMBOL(kernel_stack); -  DEFINE_PER_CPU(char *, irq_stack_ptr) =  	init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;  DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; +DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; +EXPORT_PER_CPU_SYMBOL(__preempt_count); +  DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);  /* @@ -1148,6 +1194,7 @@ int is_debug_stack(unsigned long addr)  		(addr <= __get_cpu_var(debug_stack_addr) &&  		 addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ));  } +NOKPROBE_SYMBOL(is_debug_stack);  DEFINE_PER_CPU(u32, debug_idt_ctr); @@ -1156,6 +1203,7 @@ void debug_stack_set_zero(void)  	this_cpu_inc(debug_idt_ctr);  	load_current_idt();  } +NOKPROBE_SYMBOL(debug_stack_set_zero);  void debug_stack_reset(void)  { @@ -1164,11 +1212,14 @@ void debug_stack_reset(void)  	if (this_cpu_dec_return(debug_idt_ctr) == 0)  		load_current_idt();  } +NOKPROBE_SYMBOL(debug_stack_reset);  #else	/* CONFIG_X86_64 */  DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;  EXPORT_PER_CPU_SYMBOL(current_task); +DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; +EXPORT_PER_CPU_SYMBOL(__preempt_count);  DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);  #ifdef CONFIG_CC_STACKPROTECTOR diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 4041c24ae7d..c37dc37e831 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -1,12 +1,6 @@  #ifndef ARCH_X86_CPU_H  #define ARCH_X86_CPU_H -struct cpu_model_info { -	int		vendor; -	int		family; -	const char	*model_names[16]; -}; -  /* attempt to consolidate cpu attributes */  struct cpu_dev {  	const char	*c_vendor; @@ -14,15 +8,23 @@ struct cpu_dev {  	/* some have two possibilities for cpuid string */  	const char	*c_ident[2]; -	struct		cpu_model_info c_models[4]; -  	void            (*c_early_init)(struct cpuinfo_x86 *);  	void		(*c_bsp_init)(struct cpuinfo_x86 *);  	void		(*c_init)(struct cpuinfo_x86 *);  	void		(*c_identify)(struct cpuinfo_x86 *);  	void		(*c_detect_tlb)(struct cpuinfo_x86 *); -	unsigned int	(*c_size_cache)(struct cpuinfo_x86 *, unsigned int);  	int		c_x86_vendor; +#ifdef CONFIG_X86_32 +	/* Optional vendor specific routine to obtain the cache size. */ +	unsigned int	(*legacy_cache_size)(struct cpuinfo_x86 *, +					     unsigned int); + +	/* Family/stepping-based lookup table for model names. */ +	struct legacy_cpu_model_info { +		int		family; +		const char	*model_names[16]; +	}		legacy_models[5]; +#endif  };  struct _tlb_table { diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index d0969c75ab5..aaf152e7963 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -1,4 +1,3 @@ -#include <linux/init.h>  #include <linux/bitops.h>  #include <linux/delay.h>  #include <linux/pci.h> diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index ec7299566f7..f9e4fdd3b87 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -1,4 +1,3 @@ -#include <linux/init.h>  #include <linux/kernel.h>  #include <linux/string.h> @@ -32,11 +31,8 @@ static void early_init_intel(struct cpuinfo_x86 *c)  	/* Unmask CPUID levels if masked: */  	if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { -		rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); - -		if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) { -			misc_enable &= ~MSR_IA32_MISC_ENABLE_LIMIT_CPUID; -			wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); +		if (msr_clear_bit(MSR_IA32_MISC_ENABLE, +				  MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) > 0) {  			c->cpuid_level = cpuid_eax(0);  			get_cpu_cap(c);  		} @@ -93,7 +89,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)  		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);  		set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);  		if (!check_tsc_unstable()) -			sched_clock_stable = 1; +			set_sched_clock_stable();  	}  	/* Penwell and Cloverview have the TSC which doesn't sleep on S3 */ @@ -130,16 +126,10 @@ static void early_init_intel(struct cpuinfo_x86 *c)  	 * Ingo Molnar reported a Pentium D (model 6) and a Xeon  	 * (model 2) with the same problem.  	 */ -	if (c->x86 == 15) { -		rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); - -		if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) { -			printk(KERN_INFO "kmemcheck: Disabling fast string operations\n"); - -			misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING; -			wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); -		} -	} +	if (c->x86 == 15) +		if (msr_clear_bit(MSR_IA32_MISC_ENABLE, +				  MSR_IA32_MISC_ENABLE_FAST_STRING_BIT) > 0) +			pr_info("kmemcheck: Disabling fast string operations\n");  #endif  	/* @@ -196,10 +186,16 @@ static void intel_smp_check(struct cpuinfo_x86 *c)  	}  } -static void intel_workarounds(struct cpuinfo_x86 *c) +static int forcepae; +static int __init forcepae_setup(char *__unused)  { -	unsigned long lo, hi; +	forcepae = 1; +	return 1; +} +__setup("forcepae", forcepae_setup); +static void intel_workarounds(struct cpuinfo_x86 *c) +{  #ifdef CONFIG_X86_F00F_BUG  	/*  	 * All current models of Pentium and Pentium with MMX technology CPUs @@ -226,16 +222,26 @@ static void intel_workarounds(struct cpuinfo_x86 *c)  		clear_cpu_cap(c, X86_FEATURE_SEP);  	/* +	 * PAE CPUID issue: many Pentium M report no PAE but may have a +	 * functionally usable PAE implementation. +	 * Forcefully enable PAE if kernel parameter "forcepae" is present. +	 */ +	if (forcepae) { +		printk(KERN_WARNING "PAE forced!\n"); +		set_cpu_cap(c, X86_FEATURE_PAE); +		add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE); +	} + +	/*  	 * P4 Xeon errata 037 workaround.  	 * Hardware prefetcher may cause stale data to be loaded into the cache.  	 */  	if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) { -		rdmsr(MSR_IA32_MISC_ENABLE, lo, hi); -		if ((lo & MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE) == 0) { -			printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n"); -			printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n"); -			lo |= MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE; -			wrmsr(MSR_IA32_MISC_ENABLE, lo, hi); +		if (msr_set_bit(MSR_IA32_MISC_ENABLE, +				MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT) +		    > 0) { +			pr_info("CPU: C0 stepping P4 Xeon detected.\n"); +			pr_info("CPU: Disabling hardware prefetching (Errata 037)\n");  		}  	} @@ -268,10 +274,6 @@ static void intel_workarounds(struct cpuinfo_x86 *c)  	}  #endif -#ifdef CONFIG_X86_NUMAQ -	numaq_tsc_disable(); -#endif -  	intel_smp_check(c);  }  #else @@ -368,6 +370,17 @@ static void init_intel(struct cpuinfo_x86 *c)  	 */  	detect_extended_topology(c); +	if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) { +		/* +		 * let's use the legacy cpuid vector 0x1 and 0x4 for topology +		 * detection. +		 */ +		c->x86_max_cores = intel_num_cpu_cores(c); +#ifdef CONFIG_X86_32 +		detect_ht(c); +#endif +	} +  	l2 = init_intel_cacheinfo(c);  	if (c->cpuid_level > 9) {  		unsigned eax = cpuid_eax(10); @@ -387,7 +400,8 @@ static void init_intel(struct cpuinfo_x86 *c)  			set_cpu_cap(c, X86_FEATURE_PEBS);  	} -	if (c->x86 == 6 && c->x86_model == 29 && cpu_has_clflush) +	if (c->x86 == 6 && cpu_has_clflush && +	    (c->x86_model == 29 || c->x86_model == 46 || c->x86_model == 47))  		set_cpu_cap(c, X86_FEATURE_CLFLUSH_MONITOR);  #ifdef CONFIG_X86_64 @@ -435,17 +449,6 @@ static void init_intel(struct cpuinfo_x86 *c)  		set_cpu_cap(c, X86_FEATURE_P3);  #endif -	if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) { -		/* -		 * let's use the legacy cpuid vector 0x1 and 0x4 for topology -		 * detection. -		 */ -		c->x86_max_cores = intel_num_cpu_cores(c); -#ifdef CONFIG_X86_32 -		detect_ht(c); -#endif -	} -  	/* Work around errata */  	srat_detect_node(c); @@ -505,6 +508,7 @@ static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)  #define TLB_DATA0_2M_4M	0x23  #define STLB_4K		0x41 +#define STLB_4K_2M	0x42  static const struct _tlb_table intel_tlb_table[] = {  	{ 0x01, TLB_INST_4K,		32,	" TLB_INST 4 KByte pages, 4-way set associative" }, @@ -525,13 +529,20 @@ static const struct _tlb_table intel_tlb_table[] = {  	{ 0x5b, TLB_DATA_4K_4M,		64,	" TLB_DATA 4 KByte and 4 MByte pages" },  	{ 0x5c, TLB_DATA_4K_4M,		128,	" TLB_DATA 4 KByte and 4 MByte pages" },  	{ 0x5d, TLB_DATA_4K_4M,		256,	" TLB_DATA 4 KByte and 4 MByte pages" }, +	{ 0x61, TLB_INST_4K,		48,	" TLB_INST 4 KByte pages, full associative" }, +	{ 0x63, TLB_DATA_1G,		4,	" TLB_DATA 1 GByte pages, 4-way set associative" }, +	{ 0x76, TLB_INST_2M_4M,		8,	" TLB_INST 2-MByte or 4-MByte pages, fully associative" },  	{ 0xb0, TLB_INST_4K,		128,	" TLB_INST 4 KByte pages, 4-way set associative" },  	{ 0xb1, TLB_INST_2M_4M,		4,	" TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" },  	{ 0xb2, TLB_INST_4K,		64,	" TLB_INST 4KByte pages, 4-way set associative" },  	{ 0xb3, TLB_DATA_4K,		128,	" TLB_DATA 4 KByte pages, 4-way set associative" },  	{ 0xb4, TLB_DATA_4K,		256,	" TLB_DATA 4 KByte pages, 4-way associative" }, +	{ 0xb5, TLB_INST_4K,		64,	" TLB_INST 4 KByte pages, 8-way set ssociative" }, +	{ 0xb6, TLB_INST_4K,		128,	" TLB_INST 4 KByte pages, 8-way set ssociative" },  	{ 0xba, TLB_DATA_4K,		64,	" TLB_DATA 4 KByte pages, 4-way associative" },  	{ 0xc0, TLB_DATA_4K_4M,		8,	" TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" }, +	{ 0xc1, STLB_4K_2M,		1024,	" STLB 4 KByte and 2 MByte pages, 8-way associative" }, +	{ 0xc2, TLB_DATA_2M_4M,		16,	" DTLB 2 MByte/4MByte pages, 4-way associative" },  	{ 0xca, STLB_4K,		512,	" STLB 4 KByte pages, 4-way associative" },  	{ 0x00, 0, 0 }  }; @@ -557,6 +568,20 @@ static void intel_tlb_lookup(const unsigned char desc)  		if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)  			tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;  		break; +	case STLB_4K_2M: +		if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) +			tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; +		if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) +			tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; +		if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries) +			tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries; +		if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries) +			tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries; +		if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) +			tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; +		if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) +			tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; +		break;  	case TLB_INST_ALL:  		if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)  			tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; @@ -602,6 +627,10 @@ static void intel_tlb_lookup(const unsigned char desc)  		if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)  			tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;  		break; +	case TLB_DATA_1G: +		if (tlb_lld_1g[ENTRIES] < intel_tlb_table[k].entries) +			tlb_lld_1g[ENTRIES] = intel_tlb_table[k].entries; +		break;  	}  } @@ -614,21 +643,17 @@ static void intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c)  	case 0x61d: /* six-core 45 nm xeon "Dunnington" */  		tlb_flushall_shift = -1;  		break; +	case 0x63a: /* Ivybridge */ +		tlb_flushall_shift = 2; +		break;  	case 0x61a: /* 45 nm nehalem, "Bloomfield" */  	case 0x61e: /* 45 nm nehalem, "Lynnfield" */  	case 0x625: /* 32 nm nehalem, "Clarkdale" */  	case 0x62c: /* 32 nm nehalem, "Gulftown" */  	case 0x62e: /* 45 nm nehalem-ex, "Beckton" */  	case 0x62f: /* 32 nm Xeon E7 */ -		tlb_flushall_shift = 6; -		break;  	case 0x62a: /* SandyBridge */  	case 0x62d: /* SandyBridge, "Romely-EP" */ -		tlb_flushall_shift = 5; -		break; -	case 0x63a: /* Ivybridge */ -		tlb_flushall_shift = 1; -		break;  	default:  		tlb_flushall_shift = 6;  	} @@ -665,8 +690,8 @@ static const struct cpu_dev intel_cpu_dev = {  	.c_vendor	= "Intel",  	.c_ident	= { "GenuineIntel" },  #ifdef CONFIG_X86_32 -	.c_models = { -		{ .vendor = X86_VENDOR_INTEL, .family = 4, .model_names = +	.legacy_models = { +		{ .family = 4, .model_names =  		  {  			  [0] = "486 DX-25/33",  			  [1] = "486 DX-50", @@ -679,7 +704,7 @@ static const struct cpu_dev intel_cpu_dev = {  			  [9] = "486 DX/4-WB"  		  }  		}, -		{ .vendor = X86_VENDOR_INTEL, .family = 5, .model_names = +		{ .family = 5, .model_names =  		  {  			  [0] = "Pentium 60/66 A-step",  			  [1] = "Pentium 60/66", @@ -690,7 +715,7 @@ static const struct cpu_dev intel_cpu_dev = {  			  [8] = "Mobile Pentium MMX"  		  }  		}, -		{ .vendor = X86_VENDOR_INTEL, .family = 6, .model_names = +		{ .family = 6, .model_names =  		  {  			  [0] = "Pentium Pro A-step",  			  [1] = "Pentium Pro", @@ -704,7 +729,7 @@ static const struct cpu_dev intel_cpu_dev = {  			  [11] = "Pentium III (Tualatin)",  		  }  		}, -		{ .vendor = X86_VENDOR_INTEL, .family = 15, .model_names = +		{ .family = 15, .model_names =  		  {  			  [0] = "Pentium 4 (Unknown)",  			  [1] = "Pentium 4 (Willamette)", @@ -714,7 +739,7 @@ static const struct cpu_dev intel_cpu_dev = {  		  }  		},  	}, -	.c_size_cache	= intel_size_cache, +	.legacy_cache_size = intel_size_cache,  #endif  	.c_detect_tlb	= intel_detect_tlb,  	.c_early_init   = early_init_intel, diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 1414c90feab..9c8f7394c61 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -1,5 +1,5 @@  /* - *	Routines to indentify caches on Intel CPU. + *	Routines to identify caches on Intel CPU.   *   *	Changes:   *	Venkatesh Pallipadi	: Adding cache identification through cpuid(4) @@ -730,6 +730,18 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c)  #endif  	} +#ifdef CONFIG_X86_HT +	/* +	 * If cpu_llc_id is not yet set, this means cpuid_level < 4 which in +	 * turns means that the only possibility is SMT (as indicated in +	 * cpuid1). Since cpuid2 doesn't specify shared caches, and we know +	 * that SMT shares all caches, we can unconditionally set cpu_llc_id to +	 * c->phys_proc_id. +	 */ +	if (per_cpu(cpu_llc_id, cpu) == BAD_APICID) +		per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; +#endif +  	c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d));  	return l2; @@ -1225,21 +1237,24 @@ static struct notifier_block cacheinfo_cpu_notifier = {  static int __init cache_sysfs_init(void)  { -	int i; +	int i, err = 0;  	if (num_cache_leaves == 0)  		return 0; +	cpu_notifier_register_begin();  	for_each_online_cpu(i) { -		int err;  		struct device *dev = get_cpu_device(i);  		err = cache_add_dev(dev);  		if (err) -			return err; +			goto out;  	} -	register_hotcpu_notifier(&cacheinfo_cpu_notifier); -	return 0; +	__register_hotcpu_notifier(&cacheinfo_cpu_notifier); + +out: +	cpu_notifier_register_done(); +	return err;  }  device_initcall(cache_sysfs_init); diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index 36565373af8..afa9f0d487e 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -47,45 +47,3 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match)  	return NULL;  }  EXPORT_SYMBOL(x86_match_cpu); - -ssize_t arch_print_cpu_modalias(struct device *dev, -				struct device_attribute *attr, -				char *bufptr) -{ -	int size = PAGE_SIZE; -	int i, n; -	char *buf = bufptr; - -	n = snprintf(buf, size, "x86cpu:vendor:%04X:family:%04X:" -		     "model:%04X:feature:", -		boot_cpu_data.x86_vendor, -		boot_cpu_data.x86, -		boot_cpu_data.x86_model); -	size -= n; -	buf += n; -	size -= 1; -	for (i = 0; i < NCAPINTS*32; i++) { -		if (boot_cpu_has(i)) { -			n = snprintf(buf, size, ",%04X", i); -			if (n >= size) { -				WARN(1, "x86 features overflow page\n"); -				break; -			} -			size -= n; -			buf += n; -		} -	} -	*buf++ = '\n'; -	return buf - bufptr; -} - -int arch_cpu_uevent(struct device *dev, struct kobj_uevent_env *env) -{ -	char *buf = kzalloc(PAGE_SIZE, GFP_KERNEL); -	if (buf) { -		arch_print_cpu_modalias(NULL, NULL, buf); -		add_uevent_var(env, "MODALIAS=%s", buf); -		kfree(buf); -	} -	return 0; -} diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c index cd8b166a173..a1aef953315 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-apei.c +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c @@ -33,23 +33,28 @@  #include <linux/acpi.h>  #include <linux/cper.h>  #include <acpi/apei.h> +#include <acpi/ghes.h>  #include <asm/mce.h>  #include "mce-internal.h" -void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err) +void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)  {  	struct mce m; -	/* Only corrected MC is reported */ -	if (!corrected || !(mem_err->validation_bits & -				CPER_MEM_VALID_PHYSICAL_ADDRESS)) +	if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))  		return;  	mce_setup(&m);  	m.bank = 1; -	/* Fake a memory read corrected error with unknown channel */ +	/* Fake a memory read error with unknown channel */  	m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f; + +	if (severity >= GHES_SEV_RECOVERABLE) +		m.status |= MCI_STATUS_UC; +	if (severity >= GHES_SEV_PANIC) +		m.status |= MCI_STATUS_PCC; +  	m.addr = mem_err->physical_addr;  	mce_log(&m);  	mce_notify_irq(); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index b3218cdee95..9a79c8dbd8e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -60,8 +60,6 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex);  #define SPINUNIT 100	/* 100ns */ -atomic_t mce_entry; -  DEFINE_PER_CPU(unsigned, mce_exception_count);  struct mce_bank *mce_banks __read_mostly; @@ -89,6 +87,9 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);  static DEFINE_PER_CPU(struct mce, mces_seen);  static int			cpu_missing; +/* CMCI storm detection filter */ +static DEFINE_PER_CPU(unsigned long, mce_polled_error); +  /*   * MCA banks polled by the period polling timer for corrected events.   * With Intel CMCI, this only has MCA banks which do not support CMCI (if any). @@ -614,6 +615,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)  		if (!(m.status & MCI_STATUS_VAL))  			continue; +		this_cpu_write(mce_polled_error, 1);  		/*  		 * Uncorrected or signalled events are handled by the exception  		 * handler when it is enabled, so don't process those here. @@ -700,8 +702,7 @@ static int mce_timed_out(u64 *t)  	if (!mca_cfg.monarch_timeout)  		goto out;  	if ((s64)*t < SPINUNIT) { -		/* CHECKME: Make panic default for 1 too? */ -		if (mca_cfg.tolerant < 1) +		if (mca_cfg.tolerant <= 1)  			mce_panic("Timeout synchronizing machine check over CPUs",  				  NULL, NULL);  		cpu_missing = 1; @@ -1037,8 +1038,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)  	DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);  	char *msg = "Unknown"; -	atomic_inc(&mce_entry); -  	this_cpu_inc(mce_exception_count);  	if (!cfg->banks) @@ -1168,7 +1167,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)  		mce_report_event(regs);  	mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);  out: -	atomic_dec(&mce_entry);  	sync_core();  }  EXPORT_SYMBOL_GPL(do_machine_check); @@ -1278,10 +1276,18 @@ static unsigned long mce_adjust_timer_default(unsigned long interval)  static unsigned long (*mce_adjust_timer)(unsigned long interval) =  	mce_adjust_timer_default; +static int cmc_error_seen(void) +{ +	unsigned long *v = &__get_cpu_var(mce_polled_error); + +	return test_and_clear_bit(0, v); +} +  static void mce_timer_fn(unsigned long data)  {  	struct timer_list *t = &__get_cpu_var(mce_timer);  	unsigned long iv; +	int notify;  	WARN_ON(smp_processor_id() != data); @@ -1296,7 +1302,9 @@ static void mce_timer_fn(unsigned long data)  	 * polling interval, otherwise increase the polling interval.  	 */  	iv = __this_cpu_read(mce_next_interval); -	if (mce_notify_irq()) { +	notify = mce_notify_irq(); +	notify |= cmc_error_seen(); +	if (notify) {  		iv = max(iv / 2, (unsigned long) HZ/100);  	} else {  		iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); @@ -1638,15 +1646,15 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)  static void mce_start_timer(unsigned int cpu, struct timer_list *t)  { -	unsigned long iv = mce_adjust_timer(check_interval * HZ); - -	__this_cpu_write(mce_next_interval, iv); +	unsigned long iv = check_interval * HZ;  	if (mca_cfg.ignore_ce || !iv)  		return; +	per_cpu(mce_next_interval, cpu) = iv; +  	t->expires = round_jiffies(jiffies + iv); -	add_timer_on(t, smp_processor_id()); +	add_timer_on(t, cpu);  }  static void __mcheck_cpu_init_timer(void) @@ -2272,8 +2280,10 @@ static int mce_device_create(unsigned int cpu)  	dev->release = &mce_device_release;  	err = device_register(dev); -	if (err) +	if (err) { +		put_device(dev);  		return err; +	}  	for (i = 0; mce_device_attrs[i]; i++) {  		err = device_create_file(dev, mce_device_attrs[i]); @@ -2421,28 +2431,67 @@ static __init int mcheck_init_device(void)  	int err;  	int i = 0; -	if (!mce_available(&boot_cpu_data)) -		return -EIO; +	if (!mce_available(&boot_cpu_data)) { +		err = -EIO; +		goto err_out; +	} -	zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL); +	if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) { +		err = -ENOMEM; +		goto err_out; +	}  	mce_init_banks();  	err = subsys_system_register(&mce_subsys, NULL);  	if (err) -		return err; +		goto err_out_mem; +	cpu_notifier_register_begin();  	for_each_online_cpu(i) {  		err = mce_device_create(i); -		if (err) -			return err; +		if (err) { +			/* +			 * Register notifier anyway (and do not unreg it) so +			 * that we don't leave undeleted timers, see notifier +			 * callback above. +			 */ +			__register_hotcpu_notifier(&mce_cpu_notifier); +			cpu_notifier_register_done(); +			goto err_device_create; +		}  	} +	__register_hotcpu_notifier(&mce_cpu_notifier); +	cpu_notifier_register_done(); +  	register_syscore_ops(&mce_syscore_ops); -	register_hotcpu_notifier(&mce_cpu_notifier);  	/* register character device /dev/mcelog */ -	misc_register(&mce_chrdev_device); +	err = misc_register(&mce_chrdev_device); +	if (err) +		goto err_register; + +	return 0; + +err_register: +	unregister_syscore_ops(&mce_syscore_ops); + +err_device_create: +	/* +	 * We didn't keep track of which devices were created above, but +	 * even if we had, the set of online cpus might have changed. +	 * Play safe and remove for every possible cpu, since +	 * mce_device_remove() will do the right thing. +	 */ +	for_each_possible_cpu(i) +		mce_device_remove(i); + +err_out_mem: +	free_cpumask_var(mce_device_initialized); + +err_out: +	pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);  	return err;  } diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 4cfe0458ca6..9a316b21df8 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -6,10 +6,10 @@   */  #include <linux/gfp.h> -#include <linux/init.h>  #include <linux/interrupt.h>  #include <linux/percpu.h>  #include <linux/sched.h> +#include <linux/cpumask.h>  #include <asm/apic.h>  #include <asm/processor.h>  #include <asm/msr.h> @@ -42,7 +42,7 @@ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);   * cmci_discover_lock protects against parallel discovery attempts   * which could race against each other.   */ -static DEFINE_RAW_SPINLOCK(cmci_discover_lock); +static DEFINE_SPINLOCK(cmci_discover_lock);  #define CMCI_THRESHOLD		1  #define CMCI_POLL_INTERVAL	(30 * HZ) @@ -138,6 +138,22 @@ unsigned long mce_intel_adjust_timer(unsigned long interval)  	}  } +static void cmci_storm_disable_banks(void) +{ +	unsigned long flags, *owned; +	int bank; +	u64 val; + +	spin_lock_irqsave(&cmci_discover_lock, flags); +	owned = __get_cpu_var(mce_banks_owned); +	for_each_set_bit(bank, owned, MAX_NR_BANKS) { +		rdmsrl(MSR_IA32_MCx_CTL2(bank), val); +		val &= ~MCI_CTL2_CMCI_EN; +		wrmsrl(MSR_IA32_MCx_CTL2(bank), val); +	} +	spin_unlock_irqrestore(&cmci_discover_lock, flags); +} +  static bool cmci_storm_detect(void)  {  	unsigned int cnt = __this_cpu_read(cmci_storm_cnt); @@ -159,7 +175,7 @@ static bool cmci_storm_detect(void)  	if (cnt <= CMCI_STORM_THRESHOLD)  		return false; -	cmci_clear(); +	cmci_storm_disable_banks();  	__this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);  	r = atomic_add_return(1, &cmci_storm_on_cpus);  	mce_timer_kick(CMCI_POLL_INTERVAL); @@ -195,7 +211,7 @@ static void cmci_discover(int banks)  	int i;  	int bios_wrong_thresh = 0; -	raw_spin_lock_irqsave(&cmci_discover_lock, flags); +	spin_lock_irqsave(&cmci_discover_lock, flags);  	for (i = 0; i < banks; i++) {  		u64 val;  		int bios_zero_thresh = 0; @@ -250,7 +266,7 @@ static void cmci_discover(int banks)  			WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));  		}  	} -	raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); +	spin_unlock_irqrestore(&cmci_discover_lock, flags);  	if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) {  		pr_info_once(  			"bios_cmci_threshold: Some banks do not have valid thresholds set\n"); @@ -300,10 +316,10 @@ void cmci_clear(void)  	if (!cmci_supported(&banks))  		return; -	raw_spin_lock_irqsave(&cmci_discover_lock, flags); +	spin_lock_irqsave(&cmci_discover_lock, flags);  	for (i = 0; i < banks; i++)  		__cmci_disable_bank(i); -	raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); +	spin_unlock_irqrestore(&cmci_discover_lock, flags);  }  static void cmci_rediscover_work_func(void *arg) @@ -344,9 +360,9 @@ void cmci_disable_bank(int bank)  	if (!cmci_supported(&banks))  		return; -	raw_spin_lock_irqsave(&cmci_discover_lock, flags); +	spin_lock_irqsave(&cmci_discover_lock, flags);  	__cmci_disable_bank(bank); -	raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); +	spin_unlock_irqrestore(&cmci_discover_lock, flags);  }  static void intel_init_cmci(void) diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index 1c044b1ccc5..a3042989398 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c @@ -5,7 +5,6 @@  #include <linux/interrupt.h>  #include <linux/kernel.h>  #include <linux/types.h> -#include <linux/init.h>  #include <linux/smp.h>  #include <asm/processor.h> diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 3eec7de76ef..36a1bb6d1ee 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -271,9 +271,6 @@ static void thermal_throttle_remove_dev(struct device *dev)  	sysfs_remove_group(&dev->kobj, &thermal_attr_group);  } -/* Mutex protecting device creation against CPU hotplug: */ -static DEFINE_MUTEX(therm_cpu_lock); -  /* Get notified when a cpu comes on/off. Be hotplug friendly. */  static int  thermal_throttle_cpu_callback(struct notifier_block *nfb, @@ -289,18 +286,14 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,  	switch (action) {  	case CPU_UP_PREPARE:  	case CPU_UP_PREPARE_FROZEN: -		mutex_lock(&therm_cpu_lock);  		err = thermal_throttle_add_dev(dev, cpu); -		mutex_unlock(&therm_cpu_lock);  		WARN_ON(err);  		break;  	case CPU_UP_CANCELED:  	case CPU_UP_CANCELED_FROZEN:  	case CPU_DEAD:  	case CPU_DEAD_FROZEN: -		mutex_lock(&therm_cpu_lock);  		thermal_throttle_remove_dev(dev); -		mutex_unlock(&therm_cpu_lock);  		break;  	}  	return notifier_from_errno(err); @@ -319,19 +312,16 @@ static __init int thermal_throttle_init_device(void)  	if (!atomic_read(&therm_throt_en))  		return 0; -	register_hotcpu_notifier(&thermal_throttle_cpu_notifier); +	cpu_notifier_register_begin(); -#ifdef CONFIG_HOTPLUG_CPU -	mutex_lock(&therm_cpu_lock); -#endif  	/* connect live CPUs to sysfs */  	for_each_online_cpu(cpu) {  		err = thermal_throttle_add_dev(get_cpu_device(cpu), cpu);  		WARN_ON(err);  	} -#ifdef CONFIG_HOTPLUG_CPU -	mutex_unlock(&therm_cpu_lock); -#endif + +	__register_hotcpu_notifier(&thermal_throttle_cpu_notifier); +	cpu_notifier_register_done();  	return 0;  } @@ -439,14 +429,14 @@ static inline void __smp_thermal_interrupt(void)  	smp_thermal_vector();  } -asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) +asmlinkage __visible void smp_thermal_interrupt(struct pt_regs *regs)  {  	entering_irq();  	__smp_thermal_interrupt();  	exiting_ack_irq();  } -asmlinkage void smp_trace_thermal_interrupt(struct pt_regs *regs) +asmlinkage __visible void smp_trace_thermal_interrupt(struct pt_regs *regs)  {  	entering_irq();  	trace_thermal_apic_entry(THERMAL_APIC_VECTOR); diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index fe6b1c86645..7245980186e 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -24,14 +24,14 @@ static inline void __smp_threshold_interrupt(void)  	mce_threshold_vector();  } -asmlinkage void smp_threshold_interrupt(void) +asmlinkage __visible void smp_threshold_interrupt(void)  {  	entering_irq();  	__smp_threshold_interrupt();  	exiting_ack_irq();  } -asmlinkage void smp_trace_threshold_interrupt(void) +asmlinkage __visible void smp_trace_threshold_interrupt(void)  {  	entering_irq();  	trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR); diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c index e9a701aecaa..7dc5564d0cd 100644 --- a/arch/x86/kernel/cpu/mcheck/winchip.c +++ b/arch/x86/kernel/cpu/mcheck/winchip.c @@ -5,7 +5,6 @@  #include <linux/interrupt.h>  #include <linux/kernel.h>  #include <linux/types.h> -#include <linux/init.h>  #include <asm/processor.h>  #include <asm/mce.h> diff --git a/arch/x86/kernel/cpu/microcode/Makefile b/arch/x86/kernel/cpu/microcode/Makefile new file mode 100644 index 00000000000..285c85427c3 --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/Makefile @@ -0,0 +1,7 @@ +microcode-y				:= core.o +obj-$(CONFIG_MICROCODE)			+= microcode.o +microcode-$(CONFIG_MICROCODE_INTEL)	+= intel.o intel_lib.o +microcode-$(CONFIG_MICROCODE_AMD)	+= amd.o +obj-$(CONFIG_MICROCODE_EARLY)		+= core_early.o +obj-$(CONFIG_MICROCODE_INTEL_EARLY)	+= intel_early.o +obj-$(CONFIG_MICROCODE_AMD_EARLY)	+= amd_early.o diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c new file mode 100644 index 00000000000..8fffd845e22 --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -0,0 +1,492 @@ +/* + *  AMD CPU Microcode Update Driver for Linux + *  Copyright (C) 2008-2011 Advanced Micro Devices Inc. + * + *  Author: Peter Oruba <peter.oruba@amd.com> + * + *  Based on work by: + *  Tigran Aivazian <tigran@aivazian.fsnet.co.uk> + * + *  Maintainers: + *  Andreas Herrmann <herrmann.der.user@googlemail.com> + *  Borislav Petkov <bp@alien8.de> + * + *  This driver allows to upgrade microcode on F10h AMD + *  CPUs and later. + * + *  Licensed under the terms of the GNU General Public + *  License version 2. See file COPYING for details. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/firmware.h> +#include <linux/pci_ids.h> +#include <linux/uaccess.h> +#include <linux/vmalloc.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/pci.h> + +#include <asm/microcode.h> +#include <asm/processor.h> +#include <asm/msr.h> +#include <asm/microcode_amd.h> + +MODULE_DESCRIPTION("AMD Microcode Update Driver"); +MODULE_AUTHOR("Peter Oruba"); +MODULE_LICENSE("GPL v2"); + +static struct equiv_cpu_entry *equiv_cpu_table; + +struct ucode_patch { +	struct list_head plist; +	void *data; +	u32 patch_id; +	u16 equiv_cpu; +}; + +static LIST_HEAD(pcache); + +static u16 __find_equiv_id(unsigned int cpu) +{ +	struct ucode_cpu_info *uci = ucode_cpu_info + cpu; +	return find_equiv_id(equiv_cpu_table, uci->cpu_sig.sig); +} + +static u32 find_cpu_family_by_equiv_cpu(u16 equiv_cpu) +{ +	int i = 0; + +	BUG_ON(!equiv_cpu_table); + +	while (equiv_cpu_table[i].equiv_cpu != 0) { +		if (equiv_cpu == equiv_cpu_table[i].equiv_cpu) +			return equiv_cpu_table[i].installed_cpu; +		i++; +	} +	return 0; +} + +/* + * a small, trivial cache of per-family ucode patches + */ +static struct ucode_patch *cache_find_patch(u16 equiv_cpu) +{ +	struct ucode_patch *p; + +	list_for_each_entry(p, &pcache, plist) +		if (p->equiv_cpu == equiv_cpu) +			return p; +	return NULL; +} + +static void update_cache(struct ucode_patch *new_patch) +{ +	struct ucode_patch *p; + +	list_for_each_entry(p, &pcache, plist) { +		if (p->equiv_cpu == new_patch->equiv_cpu) { +			if (p->patch_id >= new_patch->patch_id) +				/* we already have the latest patch */ +				return; + +			list_replace(&p->plist, &new_patch->plist); +			kfree(p->data); +			kfree(p); +			return; +		} +	} +	/* no patch found, add it */ +	list_add_tail(&new_patch->plist, &pcache); +} + +static void free_cache(void) +{ +	struct ucode_patch *p, *tmp; + +	list_for_each_entry_safe(p, tmp, &pcache, plist) { +		__list_del(p->plist.prev, p->plist.next); +		kfree(p->data); +		kfree(p); +	} +} + +static struct ucode_patch *find_patch(unsigned int cpu) +{ +	u16 equiv_id; + +	equiv_id = __find_equiv_id(cpu); +	if (!equiv_id) +		return NULL; + +	return cache_find_patch(equiv_id); +} + +static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) +{ +	struct cpuinfo_x86 *c = &cpu_data(cpu); +	struct ucode_cpu_info *uci = ucode_cpu_info + cpu; +	struct ucode_patch *p; + +	csig->sig = cpuid_eax(0x00000001); +	csig->rev = c->microcode; + +	/* +	 * a patch could have been loaded early, set uci->mc so that +	 * mc_bp_resume() can call apply_microcode() +	 */ +	p = find_patch(cpu); +	if (p && (p->patch_id == csig->rev)) +		uci->mc = p->data; + +	pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev); + +	return 0; +} + +static unsigned int verify_patch_size(u8 family, u32 patch_size, +				      unsigned int size) +{ +	u32 max_size; + +#define F1XH_MPB_MAX_SIZE 2048 +#define F14H_MPB_MAX_SIZE 1824 +#define F15H_MPB_MAX_SIZE 4096 +#define F16H_MPB_MAX_SIZE 3458 + +	switch (family) { +	case 0x14: +		max_size = F14H_MPB_MAX_SIZE; +		break; +	case 0x15: +		max_size = F15H_MPB_MAX_SIZE; +		break; +	case 0x16: +		max_size = F16H_MPB_MAX_SIZE; +		break; +	default: +		max_size = F1XH_MPB_MAX_SIZE; +		break; +	} + +	if (patch_size > min_t(u32, size, max_size)) { +		pr_err("patch size mismatch\n"); +		return 0; +	} + +	return patch_size; +} + +int __apply_microcode_amd(struct microcode_amd *mc_amd) +{ +	u32 rev, dummy; + +	native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); + +	/* verify patch application was successful */ +	native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); +	if (rev != mc_amd->hdr.patch_id) +		return -1; + +	return 0; +} + +int apply_microcode_amd(int cpu) +{ +	struct cpuinfo_x86 *c = &cpu_data(cpu); +	struct microcode_amd *mc_amd; +	struct ucode_cpu_info *uci; +	struct ucode_patch *p; +	u32 rev, dummy; + +	BUG_ON(raw_smp_processor_id() != cpu); + +	uci = ucode_cpu_info + cpu; + +	p = find_patch(cpu); +	if (!p) +		return 0; + +	mc_amd  = p->data; +	uci->mc = p->data; + +	rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); + +	/* need to apply patch? */ +	if (rev >= mc_amd->hdr.patch_id) { +		c->microcode = rev; +		uci->cpu_sig.rev = rev; +		return 0; +	} + +	if (__apply_microcode_amd(mc_amd)) { +		pr_err("CPU%d: update failed for patch_level=0x%08x\n", +			cpu, mc_amd->hdr.patch_id); +		return -1; +	} +	pr_info("CPU%d: new patch_level=0x%08x\n", cpu, +		mc_amd->hdr.patch_id); + +	uci->cpu_sig.rev = mc_amd->hdr.patch_id; +	c->microcode = mc_amd->hdr.patch_id; + +	return 0; +} + +static int install_equiv_cpu_table(const u8 *buf) +{ +	unsigned int *ibuf = (unsigned int *)buf; +	unsigned int type = ibuf[1]; +	unsigned int size = ibuf[2]; + +	if (type != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { +		pr_err("empty section/" +		       "invalid type field in container file section header\n"); +		return -EINVAL; +	} + +	equiv_cpu_table = vmalloc(size); +	if (!equiv_cpu_table) { +		pr_err("failed to allocate equivalent CPU table\n"); +		return -ENOMEM; +	} + +	memcpy(equiv_cpu_table, buf + CONTAINER_HDR_SZ, size); + +	/* add header length */ +	return size + CONTAINER_HDR_SZ; +} + +static void free_equiv_cpu_table(void) +{ +	vfree(equiv_cpu_table); +	equiv_cpu_table = NULL; +} + +static void cleanup(void) +{ +	free_equiv_cpu_table(); +	free_cache(); +} + +/* + * We return the current size even if some of the checks failed so that + * we can skip over the next patch. If we return a negative value, we + * signal a grave error like a memory allocation has failed and the + * driver cannot continue functioning normally. In such cases, we tear + * down everything we've used up so far and exit. + */ +static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover) +{ +	struct microcode_header_amd *mc_hdr; +	struct ucode_patch *patch; +	unsigned int patch_size, crnt_size, ret; +	u32 proc_fam; +	u16 proc_id; + +	patch_size  = *(u32 *)(fw + 4); +	crnt_size   = patch_size + SECTION_HDR_SIZE; +	mc_hdr	    = (struct microcode_header_amd *)(fw + SECTION_HDR_SIZE); +	proc_id	    = mc_hdr->processor_rev_id; + +	proc_fam = find_cpu_family_by_equiv_cpu(proc_id); +	if (!proc_fam) { +		pr_err("No patch family for equiv ID: 0x%04x\n", proc_id); +		return crnt_size; +	} + +	/* check if patch is for the current family */ +	proc_fam = ((proc_fam >> 8) & 0xf) + ((proc_fam >> 20) & 0xff); +	if (proc_fam != family) +		return crnt_size; + +	if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) { +		pr_err("Patch-ID 0x%08x: chipset-specific code unsupported.\n", +			mc_hdr->patch_id); +		return crnt_size; +	} + +	ret = verify_patch_size(family, patch_size, leftover); +	if (!ret) { +		pr_err("Patch-ID 0x%08x: size mismatch.\n", mc_hdr->patch_id); +		return crnt_size; +	} + +	patch = kzalloc(sizeof(*patch), GFP_KERNEL); +	if (!patch) { +		pr_err("Patch allocation failure.\n"); +		return -EINVAL; +	} + +	patch->data = kzalloc(patch_size, GFP_KERNEL); +	if (!patch->data) { +		pr_err("Patch data allocation failure.\n"); +		kfree(patch); +		return -EINVAL; +	} + +	/* All looks ok, copy patch... */ +	memcpy(patch->data, fw + SECTION_HDR_SIZE, patch_size); +	INIT_LIST_HEAD(&patch->plist); +	patch->patch_id  = mc_hdr->patch_id; +	patch->equiv_cpu = proc_id; + +	pr_debug("%s: Added patch_id: 0x%08x, proc_id: 0x%04x\n", +		 __func__, patch->patch_id, proc_id); + +	/* ... and add to cache. */ +	update_cache(patch); + +	return crnt_size; +} + +static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, +					     size_t size) +{ +	enum ucode_state ret = UCODE_ERROR; +	unsigned int leftover; +	u8 *fw = (u8 *)data; +	int crnt_size = 0; +	int offset; + +	offset = install_equiv_cpu_table(data); +	if (offset < 0) { +		pr_err("failed to create equivalent cpu table\n"); +		return ret; +	} +	fw += offset; +	leftover = size - offset; + +	if (*(u32 *)fw != UCODE_UCODE_TYPE) { +		pr_err("invalid type field in container file section header\n"); +		free_equiv_cpu_table(); +		return ret; +	} + +	while (leftover) { +		crnt_size = verify_and_add_patch(family, fw, leftover); +		if (crnt_size < 0) +			return ret; + +		fw	 += crnt_size; +		leftover -= crnt_size; +	} + +	return UCODE_OK; +} + +enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size) +{ +	enum ucode_state ret; + +	/* free old equiv table */ +	free_equiv_cpu_table(); + +	ret = __load_microcode_amd(family, data, size); + +	if (ret != UCODE_OK) +		cleanup(); + +#if defined(CONFIG_MICROCODE_AMD_EARLY) && defined(CONFIG_X86_32) +	/* save BSP's matching patch for early load */ +	if (cpu_data(smp_processor_id()).cpu_index == boot_cpu_data.cpu_index) { +		struct ucode_patch *p = find_patch(smp_processor_id()); +		if (p) { +			memset(amd_ucode_patch, 0, PATCH_MAX_SIZE); +			memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data), +							       PATCH_MAX_SIZE)); +		} +	} +#endif +	return ret; +} + +/* + * AMD microcode firmware naming convention, up to family 15h they are in + * the legacy file: + * + *    amd-ucode/microcode_amd.bin + * + * This legacy file is always smaller than 2K in size. + * + * Beginning with family 15h, they are in family-specific firmware files: + * + *    amd-ucode/microcode_amd_fam15h.bin + *    amd-ucode/microcode_amd_fam16h.bin + *    ... + * + * These might be larger than 2K. + */ +static enum ucode_state request_microcode_amd(int cpu, struct device *device, +					      bool refresh_fw) +{ +	char fw_name[36] = "amd-ucode/microcode_amd.bin"; +	struct cpuinfo_x86 *c = &cpu_data(cpu); +	enum ucode_state ret = UCODE_NFOUND; +	const struct firmware *fw; + +	/* reload ucode container only on the boot cpu */ +	if (!refresh_fw || c->cpu_index != boot_cpu_data.cpu_index) +		return UCODE_OK; + +	if (c->x86 >= 0x15) +		snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86); + +	if (request_firmware_direct(&fw, (const char *)fw_name, device)) { +		pr_debug("failed to load file %s\n", fw_name); +		goto out; +	} + +	ret = UCODE_ERROR; +	if (*(u32 *)fw->data != UCODE_MAGIC) { +		pr_err("invalid magic value (0x%08x)\n", *(u32 *)fw->data); +		goto fw_release; +	} + +	ret = load_microcode_amd(c->x86, fw->data, fw->size); + + fw_release: +	release_firmware(fw); + + out: +	return ret; +} + +static enum ucode_state +request_microcode_user(int cpu, const void __user *buf, size_t size) +{ +	return UCODE_ERROR; +} + +static void microcode_fini_cpu_amd(int cpu) +{ +	struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + +	uci->mc = NULL; +} + +static struct microcode_ops microcode_amd_ops = { +	.request_microcode_user           = request_microcode_user, +	.request_microcode_fw             = request_microcode_amd, +	.collect_cpu_info                 = collect_cpu_info_amd, +	.apply_microcode                  = apply_microcode_amd, +	.microcode_fini_cpu               = microcode_fini_cpu_amd, +}; + +struct microcode_ops * __init init_amd_microcode(void) +{ +	struct cpuinfo_x86 *c = &cpu_data(0); + +	if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { +		pr_warning("AMD CPU family 0x%x not supported\n", c->x86); +		return NULL; +	} + +	return µcode_amd_ops; +} + +void __exit exit_amd_microcode(void) +{ +	cleanup(); +} diff --git a/arch/x86/kernel/cpu/microcode/amd_early.c b/arch/x86/kernel/cpu/microcode/amd_early.c new file mode 100644 index 00000000000..617a9e28424 --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/amd_early.c @@ -0,0 +1,395 @@ +/* + * Copyright (C) 2013 Advanced Micro Devices, Inc. + * + * Author: Jacob Shin <jacob.shin@amd.com> + * Fixes: Borislav Petkov <bp@suse.de> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/earlycpio.h> +#include <linux/initrd.h> + +#include <asm/cpu.h> +#include <asm/setup.h> +#include <asm/microcode_amd.h> + +/* + * This points to the current valid container of microcode patches which we will + * save from the initrd before jettisoning its contents. + */ +static u8 *container; +static size_t container_size; + +static u32 ucode_new_rev; +u8 amd_ucode_patch[PATCH_MAX_SIZE]; +static u16 this_equiv_id; + +struct cpio_data ucode_cpio; + +/* + * Microcode patch container file is prepended to the initrd in cpio format. + * See Documentation/x86/early-microcode.txt + */ +static __initdata char ucode_path[] = "kernel/x86/microcode/AuthenticAMD.bin"; + +static struct cpio_data __init find_ucode_in_initrd(void) +{ +	long offset = 0; +	char *path; +	void *start; +	size_t size; + +#ifdef CONFIG_X86_32 +	struct boot_params *p; + +	/* +	 * On 32-bit, early load occurs before paging is turned on so we need +	 * to use physical addresses. +	 */ +	p       = (struct boot_params *)__pa_nodebug(&boot_params); +	path    = (char *)__pa_nodebug(ucode_path); +	start   = (void *)p->hdr.ramdisk_image; +	size    = p->hdr.ramdisk_size; +#else +	path    = ucode_path; +	start   = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET); +	size    = boot_params.hdr.ramdisk_size; +#endif + +	return find_cpio_data(path, start, size, &offset); +} + +static size_t compute_container_size(u8 *data, u32 total_size) +{ +	size_t size = 0; +	u32 *header = (u32 *)data; + +	if (header[0] != UCODE_MAGIC || +	    header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */ +	    header[2] == 0)                            /* size */ +		return size; + +	size = header[2] + CONTAINER_HDR_SZ; +	total_size -= size; +	data += size; + +	while (total_size) { +		u16 patch_size; + +		header = (u32 *)data; + +		if (header[0] != UCODE_UCODE_TYPE) +			break; + +		/* +		 * Sanity-check patch size. +		 */ +		patch_size = header[1]; +		if (patch_size > PATCH_MAX_SIZE) +			break; + +		size	   += patch_size + SECTION_HDR_SIZE; +		data	   += patch_size + SECTION_HDR_SIZE; +		total_size -= patch_size + SECTION_HDR_SIZE; +	} + +	return size; +} + +/* + * Early load occurs before we can vmalloc(). So we look for the microcode + * patch container file in initrd, traverse equivalent cpu table, look for a + * matching microcode patch, and update, all in initrd memory in place. + * When vmalloc() is available for use later -- on 64-bit during first AP load, + * and on 32-bit during save_microcode_in_initrd_amd() -- we can call + * load_microcode_amd() to save equivalent cpu table and microcode patches in + * kernel heap memory. + */ +static void apply_ucode_in_initrd(void *ucode, size_t size) +{ +	struct equiv_cpu_entry *eq; +	size_t *cont_sz; +	u32 *header; +	u8  *data, **cont; +	u16 eq_id = 0; +	int offset, left; +	u32 rev, eax, ebx, ecx, edx; +	u32 *new_rev; + +#ifdef CONFIG_X86_32 +	new_rev = (u32 *)__pa_nodebug(&ucode_new_rev); +	cont_sz = (size_t *)__pa_nodebug(&container_size); +	cont	= (u8 **)__pa_nodebug(&container); +#else +	new_rev = &ucode_new_rev; +	cont_sz = &container_size; +	cont	= &container; +#endif + +	data   = ucode; +	left   = size; +	header = (u32 *)data; + +	/* find equiv cpu table */ +	if (header[0] != UCODE_MAGIC || +	    header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */ +	    header[2] == 0)                            /* size */ +		return; + +	eax = 0x00000001; +	ecx = 0; +	native_cpuid(&eax, &ebx, &ecx, &edx); + +	while (left > 0) { +		eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ); + +		*cont = data; + +		/* Advance past the container header */ +		offset = header[2] + CONTAINER_HDR_SZ; +		data  += offset; +		left  -= offset; + +		eq_id = find_equiv_id(eq, eax); +		if (eq_id) { +			this_equiv_id = eq_id; +			*cont_sz = compute_container_size(*cont, left + offset); + +			/* +			 * truncate how much we need to iterate over in the +			 * ucode update loop below +			 */ +			left = *cont_sz - offset; +			break; +		} + +		/* +		 * support multiple container files appended together. if this +		 * one does not have a matching equivalent cpu entry, we fast +		 * forward to the next container file. +		 */ +		while (left > 0) { +			header = (u32 *)data; +			if (header[0] == UCODE_MAGIC && +			    header[1] == UCODE_EQUIV_CPU_TABLE_TYPE) +				break; + +			offset = header[1] + SECTION_HDR_SIZE; +			data  += offset; +			left  -= offset; +		} + +		/* mark where the next microcode container file starts */ +		offset    = data - (u8 *)ucode; +		ucode     = data; +	} + +	if (!eq_id) { +		*cont = NULL; +		*cont_sz = 0; +		return; +	} + +	/* find ucode and update if needed */ + +	native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax); + +	while (left > 0) { +		struct microcode_amd *mc; + +		header = (u32 *)data; +		if (header[0] != UCODE_UCODE_TYPE || /* type */ +		    header[1] == 0)                  /* size */ +			break; + +		mc = (struct microcode_amd *)(data + SECTION_HDR_SIZE); + +		if (eq_id == mc->hdr.processor_rev_id && rev < mc->hdr.patch_id) { + +			if (!__apply_microcode_amd(mc)) { +				rev = mc->hdr.patch_id; +				*new_rev = rev; + +				/* save ucode patch */ +				memcpy(amd_ucode_patch, mc, +				       min_t(u32, header[1], PATCH_MAX_SIZE)); +			} +		} + +		offset  = header[1] + SECTION_HDR_SIZE; +		data   += offset; +		left   -= offset; +	} +} + +void __init load_ucode_amd_bsp(void) +{ +	struct cpio_data cp; +	void **data; +	size_t *size; + +#ifdef CONFIG_X86_32 +	data =  (void **)__pa_nodebug(&ucode_cpio.data); +	size = (size_t *)__pa_nodebug(&ucode_cpio.size); +#else +	data = &ucode_cpio.data; +	size = &ucode_cpio.size; +#endif + +	cp = find_ucode_in_initrd(); +	if (!cp.data) +		return; + +	*data = cp.data; +	*size = cp.size; + +	apply_ucode_in_initrd(cp.data, cp.size); +} + +#ifdef CONFIG_X86_32 +/* + * On 32-bit, since AP's early load occurs before paging is turned on, we + * cannot traverse cpu_equiv_table and pcache in kernel heap memory. So during + * cold boot, AP will apply_ucode_in_initrd() just like the BSP. During + * save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch, + * which is used upon resume from suspend. + */ +void load_ucode_amd_ap(void) +{ +	struct microcode_amd *mc; +	size_t *usize; +	void **ucode; + +	mc = (struct microcode_amd *)__pa(amd_ucode_patch); +	if (mc->hdr.patch_id && mc->hdr.processor_rev_id) { +		__apply_microcode_amd(mc); +		return; +	} + +	ucode = (void *)__pa_nodebug(&container); +	usize = (size_t *)__pa_nodebug(&container_size); + +	if (!*ucode || !*usize) +		return; + +	apply_ucode_in_initrd(*ucode, *usize); +} + +static void __init collect_cpu_sig_on_bsp(void *arg) +{ +	unsigned int cpu = smp_processor_id(); +	struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + +	uci->cpu_sig.sig = cpuid_eax(0x00000001); +} + +static void __init get_bsp_sig(void) +{ +	unsigned int bsp = boot_cpu_data.cpu_index; +	struct ucode_cpu_info *uci = ucode_cpu_info + bsp; + +	if (!uci->cpu_sig.sig) +		smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1); +} +#else +void load_ucode_amd_ap(void) +{ +	unsigned int cpu = smp_processor_id(); +	struct ucode_cpu_info *uci = ucode_cpu_info + cpu; +	struct equiv_cpu_entry *eq; +	struct microcode_amd *mc; +	u32 rev, eax; +	u16 eq_id; + +	/* Exit if called on the BSP. */ +	if (!cpu) +		return; + +	if (!container) +		return; + +	rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax); + +	uci->cpu_sig.rev = rev; +	uci->cpu_sig.sig = eax; + +	eax = cpuid_eax(0x00000001); +	eq  = (struct equiv_cpu_entry *)(container + CONTAINER_HDR_SZ); + +	eq_id = find_equiv_id(eq, eax); +	if (!eq_id) +		return; + +	if (eq_id == this_equiv_id) { +		mc = (struct microcode_amd *)amd_ucode_patch; + +		if (mc && rev < mc->hdr.patch_id) { +			if (!__apply_microcode_amd(mc)) +				ucode_new_rev = mc->hdr.patch_id; +		} + +	} else { +		if (!ucode_cpio.data) +			return; + +		/* +		 * AP has a different equivalence ID than BSP, looks like +		 * mixed-steppings silicon so go through the ucode blob anew. +		 */ +		apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size); +	} +} +#endif + +int __init save_microcode_in_initrd_amd(void) +{ +	unsigned long cont; +	enum ucode_state ret; +	u32 eax; + +	if (!container) +		return -EINVAL; + +#ifdef CONFIG_X86_32 +	get_bsp_sig(); +	cont = (unsigned long)container; +#else +	/* +	 * We need the physical address of the container for both bitness since +	 * boot_params.hdr.ramdisk_image is a physical address. +	 */ +	cont = __pa(container); +#endif + +	/* +	 * Take into account the fact that the ramdisk might get relocated and +	 * therefore we need to recompute the container's position in virtual +	 * memory space. +	 */ +	if (relocated_ramdisk) +		container = (u8 *)(__va(relocated_ramdisk) + +			     (cont - boot_params.hdr.ramdisk_image)); + +	if (ucode_new_rev) +		pr_info("microcode: updated early to new patch_level=0x%08x\n", +			ucode_new_rev); + +	eax   = cpuid_eax(0x00000001); +	eax   = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); + +	ret = load_microcode_amd(eax, container, container_size); +	if (ret != UCODE_OK) +		return -EINVAL; + +	/* +	 * This will be freed any msec now, stash patches for the current +	 * family and switch to patch cache for cpu hotplug, etc later. +	 */ +	container = NULL; +	container_size = 0; + +	return 0; +} diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c new file mode 100644 index 00000000000..dd9d6190b08 --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -0,0 +1,651 @@ +/* + *	Intel CPU Microcode Update Driver for Linux + * + *	Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk> + *		      2006	Shaohua Li <shaohua.li@intel.com> + * + *	This driver allows to upgrade microcode on Intel processors + *	belonging to IA-32 family - PentiumPro, Pentium II, + *	Pentium III, Xeon, Pentium 4, etc. + * + *	Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture + *	Software Developer's Manual + *	Order Number 253668 or free download from: + * + *	http://developer.intel.com/Assets/PDF/manual/253668.pdf	 + * + *	For more information, go to http://www.urbanmyth.org/microcode + * + *	This program is free software; you can redistribute it and/or + *	modify it under the terms of the GNU General Public License + *	as published by the Free Software Foundation; either version + *	2 of the License, or (at your option) any later version. + * + *	1.0	16 Feb 2000, Tigran Aivazian <tigran@sco.com> + *		Initial release. + *	1.01	18 Feb 2000, Tigran Aivazian <tigran@sco.com> + *		Added read() support + cleanups. + *	1.02	21 Feb 2000, Tigran Aivazian <tigran@sco.com> + *		Added 'device trimming' support. open(O_WRONLY) zeroes + *		and frees the saved copy of applied microcode. + *	1.03	29 Feb 2000, Tigran Aivazian <tigran@sco.com> + *		Made to use devfs (/dev/cpu/microcode) + cleanups. + *	1.04	06 Jun 2000, Simon Trimmer <simon@veritas.com> + *		Added misc device support (now uses both devfs and misc). + *		Added MICROCODE_IOCFREE ioctl to clear memory. + *	1.05	09 Jun 2000, Simon Trimmer <simon@veritas.com> + *		Messages for error cases (non Intel & no suitable microcode). + *	1.06	03 Aug 2000, Tigran Aivazian <tigran@veritas.com> + *		Removed ->release(). Removed exclusive open and status bitmap. + *		Added microcode_rwsem to serialize read()/write()/ioctl(). + *		Removed global kernel lock usage. + *	1.07	07 Sep 2000, Tigran Aivazian <tigran@veritas.com> + *		Write 0 to 0x8B msr and then cpuid before reading revision, + *		so that it works even if there were no update done by the + *		BIOS. Otherwise, reading from 0x8B gives junk (which happened + *		to be 0 on my machine which is why it worked even when I + *		disabled update by the BIOS) + *		Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix. + *	1.08	11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and + *			     Tigran Aivazian <tigran@veritas.com> + *		Intel Pentium 4 processor support and bugfixes. + *	1.09	30 Oct 2001, Tigran Aivazian <tigran@veritas.com> + *		Bugfix for HT (Hyper-Threading) enabled processors + *		whereby processor resources are shared by all logical processors + *		in a single CPU package. + *	1.10	28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and + *		Tigran Aivazian <tigran@veritas.com>, + *		Serialize updates as required on HT processors due to + *		speculative nature of implementation. + *	1.11	22 Mar 2002 Tigran Aivazian <tigran@veritas.com> + *		Fix the panic when writing zero-length microcode chunk. + *	1.12	29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>, + *		Jun Nakajima <jun.nakajima@intel.com> + *		Support for the microcode updates in the new format. + *	1.13	10 Oct 2003 Tigran Aivazian <tigran@veritas.com> + *		Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl + *		because we no longer hold a copy of applied microcode + *		in kernel memory. + *	1.14	25 Jun 2004 Tigran Aivazian <tigran@veritas.com> + *		Fix sigmatch() macro to handle old CPUs with pf == 0. + *		Thanks to Stuart Swales for pointing out this bug. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/platform_device.h> +#include <linux/miscdevice.h> +#include <linux/capability.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/cpu.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/syscore_ops.h> + +#include <asm/microcode.h> +#include <asm/processor.h> +#include <asm/cpu_device_id.h> +#include <asm/perf_event.h> + +MODULE_DESCRIPTION("Microcode Update Driver"); +MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); +MODULE_LICENSE("GPL"); + +#define MICROCODE_VERSION	"2.00" + +static struct microcode_ops	*microcode_ops; + +bool dis_ucode_ldr; +module_param(dis_ucode_ldr, bool, 0); + +/* + * Synchronization. + * + * All non cpu-hotplug-callback call sites use: + * + * - microcode_mutex to synchronize with each other; + * - get/put_online_cpus() to synchronize with + *   the cpu-hotplug-callback call sites. + * + * We guarantee that only a single cpu is being + * updated at any particular moment of time. + */ +static DEFINE_MUTEX(microcode_mutex); + +struct ucode_cpu_info		ucode_cpu_info[NR_CPUS]; +EXPORT_SYMBOL_GPL(ucode_cpu_info); + +/* + * Operations that are run on a target cpu: + */ + +struct cpu_info_ctx { +	struct cpu_signature	*cpu_sig; +	int			err; +}; + +static void collect_cpu_info_local(void *arg) +{ +	struct cpu_info_ctx *ctx = arg; + +	ctx->err = microcode_ops->collect_cpu_info(smp_processor_id(), +						   ctx->cpu_sig); +} + +static int collect_cpu_info_on_target(int cpu, struct cpu_signature *cpu_sig) +{ +	struct cpu_info_ctx ctx = { .cpu_sig = cpu_sig, .err = 0 }; +	int ret; + +	ret = smp_call_function_single(cpu, collect_cpu_info_local, &ctx, 1); +	if (!ret) +		ret = ctx.err; + +	return ret; +} + +static int collect_cpu_info(int cpu) +{ +	struct ucode_cpu_info *uci = ucode_cpu_info + cpu; +	int ret; + +	memset(uci, 0, sizeof(*uci)); + +	ret = collect_cpu_info_on_target(cpu, &uci->cpu_sig); +	if (!ret) +		uci->valid = 1; + +	return ret; +} + +struct apply_microcode_ctx { +	int err; +}; + +static void apply_microcode_local(void *arg) +{ +	struct apply_microcode_ctx *ctx = arg; + +	ctx->err = microcode_ops->apply_microcode(smp_processor_id()); +} + +static int apply_microcode_on_target(int cpu) +{ +	struct apply_microcode_ctx ctx = { .err = 0 }; +	int ret; + +	ret = smp_call_function_single(cpu, apply_microcode_local, &ctx, 1); +	if (!ret) +		ret = ctx.err; + +	return ret; +} + +#ifdef CONFIG_MICROCODE_OLD_INTERFACE +static int do_microcode_update(const void __user *buf, size_t size) +{ +	int error = 0; +	int cpu; + +	for_each_online_cpu(cpu) { +		struct ucode_cpu_info *uci = ucode_cpu_info + cpu; +		enum ucode_state ustate; + +		if (!uci->valid) +			continue; + +		ustate = microcode_ops->request_microcode_user(cpu, buf, size); +		if (ustate == UCODE_ERROR) { +			error = -1; +			break; +		} else if (ustate == UCODE_OK) +			apply_microcode_on_target(cpu); +	} + +	return error; +} + +static int microcode_open(struct inode *inode, struct file *file) +{ +	return capable(CAP_SYS_RAWIO) ? nonseekable_open(inode, file) : -EPERM; +} + +static ssize_t microcode_write(struct file *file, const char __user *buf, +			       size_t len, loff_t *ppos) +{ +	ssize_t ret = -EINVAL; + +	if ((len >> PAGE_SHIFT) > totalram_pages) { +		pr_err("too much data (max %ld pages)\n", totalram_pages); +		return ret; +	} + +	get_online_cpus(); +	mutex_lock(µcode_mutex); + +	if (do_microcode_update(buf, len) == 0) +		ret = (ssize_t)len; + +	if (ret > 0) +		perf_check_microcode(); + +	mutex_unlock(µcode_mutex); +	put_online_cpus(); + +	return ret; +} + +static const struct file_operations microcode_fops = { +	.owner			= THIS_MODULE, +	.write			= microcode_write, +	.open			= microcode_open, +	.llseek		= no_llseek, +}; + +static struct miscdevice microcode_dev = { +	.minor			= MICROCODE_MINOR, +	.name			= "microcode", +	.nodename		= "cpu/microcode", +	.fops			= µcode_fops, +}; + +static int __init microcode_dev_init(void) +{ +	int error; + +	error = misc_register(µcode_dev); +	if (error) { +		pr_err("can't misc_register on minor=%d\n", MICROCODE_MINOR); +		return error; +	} + +	return 0; +} + +static void __exit microcode_dev_exit(void) +{ +	misc_deregister(µcode_dev); +} + +MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); +MODULE_ALIAS("devname:cpu/microcode"); +#else +#define microcode_dev_init()	0 +#define microcode_dev_exit()	do { } while (0) +#endif + +/* fake device for request_firmware */ +static struct platform_device	*microcode_pdev; + +static int reload_for_cpu(int cpu) +{ +	struct ucode_cpu_info *uci = ucode_cpu_info + cpu; +	enum ucode_state ustate; +	int err = 0; + +	if (!uci->valid) +		return err; + +	ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev, true); +	if (ustate == UCODE_OK) +		apply_microcode_on_target(cpu); +	else +		if (ustate == UCODE_ERROR) +			err = -EINVAL; +	return err; +} + +static ssize_t reload_store(struct device *dev, +			    struct device_attribute *attr, +			    const char *buf, size_t size) +{ +	unsigned long val; +	int cpu; +	ssize_t ret = 0, tmp_ret; + +	ret = kstrtoul(buf, 0, &val); +	if (ret) +		return ret; + +	if (val != 1) +		return size; + +	get_online_cpus(); +	mutex_lock(µcode_mutex); +	for_each_online_cpu(cpu) { +		tmp_ret = reload_for_cpu(cpu); +		if (tmp_ret != 0) +			pr_warn("Error reloading microcode on CPU %d\n", cpu); + +		/* save retval of the first encountered reload error */ +		if (!ret) +			ret = tmp_ret; +	} +	if (!ret) +		perf_check_microcode(); +	mutex_unlock(µcode_mutex); +	put_online_cpus(); + +	if (!ret) +		ret = size; + +	return ret; +} + +static ssize_t version_show(struct device *dev, +			struct device_attribute *attr, char *buf) +{ +	struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; + +	return sprintf(buf, "0x%x\n", uci->cpu_sig.rev); +} + +static ssize_t pf_show(struct device *dev, +			struct device_attribute *attr, char *buf) +{ +	struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; + +	return sprintf(buf, "0x%x\n", uci->cpu_sig.pf); +} + +static DEVICE_ATTR(reload, 0200, NULL, reload_store); +static DEVICE_ATTR(version, 0400, version_show, NULL); +static DEVICE_ATTR(processor_flags, 0400, pf_show, NULL); + +static struct attribute *mc_default_attrs[] = { +	&dev_attr_version.attr, +	&dev_attr_processor_flags.attr, +	NULL +}; + +static struct attribute_group mc_attr_group = { +	.attrs			= mc_default_attrs, +	.name			= "microcode", +}; + +static void microcode_fini_cpu(int cpu) +{ +	microcode_ops->microcode_fini_cpu(cpu); +} + +static enum ucode_state microcode_resume_cpu(int cpu) +{ +	pr_debug("CPU%d updated upon resume\n", cpu); + +	if (apply_microcode_on_target(cpu)) +		return UCODE_ERROR; + +	return UCODE_OK; +} + +static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw) +{ +	enum ucode_state ustate; +	struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + +	if (uci && uci->valid) +		return UCODE_OK; + +	if (collect_cpu_info(cpu)) +		return UCODE_ERROR; + +	/* --dimm. Trigger a delayed update? */ +	if (system_state != SYSTEM_RUNNING) +		return UCODE_NFOUND; + +	ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev, +						     refresh_fw); + +	if (ustate == UCODE_OK) { +		pr_debug("CPU%d updated upon init\n", cpu); +		apply_microcode_on_target(cpu); +	} + +	return ustate; +} + +static enum ucode_state microcode_update_cpu(int cpu) +{ +	struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + +	if (uci->valid) +		return microcode_resume_cpu(cpu); + +	return microcode_init_cpu(cpu, false); +} + +static int mc_device_add(struct device *dev, struct subsys_interface *sif) +{ +	int err, cpu = dev->id; + +	if (!cpu_online(cpu)) +		return 0; + +	pr_debug("CPU%d added\n", cpu); + +	err = sysfs_create_group(&dev->kobj, &mc_attr_group); +	if (err) +		return err; + +	if (microcode_init_cpu(cpu, true) == UCODE_ERROR) +		return -EINVAL; + +	return err; +} + +static int mc_device_remove(struct device *dev, struct subsys_interface *sif) +{ +	int cpu = dev->id; + +	if (!cpu_online(cpu)) +		return 0; + +	pr_debug("CPU%d removed\n", cpu); +	microcode_fini_cpu(cpu); +	sysfs_remove_group(&dev->kobj, &mc_attr_group); +	return 0; +} + +static struct subsys_interface mc_cpu_interface = { +	.name			= "microcode", +	.subsys			= &cpu_subsys, +	.add_dev		= mc_device_add, +	.remove_dev		= mc_device_remove, +}; + +/** + * mc_bp_resume - Update boot CPU microcode during resume. + */ +static void mc_bp_resume(void) +{ +	int cpu = smp_processor_id(); +	struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + +	if (uci->valid && uci->mc) +		microcode_ops->apply_microcode(cpu); +} + +static struct syscore_ops mc_syscore_ops = { +	.resume			= mc_bp_resume, +}; + +static int +mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) +{ +	unsigned int cpu = (unsigned long)hcpu; +	struct device *dev; + +	dev = get_cpu_device(cpu); + +	switch (action & ~CPU_TASKS_FROZEN) { +	case CPU_ONLINE: +		microcode_update_cpu(cpu); +		pr_debug("CPU%d added\n", cpu); +		/* +		 * "break" is missing on purpose here because we want to fall +		 * through in order to create the sysfs group. +		 */ + +	case CPU_DOWN_FAILED: +		if (sysfs_create_group(&dev->kobj, &mc_attr_group)) +			pr_err("Failed to create group for CPU%d\n", cpu); +		break; + +	case CPU_DOWN_PREPARE: +		/* Suspend is in progress, only remove the interface */ +		sysfs_remove_group(&dev->kobj, &mc_attr_group); +		pr_debug("CPU%d removed\n", cpu); +		break; + +	/* +	 * case CPU_DEAD: +	 * +	 * When a CPU goes offline, don't free up or invalidate the copy of +	 * the microcode in kernel memory, so that we can reuse it when the +	 * CPU comes back online without unnecessarily requesting the userspace +	 * for it again. +	 */ +	} + +	/* The CPU refused to come up during a system resume */ +	if (action == CPU_UP_CANCELED_FROZEN) +		microcode_fini_cpu(cpu); + +	return NOTIFY_OK; +} + +static struct notifier_block __refdata mc_cpu_notifier = { +	.notifier_call	= mc_cpu_callback, +}; + +#ifdef MODULE +/* Autoload on Intel and AMD systems */ +static const struct x86_cpu_id __initconst microcode_id[] = { +#ifdef CONFIG_MICROCODE_INTEL +	{ X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, }, +#endif +#ifdef CONFIG_MICROCODE_AMD +	{ X86_VENDOR_AMD, X86_FAMILY_ANY, X86_MODEL_ANY, }, +#endif +	{} +}; +MODULE_DEVICE_TABLE(x86cpu, microcode_id); +#endif + +static struct attribute *cpu_root_microcode_attrs[] = { +	&dev_attr_reload.attr, +	NULL +}; + +static struct attribute_group cpu_root_microcode_group = { +	.name  = "microcode", +	.attrs = cpu_root_microcode_attrs, +}; + +static int __init microcode_init(void) +{ +	struct cpuinfo_x86 *c = &cpu_data(0); +	int error; + +	if (dis_ucode_ldr) +		return 0; + +	if (c->x86_vendor == X86_VENDOR_INTEL) +		microcode_ops = init_intel_microcode(); +	else if (c->x86_vendor == X86_VENDOR_AMD) +		microcode_ops = init_amd_microcode(); +	else +		pr_err("no support for this CPU vendor\n"); + +	if (!microcode_ops) +		return -ENODEV; + +	microcode_pdev = platform_device_register_simple("microcode", -1, +							 NULL, 0); +	if (IS_ERR(microcode_pdev)) +		return PTR_ERR(microcode_pdev); + +	get_online_cpus(); +	mutex_lock(µcode_mutex); + +	error = subsys_interface_register(&mc_cpu_interface); +	if (!error) +		perf_check_microcode(); +	mutex_unlock(µcode_mutex); +	put_online_cpus(); + +	if (error) +		goto out_pdev; + +	error = sysfs_create_group(&cpu_subsys.dev_root->kobj, +				   &cpu_root_microcode_group); + +	if (error) { +		pr_err("Error creating microcode group!\n"); +		goto out_driver; +	} + +	error = microcode_dev_init(); +	if (error) +		goto out_ucode_group; + +	register_syscore_ops(&mc_syscore_ops); +	register_hotcpu_notifier(&mc_cpu_notifier); + +	pr_info("Microcode Update Driver: v" MICROCODE_VERSION +		" <tigran@aivazian.fsnet.co.uk>, Peter Oruba\n"); + +	return 0; + + out_ucode_group: +	sysfs_remove_group(&cpu_subsys.dev_root->kobj, +			   &cpu_root_microcode_group); + + out_driver: +	get_online_cpus(); +	mutex_lock(µcode_mutex); + +	subsys_interface_unregister(&mc_cpu_interface); + +	mutex_unlock(µcode_mutex); +	put_online_cpus(); + + out_pdev: +	platform_device_unregister(microcode_pdev); +	return error; + +} +module_init(microcode_init); + +static void __exit microcode_exit(void) +{ +	struct cpuinfo_x86 *c = &cpu_data(0); + +	microcode_dev_exit(); + +	unregister_hotcpu_notifier(&mc_cpu_notifier); +	unregister_syscore_ops(&mc_syscore_ops); + +	sysfs_remove_group(&cpu_subsys.dev_root->kobj, +			   &cpu_root_microcode_group); + +	get_online_cpus(); +	mutex_lock(µcode_mutex); + +	subsys_interface_unregister(&mc_cpu_interface); + +	mutex_unlock(µcode_mutex); +	put_online_cpus(); + +	platform_device_unregister(microcode_pdev); + +	microcode_ops = NULL; + +	if (c->x86_vendor == X86_VENDOR_AMD) +		exit_amd_microcode(); + +	pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); +} +module_exit(microcode_exit); diff --git a/arch/x86/kernel/cpu/microcode/core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c new file mode 100644 index 00000000000..5f28a64e71e --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/core_early.c @@ -0,0 +1,178 @@ +/* + *	X86 CPU microcode early update for Linux + * + *	Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com> + *			   H Peter Anvin" <hpa@zytor.com> + * + *	This driver allows to early upgrade microcode on Intel processors + *	belonging to IA-32 family - PentiumPro, Pentium II, + *	Pentium III, Xeon, Pentium 4, etc. + * + *	Reference: Section 9.11 of Volume 3, IA-32 Intel Architecture + *	Software Developer's Manual. + * + *	This program is free software; you can redistribute it and/or + *	modify it under the terms of the GNU General Public License + *	as published by the Free Software Foundation; either version + *	2 of the License, or (at your option) any later version. + */ +#include <linux/module.h> +#include <asm/microcode.h> +#include <asm/microcode_intel.h> +#include <asm/microcode_amd.h> +#include <asm/processor.h> +#include <asm/cmdline.h> + +#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24)) +#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u') +#define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I') +#define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l') +#define CPUID_AMD1 QCHAR('A', 'u', 't', 'h') +#define CPUID_AMD2 QCHAR('e', 'n', 't', 'i') +#define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D') + +#define CPUID_IS(a, b, c, ebx, ecx, edx)	\ +		(!((ebx ^ (a))|(edx ^ (b))|(ecx ^ (c)))) + +/* + * In early loading microcode phase on BSP, boot_cpu_data is not set up yet. + * x86_vendor() gets vendor id for BSP. + * + * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify + * coding, we still use x86_vendor() to get vendor id for AP. + * + * x86_vendor() gets vendor information directly through cpuid. + */ +static int x86_vendor(void) +{ +	u32 eax = 0x00000000; +	u32 ebx, ecx = 0, edx; + +	native_cpuid(&eax, &ebx, &ecx, &edx); + +	if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx)) +		return X86_VENDOR_INTEL; + +	if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx)) +		return X86_VENDOR_AMD; + +	return X86_VENDOR_UNKNOWN; +} + +static int x86_family(void) +{ +	u32 eax = 0x00000001; +	u32 ebx, ecx = 0, edx; +	int x86; + +	native_cpuid(&eax, &ebx, &ecx, &edx); + +	x86 = (eax >> 8) & 0xf; +	if (x86 == 15) +		x86 += (eax >> 20) & 0xff; + +	return x86; +} + +static bool __init check_loader_disabled_bsp(void) +{ +#ifdef CONFIG_X86_32 +	const char *cmdline = (const char *)__pa_nodebug(boot_command_line); +	const char *opt	    = "dis_ucode_ldr"; +	const char *option  = (const char *)__pa_nodebug(opt); +	bool *res = (bool *)__pa_nodebug(&dis_ucode_ldr); + +#else /* CONFIG_X86_64 */ +	const char *cmdline = boot_command_line; +	const char *option  = "dis_ucode_ldr"; +	bool *res = &dis_ucode_ldr; +#endif + +	if (cmdline_find_option_bool(cmdline, option)) +		*res = true; + +	return *res; +} + +void __init load_ucode_bsp(void) +{ +	int vendor, x86; + +	if (check_loader_disabled_bsp()) +		return; + +	if (!have_cpuid_p()) +		return; + +	vendor = x86_vendor(); +	x86 = x86_family(); + +	switch (vendor) { +	case X86_VENDOR_INTEL: +		if (x86 >= 6) +			load_ucode_intel_bsp(); +		break; +	case X86_VENDOR_AMD: +		if (x86 >= 0x10) +			load_ucode_amd_bsp(); +		break; +	default: +		break; +	} +} + +static bool check_loader_disabled_ap(void) +{ +#ifdef CONFIG_X86_32 +	return __pa_nodebug(dis_ucode_ldr); +#else +	return dis_ucode_ldr; +#endif +} + +void load_ucode_ap(void) +{ +	int vendor, x86; + +	if (check_loader_disabled_ap()) +		return; + +	if (!have_cpuid_p()) +		return; + +	vendor = x86_vendor(); +	x86 = x86_family(); + +	switch (vendor) { +	case X86_VENDOR_INTEL: +		if (x86 >= 6) +			load_ucode_intel_ap(); +		break; +	case X86_VENDOR_AMD: +		if (x86 >= 0x10) +			load_ucode_amd_ap(); +		break; +	default: +		break; +	} +} + +int __init save_microcode_in_initrd(void) +{ +	struct cpuinfo_x86 *c = &boot_cpu_data; + +	switch (c->x86_vendor) { +	case X86_VENDOR_INTEL: +		if (c->x86 >= 6) +			save_microcode_in_initrd_intel(); +		break; +	case X86_VENDOR_AMD: +		if (c->x86 >= 0x10) +			save_microcode_in_initrd_amd(); +		break; +	default: +		break; +	} + +	return 0; +} diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c new file mode 100644 index 00000000000..a276fa75d9b --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -0,0 +1,333 @@ +/* + *	Intel CPU Microcode Update Driver for Linux + * + *	Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk> + *		      2006	Shaohua Li <shaohua.li@intel.com> + * + *	This driver allows to upgrade microcode on Intel processors + *	belonging to IA-32 family - PentiumPro, Pentium II, + *	Pentium III, Xeon, Pentium 4, etc. + * + *	Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture + *	Software Developer's Manual + *	Order Number 253668 or free download from: + * + *	http://developer.intel.com/Assets/PDF/manual/253668.pdf	 + * + *	For more information, go to http://www.urbanmyth.org/microcode + * + *	This program is free software; you can redistribute it and/or + *	modify it under the terms of the GNU General Public License + *	as published by the Free Software Foundation; either version + *	2 of the License, or (at your option) any later version. + * + *	1.0	16 Feb 2000, Tigran Aivazian <tigran@sco.com> + *		Initial release. + *	1.01	18 Feb 2000, Tigran Aivazian <tigran@sco.com> + *		Added read() support + cleanups. + *	1.02	21 Feb 2000, Tigran Aivazian <tigran@sco.com> + *		Added 'device trimming' support. open(O_WRONLY) zeroes + *		and frees the saved copy of applied microcode. + *	1.03	29 Feb 2000, Tigran Aivazian <tigran@sco.com> + *		Made to use devfs (/dev/cpu/microcode) + cleanups. + *	1.04	06 Jun 2000, Simon Trimmer <simon@veritas.com> + *		Added misc device support (now uses both devfs and misc). + *		Added MICROCODE_IOCFREE ioctl to clear memory. + *	1.05	09 Jun 2000, Simon Trimmer <simon@veritas.com> + *		Messages for error cases (non Intel & no suitable microcode). + *	1.06	03 Aug 2000, Tigran Aivazian <tigran@veritas.com> + *		Removed ->release(). Removed exclusive open and status bitmap. + *		Added microcode_rwsem to serialize read()/write()/ioctl(). + *		Removed global kernel lock usage. + *	1.07	07 Sep 2000, Tigran Aivazian <tigran@veritas.com> + *		Write 0 to 0x8B msr and then cpuid before reading revision, + *		so that it works even if there were no update done by the + *		BIOS. Otherwise, reading from 0x8B gives junk (which happened + *		to be 0 on my machine which is why it worked even when I + *		disabled update by the BIOS) + *		Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix. + *	1.08	11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and + *			     Tigran Aivazian <tigran@veritas.com> + *		Intel Pentium 4 processor support and bugfixes. + *	1.09	30 Oct 2001, Tigran Aivazian <tigran@veritas.com> + *		Bugfix for HT (Hyper-Threading) enabled processors + *		whereby processor resources are shared by all logical processors + *		in a single CPU package. + *	1.10	28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and + *		Tigran Aivazian <tigran@veritas.com>, + *		Serialize updates as required on HT processors due to + *		speculative nature of implementation. + *	1.11	22 Mar 2002 Tigran Aivazian <tigran@veritas.com> + *		Fix the panic when writing zero-length microcode chunk. + *	1.12	29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>, + *		Jun Nakajima <jun.nakajima@intel.com> + *		Support for the microcode updates in the new format. + *	1.13	10 Oct 2003 Tigran Aivazian <tigran@veritas.com> + *		Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl + *		because we no longer hold a copy of applied microcode + *		in kernel memory. + *	1.14	25 Jun 2004 Tigran Aivazian <tigran@veritas.com> + *		Fix sigmatch() macro to handle old CPUs with pf == 0. + *		Thanks to Stuart Swales for pointing out this bug. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/firmware.h> +#include <linux/uaccess.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/vmalloc.h> + +#include <asm/microcode_intel.h> +#include <asm/processor.h> +#include <asm/msr.h> + +MODULE_DESCRIPTION("Microcode Update Driver"); +MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); +MODULE_LICENSE("GPL"); + +static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) +{ +	struct cpuinfo_x86 *c = &cpu_data(cpu_num); +	unsigned int val[2]; + +	memset(csig, 0, sizeof(*csig)); + +	csig->sig = cpuid_eax(0x00000001); + +	if ((c->x86_model >= 5) || (c->x86 > 6)) { +		/* get processor flags from MSR 0x17 */ +		rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); +		csig->pf = 1 << ((val[1] >> 18) & 7); +	} + +	csig->rev = c->microcode; +	pr_info("CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n", +		cpu_num, csig->sig, csig->pf, csig->rev); + +	return 0; +} + +/* + * return 0 - no update found + * return 1 - found update + */ +static int get_matching_mc(struct microcode_intel *mc_intel, int cpu) +{ +	struct cpu_signature cpu_sig; +	unsigned int csig, cpf, crev; + +	collect_cpu_info(cpu, &cpu_sig); + +	csig = cpu_sig.sig; +	cpf = cpu_sig.pf; +	crev = cpu_sig.rev; + +	return get_matching_microcode(csig, cpf, mc_intel, crev); +} + +int apply_microcode(int cpu) +{ +	struct microcode_intel *mc_intel; +	struct ucode_cpu_info *uci; +	unsigned int val[2]; +	int cpu_num = raw_smp_processor_id(); +	struct cpuinfo_x86 *c = &cpu_data(cpu_num); + +	uci = ucode_cpu_info + cpu; +	mc_intel = uci->mc; + +	/* We should bind the task to the CPU */ +	BUG_ON(cpu_num != cpu); + +	if (mc_intel == NULL) +		return 0; + +	/* +	 * Microcode on this CPU could be updated earlier. Only apply the +	 * microcode patch in mc_intel when it is newer than the one on this +	 * CPU. +	 */ +	if (get_matching_mc(mc_intel, cpu) == 0) +		return 0; + +	/* write microcode via MSR 0x79 */ +	wrmsr(MSR_IA32_UCODE_WRITE, +	      (unsigned long) mc_intel->bits, +	      (unsigned long) mc_intel->bits >> 16 >> 16); +	wrmsr(MSR_IA32_UCODE_REV, 0, 0); + +	/* As documented in the SDM: Do a CPUID 1 here */ +	sync_core(); + +	/* get the current revision from MSR 0x8B */ +	rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); + +	if (val[1] != mc_intel->hdr.rev) { +		pr_err("CPU%d update to revision 0x%x failed\n", +		       cpu_num, mc_intel->hdr.rev); +		return -1; +	} +	pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x\n", +		cpu_num, val[1], +		mc_intel->hdr.date & 0xffff, +		mc_intel->hdr.date >> 24, +		(mc_intel->hdr.date >> 16) & 0xff); + +	uci->cpu_sig.rev = val[1]; +	c->microcode = val[1]; + +	return 0; +} + +static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, +				int (*get_ucode_data)(void *, const void *, size_t)) +{ +	struct ucode_cpu_info *uci = ucode_cpu_info + cpu; +	u8 *ucode_ptr = data, *new_mc = NULL, *mc = NULL; +	int new_rev = uci->cpu_sig.rev; +	unsigned int leftover = size; +	enum ucode_state state = UCODE_OK; +	unsigned int curr_mc_size = 0; +	unsigned int csig, cpf; + +	while (leftover) { +		struct microcode_header_intel mc_header; +		unsigned int mc_size; + +		if (get_ucode_data(&mc_header, ucode_ptr, sizeof(mc_header))) +			break; + +		mc_size = get_totalsize(&mc_header); +		if (!mc_size || mc_size > leftover) { +			pr_err("error! Bad data in microcode data file\n"); +			break; +		} + +		/* For performance reasons, reuse mc area when possible */ +		if (!mc || mc_size > curr_mc_size) { +			vfree(mc); +			mc = vmalloc(mc_size); +			if (!mc) +				break; +			curr_mc_size = mc_size; +		} + +		if (get_ucode_data(mc, ucode_ptr, mc_size) || +		    microcode_sanity_check(mc, 1) < 0) { +			break; +		} + +		csig = uci->cpu_sig.sig; +		cpf = uci->cpu_sig.pf; +		if (get_matching_microcode(csig, cpf, mc, new_rev)) { +			vfree(new_mc); +			new_rev = mc_header.rev; +			new_mc  = mc; +			mc = NULL;	/* trigger new vmalloc */ +		} + +		ucode_ptr += mc_size; +		leftover  -= mc_size; +	} + +	vfree(mc); + +	if (leftover) { +		vfree(new_mc); +		state = UCODE_ERROR; +		goto out; +	} + +	if (!new_mc) { +		state = UCODE_NFOUND; +		goto out; +	} + +	vfree(uci->mc); +	uci->mc = (struct microcode_intel *)new_mc; + +	/* +	 * If early loading microcode is supported, save this mc into +	 * permanent memory. So it will be loaded early when a CPU is hot added +	 * or resumes. +	 */ +	save_mc_for_early(new_mc); + +	pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", +		 cpu, new_rev, uci->cpu_sig.rev); +out: +	return state; +} + +static int get_ucode_fw(void *to, const void *from, size_t n) +{ +	memcpy(to, from, n); +	return 0; +} + +static enum ucode_state request_microcode_fw(int cpu, struct device *device, +					     bool refresh_fw) +{ +	char name[30]; +	struct cpuinfo_x86 *c = &cpu_data(cpu); +	const struct firmware *firmware; +	enum ucode_state ret; + +	sprintf(name, "intel-ucode/%02x-%02x-%02x", +		c->x86, c->x86_model, c->x86_mask); + +	if (request_firmware_direct(&firmware, name, device)) { +		pr_debug("data file %s load failed\n", name); +		return UCODE_NFOUND; +	} + +	ret = generic_load_microcode(cpu, (void *)firmware->data, +				     firmware->size, &get_ucode_fw); + +	release_firmware(firmware); + +	return ret; +} + +static int get_ucode_user(void *to, const void *from, size_t n) +{ +	return copy_from_user(to, from, n); +} + +static enum ucode_state +request_microcode_user(int cpu, const void __user *buf, size_t size) +{ +	return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user); +} + +static void microcode_fini_cpu(int cpu) +{ +	struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + +	vfree(uci->mc); +	uci->mc = NULL; +} + +static struct microcode_ops microcode_intel_ops = { +	.request_microcode_user		  = request_microcode_user, +	.request_microcode_fw             = request_microcode_fw, +	.collect_cpu_info                 = collect_cpu_info, +	.apply_microcode                  = apply_microcode, +	.microcode_fini_cpu               = microcode_fini_cpu, +}; + +struct microcode_ops * __init init_intel_microcode(void) +{ +	struct cpuinfo_x86 *c = &cpu_data(0); + +	if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || +	    cpu_has(c, X86_FEATURE_IA64)) { +		pr_err("Intel CPU family 0x%x not supported\n", c->x86); +		return NULL; +	} + +	return µcode_intel_ops; +} + diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c new file mode 100644 index 00000000000..18f739129e7 --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/intel_early.c @@ -0,0 +1,787 @@ +/* + *	Intel CPU microcode early update for Linux + * + *	Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com> + *			   H Peter Anvin" <hpa@zytor.com> + * + *	This allows to early upgrade microcode on Intel processors + *	belonging to IA-32 family - PentiumPro, Pentium II, + *	Pentium III, Xeon, Pentium 4, etc. + * + *	Reference: Section 9.11 of Volume 3, IA-32 Intel Architecture + *	Software Developer's Manual. + * + *	This program is free software; you can redistribute it and/or + *	modify it under the terms of the GNU General Public License + *	as published by the Free Software Foundation; either version + *	2 of the License, or (at your option) any later version. + */ +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/earlycpio.h> +#include <linux/initrd.h> +#include <linux/cpu.h> +#include <asm/msr.h> +#include <asm/microcode_intel.h> +#include <asm/processor.h> +#include <asm/tlbflush.h> +#include <asm/setup.h> + +unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT]; +struct mc_saved_data { +	unsigned int mc_saved_count; +	struct microcode_intel **mc_saved; +} mc_saved_data; + +static enum ucode_state +generic_load_microcode_early(struct microcode_intel **mc_saved_p, +			     unsigned int mc_saved_count, +			     struct ucode_cpu_info *uci) +{ +	struct microcode_intel *ucode_ptr, *new_mc = NULL; +	int new_rev = uci->cpu_sig.rev; +	enum ucode_state state = UCODE_OK; +	unsigned int mc_size; +	struct microcode_header_intel *mc_header; +	unsigned int csig = uci->cpu_sig.sig; +	unsigned int cpf = uci->cpu_sig.pf; +	int i; + +	for (i = 0; i < mc_saved_count; i++) { +		ucode_ptr = mc_saved_p[i]; + +		mc_header = (struct microcode_header_intel *)ucode_ptr; +		mc_size = get_totalsize(mc_header); +		if (get_matching_microcode(csig, cpf, ucode_ptr, new_rev)) { +			new_rev = mc_header->rev; +			new_mc  = ucode_ptr; +		} +	} + +	if (!new_mc) { +		state = UCODE_NFOUND; +		goto out; +	} + +	uci->mc = (struct microcode_intel *)new_mc; +out: +	return state; +} + +static void +microcode_pointer(struct microcode_intel **mc_saved, +		  unsigned long *mc_saved_in_initrd, +		  unsigned long initrd_start, int mc_saved_count) +{ +	int i; + +	for (i = 0; i < mc_saved_count; i++) +		mc_saved[i] = (struct microcode_intel *) +			      (mc_saved_in_initrd[i] + initrd_start); +} + +#ifdef CONFIG_X86_32 +static void +microcode_phys(struct microcode_intel **mc_saved_tmp, +	       struct mc_saved_data *mc_saved_data) +{ +	int i; +	struct microcode_intel ***mc_saved; + +	mc_saved = (struct microcode_intel ***) +		   __pa_nodebug(&mc_saved_data->mc_saved); +	for (i = 0; i < mc_saved_data->mc_saved_count; i++) { +		struct microcode_intel *p; + +		p = *(struct microcode_intel **) +			__pa_nodebug(mc_saved_data->mc_saved + i); +		mc_saved_tmp[i] = (struct microcode_intel *)__pa_nodebug(p); +	} +} +#endif + +static enum ucode_state +load_microcode(struct mc_saved_data *mc_saved_data, +	       unsigned long *mc_saved_in_initrd, +	       unsigned long initrd_start, +	       struct ucode_cpu_info *uci) +{ +	struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; +	unsigned int count = mc_saved_data->mc_saved_count; + +	if (!mc_saved_data->mc_saved) { +		microcode_pointer(mc_saved_tmp, mc_saved_in_initrd, +				  initrd_start, count); + +		return generic_load_microcode_early(mc_saved_tmp, count, uci); +	} else { +#ifdef CONFIG_X86_32 +		microcode_phys(mc_saved_tmp, mc_saved_data); +		return generic_load_microcode_early(mc_saved_tmp, count, uci); +#else +		return generic_load_microcode_early(mc_saved_data->mc_saved, +						    count, uci); +#endif +	} +} + +static u8 get_x86_family(unsigned long sig) +{ +	u8 x86; + +	x86 = (sig >> 8) & 0xf; + +	if (x86 == 0xf) +		x86 += (sig >> 20) & 0xff; + +	return x86; +} + +static u8 get_x86_model(unsigned long sig) +{ +	u8 x86, x86_model; + +	x86 = get_x86_family(sig); +	x86_model = (sig >> 4) & 0xf; + +	if (x86 == 0x6 || x86 == 0xf) +		x86_model += ((sig >> 16) & 0xf) << 4; + +	return x86_model; +} + +/* + * Given CPU signature and a microcode patch, this function finds if the + * microcode patch has matching family and model with the CPU. + */ +static enum ucode_state +matching_model_microcode(struct microcode_header_intel *mc_header, +			unsigned long sig) +{ +	u8 x86, x86_model; +	u8 x86_ucode, x86_model_ucode; +	struct extended_sigtable *ext_header; +	unsigned long total_size = get_totalsize(mc_header); +	unsigned long data_size = get_datasize(mc_header); +	int ext_sigcount, i; +	struct extended_signature *ext_sig; + +	x86 = get_x86_family(sig); +	x86_model = get_x86_model(sig); + +	x86_ucode = get_x86_family(mc_header->sig); +	x86_model_ucode = get_x86_model(mc_header->sig); + +	if (x86 == x86_ucode && x86_model == x86_model_ucode) +		return UCODE_OK; + +	/* Look for ext. headers: */ +	if (total_size <= data_size + MC_HEADER_SIZE) +		return UCODE_NFOUND; + +	ext_header = (struct extended_sigtable *) +		     mc_header + data_size + MC_HEADER_SIZE; +	ext_sigcount = ext_header->count; +	ext_sig = (void *)ext_header + EXT_HEADER_SIZE; + +	for (i = 0; i < ext_sigcount; i++) { +		x86_ucode = get_x86_family(ext_sig->sig); +		x86_model_ucode = get_x86_model(ext_sig->sig); + +		if (x86 == x86_ucode && x86_model == x86_model_ucode) +			return UCODE_OK; + +		ext_sig++; +	} + +	return UCODE_NFOUND; +} + +static int +save_microcode(struct mc_saved_data *mc_saved_data, +	       struct microcode_intel **mc_saved_src, +	       unsigned int mc_saved_count) +{ +	int i, j; +	struct microcode_intel **mc_saved_p; +	int ret; + +	if (!mc_saved_count) +		return -EINVAL; + +	/* +	 * Copy new microcode data. +	 */ +	mc_saved_p = kmalloc(mc_saved_count*sizeof(struct microcode_intel *), +			     GFP_KERNEL); +	if (!mc_saved_p) +		return -ENOMEM; + +	for (i = 0; i < mc_saved_count; i++) { +		struct microcode_intel *mc = mc_saved_src[i]; +		struct microcode_header_intel *mc_header = &mc->hdr; +		unsigned long mc_size = get_totalsize(mc_header); +		mc_saved_p[i] = kmalloc(mc_size, GFP_KERNEL); +		if (!mc_saved_p[i]) { +			ret = -ENOMEM; +			goto err; +		} +		if (!mc_saved_src[i]) { +			ret = -EINVAL; +			goto err; +		} +		memcpy(mc_saved_p[i], mc, mc_size); +	} + +	/* +	 * Point to newly saved microcode. +	 */ +	mc_saved_data->mc_saved = mc_saved_p; +	mc_saved_data->mc_saved_count = mc_saved_count; + +	return 0; + +err: +	for (j = 0; j <= i; j++) +		kfree(mc_saved_p[j]); +	kfree(mc_saved_p); + +	return ret; +} + +/* + * A microcode patch in ucode_ptr is saved into mc_saved + * - if it has matching signature and newer revision compared to an existing + *   patch mc_saved. + * - or if it is a newly discovered microcode patch. + * + * The microcode patch should have matching model with CPU. + */ +static void _save_mc(struct microcode_intel **mc_saved, u8 *ucode_ptr, +		     unsigned int *mc_saved_count_p) +{ +	int i; +	int found = 0; +	unsigned int mc_saved_count = *mc_saved_count_p; +	struct microcode_header_intel *mc_header; + +	mc_header = (struct microcode_header_intel *)ucode_ptr; +	for (i = 0; i < mc_saved_count; i++) { +		unsigned int sig, pf; +		unsigned int new_rev; +		struct microcode_header_intel *mc_saved_header = +			     (struct microcode_header_intel *)mc_saved[i]; +		sig = mc_saved_header->sig; +		pf = mc_saved_header->pf; +		new_rev = mc_header->rev; + +		if (get_matching_sig(sig, pf, ucode_ptr, new_rev)) { +			found = 1; +			if (update_match_revision(mc_header, new_rev)) { +				/* +				 * Found an older ucode saved before. +				 * Replace the older one with this newer +				 * one. +				 */ +				mc_saved[i] = +					(struct microcode_intel *)ucode_ptr; +				break; +			} +		} +	} +	if (i >= mc_saved_count && !found) +		/* +		 * This ucode is first time discovered in ucode file. +		 * Save it to memory. +		 */ +		mc_saved[mc_saved_count++] = +				 (struct microcode_intel *)ucode_ptr; + +	*mc_saved_count_p = mc_saved_count; +} + +/* + * Get microcode matching with BSP's model. Only CPUs with the same model as + * BSP can stay in the platform. + */ +static enum ucode_state __init +get_matching_model_microcode(int cpu, unsigned long start, +			     void *data, size_t size, +			     struct mc_saved_data *mc_saved_data, +			     unsigned long *mc_saved_in_initrd, +			     struct ucode_cpu_info *uci) +{ +	u8 *ucode_ptr = data; +	unsigned int leftover = size; +	enum ucode_state state = UCODE_OK; +	unsigned int mc_size; +	struct microcode_header_intel *mc_header; +	struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; +	unsigned int mc_saved_count = mc_saved_data->mc_saved_count; +	int i; + +	while (leftover) { +		mc_header = (struct microcode_header_intel *)ucode_ptr; + +		mc_size = get_totalsize(mc_header); +		if (!mc_size || mc_size > leftover || +			microcode_sanity_check(ucode_ptr, 0) < 0) +			break; + +		leftover -= mc_size; + +		/* +		 * Since APs with same family and model as the BSP may boot in +		 * the platform, we need to find and save microcode patches +		 * with the same family and model as the BSP. +		 */ +		if (matching_model_microcode(mc_header, uci->cpu_sig.sig) != +			 UCODE_OK) { +			ucode_ptr += mc_size; +			continue; +		} + +		_save_mc(mc_saved_tmp, ucode_ptr, &mc_saved_count); + +		ucode_ptr += mc_size; +	} + +	if (leftover) { +		state = UCODE_ERROR; +		goto out; +	} + +	if (mc_saved_count == 0) { +		state = UCODE_NFOUND; +		goto out; +	} + +	for (i = 0; i < mc_saved_count; i++) +		mc_saved_in_initrd[i] = (unsigned long)mc_saved_tmp[i] - start; + +	mc_saved_data->mc_saved_count = mc_saved_count; +out: +	return state; +} + +static int collect_cpu_info_early(struct ucode_cpu_info *uci) +{ +	unsigned int val[2]; +	u8 x86, x86_model; +	struct cpu_signature csig; +	unsigned int eax, ebx, ecx, edx; + +	csig.sig = 0; +	csig.pf = 0; +	csig.rev = 0; + +	memset(uci, 0, sizeof(*uci)); + +	eax = 0x00000001; +	ecx = 0; +	native_cpuid(&eax, &ebx, &ecx, &edx); +	csig.sig = eax; + +	x86 = get_x86_family(csig.sig); +	x86_model = get_x86_model(csig.sig); + +	if ((x86_model >= 5) || (x86 > 6)) { +		/* get processor flags from MSR 0x17 */ +		native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); +		csig.pf = 1 << ((val[1] >> 18) & 7); +	} +	native_wrmsr(MSR_IA32_UCODE_REV, 0, 0); + +	/* As documented in the SDM: Do a CPUID 1 here */ +	sync_core(); + +	/* get the current revision from MSR 0x8B */ +	native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); + +	csig.rev = val[1]; + +	uci->cpu_sig = csig; +	uci->valid = 1; + +	return 0; +} + +#ifdef DEBUG +static void __ref show_saved_mc(void) +{ +	int i, j; +	unsigned int sig, pf, rev, total_size, data_size, date; +	struct ucode_cpu_info uci; + +	if (mc_saved_data.mc_saved_count == 0) { +		pr_debug("no micorcode data saved.\n"); +		return; +	} +	pr_debug("Total microcode saved: %d\n", mc_saved_data.mc_saved_count); + +	collect_cpu_info_early(&uci); + +	sig = uci.cpu_sig.sig; +	pf = uci.cpu_sig.pf; +	rev = uci.cpu_sig.rev; +	pr_debug("CPU%d: sig=0x%x, pf=0x%x, rev=0x%x\n", +		 smp_processor_id(), sig, pf, rev); + +	for (i = 0; i < mc_saved_data.mc_saved_count; i++) { +		struct microcode_header_intel *mc_saved_header; +		struct extended_sigtable *ext_header; +		int ext_sigcount; +		struct extended_signature *ext_sig; + +		mc_saved_header = (struct microcode_header_intel *) +				  mc_saved_data.mc_saved[i]; +		sig = mc_saved_header->sig; +		pf = mc_saved_header->pf; +		rev = mc_saved_header->rev; +		total_size = get_totalsize(mc_saved_header); +		data_size = get_datasize(mc_saved_header); +		date = mc_saved_header->date; + +		pr_debug("mc_saved[%d]: sig=0x%x, pf=0x%x, rev=0x%x, toal size=0x%x, date = %04x-%02x-%02x\n", +			 i, sig, pf, rev, total_size, +			 date & 0xffff, +			 date >> 24, +			 (date >> 16) & 0xff); + +		/* Look for ext. headers: */ +		if (total_size <= data_size + MC_HEADER_SIZE) +			continue; + +		ext_header = (struct extended_sigtable *) +			     mc_saved_header + data_size + MC_HEADER_SIZE; +		ext_sigcount = ext_header->count; +		ext_sig = (void *)ext_header + EXT_HEADER_SIZE; + +		for (j = 0; j < ext_sigcount; j++) { +			sig = ext_sig->sig; +			pf = ext_sig->pf; + +			pr_debug("\tExtended[%d]: sig=0x%x, pf=0x%x\n", +				 j, sig, pf); + +			ext_sig++; +		} + +	} +} +#else +static inline void show_saved_mc(void) +{ +} +#endif + +#if defined(CONFIG_MICROCODE_INTEL_EARLY) && defined(CONFIG_HOTPLUG_CPU) +static DEFINE_MUTEX(x86_cpu_microcode_mutex); +/* + * Save this mc into mc_saved_data. So it will be loaded early when a CPU is + * hot added or resumes. + * + * Please make sure this mc should be a valid microcode patch before calling + * this function. + */ +int save_mc_for_early(u8 *mc) +{ +	struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; +	unsigned int mc_saved_count_init; +	unsigned int mc_saved_count; +	struct microcode_intel **mc_saved; +	int ret = 0; +	int i; + +	/* +	 * Hold hotplug lock so mc_saved_data is not accessed by a CPU in +	 * hotplug. +	 */ +	mutex_lock(&x86_cpu_microcode_mutex); + +	mc_saved_count_init = mc_saved_data.mc_saved_count; +	mc_saved_count = mc_saved_data.mc_saved_count; +	mc_saved = mc_saved_data.mc_saved; + +	if (mc_saved && mc_saved_count) +		memcpy(mc_saved_tmp, mc_saved, +		       mc_saved_count * sizeof(struct mirocode_intel *)); +	/* +	 * Save the microcode patch mc in mc_save_tmp structure if it's a newer +	 * version. +	 */ + +	_save_mc(mc_saved_tmp, mc, &mc_saved_count); + +	/* +	 * Save the mc_save_tmp in global mc_saved_data. +	 */ +	ret = save_microcode(&mc_saved_data, mc_saved_tmp, mc_saved_count); +	if (ret) { +		pr_err("Cannot save microcode patch.\n"); +		goto out; +	} + +	show_saved_mc(); + +	/* +	 * Free old saved microcod data. +	 */ +	if (mc_saved) { +		for (i = 0; i < mc_saved_count_init; i++) +			kfree(mc_saved[i]); +		kfree(mc_saved); +	} + +out: +	mutex_unlock(&x86_cpu_microcode_mutex); + +	return ret; +} +EXPORT_SYMBOL_GPL(save_mc_for_early); +#endif + +static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin"; +static __init enum ucode_state +scan_microcode(unsigned long start, unsigned long end, +		struct mc_saved_data *mc_saved_data, +		unsigned long *mc_saved_in_initrd, +		struct ucode_cpu_info *uci) +{ +	unsigned int size = end - start + 1; +	struct cpio_data cd; +	long offset = 0; +#ifdef CONFIG_X86_32 +	char *p = (char *)__pa_nodebug(ucode_name); +#else +	char *p = ucode_name; +#endif + +	cd.data = NULL; +	cd.size = 0; + +	cd = find_cpio_data(p, (void *)start, size, &offset); +	if (!cd.data) +		return UCODE_ERROR; + + +	return get_matching_model_microcode(0, start, cd.data, cd.size, +					    mc_saved_data, mc_saved_in_initrd, +					    uci); +} + +/* + * Print ucode update info. + */ +static void +print_ucode_info(struct ucode_cpu_info *uci, unsigned int date) +{ +	int cpu = smp_processor_id(); + +	pr_info("CPU%d microcode updated early to revision 0x%x, date = %04x-%02x-%02x\n", +		cpu, +		uci->cpu_sig.rev, +		date & 0xffff, +		date >> 24, +		(date >> 16) & 0xff); +} + +#ifdef CONFIG_X86_32 + +static int delay_ucode_info; +static int current_mc_date; + +/* + * Print early updated ucode info after printk works. This is delayed info dump. + */ +void show_ucode_info_early(void) +{ +	struct ucode_cpu_info uci; + +	if (delay_ucode_info) { +		collect_cpu_info_early(&uci); +		print_ucode_info(&uci, current_mc_date); +		delay_ucode_info = 0; +	} +} + +/* + * At this point, we can not call printk() yet. Keep microcode patch number in + * mc_saved_data.mc_saved and delay printing microcode info in + * show_ucode_info_early() until printk() works. + */ +static void print_ucode(struct ucode_cpu_info *uci) +{ +	struct microcode_intel *mc_intel; +	int *delay_ucode_info_p; +	int *current_mc_date_p; + +	mc_intel = uci->mc; +	if (mc_intel == NULL) +		return; + +	delay_ucode_info_p = (int *)__pa_nodebug(&delay_ucode_info); +	current_mc_date_p = (int *)__pa_nodebug(¤t_mc_date); + +	*delay_ucode_info_p = 1; +	*current_mc_date_p = mc_intel->hdr.date; +} +#else + +/* + * Flush global tlb. We only do this in x86_64 where paging has been enabled + * already and PGE should be enabled as well. + */ +static inline void flush_tlb_early(void) +{ +	__native_flush_tlb_global_irq_disabled(); +} + +static inline void print_ucode(struct ucode_cpu_info *uci) +{ +	struct microcode_intel *mc_intel; + +	mc_intel = uci->mc; +	if (mc_intel == NULL) +		return; + +	print_ucode_info(uci, mc_intel->hdr.date); +} +#endif + +static int apply_microcode_early(struct mc_saved_data *mc_saved_data, +				 struct ucode_cpu_info *uci) +{ +	struct microcode_intel *mc_intel; +	unsigned int val[2]; + +	mc_intel = uci->mc; +	if (mc_intel == NULL) +		return 0; + +	/* write microcode via MSR 0x79 */ +	native_wrmsr(MSR_IA32_UCODE_WRITE, +	      (unsigned long) mc_intel->bits, +	      (unsigned long) mc_intel->bits >> 16 >> 16); +	native_wrmsr(MSR_IA32_UCODE_REV, 0, 0); + +	/* As documented in the SDM: Do a CPUID 1 here */ +	sync_core(); + +	/* get the current revision from MSR 0x8B */ +	native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); +	if (val[1] != mc_intel->hdr.rev) +		return -1; + +#ifdef CONFIG_X86_64 +	/* Flush global tlb. This is precaution. */ +	flush_tlb_early(); +#endif +	uci->cpu_sig.rev = val[1]; + +	print_ucode(uci); + +	return 0; +} + +/* + * This function converts microcode patch offsets previously stored in + * mc_saved_in_initrd to pointers and stores the pointers in mc_saved_data. + */ +int __init save_microcode_in_initrd_intel(void) +{ +	unsigned int count = mc_saved_data.mc_saved_count; +	struct microcode_intel *mc_saved[MAX_UCODE_COUNT]; +	int ret = 0; + +	if (count == 0) +		return ret; + +	microcode_pointer(mc_saved, mc_saved_in_initrd, initrd_start, count); +	ret = save_microcode(&mc_saved_data, mc_saved, count); +	if (ret) +		pr_err("Cannot save microcode patches from initrd.\n"); + +	show_saved_mc(); + +	return ret; +} + +static void __init +_load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data, +		      unsigned long *mc_saved_in_initrd, +		      unsigned long initrd_start_early, +		      unsigned long initrd_end_early, +		      struct ucode_cpu_info *uci) +{ +	collect_cpu_info_early(uci); +	scan_microcode(initrd_start_early, initrd_end_early, mc_saved_data, +		       mc_saved_in_initrd, uci); +	load_microcode(mc_saved_data, mc_saved_in_initrd, +		       initrd_start_early, uci); +	apply_microcode_early(mc_saved_data, uci); +} + +void __init +load_ucode_intel_bsp(void) +{ +	u64 ramdisk_image, ramdisk_size; +	unsigned long initrd_start_early, initrd_end_early; +	struct ucode_cpu_info uci; +#ifdef CONFIG_X86_32 +	struct boot_params *boot_params_p; + +	boot_params_p = (struct boot_params *)__pa_nodebug(&boot_params); +	ramdisk_image = boot_params_p->hdr.ramdisk_image; +	ramdisk_size  = boot_params_p->hdr.ramdisk_size; +	initrd_start_early = ramdisk_image; +	initrd_end_early = initrd_start_early + ramdisk_size; + +	_load_ucode_intel_bsp( +		(struct mc_saved_data *)__pa_nodebug(&mc_saved_data), +		(unsigned long *)__pa_nodebug(&mc_saved_in_initrd), +		initrd_start_early, initrd_end_early, &uci); +#else +	ramdisk_image = boot_params.hdr.ramdisk_image; +	ramdisk_size  = boot_params.hdr.ramdisk_size; +	initrd_start_early = ramdisk_image + PAGE_OFFSET; +	initrd_end_early = initrd_start_early + ramdisk_size; + +	_load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, +			      initrd_start_early, initrd_end_early, &uci); +#endif +} + +void load_ucode_intel_ap(void) +{ +	struct mc_saved_data *mc_saved_data_p; +	struct ucode_cpu_info uci; +	unsigned long *mc_saved_in_initrd_p; +	unsigned long initrd_start_addr; +#ifdef CONFIG_X86_32 +	unsigned long *initrd_start_p; + +	mc_saved_in_initrd_p = +		(unsigned long *)__pa_nodebug(mc_saved_in_initrd); +	mc_saved_data_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data); +	initrd_start_p = (unsigned long *)__pa_nodebug(&initrd_start); +	initrd_start_addr = (unsigned long)__pa_nodebug(*initrd_start_p); +#else +	mc_saved_data_p = &mc_saved_data; +	mc_saved_in_initrd_p = mc_saved_in_initrd; +	initrd_start_addr = initrd_start; +#endif + +	/* +	 * If there is no valid ucode previously saved in memory, no need to +	 * update ucode on this AP. +	 */ +	if (mc_saved_data_p->mc_saved_count == 0) +		return; + +	collect_cpu_info_early(&uci); +	load_microcode(mc_saved_data_p, mc_saved_in_initrd_p, +		       initrd_start_addr, &uci); +	apply_microcode_early(mc_saved_data_p, &uci); +} diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c new file mode 100644 index 00000000000..ce69320d017 --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/intel_lib.c @@ -0,0 +1,174 @@ +/* + *	Intel CPU Microcode Update Driver for Linux + * + *	Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com> + *			   H Peter Anvin" <hpa@zytor.com> + * + *	This driver allows to upgrade microcode on Intel processors + *	belonging to IA-32 family - PentiumPro, Pentium II, + *	Pentium III, Xeon, Pentium 4, etc. + * + *	Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture + *	Software Developer's Manual + *	Order Number 253668 or free download from: + * + *	http://developer.intel.com/Assets/PDF/manual/253668.pdf + * + *	For more information, go to http://www.urbanmyth.org/microcode + * + *	This program is free software; you can redistribute it and/or + *	modify it under the terms of the GNU General Public License + *	as published by the Free Software Foundation; either version + *	2 of the License, or (at your option) any later version. + * + */ +#include <linux/firmware.h> +#include <linux/uaccess.h> +#include <linux/kernel.h> +#include <linux/module.h> + +#include <asm/microcode_intel.h> +#include <asm/processor.h> +#include <asm/msr.h> + +static inline int +update_match_cpu(unsigned int csig, unsigned int cpf, +		 unsigned int sig, unsigned int pf) +{ +	return (!sigmatch(sig, csig, pf, cpf)) ? 0 : 1; +} + +int +update_match_revision(struct microcode_header_intel *mc_header, int rev) +{ +	return (mc_header->rev <= rev) ? 0 : 1; +} + +int microcode_sanity_check(void *mc, int print_err) +{ +	unsigned long total_size, data_size, ext_table_size; +	struct microcode_header_intel *mc_header = mc; +	struct extended_sigtable *ext_header = NULL; +	int sum, orig_sum, ext_sigcount = 0, i; +	struct extended_signature *ext_sig; + +	total_size = get_totalsize(mc_header); +	data_size = get_datasize(mc_header); + +	if (data_size + MC_HEADER_SIZE > total_size) { +		if (print_err) +			pr_err("error! Bad data size in microcode data file\n"); +		return -EINVAL; +	} + +	if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { +		if (print_err) +			pr_err("error! Unknown microcode update format\n"); +		return -EINVAL; +	} +	ext_table_size = total_size - (MC_HEADER_SIZE + data_size); +	if (ext_table_size) { +		if ((ext_table_size < EXT_HEADER_SIZE) +		 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { +			if (print_err) +				pr_err("error! Small exttable size in microcode data file\n"); +			return -EINVAL; +		} +		ext_header = mc + MC_HEADER_SIZE + data_size; +		if (ext_table_size != exttable_size(ext_header)) { +			if (print_err) +				pr_err("error! Bad exttable size in microcode data file\n"); +			return -EFAULT; +		} +		ext_sigcount = ext_header->count; +	} + +	/* check extended table checksum */ +	if (ext_table_size) { +		int ext_table_sum = 0; +		int *ext_tablep = (int *)ext_header; + +		i = ext_table_size / DWSIZE; +		while (i--) +			ext_table_sum += ext_tablep[i]; +		if (ext_table_sum) { +			if (print_err) +				pr_warn("aborting, bad extended signature table checksum\n"); +			return -EINVAL; +		} +	} + +	/* calculate the checksum */ +	orig_sum = 0; +	i = (MC_HEADER_SIZE + data_size) / DWSIZE; +	while (i--) +		orig_sum += ((int *)mc)[i]; +	if (orig_sum) { +		if (print_err) +			pr_err("aborting, bad checksum\n"); +		return -EINVAL; +	} +	if (!ext_table_size) +		return 0; +	/* check extended signature checksum */ +	for (i = 0; i < ext_sigcount; i++) { +		ext_sig = (void *)ext_header + EXT_HEADER_SIZE + +			  EXT_SIGNATURE_SIZE * i; +		sum = orig_sum +			- (mc_header->sig + mc_header->pf + mc_header->cksum) +			+ (ext_sig->sig + ext_sig->pf + ext_sig->cksum); +		if (sum) { +			if (print_err) +				pr_err("aborting, bad checksum\n"); +			return -EINVAL; +		} +	} +	return 0; +} +EXPORT_SYMBOL_GPL(microcode_sanity_check); + +/* + * return 0 - no update found + * return 1 - found update + */ +int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev) +{ +	struct microcode_header_intel *mc_header = mc; +	struct extended_sigtable *ext_header; +	unsigned long total_size = get_totalsize(mc_header); +	int ext_sigcount, i; +	struct extended_signature *ext_sig; + +	if (update_match_cpu(csig, cpf, mc_header->sig, mc_header->pf)) +		return 1; + +	/* Look for ext. headers: */ +	if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE) +		return 0; + +	ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE; +	ext_sigcount = ext_header->count; +	ext_sig = (void *)ext_header + EXT_HEADER_SIZE; + +	for (i = 0; i < ext_sigcount; i++) { +		if (update_match_cpu(csig, cpf, ext_sig->sig, ext_sig->pf)) +			return 1; +		ext_sig++; +	} +	return 0; +} + +/* + * return 0 - no update found + * return 1 - found update + */ +int get_matching_microcode(unsigned int csig, int cpf, void *mc, int rev) +{ +	struct microcode_header_intel *mc_header = mc; + +	if (!update_match_revision(mc_header, rev)) +		return 0; + +	return get_matching_sig(csig, cpf, mc, rev); +} +EXPORT_SYMBOL_GPL(get_matching_microcode); diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 71a39f3621b..a450373e8e9 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -15,7 +15,9 @@  #include <linux/clocksource.h>  #include <linux/module.h>  #include <linux/hardirq.h> +#include <linux/efi.h>  #include <linux/interrupt.h> +#include <linux/irq.h>  #include <asm/processor.h>  #include <asm/hypervisor.h>  #include <asm/hyperv.h> @@ -23,10 +25,52 @@  #include <asm/desc.h>  #include <asm/idle.h>  #include <asm/irq_regs.h> +#include <asm/i8259.h> +#include <asm/apic.h> +#include <asm/timer.h>  struct ms_hyperv_info ms_hyperv;  EXPORT_SYMBOL_GPL(ms_hyperv); +#if IS_ENABLED(CONFIG_HYPERV) +static void (*vmbus_handler)(void); + +void hyperv_vector_handler(struct pt_regs *regs) +{ +	struct pt_regs *old_regs = set_irq_regs(regs); + +	irq_enter(); +	exit_idle(); + +	inc_irq_stat(irq_hv_callback_count); +	if (vmbus_handler) +		vmbus_handler(); + +	irq_exit(); +	set_irq_regs(old_regs); +} + +void hv_setup_vmbus_irq(void (*handler)(void)) +{ +	vmbus_handler = handler; +	/* +	 * Setup the IDT for hypervisor callback. Prevent reallocation +	 * at module reload. +	 */ +	if (!test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors)) +		alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, +				hyperv_callback_vector); +} + +void hv_remove_vmbus_irq(void) +{ +	/* We have no way to deallocate the interrupt gate */ +	vmbus_handler = NULL; +} +EXPORT_SYMBOL_GPL(hv_setup_vmbus_irq); +EXPORT_SYMBOL_GPL(hv_remove_vmbus_irq); +#endif +  static uint32_t  __init ms_hyperv_platform(void)  {  	u32 eax; @@ -76,8 +120,28 @@ static void __init ms_hyperv_init_platform(void)  	printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n",  	       ms_hyperv.features, ms_hyperv.hints); +#ifdef CONFIG_X86_LOCAL_APIC +	if (ms_hyperv.features & HV_X64_MSR_APIC_FREQUENCY_AVAILABLE) { +		/* +		 * Get the APIC frequency. +		 */ +		u64	hv_lapic_frequency; + +		rdmsrl(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency); +		hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ); +		lapic_timer_frequency = hv_lapic_frequency; +		printk(KERN_INFO "HyperV: LAPIC Timer Frequency: %#x\n", +				lapic_timer_frequency); +	} +#endif +  	if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE)  		clocksource_register_hz(&hyperv_cs, NSEC_PER_SEC/100); + +#ifdef CONFIG_X86_IO_APIC +	no_timer_check = 1; +#endif +  }  const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { @@ -86,41 +150,3 @@ const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {  	.init_platform		= ms_hyperv_init_platform,  };  EXPORT_SYMBOL(x86_hyper_ms_hyperv); - -#if IS_ENABLED(CONFIG_HYPERV) -static int vmbus_irq = -1; -static irq_handler_t vmbus_isr; - -void hv_register_vmbus_handler(int irq, irq_handler_t handler) -{ -	/* -	 * Setup the IDT for hypervisor callback. -	 */ -	alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector); - -	vmbus_irq = irq; -	vmbus_isr = handler; -} - -void hyperv_vector_handler(struct pt_regs *regs) -{ -	struct pt_regs *old_regs = set_irq_regs(regs); -	struct irq_desc *desc; - -	irq_enter(); -	exit_idle(); - -	desc = irq_to_desc(vmbus_irq); - -	if (desc) -		generic_handle_irq_desc(vmbus_irq, desc); - -	irq_exit(); -	set_irq_regs(old_regs); -} -#else -void hv_register_vmbus_handler(int irq, irq_handler_t handler) -{ -} -#endif -EXPORT_SYMBOL_GPL(hv_register_vmbus_handler); diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index ce2d0a2c3e4..0e25a1bc5ab 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -683,7 +683,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)  	}  	/* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ -	count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); +	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);  	__flush_tlb();  	/* Save MTRR state */ @@ -697,7 +697,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)  static void post_set(void) __releases(set_atomicity_lock)  {  	/* Flush TLBs (no need to flush caches - they are disabled) */ -	count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); +	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);  	__flush_tlb();  	/* Intel (P6) standard MTRRs */ diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 897783b3302..2879ecdaac4 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -118,6 +118,9 @@ static int x86_pmu_extra_regs(u64 config, struct perf_event *event)  			continue;  		if (event->attr.config1 & ~er->valid_mask)  			return -EINVAL; +		/* Check if the extra msrs can be safely accessed*/ +		if (!er->extra_msr_access) +			return -ENXIO;  		reg->idx = er->idx;  		reg->config = event->attr.config1; @@ -303,15 +306,6 @@ int x86_setup_perfctr(struct perf_event *event)  		hwc->sample_period = x86_pmu.max_period;  		hwc->last_period = hwc->sample_period;  		local64_set(&hwc->period_left, hwc->sample_period); -	} else { -		/* -		 * If we have a PMU initialized but no APIC -		 * interrupts, we cannot sample hardware -		 * events (user-space has to fall back and -		 * sample via a hrtimer based software event): -		 */ -		if (!x86_pmu.apic) -			return -EOPNOTSUPP;  	}  	if (attr->type == PERF_TYPE_RAW) @@ -721,6 +715,7 @@ int perf_assign_events(struct perf_event **events, int n,  	return sched.state.unassigned;  } +EXPORT_SYMBOL_GPL(perf_assign_events);  int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)  { @@ -892,7 +887,6 @@ static void x86_pmu_enable(struct pmu *pmu)  		 * hw_perf_group_sched_in() or x86_pmu_enable()  		 *  		 * step1: save events moving to new counters -		 * step2: reprogram moved events into new counters  		 */  		for (i = 0; i < n_running; i++) {  			event = cpuc->event_list[i]; @@ -918,6 +912,9 @@ static void x86_pmu_enable(struct pmu *pmu)  			x86_pmu_stop(event, PERF_EF_UPDATE);  		} +		/* +		 * step2: reprogram moved events into new counters +		 */  		for (i = 0; i < cpuc->n_events; i++) {  			event = cpuc->event_list[i];  			hwc = &event->hw; @@ -1043,7 +1040,7 @@ static int x86_pmu_add(struct perf_event *event, int flags)  	/*  	 * If group events scheduling transaction was started,  	 * skip the schedulability test here, it will be performed -	 * at commit time (->commit_txn) as a whole +	 * at commit time (->commit_txn) as a whole.  	 */  	if (cpuc->group_flag & PERF_EVENT_TXN)  		goto done_collect; @@ -1058,6 +1055,10 @@ static int x86_pmu_add(struct perf_event *event, int flags)  	memcpy(cpuc->assign, assign, n*sizeof(int));  done_collect: +	/* +	 * Commit the collect_events() state. See x86_pmu_del() and +	 * x86_pmu_*_txn(). +	 */  	cpuc->n_events = n;  	cpuc->n_added += n - n0;  	cpuc->n_txn += n - n0; @@ -1183,25 +1184,38 @@ static void x86_pmu_del(struct perf_event *event, int flags)  	 * If we're called during a txn, we don't need to do anything.  	 * The events never got scheduled and ->cancel_txn will truncate  	 * the event_list. +	 * +	 * XXX assumes any ->del() called during a TXN will only be on +	 * an event added during that same TXN.  	 */  	if (cpuc->group_flag & PERF_EVENT_TXN)  		return; +	/* +	 * Not a TXN, therefore cleanup properly. +	 */  	x86_pmu_stop(event, PERF_EF_UPDATE);  	for (i = 0; i < cpuc->n_events; i++) { -		if (event == cpuc->event_list[i]) { +		if (event == cpuc->event_list[i]) +			break; +	} -			if (x86_pmu.put_event_constraints) -				x86_pmu.put_event_constraints(cpuc, event); +	if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */ +		return; -			while (++i < cpuc->n_events) -				cpuc->event_list[i-1] = cpuc->event_list[i]; +	/* If we have a newly added event; make sure to decrease n_added. */ +	if (i >= cpuc->n_events - cpuc->n_added) +		--cpuc->n_added; + +	if (x86_pmu.put_event_constraints) +		x86_pmu.put_event_constraints(cpuc, event); + +	/* Delete the array entry. */ +	while (++i < cpuc->n_events) +		cpuc->event_list[i-1] = cpuc->event_list[i]; +	--cpuc->n_events; -			--cpuc->n_events; -			break; -		} -	}  	perf_event_update_userpage(event);  } @@ -1273,24 +1287,25 @@ void perf_events_lapic_init(void)  	apic_write(APIC_LVTPC, APIC_DM_NMI);  } -static int __kprobes +static int  perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)  { -	int ret;  	u64 start_clock;  	u64 finish_clock; +	int ret;  	if (!atomic_read(&active_events))  		return NMI_DONE; -	start_clock = local_clock(); +	start_clock = sched_clock();  	ret = x86_pmu.handle_irq(regs); -	finish_clock = local_clock(); +	finish_clock = sched_clock();  	perf_sample_event_took(finish_clock - start_clock);  	return ret;  } +NOKPROBE_SYMBOL(perf_event_nmi_handler);  struct event_constraint emptyconstraint;  struct event_constraint unconstrained; @@ -1346,6 +1361,15 @@ static void __init pmu_check_apic(void)  	x86_pmu.apic = 0;  	pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");  	pr_info("no hardware sampling interrupt available.\n"); + +	/* +	 * If we have a PMU initialized but no APIC +	 * interrupts, we cannot sample hardware +	 * events (user-space has to fall back and +	 * sample via a hrtimer based software event): +	 */ +	pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT; +  }  static struct attribute_group x86_pmu_format_group = { @@ -1521,6 +1545,8 @@ static int __init init_hw_perf_events(void)  	pr_cont("%s PMU driver.\n", x86_pmu.name); +	x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ +  	for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next)  		quirk->func(); @@ -1534,7 +1560,6 @@ static int __init init_hw_perf_events(void)  		__EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,  				   0, x86_pmu.num_counters, 0, 0); -	x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */  	x86_pmu_format_group.attrs = x86_pmu.format_attrs;  	if (x86_pmu.event_attrs) @@ -1594,7 +1619,8 @@ static void x86_pmu_cancel_txn(struct pmu *pmu)  {  	__this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN);  	/* -	 * Truncate the collected events. +	 * Truncate collected array by the number of events added in this +	 * transaction. See x86_pmu_add() and x86_pmu_*_txn().  	 */  	__this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));  	__this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn)); @@ -1605,6 +1631,8 @@ static void x86_pmu_cancel_txn(struct pmu *pmu)   * Commit group events scheduling transaction   * Perform the group schedulability test as a whole   * Return 0 if success + * + * Does not cancel the transaction on failure; expects the caller to do this.   */  static int x86_pmu_commit_txn(struct pmu *pmu)  { @@ -1820,9 +1848,12 @@ static ssize_t set_attr_rdpmc(struct device *cdev,  	if (ret)  		return ret; +	if (x86_pmu.attr_rdpmc_broken) +		return -ENOTSUPP; +  	if (!!val != !!x86_pmu.attr_rdpmc) {  		x86_pmu.attr_rdpmc = !!val; -		smp_call_function(change_rdpmc, (void *)val, 1); +		on_each_cpu(change_rdpmc, (void *)val, 1);  	}  	return count; @@ -1883,26 +1914,27 @@ static struct pmu pmu = {  void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)  { +	struct cyc2ns_data *data; +  	userpg->cap_user_time = 0;  	userpg->cap_user_time_zero = 0;  	userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc;  	userpg->pmc_width = x86_pmu.cntval_bits; -	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) +	if (!sched_clock_stable())  		return; -	if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) -		return; +	data = cyc2ns_read_begin();  	userpg->cap_user_time = 1; -	userpg->time_mult = this_cpu_read(cyc2ns); -	userpg->time_shift = CYC2NS_SCALE_FACTOR; -	userpg->time_offset = this_cpu_read(cyc2ns_offset) - now; +	userpg->time_mult = data->cyc2ns_mul; +	userpg->time_shift = data->cyc2ns_shift; +	userpg->time_offset = data->cyc2ns_offset - now; -	if (sched_clock_stable && !check_tsc_disabled()) { -		userpg->cap_user_time_zero = 1; -		userpg->time_zero = this_cpu_read(cyc2ns_offset); -	} +	userpg->cap_user_time_zero = 1; +	userpg->time_zero = data->cyc2ns_offset; + +	cyc2ns_read_end(data);  }  /* @@ -1994,7 +2026,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)  		frame.return_address = 0;  		bytes = copy_from_user_nmi(&frame, fp, sizeof(frame)); -		if (bytes != sizeof(frame)) +		if (bytes != 0)  			break;  		if (!valid_user_frame(fp, sizeof(frame))) @@ -2046,7 +2078,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)  		frame.return_address = 0;  		bytes = copy_from_user_nmi(&frame, fp, sizeof(frame)); -		if (bytes != sizeof(frame)) +		if (bytes != 0)  			break;  		if (!valid_user_frame(fp, sizeof(frame))) diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index cc16faae053..8ade93111e0 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -130,9 +130,11 @@ struct cpu_hw_events {  	unsigned long		running[BITS_TO_LONGS(X86_PMC_IDX_MAX)];  	int			enabled; -	int			n_events; -	int			n_added; -	int			n_txn; +	int			n_events; /* the # of events in the below arrays */ +	int			n_added;  /* the # last events in the below arrays; +					     they've never been enabled yet */ +	int			n_txn;    /* the # last events in the below arrays; +					     added in the current transaction */  	int			assign[X86_PMC_IDX_MAX]; /* event to counter assignment */  	u64			tags[X86_PMC_IDX_MAX];  	struct perf_event	*event_list[X86_PMC_IDX_MAX]; /* in enabled order */ @@ -164,6 +166,11 @@ struct cpu_hw_events {  	struct perf_guest_switch_msr	guest_switch_msrs[X86_PMC_IDX_MAX];  	/* +	 * Intel checkpoint mask +	 */ +	u64				intel_cp_status; + +	/*  	 * manage shared (per-core, per-cpu) registers  	 * used on Intel NHM/WSM/SNB  	 */ @@ -257,11 +264,20 @@ struct cpu_hw_events {  	__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \  			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW) -#define EVENT_CONSTRAINT_END		\ -	EVENT_CONSTRAINT(0, 0, 0) +/* + * We define the end marker as having a weight of -1 + * to enable blacklisting of events using a counter bitmask + * of zero and thus a weight of zero. + * The end marker has a weight that cannot possibly be + * obtained from counting the bits in the bitmask. + */ +#define EVENT_CONSTRAINT_END { .weight = -1 } +/* + * Check for end marker with weight == -1 + */  #define for_each_event_constraint(e, c)	\ -	for ((e) = (c); (e)->weight; (e)++) +	for ((e) = (c); (e)->weight != -1; (e)++)  /*   * Extra registers for specific events. @@ -279,14 +295,16 @@ struct extra_reg {  	u64			config_mask;  	u64			valid_mask;  	int			idx;  /* per_xxx->regs[] reg index */ +	bool			extra_msr_access;  };  #define EVENT_EXTRA_REG(e, ms, m, vm, i) {	\ -	.event = (e),		\ -	.msr = (ms),		\ -	.config_mask = (m),	\ -	.valid_mask = (vm),	\ -	.idx = EXTRA_REG_##i,	\ +	.event = (e),			\ +	.msr = (ms),			\ +	.config_mask = (m),		\ +	.valid_mask = (vm),		\ +	.idx = EXTRA_REG_##i,		\ +	.extra_msr_access = true,	\  	}  #define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx)	\ @@ -395,6 +413,7 @@ struct x86_pmu {  	/*  	 * sysfs attrs  	 */ +	int		attr_rdpmc_broken;  	int		attr_rdpmc;  	struct attribute **format_attrs;  	struct attribute **event_attrs; @@ -440,6 +459,7 @@ struct x86_pmu {  	int		lbr_nr;			   /* hardware stack size */  	u64		lbr_sel_mask;		   /* LBR_SELECT valid bits */  	const int	*lbr_sel_map;		   /* lbr_select mappings */ +	bool		lbr_double_abort;	   /* duplicated lbr aborts */  	/*  	 * Extra registers for events diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index e09f0bfb7b8..cbb1be3ed9e 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -10,6 +10,7 @@  #include <linux/module.h>  #include <linux/pci.h>  #include <linux/ptrace.h> +#include <linux/syscore_ops.h>  #include <asm/apic.h> @@ -592,7 +593,7 @@ out:  	return 1;  } -static int __kprobes +static int  perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)  {  	int handled = 0; @@ -605,6 +606,7 @@ perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)  	return handled;  } +NOKPROBE_SYMBOL(perf_ibs_nmi_handler);  static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name)  { @@ -816,6 +818,18 @@ out:  	return ret;  } +static void ibs_eilvt_setup(void) +{ +	/* +	 * Force LVT offset assignment for family 10h: The offsets are +	 * not assigned by the BIOS for this family, so the OS is +	 * responsible for doing it. If the OS assignment fails, fall +	 * back to BIOS settings and try to setup this. +	 */ +	if (boot_cpu_data.x86 == 0x10) +		force_ibs_eilvt_setup(); +} +  static inline int get_ibs_lvt_offset(void)  {  	u64 val; @@ -851,6 +865,36 @@ static void clear_APIC_ibs(void *dummy)  		setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1);  } +#ifdef CONFIG_PM + +static int perf_ibs_suspend(void) +{ +	clear_APIC_ibs(NULL); +	return 0; +} + +static void perf_ibs_resume(void) +{ +	ibs_eilvt_setup(); +	setup_APIC_ibs(NULL); +} + +static struct syscore_ops perf_ibs_syscore_ops = { +	.resume		= perf_ibs_resume, +	.suspend	= perf_ibs_suspend, +}; + +static void perf_ibs_pm_init(void) +{ +	register_syscore_ops(&perf_ibs_syscore_ops); +} + +#else + +static inline void perf_ibs_pm_init(void) { } + +#endif +  static int  perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)  { @@ -877,25 +921,19 @@ static __init int amd_ibs_init(void)  	if (!caps)  		return -ENODEV;	/* ibs not supported by the cpu */ -	/* -	 * Force LVT offset assignment for family 10h: The offsets are -	 * not assigned by the BIOS for this family, so the OS is -	 * responsible for doing it. If the OS assignment fails, fall -	 * back to BIOS settings and try to setup this. -	 */ -	if (boot_cpu_data.x86 == 0x10) -		force_ibs_eilvt_setup(); +	ibs_eilvt_setup();  	if (!ibs_eilvt_valid())  		goto out; -	get_online_cpus(); +	perf_ibs_pm_init(); +	cpu_notifier_register_begin();  	ibs_caps = caps;  	/* make ibs_caps visible to other cpus: */  	smp_mb(); -	perf_cpu_notifier(perf_ibs_cpu_notifier);  	smp_call_function(setup_APIC_ibs, NULL, 1); -	put_online_cpus(); +	__perf_cpu_notifier(perf_ibs_cpu_notifier); +	cpu_notifier_register_done();  	ret = perf_event_ibs_init();  out: diff --git a/arch/x86/kernel/cpu/perf_event_amd_uncore.c b/arch/x86/kernel/cpu/perf_event_amd_uncore.c index 754291adec3..3bbdf4cd38b 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_amd_uncore.c @@ -531,15 +531,16 @@ static int __init amd_uncore_init(void)  	if (ret)  		return -ENODEV; -	get_online_cpus(); +	cpu_notifier_register_begin(); +  	/* init cpus already online before registering for hotplug notifier */  	for_each_online_cpu(cpu) {  		amd_uncore_cpu_up_prepare(cpu);  		smp_call_function_single(cpu, init_cpu_already_online, NULL, 1);  	} -	register_cpu_notifier(&amd_uncore_cpu_notifier_block); -	put_online_cpus(); +	__register_cpu_notifier(&amd_uncore_cpu_notifier_block); +	cpu_notifier_register_done();  	return 0;  } diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index f31a1655d1f..2502d0d9d24 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -169,7 +169,6 @@ static struct event_constraint intel_slm_event_constraints[] __read_mostly =  {  	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */  	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ -	FIXED_EVENT_CONSTRAINT(0x013c, 2), /* CPU_CLK_UNHALTED.REF */  	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* pseudo CPU_CLK_UNHALTED.REF */  	EVENT_CONSTRAINT_END  }; @@ -190,9 +189,9 @@ static struct extra_reg intel_snbep_extra_regs[] __read_mostly = {  	EVENT_EXTRA_END  }; -EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3"); -EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3"); -EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2"); +EVENT_ATTR_STR(mem-loads,	mem_ld_nhm,	"event=0x0b,umask=0x10,ldlat=3"); +EVENT_ATTR_STR(mem-loads,	mem_ld_snb,	"event=0xcd,umask=0x1,ldlat=3"); +EVENT_ATTR_STR(mem-stores,	mem_st_snb,	"event=0xcd,umask=0x2");  struct attribute *nhm_events_attrs[] = {  	EVENT_PTR(mem_ld_nhm), @@ -1184,6 +1183,11 @@ static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)  	wrmsrl(hwc->config_base, ctrl_val);  } +static inline bool event_is_checkpointed(struct perf_event *event) +{ +	return (event->hw.config & HSW_IN_TX_CHECKPOINTED) != 0; +} +  static void intel_pmu_disable_event(struct perf_event *event)  {  	struct hw_perf_event *hwc = &event->hw; @@ -1197,6 +1201,7 @@ static void intel_pmu_disable_event(struct perf_event *event)  	cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx);  	cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx); +	cpuc->intel_cp_status &= ~(1ull << hwc->idx);  	/*  	 * must disable before any actual event @@ -1271,6 +1276,9 @@ static void intel_pmu_enable_event(struct perf_event *event)  	if (event->attr.exclude_guest)  		cpuc->intel_ctrl_host_mask |= (1ull << hwc->idx); +	if (unlikely(event_is_checkpointed(event))) +		cpuc->intel_cp_status |= (1ull << hwc->idx); +  	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {  		intel_pmu_enable_fixed(hwc);  		return; @@ -1289,6 +1297,17 @@ static void intel_pmu_enable_event(struct perf_event *event)  int intel_pmu_save_and_restart(struct perf_event *event)  {  	x86_perf_event_update(event); +	/* +	 * For a checkpointed counter always reset back to 0.  This +	 * avoids a situation where the counter overflows, aborts the +	 * transaction and is then set back to shortly before the +	 * overflow, and overflows and aborts again. +	 */ +	if (unlikely(event_is_checkpointed(event))) { +		/* No race with NMIs because the counter should not be armed */ +		wrmsrl(event->hw.event_base, 0); +		local64_set(&event->hw.prev_count, 0); +	}  	return x86_perf_event_set_period(event);  } @@ -1341,10 +1360,8 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)  	intel_pmu_disable_all();  	handled = intel_pmu_drain_bts_buffer();  	status = intel_pmu_get_status(); -	if (!status) { -		intel_pmu_enable_all(0); -		return handled; -	} +	if (!status) +		goto done;  	loops = 0;  again: @@ -1365,6 +1382,15 @@ again:  	intel_pmu_lbr_read();  	/* +	 * CondChgd bit 63 doesn't mean any overflow status. Ignore +	 * and clear the bit. +	 */ +	if (__test_and_clear_bit(63, (unsigned long *)&status)) { +		if (!status) +			goto done; +	} + +	/*  	 * PEBS overflow sets bit 62 in the global status register  	 */  	if (__test_and_clear_bit(62, (unsigned long *)&status)) { @@ -1372,6 +1398,13 @@ again:  		x86_pmu.drain_pebs(regs);  	} +	/* +	 * Checkpointed counters can lead to 'spurious' PMIs because the +	 * rollback caused by the PMI will have cleared the overflow status +	 * bit. Therefore always force probe these counters. +	 */ +	status |= cpuc->intel_cp_status; +  	for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {  		struct perf_event *event = cpuc->events[bit]; @@ -1837,6 +1870,20 @@ static int hsw_hw_config(struct perf_event *event)  	      event->attr.precise_ip > 0))  		return -EOPNOTSUPP; +	if (event_is_checkpointed(event)) { +		/* +		 * Sampling of checkpointed events can cause situations where +		 * the CPU constantly aborts because of a overflow, which is +		 * then checkpointed back and ignored. Forbid checkpointing +		 * for sampling. +		 * +		 * But still allow a long sampling period, so that perf stat +		 * from KVM works. +		 */ +		if (event->attr.sample_period > 0 && +		    event->attr.sample_period < 0x7fffffff) +			return -EOPNOTSUPP; +	}  	return 0;  } @@ -2135,6 +2182,41 @@ static void intel_snb_check_microcode(void)  	}  } +/* + * Under certain circumstances, access certain MSR may cause #GP. + * The function tests if the input MSR can be safely accessed. + */ +static bool check_msr(unsigned long msr, u64 mask) +{ +	u64 val_old, val_new, val_tmp; + +	/* +	 * Read the current value, change it and read it back to see if it +	 * matches, this is needed to detect certain hardware emulators +	 * (qemu/kvm) that don't trap on the MSR access and always return 0s. +	 */ +	if (rdmsrl_safe(msr, &val_old)) +		return false; + +	/* +	 * Only change the bits which can be updated by wrmsrl. +	 */ +	val_tmp = val_old ^ mask; +	if (wrmsrl_safe(msr, val_tmp) || +	    rdmsrl_safe(msr, &val_new)) +		return false; + +	if (val_new != val_tmp) +		return false; + +	/* Here it's sure that the MSR can be safely accessed. +	 * Restore the old value and return. +	 */ +	wrmsrl(msr, val_old); + +	return true; +} +  static __init void intel_sandybridge_quirk(void)  {  	x86_pmu.check_microcode = intel_snb_check_microcode; @@ -2182,10 +2264,36 @@ static __init void intel_nehalem_quirk(void)  	}  } -EVENT_ATTR_STR(mem-loads,      mem_ld_hsw,     "event=0xcd,umask=0x1,ldlat=3"); -EVENT_ATTR_STR(mem-stores,     mem_st_hsw,     "event=0xd0,umask=0x82") +EVENT_ATTR_STR(mem-loads,	mem_ld_hsw,	"event=0xcd,umask=0x1,ldlat=3"); +EVENT_ATTR_STR(mem-stores,	mem_st_hsw,	"event=0xd0,umask=0x82") + +/* Haswell special events */ +EVENT_ATTR_STR(tx-start,	tx_start,	"event=0xc9,umask=0x1"); +EVENT_ATTR_STR(tx-commit,	tx_commit,	"event=0xc9,umask=0x2"); +EVENT_ATTR_STR(tx-abort,	tx_abort,	"event=0xc9,umask=0x4"); +EVENT_ATTR_STR(tx-capacity,	tx_capacity,	"event=0x54,umask=0x2"); +EVENT_ATTR_STR(tx-conflict,	tx_conflict,	"event=0x54,umask=0x1"); +EVENT_ATTR_STR(el-start,	el_start,	"event=0xc8,umask=0x1"); +EVENT_ATTR_STR(el-commit,	el_commit,	"event=0xc8,umask=0x2"); +EVENT_ATTR_STR(el-abort,	el_abort,	"event=0xc8,umask=0x4"); +EVENT_ATTR_STR(el-capacity,	el_capacity,	"event=0x54,umask=0x2"); +EVENT_ATTR_STR(el-conflict,	el_conflict,	"event=0x54,umask=0x1"); +EVENT_ATTR_STR(cycles-t,	cycles_t,	"event=0x3c,in_tx=1"); +EVENT_ATTR_STR(cycles-ct,	cycles_ct,	"event=0x3c,in_tx=1,in_tx_cp=1");  static struct attribute *hsw_events_attrs[] = { +	EVENT_PTR(tx_start), +	EVENT_PTR(tx_commit), +	EVENT_PTR(tx_abort), +	EVENT_PTR(tx_capacity), +	EVENT_PTR(tx_conflict), +	EVENT_PTR(el_start), +	EVENT_PTR(el_commit), +	EVENT_PTR(el_abort), +	EVENT_PTR(el_capacity), +	EVENT_PTR(el_conflict), +	EVENT_PTR(cycles_t), +	EVENT_PTR(cycles_ct),  	EVENT_PTR(mem_ld_hsw),  	EVENT_PTR(mem_st_hsw),  	NULL @@ -2198,7 +2306,8 @@ __init int intel_pmu_init(void)  	union cpuid10_ebx ebx;  	struct event_constraint *c;  	unsigned int unused; -	int version; +	struct extra_reg *er; +	int version, i;  	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {  		switch (boot_cpu_data.x86) { @@ -2243,10 +2352,7 @@ __init int intel_pmu_init(void)  	if (version > 1)  		x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); -	/* -	 * v2 and above have a perf capabilities MSR -	 */ -	if (version > 1) { +	if (boot_cpu_has(X86_FEATURE_PDCM)) {  		u64 capabilities;  		rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities); @@ -2404,6 +2510,9 @@ __init int intel_pmu_init(void)  	case 62: /* IvyBridge EP */  		memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,  		       sizeof(hw_cache_event_ids)); +		/* dTLB-load-misses on IVB is different than SNB */ +		hw_cache_event_ids[C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = 0x8108; /* DTLB_LOAD_MISSES.DEMAND_LD_MISS_CAUSES_A_WALK */ +  		memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,  		       sizeof(hw_cache_extra_regs)); @@ -2452,6 +2561,7 @@ __init int intel_pmu_init(void)  		x86_pmu.hw_config = hsw_hw_config;  		x86_pmu.get_event_constraints = hsw_get_event_constraints;  		x86_pmu.cpu_events = hsw_events_attrs; +		x86_pmu.lbr_double_abort = true;  		pr_cont("Haswell events, ");  		break; @@ -2503,6 +2613,34 @@ __init int intel_pmu_init(void)  		}  	} +	/* +	 * Access LBR MSR may cause #GP under certain circumstances. +	 * E.g. KVM doesn't support LBR MSR +	 * Check all LBT MSR here. +	 * Disable LBR access if any LBR MSRs can not be accessed. +	 */ +	if (x86_pmu.lbr_nr && !check_msr(x86_pmu.lbr_tos, 0x3UL)) +		x86_pmu.lbr_nr = 0; +	for (i = 0; i < x86_pmu.lbr_nr; i++) { +		if (!(check_msr(x86_pmu.lbr_from + i, 0xffffUL) && +		      check_msr(x86_pmu.lbr_to + i, 0xffffUL))) +			x86_pmu.lbr_nr = 0; +	} + +	/* +	 * Access extra MSR may cause #GP under certain circumstances. +	 * E.g. KVM doesn't support offcore event +	 * Check all extra_regs here. +	 */ +	if (x86_pmu.extra_regs) { +		for (er = x86_pmu.extra_regs; er->msr; er++) { +			er->extra_msr_access = check_msr(er->msr, 0x1ffUL); +			/* Disable LBR select mapping */ +			if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access) +				x86_pmu.lbr_sel_map = NULL; +		} +	} +  	/* Support full width counters using alternative MSR range */  	if (x86_pmu.intel_cap.full_width_write) {  		x86_pmu.max_period = x86_pmu.cntval_mask; diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index ab3ba1c1b7d..696ade311de 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -12,6 +12,7 @@  #define BTS_BUFFER_SIZE		(PAGE_SIZE << 4)  #define PEBS_BUFFER_SIZE	PAGE_SIZE +#define PEBS_FIXUP_SIZE		PAGE_SIZE  /*   * pebs_record_32 for p4 and core not supported @@ -107,15 +108,31 @@ static u64 precise_store_data(u64 status)  	return val;  } -static u64 precise_store_data_hsw(u64 status) +static u64 precise_store_data_hsw(struct perf_event *event, u64 status)  {  	union perf_mem_data_src dse; +	u64 cfg = event->hw.config & INTEL_ARCH_EVENT_MASK;  	dse.val = 0;  	dse.mem_op = PERF_MEM_OP_STORE;  	dse.mem_lvl = PERF_MEM_LVL_NA; + +	/* +	 * L1 info only valid for following events: +	 * +	 * MEM_UOPS_RETIRED.STLB_MISS_STORES +	 * MEM_UOPS_RETIRED.LOCK_STORES +	 * MEM_UOPS_RETIRED.SPLIT_STORES +	 * MEM_UOPS_RETIRED.ALL_STORES +	 */ +	if (cfg != 0x12d0 && cfg != 0x22d0 && cfg != 0x42d0 && cfg != 0x82d0) +		return dse.mem_lvl; +  	if (status & 1) -		dse.mem_lvl = PERF_MEM_LVL_L1; +		dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT; +	else +		dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS; +  	/* Nothing else supported. Sorry. */  	return dse.val;  } @@ -182,18 +199,32 @@ struct pebs_record_nhm {   * Same as pebs_record_nhm, with two additional fields.   */  struct pebs_record_hsw { -	struct pebs_record_nhm nhm; -	/* -	 * Real IP of the event. In the Intel documentation this -	 * is called eventingrip. -	 */ -	u64 real_ip; -	/* -	 * TSX tuning information field: abort cycles and abort flags. -	 */ -	u64 tsx_tuning; +	u64 flags, ip; +	u64 ax, bx, cx, dx; +	u64 si, di, bp, sp; +	u64 r8,  r9,  r10, r11; +	u64 r12, r13, r14, r15; +	u64 status, dla, dse, lat; +	u64 real_ip, tsx_tuning; +}; + +union hsw_tsx_tuning { +	struct { +		u32 cycles_last_block     : 32, +		    hle_abort		  : 1, +		    rtm_abort		  : 1, +		    instruction_abort     : 1, +		    non_instruction_abort : 1, +		    retry		  : 1, +		    data_conflict	  : 1, +		    capacity_writes	  : 1, +		    capacity_reads	  : 1; +	}; +	u64	    value;  }; +#define PEBS_HSW_TSX_FLAGS	0xff00000000ULL +  void init_debug_store_on_cpu(int cpu)  {  	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; @@ -214,12 +245,14 @@ void fini_debug_store_on_cpu(int cpu)  	wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);  } +static DEFINE_PER_CPU(void *, insn_buffer); +  static int alloc_pebs_buffer(int cpu)  {  	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;  	int node = cpu_to_node(cpu);  	int max, thresh = 1; /* always use a single PEBS record */ -	void *buffer; +	void *buffer, *ibuffer;  	if (!x86_pmu.pebs)  		return 0; @@ -228,6 +261,19 @@ static int alloc_pebs_buffer(int cpu)  	if (unlikely(!buffer))  		return -ENOMEM; +	/* +	 * HSW+ already provides us the eventing ip; no need to allocate this +	 * buffer then. +	 */ +	if (x86_pmu.intel_cap.pebs_format < 2) { +		ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node); +		if (!ibuffer) { +			kfree(buffer); +			return -ENOMEM; +		} +		per_cpu(insn_buffer, cpu) = ibuffer; +	} +  	max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;  	ds->pebs_buffer_base = (u64)(unsigned long)buffer; @@ -248,6 +294,9 @@ static void release_pebs_buffer(int cpu)  	if (!ds || !x86_pmu.pebs)  		return; +	kfree(per_cpu(insn_buffer, cpu)); +	per_cpu(insn_buffer, cpu) = NULL; +  	kfree((void *)(unsigned long)ds->pebs_buffer_base);  	ds->pebs_buffer_base = 0;  } @@ -262,9 +311,11 @@ static int alloc_bts_buffer(int cpu)  	if (!x86_pmu.bts)  		return 0; -	buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL, node); -	if (unlikely(!buffer)) +	buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node); +	if (unlikely(!buffer)) { +		WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);  		return -ENOMEM; +	}  	max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;  	thresh = max / 16; @@ -715,6 +766,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)  	unsigned long old_to, to = cpuc->lbr_entries[0].to;  	unsigned long ip = regs->ip;  	int is_64bit = 0; +	void *kaddr;  	/*  	 * We don't need to fixup if the PEBS assist is fault like @@ -738,7 +790,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)  	 * unsigned math, either ip is before the start (impossible) or  	 * the basic block is larger than 1 page (sanity)  	 */ -	if ((ip - to) > PAGE_SIZE) +	if ((ip - to) > PEBS_FIXUP_SIZE)  		return 0;  	/* @@ -749,29 +801,33 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)  		return 1;  	} +	if (!kernel_ip(ip)) { +		int size, bytes; +		u8 *buf = this_cpu_read(insn_buffer); + +		size = ip - to; /* Must fit our buffer, see above */ +		bytes = copy_from_user_nmi(buf, (void __user *)to, size); +		if (bytes != 0) +			return 0; + +		kaddr = buf; +	} else { +		kaddr = (void *)to; +	} +  	do {  		struct insn insn; -		u8 buf[MAX_INSN_SIZE]; -		void *kaddr;  		old_to = to; -		if (!kernel_ip(ip)) { -			int bytes, size = MAX_INSN_SIZE; - -			bytes = copy_from_user_nmi(buf, (void __user *)to, size); -			if (bytes != size) -				return 0; - -			kaddr = buf; -		} else -			kaddr = (void *)to;  #ifdef CONFIG_X86_64  		is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32);  #endif  		insn_init(&insn, kaddr, is_64bit);  		insn_get_length(&insn); +  		to += insn.length; +		kaddr += insn.length;  	} while (to < ip);  	if (to == ip) { @@ -786,16 +842,34 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)  	return 0;  } +static inline u64 intel_hsw_weight(struct pebs_record_hsw *pebs) +{ +	if (pebs->tsx_tuning) { +		union hsw_tsx_tuning tsx = { .value = pebs->tsx_tuning }; +		return tsx.cycles_last_block; +	} +	return 0; +} + +static inline u64 intel_hsw_transaction(struct pebs_record_hsw *pebs) +{ +	u64 txn = (pebs->tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32; + +	/* For RTM XABORTs also log the abort code from AX */ +	if ((txn & PERF_TXN_TRANSACTION) && (pebs->ax & 1)) +		txn |= ((pebs->ax >> 24) & 0xff) << PERF_TXN_ABORT_SHIFT; +	return txn; +} +  static void __intel_pmu_pebs_event(struct perf_event *event,  				   struct pt_regs *iregs, void *__pebs)  {  	/* -	 * We cast to pebs_record_nhm to get the load latency data -	 * if extra_reg MSR_PEBS_LD_LAT_THRESHOLD used +	 * We cast to the biggest pebs_record but are careful not to +	 * unconditionally access the 'extra' entries.  	 */  	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); -	struct pebs_record_nhm *pebs = __pebs; -	struct pebs_record_hsw *pebs_hsw = __pebs; +	struct pebs_record_hsw *pebs = __pebs;  	struct perf_sample_data data;  	struct pt_regs regs;  	u64 sample_type; @@ -831,7 +905,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,  				data.data_src.val = load_latency_data(pebs->dse);  			else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW)  				data.data_src.val = -					precise_store_data_hsw(pebs->dse); +					precise_store_data_hsw(event, pebs->dse);  			else  				data.data_src.val = precise_store_data(pebs->dse);  		} @@ -854,7 +928,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,  	regs.sp = pebs->sp;  	if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format >= 2) { -		regs.ip = pebs_hsw->real_ip; +		regs.ip = pebs->real_ip;  		regs.flags |= PERF_EFLAGS_EXACT;  	} else if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(®s))  		regs.flags |= PERF_EFLAGS_EXACT; @@ -862,9 +936,18 @@ static void __intel_pmu_pebs_event(struct perf_event *event,  		regs.flags &= ~PERF_EFLAGS_EXACT;  	if ((event->attr.sample_type & PERF_SAMPLE_ADDR) && -		x86_pmu.intel_cap.pebs_format >= 1) +	    x86_pmu.intel_cap.pebs_format >= 1)  		data.addr = pebs->dla; +	if (x86_pmu.intel_cap.pebs_format >= 2) { +		/* Only set the TSX weight when no memory weight. */ +		if ((event->attr.sample_type & PERF_SAMPLE_WEIGHT) && !fll) +			data.weight = intel_hsw_weight(pebs); + +		if (event->attr.sample_type & PERF_SAMPLE_TRANSACTION) +			data.txn = intel_hsw_transaction(pebs); +	} +  	if (has_branch_stack(event))  		data.br_stack = &cpuc->lbr_stack; @@ -913,17 +996,34 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)  	__intel_pmu_pebs_event(event, iregs, at);  } -static void __intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, void *at, -					void *top) +static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)  {  	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);  	struct debug_store *ds = cpuc->ds;  	struct perf_event *event = NULL; +	void *at, *top;  	u64 status = 0;  	int bit; +	if (!x86_pmu.pebs_active) +		return; + +	at  = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base; +	top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index; +  	ds->pebs_index = ds->pebs_buffer_base; +	if (unlikely(at > top)) +		return; + +	/* +	 * Should not happen, we program the threshold at 1 and do not +	 * set a reset value. +	 */ +	WARN_ONCE(top - at > x86_pmu.max_pebs_events * x86_pmu.pebs_record_size, +		  "Unexpected number of pebs records %ld\n", +		  (long)(top - at) / x86_pmu.pebs_record_size); +  	for (; at < top; at += x86_pmu.pebs_record_size) {  		struct pebs_record_nhm *p = at; @@ -951,61 +1051,6 @@ static void __intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, void *at,  	}  } -static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) -{ -	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); -	struct debug_store *ds = cpuc->ds; -	struct pebs_record_nhm *at, *top; -	int n; - -	if (!x86_pmu.pebs_active) -		return; - -	at  = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base; -	top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index; - -	ds->pebs_index = ds->pebs_buffer_base; - -	n = top - at; -	if (n <= 0) -		return; - -	/* -	 * Should not happen, we program the threshold at 1 and do not -	 * set a reset value. -	 */ -	WARN_ONCE(n > x86_pmu.max_pebs_events, -		  "Unexpected number of pebs records %d\n", n); - -	return __intel_pmu_drain_pebs_nhm(iregs, at, top); -} - -static void intel_pmu_drain_pebs_hsw(struct pt_regs *iregs) -{ -	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); -	struct debug_store *ds = cpuc->ds; -	struct pebs_record_hsw *at, *top; -	int n; - -	if (!x86_pmu.pebs_active) -		return; - -	at  = (struct pebs_record_hsw *)(unsigned long)ds->pebs_buffer_base; -	top = (struct pebs_record_hsw *)(unsigned long)ds->pebs_index; - -	n = top - at; -	if (n <= 0) -		return; -	/* -	 * Should not happen, we program the threshold at 1 and do not -	 * set a reset value. -	 */ -	WARN_ONCE(n > x86_pmu.max_pebs_events, -		  "Unexpected number of pebs records %d\n", n); - -	return __intel_pmu_drain_pebs_nhm(iregs, at, top); -} -  /*   * BTS, PEBS probe and setup   */ @@ -1040,7 +1085,7 @@ void intel_ds_init(void)  		case 2:  			pr_cont("PEBS fmt2%c, ", pebs_type);  			x86_pmu.pebs_record_size = sizeof(struct pebs_record_hsw); -			x86_pmu.drain_pebs = intel_pmu_drain_pebs_hsw; +			x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;  			break;  		default: diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index d5be06a5005..9dd2459a4c7 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -284,6 +284,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)  	int lbr_format = x86_pmu.intel_cap.lbr_format;  	u64 tos = intel_pmu_lbr_tos();  	int i; +	int out = 0;  	for (i = 0; i < x86_pmu.lbr_nr; i++) {  		unsigned long lbr_idx = (tos - i) & mask; @@ -306,15 +307,27 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)  		}  		from = (u64)((((s64)from) << skip) >> skip); -		cpuc->lbr_entries[i].from	= from; -		cpuc->lbr_entries[i].to		= to; -		cpuc->lbr_entries[i].mispred	= mis; -		cpuc->lbr_entries[i].predicted	= pred; -		cpuc->lbr_entries[i].in_tx	= in_tx; -		cpuc->lbr_entries[i].abort	= abort; -		cpuc->lbr_entries[i].reserved	= 0; +		/* +		 * Some CPUs report duplicated abort records, +		 * with the second entry not having an abort bit set. +		 * Skip them here. This loop runs backwards, +		 * so we need to undo the previous record. +		 * If the abort just happened outside the window +		 * the extra entry cannot be removed. +		 */ +		if (abort && x86_pmu.lbr_double_abort && out > 0) +			out--; + +		cpuc->lbr_entries[out].from	 = from; +		cpuc->lbr_entries[out].to	 = to; +		cpuc->lbr_entries[out].mispred	 = mis; +		cpuc->lbr_entries[out].predicted = pred; +		cpuc->lbr_entries[out].in_tx	 = in_tx; +		cpuc->lbr_entries[out].abort	 = abort; +		cpuc->lbr_entries[out].reserved	 = 0; +		out++;  	} -	cpuc->lbr_stack.nr = i; +	cpuc->lbr_stack.nr = out;  }  void intel_pmu_lbr_read(void) @@ -371,6 +384,9 @@ static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)  	if (br_type & PERF_SAMPLE_BRANCH_NO_TX)  		mask |= X86_BR_NO_TX; +	if (br_type & PERF_SAMPLE_BRANCH_COND) +		mask |= X86_BR_JCC; +  	/*  	 * stash actual user request into reg, it may  	 * be used by fixup code for some CPU @@ -478,7 +494,7 @@ static int branch_type(unsigned long from, unsigned long to, int abort)  		/* may fail if text not present */  		bytes = copy_from_user_nmi(buf, (void __user *)from, size); -		if (bytes != size) +		if (bytes != 0)  			return X86_BR_NONE;  		addr = buf; @@ -665,6 +681,7 @@ static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {  	 * NHM/WSM erratum: must include IND_JMP to capture IND_CALL  	 */  	[PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL | LBR_IND_JMP, +	[PERF_SAMPLE_BRANCH_COND]     = LBR_JCC,  };  static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = { @@ -676,6 +693,7 @@ static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {  	[PERF_SAMPLE_BRANCH_ANY_CALL]	= LBR_REL_CALL | LBR_IND_CALL  					| LBR_FAR,  	[PERF_SAMPLE_BRANCH_IND_CALL]	= LBR_IND_CALL, +	[PERF_SAMPLE_BRANCH_COND]       = LBR_JCC,  };  /* core */ diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c new file mode 100644 index 00000000000..619f7699487 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c @@ -0,0 +1,714 @@ +/* + * perf_event_intel_rapl.c: support Intel RAPL energy consumption counters + * Copyright (C) 2013 Google, Inc., Stephane Eranian + * + * Intel RAPL interface is specified in the IA-32 Manual Vol3b + * section 14.7.1 (September 2013) + * + * RAPL provides more controls than just reporting energy consumption + * however here we only expose the 3 energy consumption free running + * counters (pp0, pkg, dram). + * + * Each of those counters increments in a power unit defined by the + * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules + * but it can vary. + * + * Counter to rapl events mappings: + * + *  pp0 counter: consumption of all physical cores (power plane 0) + * 	  event: rapl_energy_cores + *    perf code: 0x1 + * + *  pkg counter: consumption of the whole processor package + *	  event: rapl_energy_pkg + *    perf code: 0x2 + * + * dram counter: consumption of the dram domain (servers only) + *	  event: rapl_energy_dram + *    perf code: 0x3 + * + * dram counter: consumption of the builtin-gpu domain (client only) + *	  event: rapl_energy_gpu + *    perf code: 0x4 + * + * We manage those counters as free running (read-only). They may be + * use simultaneously by other tools, such as turbostat. + * + * The events only support system-wide mode counting. There is no + * sampling support because it does not make sense and is not + * supported by the RAPL hardware. + * + * Because we want to avoid floating-point operations in the kernel, + * the events are all reported in fixed point arithmetic (32.32). + * Tools must adjust the counts to convert them to Watts using + * the duration of the measurement. Tools may use a function such as + * ldexp(raw_count, -32); + */ +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/perf_event.h> +#include <asm/cpu_device_id.h> +#include "perf_event.h" + +/* + * RAPL energy status counters + */ +#define RAPL_IDX_PP0_NRG_STAT	0	/* all cores */ +#define INTEL_RAPL_PP0		0x1	/* pseudo-encoding */ +#define RAPL_IDX_PKG_NRG_STAT	1	/* entire package */ +#define INTEL_RAPL_PKG		0x2	/* pseudo-encoding */ +#define RAPL_IDX_RAM_NRG_STAT	2	/* DRAM */ +#define INTEL_RAPL_RAM		0x3	/* pseudo-encoding */ +#define RAPL_IDX_PP1_NRG_STAT	3	/* gpu */ +#define INTEL_RAPL_PP1		0x4	/* pseudo-encoding */ + +/* Clients have PP0, PKG */ +#define RAPL_IDX_CLN	(1<<RAPL_IDX_PP0_NRG_STAT|\ +			 1<<RAPL_IDX_PKG_NRG_STAT|\ +			 1<<RAPL_IDX_PP1_NRG_STAT) + +/* Servers have PP0, PKG, RAM */ +#define RAPL_IDX_SRV	(1<<RAPL_IDX_PP0_NRG_STAT|\ +			 1<<RAPL_IDX_PKG_NRG_STAT|\ +			 1<<RAPL_IDX_RAM_NRG_STAT) + +/* Servers have PP0, PKG, RAM, PP1 */ +#define RAPL_IDX_HSW	(1<<RAPL_IDX_PP0_NRG_STAT|\ +			 1<<RAPL_IDX_PKG_NRG_STAT|\ +			 1<<RAPL_IDX_RAM_NRG_STAT|\ +			 1<<RAPL_IDX_PP1_NRG_STAT) + +/* + * event code: LSB 8 bits, passed in attr->config + * any other bit is reserved + */ +#define RAPL_EVENT_MASK	0xFFULL + +#define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format)		\ +static ssize_t __rapl_##_var##_show(struct kobject *kobj,	\ +				struct kobj_attribute *attr,	\ +				char *page)			\ +{								\ +	BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);		\ +	return sprintf(page, _format "\n");			\ +}								\ +static struct kobj_attribute format_attr_##_var =		\ +	__ATTR(_name, 0444, __rapl_##_var##_show, NULL) + +#define RAPL_EVENT_DESC(_name, _config)				\ +{								\ +	.attr	= __ATTR(_name, 0444, rapl_event_show, NULL),	\ +	.config	= _config,					\ +} + +#define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */ + +struct rapl_pmu { +	spinlock_t	 lock; +	int		 hw_unit;  /* 1/2^hw_unit Joule */ +	int		 n_active; /* number of active events */ +	struct list_head active_list; +	struct pmu	 *pmu; /* pointer to rapl_pmu_class */ +	ktime_t		 timer_interval; /* in ktime_t unit */ +	struct hrtimer   hrtimer; +}; + +static struct pmu rapl_pmu_class; +static cpumask_t rapl_cpu_mask; +static int rapl_cntr_mask; + +static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu); +static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free); + +static inline u64 rapl_read_counter(struct perf_event *event) +{ +	u64 raw; +	rdmsrl(event->hw.event_base, raw); +	return raw; +} + +static inline u64 rapl_scale(u64 v) +{ +	/* +	 * scale delta to smallest unit (1/2^32) +	 * users must then scale back: count * 1/(1e9*2^32) to get Joules +	 * or use ldexp(count, -32). +	 * Watts = Joules/Time delta +	 */ +	return v << (32 - __get_cpu_var(rapl_pmu)->hw_unit); +} + +static u64 rapl_event_update(struct perf_event *event) +{ +	struct hw_perf_event *hwc = &event->hw; +	u64 prev_raw_count, new_raw_count; +	s64 delta, sdelta; +	int shift = RAPL_CNTR_WIDTH; + +again: +	prev_raw_count = local64_read(&hwc->prev_count); +	rdmsrl(event->hw.event_base, new_raw_count); + +	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, +			    new_raw_count) != prev_raw_count) { +		cpu_relax(); +		goto again; +	} + +	/* +	 * Now we have the new raw value and have updated the prev +	 * timestamp already. We can now calculate the elapsed delta +	 * (event-)time and add that to the generic event. +	 * +	 * Careful, not all hw sign-extends above the physical width +	 * of the count. +	 */ +	delta = (new_raw_count << shift) - (prev_raw_count << shift); +	delta >>= shift; + +	sdelta = rapl_scale(delta); + +	local64_add(sdelta, &event->count); + +	return new_raw_count; +} + +static void rapl_start_hrtimer(struct rapl_pmu *pmu) +{ +	__hrtimer_start_range_ns(&pmu->hrtimer, +			pmu->timer_interval, 0, +			HRTIMER_MODE_REL_PINNED, 0); +} + +static void rapl_stop_hrtimer(struct rapl_pmu *pmu) +{ +	hrtimer_cancel(&pmu->hrtimer); +} + +static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer) +{ +	struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu); +	struct perf_event *event; +	unsigned long flags; + +	if (!pmu->n_active) +		return HRTIMER_NORESTART; + +	spin_lock_irqsave(&pmu->lock, flags); + +	list_for_each_entry(event, &pmu->active_list, active_entry) { +		rapl_event_update(event); +	} + +	spin_unlock_irqrestore(&pmu->lock, flags); + +	hrtimer_forward_now(hrtimer, pmu->timer_interval); + +	return HRTIMER_RESTART; +} + +static void rapl_hrtimer_init(struct rapl_pmu *pmu) +{ +	struct hrtimer *hr = &pmu->hrtimer; + +	hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL); +	hr->function = rapl_hrtimer_handle; +} + +static void __rapl_pmu_event_start(struct rapl_pmu *pmu, +				   struct perf_event *event) +{ +	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) +		return; + +	event->hw.state = 0; + +	list_add_tail(&event->active_entry, &pmu->active_list); + +	local64_set(&event->hw.prev_count, rapl_read_counter(event)); + +	pmu->n_active++; +	if (pmu->n_active == 1) +		rapl_start_hrtimer(pmu); +} + +static void rapl_pmu_event_start(struct perf_event *event, int mode) +{ +	struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu); +	unsigned long flags; + +	spin_lock_irqsave(&pmu->lock, flags); +	__rapl_pmu_event_start(pmu, event); +	spin_unlock_irqrestore(&pmu->lock, flags); +} + +static void rapl_pmu_event_stop(struct perf_event *event, int mode) +{ +	struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu); +	struct hw_perf_event *hwc = &event->hw; +	unsigned long flags; + +	spin_lock_irqsave(&pmu->lock, flags); + +	/* mark event as deactivated and stopped */ +	if (!(hwc->state & PERF_HES_STOPPED)) { +		WARN_ON_ONCE(pmu->n_active <= 0); +		pmu->n_active--; +		if (pmu->n_active == 0) +			rapl_stop_hrtimer(pmu); + +		list_del(&event->active_entry); + +		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); +		hwc->state |= PERF_HES_STOPPED; +	} + +	/* check if update of sw counter is necessary */ +	if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { +		/* +		 * Drain the remaining delta count out of a event +		 * that we are disabling: +		 */ +		rapl_event_update(event); +		hwc->state |= PERF_HES_UPTODATE; +	} + +	spin_unlock_irqrestore(&pmu->lock, flags); +} + +static int rapl_pmu_event_add(struct perf_event *event, int mode) +{ +	struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu); +	struct hw_perf_event *hwc = &event->hw; +	unsigned long flags; + +	spin_lock_irqsave(&pmu->lock, flags); + +	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + +	if (mode & PERF_EF_START) +		__rapl_pmu_event_start(pmu, event); + +	spin_unlock_irqrestore(&pmu->lock, flags); + +	return 0; +} + +static void rapl_pmu_event_del(struct perf_event *event, int flags) +{ +	rapl_pmu_event_stop(event, PERF_EF_UPDATE); +} + +static int rapl_pmu_event_init(struct perf_event *event) +{ +	u64 cfg = event->attr.config & RAPL_EVENT_MASK; +	int bit, msr, ret = 0; + +	/* only look at RAPL events */ +	if (event->attr.type != rapl_pmu_class.type) +		return -ENOENT; + +	/* check only supported bits are set */ +	if (event->attr.config & ~RAPL_EVENT_MASK) +		return -EINVAL; + +	/* +	 * check event is known (determines counter) +	 */ +	switch (cfg) { +	case INTEL_RAPL_PP0: +		bit = RAPL_IDX_PP0_NRG_STAT; +		msr = MSR_PP0_ENERGY_STATUS; +		break; +	case INTEL_RAPL_PKG: +		bit = RAPL_IDX_PKG_NRG_STAT; +		msr = MSR_PKG_ENERGY_STATUS; +		break; +	case INTEL_RAPL_RAM: +		bit = RAPL_IDX_RAM_NRG_STAT; +		msr = MSR_DRAM_ENERGY_STATUS; +		break; +	case INTEL_RAPL_PP1: +		bit = RAPL_IDX_PP1_NRG_STAT; +		msr = MSR_PP1_ENERGY_STATUS; +		break; +	default: +		return -EINVAL; +	} +	/* check event supported */ +	if (!(rapl_cntr_mask & (1 << bit))) +		return -EINVAL; + +	/* unsupported modes and filters */ +	if (event->attr.exclude_user   || +	    event->attr.exclude_kernel || +	    event->attr.exclude_hv     || +	    event->attr.exclude_idle   || +	    event->attr.exclude_host   || +	    event->attr.exclude_guest  || +	    event->attr.sample_period) /* no sampling */ +		return -EINVAL; + +	/* must be done before validate_group */ +	event->hw.event_base = msr; +	event->hw.config = cfg; +	event->hw.idx = bit; + +	return ret; +} + +static void rapl_pmu_event_read(struct perf_event *event) +{ +	rapl_event_update(event); +} + +static ssize_t rapl_get_attr_cpumask(struct device *dev, +				struct device_attribute *attr, char *buf) +{ +	int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &rapl_cpu_mask); + +	buf[n++] = '\n'; +	buf[n] = '\0'; +	return n; +} + +static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL); + +static struct attribute *rapl_pmu_attrs[] = { +	&dev_attr_cpumask.attr, +	NULL, +}; + +static struct attribute_group rapl_pmu_attr_group = { +	.attrs = rapl_pmu_attrs, +}; + +EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01"); +EVENT_ATTR_STR(energy-pkg  ,   rapl_pkg, "event=0x02"); +EVENT_ATTR_STR(energy-ram  ,   rapl_ram, "event=0x03"); +EVENT_ATTR_STR(energy-gpu  ,   rapl_gpu, "event=0x04"); + +EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules"); +EVENT_ATTR_STR(energy-pkg.unit  ,   rapl_pkg_unit, "Joules"); +EVENT_ATTR_STR(energy-ram.unit  ,   rapl_ram_unit, "Joules"); +EVENT_ATTR_STR(energy-gpu.unit  ,   rapl_gpu_unit, "Joules"); + +/* + * we compute in 0.23 nJ increments regardless of MSR + */ +EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10"); +EVENT_ATTR_STR(energy-pkg.scale,     rapl_pkg_scale, "2.3283064365386962890625e-10"); +EVENT_ATTR_STR(energy-ram.scale,     rapl_ram_scale, "2.3283064365386962890625e-10"); +EVENT_ATTR_STR(energy-gpu.scale,     rapl_gpu_scale, "2.3283064365386962890625e-10"); + +static struct attribute *rapl_events_srv_attr[] = { +	EVENT_PTR(rapl_cores), +	EVENT_PTR(rapl_pkg), +	EVENT_PTR(rapl_ram), + +	EVENT_PTR(rapl_cores_unit), +	EVENT_PTR(rapl_pkg_unit), +	EVENT_PTR(rapl_ram_unit), + +	EVENT_PTR(rapl_cores_scale), +	EVENT_PTR(rapl_pkg_scale), +	EVENT_PTR(rapl_ram_scale), +	NULL, +}; + +static struct attribute *rapl_events_cln_attr[] = { +	EVENT_PTR(rapl_cores), +	EVENT_PTR(rapl_pkg), +	EVENT_PTR(rapl_gpu), + +	EVENT_PTR(rapl_cores_unit), +	EVENT_PTR(rapl_pkg_unit), +	EVENT_PTR(rapl_gpu_unit), + +	EVENT_PTR(rapl_cores_scale), +	EVENT_PTR(rapl_pkg_scale), +	EVENT_PTR(rapl_gpu_scale), +	NULL, +}; + +static struct attribute *rapl_events_hsw_attr[] = { +	EVENT_PTR(rapl_cores), +	EVENT_PTR(rapl_pkg), +	EVENT_PTR(rapl_gpu), +	EVENT_PTR(rapl_ram), + +	EVENT_PTR(rapl_cores_unit), +	EVENT_PTR(rapl_pkg_unit), +	EVENT_PTR(rapl_gpu_unit), +	EVENT_PTR(rapl_ram_unit), + +	EVENT_PTR(rapl_cores_scale), +	EVENT_PTR(rapl_pkg_scale), +	EVENT_PTR(rapl_gpu_scale), +	EVENT_PTR(rapl_ram_scale), +	NULL, +}; + +static struct attribute_group rapl_pmu_events_group = { +	.name = "events", +	.attrs = NULL, /* patched at runtime */ +}; + +DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7"); +static struct attribute *rapl_formats_attr[] = { +	&format_attr_event.attr, +	NULL, +}; + +static struct attribute_group rapl_pmu_format_group = { +	.name = "format", +	.attrs = rapl_formats_attr, +}; + +const struct attribute_group *rapl_attr_groups[] = { +	&rapl_pmu_attr_group, +	&rapl_pmu_format_group, +	&rapl_pmu_events_group, +	NULL, +}; + +static struct pmu rapl_pmu_class = { +	.attr_groups	= rapl_attr_groups, +	.task_ctx_nr	= perf_invalid_context, /* system-wide only */ +	.event_init	= rapl_pmu_event_init, +	.add		= rapl_pmu_event_add, /* must have */ +	.del		= rapl_pmu_event_del, /* must have */ +	.start		= rapl_pmu_event_start, +	.stop		= rapl_pmu_event_stop, +	.read		= rapl_pmu_event_read, +}; + +static void rapl_cpu_exit(int cpu) +{ +	struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); +	int i, phys_id = topology_physical_package_id(cpu); +	int target = -1; + +	/* find a new cpu on same package */ +	for_each_online_cpu(i) { +		if (i == cpu) +			continue; +		if (phys_id == topology_physical_package_id(i)) { +			target = i; +			break; +		} +	} +	/* +	 * clear cpu from cpumask +	 * if was set in cpumask and still some cpu on package, +	 * then move to new cpu +	 */ +	if (cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask) && target >= 0) +		cpumask_set_cpu(target, &rapl_cpu_mask); + +	WARN_ON(cpumask_empty(&rapl_cpu_mask)); +	/* +	 * migrate events and context to new cpu +	 */ +	if (target >= 0) +		perf_pmu_migrate_context(pmu->pmu, cpu, target); + +	/* cancel overflow polling timer for CPU */ +	rapl_stop_hrtimer(pmu); +} + +static void rapl_cpu_init(int cpu) +{ +	int i, phys_id = topology_physical_package_id(cpu); + +	/* check if phys_is is already covered */ +	for_each_cpu(i, &rapl_cpu_mask) { +		if (phys_id == topology_physical_package_id(i)) +			return; +	} +	/* was not found, so add it */ +	cpumask_set_cpu(cpu, &rapl_cpu_mask); +} + +static int rapl_cpu_prepare(int cpu) +{ +	struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); +	int phys_id = topology_physical_package_id(cpu); +	u64 ms; +	u64 msr_rapl_power_unit_bits; + +	if (pmu) +		return 0; + +	if (phys_id < 0) +		return -1; + +	/* protect rdmsrl() to handle virtualization */ +	if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits)) +		return -1; + +	pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); +	if (!pmu) +		return -1; + +	spin_lock_init(&pmu->lock); + +	INIT_LIST_HEAD(&pmu->active_list); + +	/* +	 * grab power unit as: 1/2^unit Joules +	 * +	 * we cache in local PMU instance +	 */ +	pmu->hw_unit = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; +	pmu->pmu = &rapl_pmu_class; + +	/* +	 * use reference of 200W for scaling the timeout +	 * to avoid missing counter overflows. +	 * 200W = 200 Joules/sec +	 * divide interval by 2 to avoid lockstep (2 * 100) +	 * if hw unit is 32, then we use 2 ms 1/200/2 +	 */ +	if (pmu->hw_unit < 32) +		ms = (1000 / (2 * 100)) * (1ULL << (32 - pmu->hw_unit - 1)); +	else +		ms = 2; + +	pmu->timer_interval = ms_to_ktime(ms); + +	rapl_hrtimer_init(pmu); + +	/* set RAPL pmu for this cpu for now */ +	per_cpu(rapl_pmu, cpu) = pmu; +	per_cpu(rapl_pmu_to_free, cpu) = NULL; + +	return 0; +} + +static void rapl_cpu_kfree(int cpu) +{ +	struct rapl_pmu *pmu = per_cpu(rapl_pmu_to_free, cpu); + +	kfree(pmu); + +	per_cpu(rapl_pmu_to_free, cpu) = NULL; +} + +static int rapl_cpu_dying(int cpu) +{ +	struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); + +	if (!pmu) +		return 0; + +	per_cpu(rapl_pmu, cpu) = NULL; + +	per_cpu(rapl_pmu_to_free, cpu) = pmu; + +	return 0; +} + +static int rapl_cpu_notifier(struct notifier_block *self, +			     unsigned long action, void *hcpu) +{ +	unsigned int cpu = (long)hcpu; + +	switch (action & ~CPU_TASKS_FROZEN) { +	case CPU_UP_PREPARE: +		rapl_cpu_prepare(cpu); +		break; +	case CPU_STARTING: +		rapl_cpu_init(cpu); +		break; +	case CPU_UP_CANCELED: +	case CPU_DYING: +		rapl_cpu_dying(cpu); +		break; +	case CPU_ONLINE: +	case CPU_DEAD: +		rapl_cpu_kfree(cpu); +		break; +	case CPU_DOWN_PREPARE: +		rapl_cpu_exit(cpu); +		break; +	default: +		break; +	} + +	return NOTIFY_OK; +} + +static const struct x86_cpu_id rapl_cpu_match[] = { +	[0] = { .vendor = X86_VENDOR_INTEL, .family = 6 }, +	[1] = {}, +}; + +static int __init rapl_pmu_init(void) +{ +	struct rapl_pmu *pmu; +	int cpu, ret; + +	/* +	 * check for Intel processor family 6 +	 */ +	if (!x86_match_cpu(rapl_cpu_match)) +		return 0; + +	/* check supported CPU */ +	switch (boot_cpu_data.x86_model) { +	case 42: /* Sandy Bridge */ +	case 58: /* Ivy Bridge */ +		rapl_cntr_mask = RAPL_IDX_CLN; +		rapl_pmu_events_group.attrs = rapl_events_cln_attr; +		break; +	case 60: /* Haswell */ +	case 69: /* Haswell-Celeron */ +		rapl_cntr_mask = RAPL_IDX_HSW; +		rapl_pmu_events_group.attrs = rapl_events_hsw_attr; +		break; +	case 45: /* Sandy Bridge-EP */ +	case 62: /* IvyTown */ +		rapl_cntr_mask = RAPL_IDX_SRV; +		rapl_pmu_events_group.attrs = rapl_events_srv_attr; +		break; + +	default: +		/* unsupported */ +		return 0; +	} + +	cpu_notifier_register_begin(); + +	for_each_online_cpu(cpu) { +		ret = rapl_cpu_prepare(cpu); +		if (ret) +			goto out; +		rapl_cpu_init(cpu); +	} + +	__perf_cpu_notifier(rapl_cpu_notifier); + +	ret = perf_pmu_register(&rapl_pmu_class, "power", -1); +	if (WARN_ON(ret)) { +		pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret); +		cpu_notifier_register_done(); +		return -1; +	} + +	pmu = __get_cpu_var(rapl_pmu); + +	pr_info("RAPL PMU detected, hw unit 2^-%d Joules," +		" API unit is 2^-32 Joules," +		" %d fixed counters" +		" %llu ms ovfl timer\n", +		pmu->hw_unit, +		hweight32(rapl_cntr_mask), +		ktime_to_ms(pmu->timer_interval)); + +out: +	cpu_notifier_register_done(); + +	return 0; +} +device_initcall(rapl_pmu_init); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 4118f9f6831..ae6552a0701 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -66,6 +66,47 @@ DEFINE_UNCORE_FORMAT_ATTR(mask_vnw, mask_vnw, "config2:3-4");  DEFINE_UNCORE_FORMAT_ATTR(mask0, mask0, "config2:0-31");  DEFINE_UNCORE_FORMAT_ATTR(mask1, mask1, "config2:32-63"); +static void uncore_pmu_start_hrtimer(struct intel_uncore_box *box); +static void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box); +static void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event); +static void uncore_pmu_event_read(struct perf_event *event); + +static struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event) +{ +	return container_of(event->pmu, struct intel_uncore_pmu, pmu); +} + +static struct intel_uncore_box * +uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu) +{ +	struct intel_uncore_box *box; + +	box = *per_cpu_ptr(pmu->box, cpu); +	if (box) +		return box; + +	raw_spin_lock(&uncore_box_lock); +	list_for_each_entry(box, &pmu->box_list, list) { +		if (box->phys_id == topology_physical_package_id(cpu)) { +			atomic_inc(&box->refcnt); +			*per_cpu_ptr(pmu->box, cpu) = box; +			break; +		} +	} +	raw_spin_unlock(&uncore_box_lock); + +	return *per_cpu_ptr(pmu->box, cpu); +} + +static struct intel_uncore_box *uncore_event_to_box(struct perf_event *event) +{ +	/* +	 * perf core schedules event on the basis of cpu, uncore events are +	 * collected by one of the cpus inside a physical package. +	 */ +	return uncore_pmu_to_box(uncore_event_to_pmu(event), smp_processor_id()); +} +  static u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event)  {  	u64 count; @@ -501,21 +542,24 @@ static struct extra_reg snbep_uncore_cbox_extra_regs[] = {  	SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,  				  SNBEP_CBO_PMON_CTL_TID_EN, 0x1),  	SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4), +	SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0x6),  	SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4), +	SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0x6),  	SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4), +	SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0x6),  	SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x6),  	SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x8),  	SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x8), -	SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0xc), -	SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0xc), +	SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0xa), +	SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0xa),  	SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x2),  	SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x2),  	SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x2),  	SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x2),  	SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x8),  	SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x8), -	SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0xc), -	SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0xc), +	SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0xa), +	SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0xa),  	SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x2),  	SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x2),  	SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x2), @@ -997,6 +1041,20 @@ static int snbep_pci2phy_map_init(int devid)  		}  	} +	if (!err) { +		/* +		 * For PCI bus with no UBOX device, find the next bus +		 * that has UBOX device and use its mapping. +		 */ +		i = -1; +		for (bus = 255; bus >= 0; bus--) { +			if (pcibus_to_physid[bus] >= 0) +				i = pcibus_to_physid[bus]; +			else +				pcibus_to_physid[bus] = i; +		} +	} +  	if (ubox_dev)  		pci_dev_put(ubox_dev); @@ -1099,6 +1157,24 @@ static struct attribute *ivt_uncore_qpi_formats_attr[] = {  	&format_attr_umask.attr,  	&format_attr_edge.attr,  	&format_attr_thresh8.attr, +	&format_attr_match_rds.attr, +	&format_attr_match_rnid30.attr, +	&format_attr_match_rnid4.attr, +	&format_attr_match_dnid.attr, +	&format_attr_match_mc.attr, +	&format_attr_match_opc.attr, +	&format_attr_match_vnw.attr, +	&format_attr_match0.attr, +	&format_attr_match1.attr, +	&format_attr_mask_rds.attr, +	&format_attr_mask_rnid30.attr, +	&format_attr_mask_rnid4.attr, +	&format_attr_mask_dnid.attr, +	&format_attr_mask_mc.attr, +	&format_attr_mask_opc.attr, +	&format_attr_mask_vnw.attr, +	&format_attr_mask0.attr, +	&format_attr_mask1.attr,  	NULL,  }; @@ -1146,10 +1222,16 @@ static struct extra_reg ivt_uncore_cbox_extra_regs[] = {  	SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,  				  SNBEP_CBO_PMON_CTL_TID_EN, 0x1),  	SNBEP_CBO_EVENT_EXTRA_REG(0x1031, 0x10ff, 0x2), + +	SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4), +	SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0xc), +	SNBEP_CBO_EVENT_EXTRA_REG(0x5134, 0xffff, 0xc),  	SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4), +	SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0xc),  	SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4), +	SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0xc),  	SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4), -	SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0xc), +	SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0xc),  	SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x10),  	SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10),  	SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10), @@ -1164,7 +1246,7 @@ static struct extra_reg ivt_uncore_cbox_extra_regs[] = {  	SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10),  	SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10),  	SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10), -	SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10), +	SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10),  	SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),  	SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18),  	SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18), @@ -1312,17 +1394,83 @@ static struct intel_uncore_type ivt_uncore_imc = {  	IVT_UNCORE_PCI_COMMON_INIT(),  }; +/* registers in IRP boxes are not properly aligned */ +static unsigned ivt_uncore_irp_ctls[] = {0xd8, 0xdc, 0xe0, 0xe4}; +static unsigned ivt_uncore_irp_ctrs[] = {0xa0, 0xb0, 0xb8, 0xc0}; + +static void ivt_uncore_irp_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ +	struct pci_dev *pdev = box->pci_dev; +	struct hw_perf_event *hwc = &event->hw; + +	pci_write_config_dword(pdev, ivt_uncore_irp_ctls[hwc->idx], +			       hwc->config | SNBEP_PMON_CTL_EN); +} + +static void ivt_uncore_irp_disable_event(struct intel_uncore_box *box, struct perf_event *event) +{ +	struct pci_dev *pdev = box->pci_dev; +	struct hw_perf_event *hwc = &event->hw; + +	pci_write_config_dword(pdev, ivt_uncore_irp_ctls[hwc->idx], hwc->config); +} + +static u64 ivt_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event) +{ +	struct pci_dev *pdev = box->pci_dev; +	struct hw_perf_event *hwc = &event->hw; +	u64 count = 0; + +	pci_read_config_dword(pdev, ivt_uncore_irp_ctrs[hwc->idx], (u32 *)&count); +	pci_read_config_dword(pdev, ivt_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1); + +	return count; +} + +static struct intel_uncore_ops ivt_uncore_irp_ops = { +	.init_box	= ivt_uncore_pci_init_box, +	.disable_box	= snbep_uncore_pci_disable_box, +	.enable_box	= snbep_uncore_pci_enable_box, +	.disable_event	= ivt_uncore_irp_disable_event, +	.enable_event	= ivt_uncore_irp_enable_event, +	.read_counter	= ivt_uncore_irp_read_counter, +}; + +static struct intel_uncore_type ivt_uncore_irp = { +	.name			= "irp", +	.num_counters		= 4, +	.num_boxes		= 1, +	.perf_ctr_bits		= 48, +	.event_mask		= IVT_PMON_RAW_EVENT_MASK, +	.box_ctl		= SNBEP_PCI_PMON_BOX_CTL, +	.ops			= &ivt_uncore_irp_ops, +	.format_group		= &ivt_uncore_format_group, +}; + +static struct intel_uncore_ops ivt_uncore_qpi_ops = { +	.init_box	= ivt_uncore_pci_init_box, +	.disable_box	= snbep_uncore_pci_disable_box, +	.enable_box	= snbep_uncore_pci_enable_box, +	.disable_event	= snbep_uncore_pci_disable_event, +	.enable_event	= snbep_qpi_enable_event, +	.read_counter	= snbep_uncore_pci_read_counter, +	.hw_config	= snbep_qpi_hw_config, +	.get_constraint	= uncore_get_constraint, +	.put_constraint	= uncore_put_constraint, +}; +  static struct intel_uncore_type ivt_uncore_qpi = { -	.name		= "qpi", -	.num_counters   = 4, -	.num_boxes	= 3, -	.perf_ctr_bits	= 48, -	.perf_ctr	= SNBEP_PCI_PMON_CTR0, -	.event_ctl	= SNBEP_PCI_PMON_CTL0, -	.event_mask	= IVT_QPI_PCI_PMON_RAW_EVENT_MASK, -	.box_ctl	= SNBEP_PCI_PMON_BOX_CTL, -	.ops		= &ivt_uncore_pci_ops, -	.format_group	= &ivt_uncore_qpi_format_group, +	.name			= "qpi", +	.num_counters		= 4, +	.num_boxes		= 3, +	.perf_ctr_bits		= 48, +	.perf_ctr		= SNBEP_PCI_PMON_CTR0, +	.event_ctl		= SNBEP_PCI_PMON_CTL0, +	.event_mask		= IVT_QPI_PCI_PMON_RAW_EVENT_MASK, +	.box_ctl		= SNBEP_PCI_PMON_BOX_CTL, +	.num_shared_regs	= 1, +	.ops			= &ivt_uncore_qpi_ops, +	.format_group		= &ivt_uncore_qpi_format_group,  };  static struct intel_uncore_type ivt_uncore_r2pcie = { @@ -1346,6 +1494,7 @@ static struct intel_uncore_type ivt_uncore_r3qpi = {  enum {  	IVT_PCI_UNCORE_HA,  	IVT_PCI_UNCORE_IMC, +	IVT_PCI_UNCORE_IRP,  	IVT_PCI_UNCORE_QPI,  	IVT_PCI_UNCORE_R2PCIE,  	IVT_PCI_UNCORE_R3QPI, @@ -1354,6 +1503,7 @@ enum {  static struct intel_uncore_type *ivt_pci_uncores[] = {  	[IVT_PCI_UNCORE_HA]	= &ivt_uncore_ha,  	[IVT_PCI_UNCORE_IMC]	= &ivt_uncore_imc, +	[IVT_PCI_UNCORE_IRP]	= &ivt_uncore_irp,  	[IVT_PCI_UNCORE_QPI]	= &ivt_uncore_qpi,  	[IVT_PCI_UNCORE_R2PCIE]	= &ivt_uncore_r2pcie,  	[IVT_PCI_UNCORE_R3QPI]	= &ivt_uncore_r3qpi, @@ -1401,6 +1551,10 @@ static DEFINE_PCI_DEVICE_TABLE(ivt_uncore_pci_ids) = {  		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef1),  		.driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 7),  	}, +	{ /* IRP */ +		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe39), +		.driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IRP, 0), +	},  	{ /* QPI0 Port 0 */  		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe32),  		.driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_QPI, 0), @@ -1429,6 +1583,16 @@ static DEFINE_PCI_DEVICE_TABLE(ivt_uncore_pci_ids) = {  		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3e),  		.driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R3QPI, 2),  	}, +	{ /* QPI Port 0 filter  */ +		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe86), +		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, +						   SNBEP_PCI_QPI_PORT0_FILTER), +	}, +	{ /* QPI Port 0 filter  */ +		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe96), +		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, +						   SNBEP_PCI_QPI_PORT1_FILTER), +	},  	{ /* end: all zeroes */ }  }; @@ -1517,6 +1681,349 @@ static struct intel_uncore_type *snb_msr_uncores[] = {  	&snb_uncore_cbox,  	NULL,  }; + +enum { +	SNB_PCI_UNCORE_IMC, +}; + +static struct uncore_event_desc snb_uncore_imc_events[] = { +	INTEL_UNCORE_EVENT_DESC(data_reads,  "event=0x01"), +	INTEL_UNCORE_EVENT_DESC(data_reads.scale, "6.103515625e-5"), +	INTEL_UNCORE_EVENT_DESC(data_reads.unit, "MiB"), + +	INTEL_UNCORE_EVENT_DESC(data_writes, "event=0x02"), +	INTEL_UNCORE_EVENT_DESC(data_writes.scale, "6.103515625e-5"), +	INTEL_UNCORE_EVENT_DESC(data_writes.unit, "MiB"), + +	{ /* end: all zeroes */ }, +}; + +#define SNB_UNCORE_PCI_IMC_EVENT_MASK		0xff +#define SNB_UNCORE_PCI_IMC_BAR_OFFSET		0x48 + +/* page size multiple covering all config regs */ +#define SNB_UNCORE_PCI_IMC_MAP_SIZE		0x6000 + +#define SNB_UNCORE_PCI_IMC_DATA_READS		0x1 +#define SNB_UNCORE_PCI_IMC_DATA_READS_BASE	0x5050 +#define SNB_UNCORE_PCI_IMC_DATA_WRITES		0x2 +#define SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE	0x5054 +#define SNB_UNCORE_PCI_IMC_CTR_BASE		SNB_UNCORE_PCI_IMC_DATA_READS_BASE + +static struct attribute *snb_uncore_imc_formats_attr[] = { +	&format_attr_event.attr, +	NULL, +}; + +static struct attribute_group snb_uncore_imc_format_group = { +	.name = "format", +	.attrs = snb_uncore_imc_formats_attr, +}; + +static void snb_uncore_imc_init_box(struct intel_uncore_box *box) +{ +	struct pci_dev *pdev = box->pci_dev; +	int where = SNB_UNCORE_PCI_IMC_BAR_OFFSET; +	resource_size_t addr; +	u32 pci_dword; + +	pci_read_config_dword(pdev, where, &pci_dword); +	addr = pci_dword; + +#ifdef CONFIG_PHYS_ADDR_T_64BIT +	pci_read_config_dword(pdev, where + 4, &pci_dword); +	addr |= ((resource_size_t)pci_dword << 32); +#endif + +	addr &= ~(PAGE_SIZE - 1); + +	box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE); +	box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL; +} + +static void snb_uncore_imc_enable_box(struct intel_uncore_box *box) +{} + +static void snb_uncore_imc_disable_box(struct intel_uncore_box *box) +{} + +static void snb_uncore_imc_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{} + +static void snb_uncore_imc_disable_event(struct intel_uncore_box *box, struct perf_event *event) +{} + +static u64 snb_uncore_imc_read_counter(struct intel_uncore_box *box, struct perf_event *event) +{ +	struct hw_perf_event *hwc = &event->hw; + +	return (u64)*(unsigned int *)(box->io_addr + hwc->event_base); +} + +/* + * custom event_init() function because we define our own fixed, free + * running counters, so we do not want to conflict with generic uncore + * logic. Also simplifies processing + */ +static int snb_uncore_imc_event_init(struct perf_event *event) +{ +	struct intel_uncore_pmu *pmu; +	struct intel_uncore_box *box; +	struct hw_perf_event *hwc = &event->hw; +	u64 cfg = event->attr.config & SNB_UNCORE_PCI_IMC_EVENT_MASK; +	int idx, base; + +	if (event->attr.type != event->pmu->type) +		return -ENOENT; + +	pmu = uncore_event_to_pmu(event); +	/* no device found for this pmu */ +	if (pmu->func_id < 0) +		return -ENOENT; + +	/* Sampling not supported yet */ +	if (hwc->sample_period) +		return -EINVAL; + +	/* unsupported modes and filters */ +	if (event->attr.exclude_user   || +	    event->attr.exclude_kernel || +	    event->attr.exclude_hv     || +	    event->attr.exclude_idle   || +	    event->attr.exclude_host   || +	    event->attr.exclude_guest  || +	    event->attr.sample_period) /* no sampling */ +		return -EINVAL; + +	/* +	 * Place all uncore events for a particular physical package +	 * onto a single cpu +	 */ +	if (event->cpu < 0) +		return -EINVAL; + +	/* check only supported bits are set */ +	if (event->attr.config & ~SNB_UNCORE_PCI_IMC_EVENT_MASK) +		return -EINVAL; + +	box = uncore_pmu_to_box(pmu, event->cpu); +	if (!box || box->cpu < 0) +		return -EINVAL; + +	event->cpu = box->cpu; + +	event->hw.idx = -1; +	event->hw.last_tag = ~0ULL; +	event->hw.extra_reg.idx = EXTRA_REG_NONE; +	event->hw.branch_reg.idx = EXTRA_REG_NONE; +	/* +	 * check event is known (whitelist, determines counter) +	 */ +	switch (cfg) { +	case SNB_UNCORE_PCI_IMC_DATA_READS: +		base = SNB_UNCORE_PCI_IMC_DATA_READS_BASE; +		idx = UNCORE_PMC_IDX_FIXED; +		break; +	case SNB_UNCORE_PCI_IMC_DATA_WRITES: +		base = SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE; +		idx = UNCORE_PMC_IDX_FIXED + 1; +		break; +	default: +		return -EINVAL; +	} + +	/* must be done before validate_group */ +	event->hw.event_base = base; +	event->hw.config = cfg; +	event->hw.idx = idx; + +	/* no group validation needed, we have free running counters */ + +	return 0; +} + +static int snb_uncore_imc_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ +	return 0; +} + +static void snb_uncore_imc_event_start(struct perf_event *event, int flags) +{ +	struct intel_uncore_box *box = uncore_event_to_box(event); +	u64 count; + +	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) +		return; + +	event->hw.state = 0; +	box->n_active++; + +	list_add_tail(&event->active_entry, &box->active_list); + +	count = snb_uncore_imc_read_counter(box, event); +	local64_set(&event->hw.prev_count, count); + +	if (box->n_active == 1) +		uncore_pmu_start_hrtimer(box); +} + +static void snb_uncore_imc_event_stop(struct perf_event *event, int flags) +{ +	struct intel_uncore_box *box = uncore_event_to_box(event); +	struct hw_perf_event *hwc = &event->hw; + +	if (!(hwc->state & PERF_HES_STOPPED)) { +		box->n_active--; + +		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); +		hwc->state |= PERF_HES_STOPPED; + +		list_del(&event->active_entry); + +		if (box->n_active == 0) +			uncore_pmu_cancel_hrtimer(box); +	} + +	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { +		/* +		 * Drain the remaining delta count out of a event +		 * that we are disabling: +		 */ +		uncore_perf_event_update(box, event); +		hwc->state |= PERF_HES_UPTODATE; +	} +} + +static int snb_uncore_imc_event_add(struct perf_event *event, int flags) +{ +	struct intel_uncore_box *box = uncore_event_to_box(event); +	struct hw_perf_event *hwc = &event->hw; + +	if (!box) +		return -ENODEV; + +	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; +	if (!(flags & PERF_EF_START)) +		hwc->state |= PERF_HES_ARCH; + +	snb_uncore_imc_event_start(event, 0); + +	box->n_events++; + +	return 0; +} + +static void snb_uncore_imc_event_del(struct perf_event *event, int flags) +{ +	struct intel_uncore_box *box = uncore_event_to_box(event); +	int i; + +	snb_uncore_imc_event_stop(event, PERF_EF_UPDATE); + +	for (i = 0; i < box->n_events; i++) { +		if (event == box->event_list[i]) { +			--box->n_events; +			break; +		} +	} +} + +static int snb_pci2phy_map_init(int devid) +{ +	struct pci_dev *dev = NULL; +	int bus; + +	dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, dev); +	if (!dev) +		return -ENOTTY; + +	bus = dev->bus->number; + +	pcibus_to_physid[bus] = 0; + +	pci_dev_put(dev); + +	return 0; +} + +static struct pmu snb_uncore_imc_pmu = { +	.task_ctx_nr	= perf_invalid_context, +	.event_init	= snb_uncore_imc_event_init, +	.add		= snb_uncore_imc_event_add, +	.del		= snb_uncore_imc_event_del, +	.start		= snb_uncore_imc_event_start, +	.stop		= snb_uncore_imc_event_stop, +	.read		= uncore_pmu_event_read, +}; + +static struct intel_uncore_ops snb_uncore_imc_ops = { +	.init_box	= snb_uncore_imc_init_box, +	.enable_box	= snb_uncore_imc_enable_box, +	.disable_box	= snb_uncore_imc_disable_box, +	.disable_event	= snb_uncore_imc_disable_event, +	.enable_event	= snb_uncore_imc_enable_event, +	.hw_config	= snb_uncore_imc_hw_config, +	.read_counter	= snb_uncore_imc_read_counter, +}; + +static struct intel_uncore_type snb_uncore_imc = { +	.name		= "imc", +	.num_counters   = 2, +	.num_boxes	= 1, +	.fixed_ctr_bits	= 32, +	.fixed_ctr	= SNB_UNCORE_PCI_IMC_CTR_BASE, +	.event_descs	= snb_uncore_imc_events, +	.format_group	= &snb_uncore_imc_format_group, +	.perf_ctr	= SNB_UNCORE_PCI_IMC_DATA_READS_BASE, +	.event_mask	= SNB_UNCORE_PCI_IMC_EVENT_MASK, +	.ops		= &snb_uncore_imc_ops, +	.pmu		= &snb_uncore_imc_pmu, +}; + +static struct intel_uncore_type *snb_pci_uncores[] = { +	[SNB_PCI_UNCORE_IMC]	= &snb_uncore_imc, +	NULL, +}; + +static DEFINE_PCI_DEVICE_TABLE(snb_uncore_pci_ids) = { +	{ /* IMC */ +		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SNB_IMC), +		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), +	}, +	{ /* end: all zeroes */ }, +}; + +static DEFINE_PCI_DEVICE_TABLE(ivb_uncore_pci_ids) = { +	{ /* IMC */ +		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_IMC), +		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), +	}, +	{ /* end: all zeroes */ }, +}; + +static DEFINE_PCI_DEVICE_TABLE(hsw_uncore_pci_ids) = { +	{ /* IMC */ +		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC), +		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), +	}, +	{ /* end: all zeroes */ }, +}; + +static struct pci_driver snb_uncore_pci_driver = { +	.name		= "snb_uncore", +	.id_table	= snb_uncore_pci_ids, +}; + +static struct pci_driver ivb_uncore_pci_driver = { +	.name		= "ivb_uncore", +	.id_table	= ivb_uncore_pci_ids, +}; + +static struct pci_driver hsw_uncore_pci_driver = { +	.name		= "hsw_uncore", +	.id_table	= hsw_uncore_pci_ids, +}; +  /* end of Sandy Bridge uncore support */  /* Nehalem uncore support */ @@ -2667,6 +3174,7 @@ again:  static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer)  {  	struct intel_uncore_box *box; +	struct perf_event *event;  	unsigned long flags;  	int bit; @@ -2679,19 +3187,27 @@ static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer)  	 */  	local_irq_save(flags); +	/* +	 * handle boxes with an active event list as opposed to active +	 * counters +	 */ +	list_for_each_entry(event, &box->active_list, active_entry) { +		uncore_perf_event_update(box, event); +	} +  	for_each_set_bit(bit, box->active_mask, UNCORE_PMC_IDX_MAX)  		uncore_perf_event_update(box, box->events[bit]);  	local_irq_restore(flags); -	hrtimer_forward_now(hrtimer, ns_to_ktime(UNCORE_PMU_HRTIMER_INTERVAL)); +	hrtimer_forward_now(hrtimer, ns_to_ktime(box->hrtimer_duration));  	return HRTIMER_RESTART;  }  static void uncore_pmu_start_hrtimer(struct intel_uncore_box *box)  {  	__hrtimer_start_range_ns(&box->hrtimer, -			ns_to_ktime(UNCORE_PMU_HRTIMER_INTERVAL), 0, +			ns_to_ktime(box->hrtimer_duration), 0,  			HRTIMER_MODE_REL_PINNED, 0);  } @@ -2725,43 +3241,12 @@ static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type,  	box->cpu = -1;  	box->phys_id = -1; -	return box; -} - -static struct intel_uncore_box * -uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu) -{ -	struct intel_uncore_box *box; - -	box = *per_cpu_ptr(pmu->box, cpu); -	if (box) -		return box; - -	raw_spin_lock(&uncore_box_lock); -	list_for_each_entry(box, &pmu->box_list, list) { -		if (box->phys_id == topology_physical_package_id(cpu)) { -			atomic_inc(&box->refcnt); -			*per_cpu_ptr(pmu->box, cpu) = box; -			break; -		} -	} -	raw_spin_unlock(&uncore_box_lock); - -	return *per_cpu_ptr(pmu->box, cpu); -} +	/* set default hrtimer timeout */ +	box->hrtimer_duration = UNCORE_PMU_HRTIMER_INTERVAL; -static struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event) -{ -	return container_of(event->pmu, struct intel_uncore_pmu, pmu); -} +	INIT_LIST_HEAD(&box->active_list); -static struct intel_uncore_box *uncore_event_to_box(struct perf_event *event) -{ -	/* -	 * perf core schedules event on the basis of cpu, uncore events are -	 * collected by one of the cpus inside a physical package. -	 */ -	return uncore_pmu_to_box(uncore_event_to_pmu(event), smp_processor_id()); +	return box;  }  static int @@ -3157,16 +3642,21 @@ static int __init uncore_pmu_register(struct intel_uncore_pmu *pmu)  {  	int ret; -	pmu->pmu = (struct pmu) { -		.attr_groups	= pmu->type->attr_groups, -		.task_ctx_nr	= perf_invalid_context, -		.event_init	= uncore_pmu_event_init, -		.add		= uncore_pmu_event_add, -		.del		= uncore_pmu_event_del, -		.start		= uncore_pmu_event_start, -		.stop		= uncore_pmu_event_stop, -		.read		= uncore_pmu_event_read, -	}; +	if (!pmu->type->pmu) { +		pmu->pmu = (struct pmu) { +			.attr_groups	= pmu->type->attr_groups, +			.task_ctx_nr	= perf_invalid_context, +			.event_init	= uncore_pmu_event_init, +			.add		= uncore_pmu_event_add, +			.del		= uncore_pmu_event_del, +			.start		= uncore_pmu_event_start, +			.stop		= uncore_pmu_event_stop, +			.read		= uncore_pmu_event_read, +		}; +	} else { +		pmu->pmu = *pmu->type->pmu; +		pmu->pmu.attr_groups = pmu->type->attr_groups; +	}  	if (pmu->type->num_boxes == 1) {  		if (strlen(pmu->type->name) > 0) @@ -3212,6 +3702,8 @@ static int __init uncore_type_init(struct intel_uncore_type *type)  	if (!pmus)  		return -ENOMEM; +	type->pmus = pmus; +  	type->unconstrainted = (struct event_constraint)  		__EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1,  				0, type->num_counters, 0, 0); @@ -3247,7 +3739,6 @@ static int __init uncore_type_init(struct intel_uncore_type *type)  	}  	type->pmu_group = &uncore_pmu_attr_group; -	type->pmus = pmus;  	return 0;  fail:  	uncore_type_exit(type); @@ -3379,6 +3870,28 @@ static int __init uncore_pci_init(void)  		pci_uncores = ivt_pci_uncores;  		uncore_pci_driver = &ivt_uncore_pci_driver;  		break; +	case 42: /* Sandy Bridge */ +		ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_SNB_IMC); +		if (ret) +			return ret; +		pci_uncores = snb_pci_uncores; +		uncore_pci_driver = &snb_uncore_pci_driver; +		break; +	case 58: /* Ivy Bridge */ +		ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_IVB_IMC); +		if (ret) +			return ret; +		pci_uncores = snb_pci_uncores; +		uncore_pci_driver = &ivb_uncore_pci_driver; +		break; +	case 60: /* Haswell */ +	case 69: /* Haswell Celeron */ +		ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_HSW_IMC); +		if (ret) +			return ret; +		pci_uncores = snb_pci_uncores; +		uncore_pci_driver = &hsw_uncore_pci_driver; +		break;  	default:  		return 0;  	} @@ -3650,7 +4163,7 @@ static void __init uncore_cpu_setup(void *dummy)  static int __init uncore_cpu_init(void)  { -	int ret, cpu, max_cores; +	int ret, max_cores;  	max_cores = boot_cpu_data.x86_max_cores;  	switch (boot_cpu_data.x86_model) { @@ -3694,29 +4207,6 @@ static int __init uncore_cpu_init(void)  	if (ret)  		return ret; -	get_online_cpus(); - -	for_each_online_cpu(cpu) { -		int i, phys_id = topology_physical_package_id(cpu); - -		for_each_cpu(i, &uncore_cpu_mask) { -			if (phys_id == topology_physical_package_id(i)) { -				phys_id = -1; -				break; -			} -		} -		if (phys_id < 0) -			continue; - -		uncore_cpu_prepare(cpu, phys_id); -		uncore_event_init_cpu(cpu); -	} -	on_each_cpu(uncore_cpu_setup, NULL, 1); - -	register_cpu_notifier(&uncore_cpu_nb); - -	put_online_cpus(); -  	return 0;  } @@ -3745,6 +4235,41 @@ static int __init uncore_pmus_register(void)  	return 0;  } +static void __init uncore_cpumask_init(void) +{ +	int cpu; + +	/* +	 * ony invoke once from msr or pci init code +	 */ +	if (!cpumask_empty(&uncore_cpu_mask)) +		return; + +	cpu_notifier_register_begin(); + +	for_each_online_cpu(cpu) { +		int i, phys_id = topology_physical_package_id(cpu); + +		for_each_cpu(i, &uncore_cpu_mask) { +			if (phys_id == topology_physical_package_id(i)) { +				phys_id = -1; +				break; +			} +		} +		if (phys_id < 0) +			continue; + +		uncore_cpu_prepare(cpu, phys_id); +		uncore_event_init_cpu(cpu); +	} +	on_each_cpu(uncore_cpu_setup, NULL, 1); + +	__register_cpu_notifier(&uncore_cpu_nb); + +	cpu_notifier_register_done(); +} + +  static int __init intel_uncore_init(void)  {  	int ret; @@ -3763,6 +4288,7 @@ static int __init intel_uncore_init(void)  		uncore_pci_exit();  		goto fail;  	} +	uncore_cpumask_init();  	uncore_pmus_register();  	return 0; diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index a80ab71a883..90236f0c94a 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -6,6 +6,7 @@  #define UNCORE_PMU_NAME_LEN		32  #define UNCORE_PMU_HRTIMER_INTERVAL	(60LL * NSEC_PER_SEC) +#define UNCORE_SNB_IMC_HRTIMER_INTERVAL (5ULL * NSEC_PER_SEC)  #define UNCORE_FIXED_EVENT		0xff  #define UNCORE_PMC_IDX_MAX_GENERIC	8 @@ -440,6 +441,7 @@ struct intel_uncore_type {  	struct intel_uncore_ops *ops;  	struct uncore_event_desc *event_descs;  	const struct attribute_group *attr_groups[4]; +	struct pmu *pmu; /* for custom pmu ops */  };  #define pmu_group attr_groups[0] @@ -488,8 +490,11 @@ struct intel_uncore_box {  	u64 tags[UNCORE_PMC_IDX_MAX];  	struct pci_dev *pci_dev;  	struct intel_uncore_pmu *pmu; +	u64 hrtimer_duration; /* hrtimer timeout for this box */  	struct hrtimer hrtimer;  	struct list_head list; +	struct list_head active_list; +	void *io_addr;  	struct intel_uncore_extra_reg shared_regs[0];  }; diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index 3486e666035..5d466b7d860 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -1257,7 +1257,24 @@ again:  			pass++;  			goto again;  		} - +		/* +		 * Perf does test runs to see if a whole group can be assigned +		 * together succesfully.  There can be multiple rounds of this. +		 * Unfortunately, p4_pmu_swap_config_ts touches the hwc->config +		 * bits, such that the next round of group assignments will +		 * cause the above p4_should_swap_ts to pass instead of fail. +		 * This leads to counters exclusive to thread0 being used by +		 * thread1. +		 * +		 * Solve this with a cheap hack, reset the idx back to -1 to +		 * force a new lookup (p4_next_cntr) to get the right counter +		 * for the right thread. +		 * +		 * This probably doesn't comply with the general spirit of how +		 * perf wants to work, but P4 is special. :-( +		 */ +		if (p4_should_swap_ts(hwc->config, cpu)) +			hwc->idx = -1;  		p4_pmu_swap_config_ts(hwc, cpu);  		if (assign)  			assign[i] = cntr_idx; @@ -1322,6 +1339,7 @@ static __initconst const struct x86_pmu p4_pmu = {  __init int p4_pmu_init(void)  {  	unsigned int low, high; +	int i, reg;  	/* If we get stripped -- indexing fails */  	BUILD_BUG_ON(ARCH_P4_MAX_CCCR > INTEL_PMC_MAX_GENERIC); @@ -1340,5 +1358,19 @@ __init int p4_pmu_init(void)  	x86_pmu = p4_pmu; +	/* +	 * Even though the counters are configured to interrupt a particular +	 * logical processor when an overflow happens, testing has shown that +	 * on kdump kernels (which uses a single cpu), thread1's counter +	 * continues to run and will report an NMI on thread0.  Due to the +	 * overflow bug, this leads to a stream of unknown NMIs. +	 * +	 * Solve this by zero'ing out the registers to mimic a reset. +	 */ +	for (i = 0; i < x86_pmu.num_counters; i++) { +		reg = x86_pmu_config_addr(i); +		wrmsrl_safe(reg, 0ULL); +	} +  	return 0;  } diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c index b1e2fe11532..7c1a0c07b60 100644 --- a/arch/x86/kernel/cpu/perf_event_p6.c +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -231,31 +231,49 @@ static __initconst const struct x86_pmu p6_pmu = {  }; +static __init void p6_pmu_rdpmc_quirk(void) +{ +	if (boot_cpu_data.x86_mask < 9) { +		/* +		 * PPro erratum 26; fixed in stepping 9 and above. +		 */ +		pr_warn("Userspace RDPMC support disabled due to a CPU erratum\n"); +		x86_pmu.attr_rdpmc_broken = 1; +		x86_pmu.attr_rdpmc = 0; +	} +} +  __init int p6_pmu_init(void)  { +	x86_pmu = p6_pmu; +  	switch (boot_cpu_data.x86_model) { -	case 1: -	case 3:  /* Pentium Pro */ -	case 5: -	case 6:  /* Pentium II */ -	case 7: -	case 8: -	case 11: /* Pentium III */ -	case 9: -	case 13: -		/* Pentium M */ +	case  1: /* Pentium Pro */ +		x86_add_quirk(p6_pmu_rdpmc_quirk); +		break; + +	case  3: /* Pentium II - Klamath */ +	case  5: /* Pentium II - Deschutes */ +	case  6: /* Pentium II - Mendocino */  		break; + +	case  7: /* Pentium III - Katmai */ +	case  8: /* Pentium III - Coppermine */ +	case 10: /* Pentium III Xeon */ +	case 11: /* Pentium III - Tualatin */ +		break; + +	case  9: /* Pentium M - Banias */ +	case 13: /* Pentium M - Dothan */ +		break; +  	default: -		pr_cont("unsupported p6 CPU model %d ", -			boot_cpu_data.x86_model); +		pr_cont("unsupported p6 CPU model %d ", boot_cpu_data.x86_model);  		return -ENODEV;  	} -	x86_pmu = p6_pmu; -  	memcpy(hw_cache_event_ids, p6_hw_cache_event_ids,  		sizeof(hw_cache_event_ids)); -  	return 0;  } diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index aee6317b902..06fe3ed8b85 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -11,15 +11,12 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,  			      unsigned int cpu)  {  #ifdef CONFIG_SMP -	if (c->x86_max_cores * smp_num_siblings > 1) { -		seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); -		seq_printf(m, "siblings\t: %d\n", -			   cpumask_weight(cpu_core_mask(cpu))); -		seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); -		seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); -		seq_printf(m, "apicid\t\t: %d\n", c->apicid); -		seq_printf(m, "initial apicid\t: %d\n", c->initial_apicid); -	} +	seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); +	seq_printf(m, "siblings\t: %d\n", cpumask_weight(cpu_core_mask(cpu))); +	seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); +	seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); +	seq_printf(m, "apicid\t\t: %d\n", c->apicid); +	seq_printf(m, "initial apicid\t: %d\n", c->initial_apicid);  #endif  } diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c index 88db010845c..136ac74dee8 100644 --- a/arch/x86/kernel/cpu/rdrand.c +++ b/arch/x86/kernel/cpu/rdrand.c @@ -27,24 +27,11 @@  static int __init x86_rdrand_setup(char *s)  {  	setup_clear_cpu_cap(X86_FEATURE_RDRAND); +	setup_clear_cpu_cap(X86_FEATURE_RDSEED);  	return 1;  }  __setup("nordrand", x86_rdrand_setup); -/* We can't use arch_get_random_long() here since alternatives haven't run */ -static inline int rdrand_long(unsigned long *v) -{ -	int ok; -	asm volatile("1: " RDRAND_LONG "\n\t" -		     "jc 2f\n\t" -		     "decl %0\n\t" -		     "jnz 1b\n\t" -		     "2:" -		     : "=r" (ok), "=a" (*v) -		     : "0" (RDRAND_RETRY_LOOPS)); -	return ok; -} -  /*   * Force a reseed cycle; we are architecturally guaranteed a reseed   * after no more than 512 128-bit chunks of random data.  This also diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index f2cc63e9cf0..b6f794aa169 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -1,5 +1,5 @@  /* - *	Routines to indentify additional cpu features that are scattered in + *	Routines to identify additional cpu features that are scattered in   *	cpuid space.   */  #include <linux/cpu.h> diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c index aa0430d69b9..3fa0e5ad86b 100644 --- a/arch/x86/kernel/cpu/transmeta.c +++ b/arch/x86/kernel/cpu/transmeta.c @@ -1,6 +1,5 @@  #include <linux/kernel.h>  #include <linux/mm.h> -#include <linux/init.h>  #include <asm/processor.h>  #include <asm/msr.h>  #include "cpu.h" diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c index 202759a1412..ef9c2a0078b 100644 --- a/arch/x86/kernel/cpu/umc.c +++ b/arch/x86/kernel/cpu/umc.c @@ -1,5 +1,4 @@  #include <linux/kernel.h> -#include <linux/init.h>  #include <asm/processor.h>  #include "cpu.h" @@ -11,8 +10,8 @@  static const struct cpu_dev umc_cpu_dev = {  	.c_vendor	= "UMC",  	.c_ident	= { "UMC UMC UMC" }, -	.c_models = { -		{ .vendor = X86_VENDOR_UMC, .family = 4, .model_names = +	.legacy_models	= { +		{ .family = 4, .model_names =  		  {  			  [1] = "U5D",  			  [2] = "U5S",  | 
