diff options
Diffstat (limited to 'arch/x86/kernel')
86 files changed, 2462 insertions, 1448 deletions
| diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 8baca3c4871..5369059c07a 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -25,7 +25,8 @@ obj-$(CONFIG_IRQ_WORK)  += irq_work.o  obj-y			+= probe_roms.o  obj-$(CONFIG_X86_32)	+= sys_i386_32.o i386_ksyms_32.o  obj-$(CONFIG_X86_64)	+= sys_x86_64.o x8664_ksyms_64.o -obj-$(CONFIG_X86_64)	+= syscall_64.o vsyscall_64.o +obj-y			+= syscall_$(BITS).o +obj-$(CONFIG_X86_64)	+= vsyscall_64.o  obj-$(CONFIG_X86_64)	+= vsyscall_emu_64.o  obj-y			+= bootflag.o e820.o  obj-y			+= pci-dma.o quirks.o topology.o kdebugfs.o @@ -80,6 +81,7 @@ obj-$(CONFIG_APB_TIMER)		+= apb_timer.o  obj-$(CONFIG_AMD_NB)		+= amd_nb.o  obj-$(CONFIG_DEBUG_RODATA_TEST)	+= test_rodata.o  obj-$(CONFIG_DEBUG_NX_TEST)	+= test_nx.o +obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o  obj-$(CONFIG_KVM_GUEST)		+= kvm.o  obj-$(CONFIG_KVM_CLOCK)		+= kvmclock.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 4558f0d0822..ce664f33ea8 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -219,6 +219,8 @@ static int __init  acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)  {  	struct acpi_madt_local_x2apic *processor = NULL; +	int apic_id; +	u8 enabled;  	processor = (struct acpi_madt_local_x2apic *)header; @@ -227,6 +229,8 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)  	acpi_table_print_madt_entry(header); +	apic_id = processor->local_apic_id; +	enabled = processor->lapic_flags & ACPI_MADT_ENABLED;  #ifdef CONFIG_X86_X2APIC  	/*  	 * We need to register disabled CPU as well to permit @@ -235,8 +239,10 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)  	 * to not preallocating memory for all NR_CPUS  	 * when we use CPU hotplug.  	 */ -	acpi_register_lapic(processor->local_apic_id,	/* APIC ID */ -			    processor->lapic_flags & ACPI_MADT_ENABLED); +	if (!cpu_has_x2apic && (apic_id >= 0xff) && enabled) +		printk(KERN_WARNING PREFIX "x2apic entry ignored\n"); +	else +		acpi_register_lapic(apic_id, enabled);  #else  	printk(KERN_WARNING PREFIX "x2apic entry ignored\n");  #endif diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 4c39baa8fac..be16854591c 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -119,20 +119,49 @@ bool __init early_is_amd_nb(u32 device)  	return false;  } +struct resource *amd_get_mmconfig_range(struct resource *res) +{ +	u32 address; +	u64 base, msr; +	unsigned segn_busn_bits; + +	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) +		return NULL; + +	/* assume all cpus from fam10h have mmconfig */ +        if (boot_cpu_data.x86 < 0x10) +		return NULL; + +	address = MSR_FAM10H_MMIO_CONF_BASE; +	rdmsrl(address, msr); + +	/* mmconfig is not enabled */ +	if (!(msr & FAM10H_MMIO_CONF_ENABLE)) +		return NULL; + +	base = msr & (FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT); + +	segn_busn_bits = (msr >> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) & +			 FAM10H_MMIO_CONF_BUSRANGE_MASK; + +	res->flags = IORESOURCE_MEM; +	res->start = base; +	res->end = base + (1ULL<<(segn_busn_bits + 20)) - 1; +	return res; +} +  int amd_get_subcaches(int cpu)  {  	struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link;  	unsigned int mask; -	int cuid = 0; +	int cuid;  	if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING))  		return 0;  	pci_read_config_dword(link, 0x1d4, &mask); -#ifdef CONFIG_SMP  	cuid = cpu_data(cpu).compute_unit_id; -#endif  	return (mask >> (4 * cuid)) & 0xf;  } @@ -141,7 +170,7 @@ int amd_set_subcaches(int cpu, int mask)  	static unsigned int reset, ban;  	struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu));  	unsigned int reg; -	int cuid = 0; +	int cuid;  	if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING) || mask > 0xf)  		return -EINVAL; @@ -159,9 +188,7 @@ int amd_set_subcaches(int cpu, int mask)  		pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000);  	} -#ifdef CONFIG_SMP  	cuid = cpu_data(cpu).compute_unit_id; -#endif  	mask <<= 4 * cuid;  	mask |= (0xf ^ (1 << cuid)) << 26; diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 3d2661ca654..6e76c191a83 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -88,13 +88,13 @@ static u32 __init allocate_aperture(void)  	 */  	addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR,  				      aper_size, aper_size); -	if (addr == MEMBLOCK_ERROR || addr + aper_size > GART_MAX_ADDR) { +	if (!addr || addr + aper_size > GART_MAX_ADDR) {  		printk(KERN_ERR  			"Cannot allocate aperture memory hole (%lx,%uK)\n",  				addr, aper_size>>10);  		return 0;  	} -	memblock_x86_reserve_range(addr, addr + aper_size, "aperture64"); +	memblock_reserve(addr, aper_size);  	/*  	 * Kmemleak should not scan this block as it may not be mapped via the  	 * kernel direct mapping. diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 767fd04f284..0ae0323b1f9 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -10,6 +10,7 @@ obj-$(CONFIG_SMP)		+= ipi.o  ifeq ($(CONFIG_X86_64),y)  # APIC probe will depend on the listing order here +obj-$(CONFIG_X86_NUMACHIP)	+= apic_numachip.o  obj-$(CONFIG_X86_UV)		+= x2apic_uv_x.o  obj-$(CONFIG_X86_X2APIC)	+= x2apic_phys.o  obj-$(CONFIG_X86_X2APIC)	+= x2apic_cluster.o diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index f98d84caf94..2eec05b6d1b 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -146,16 +146,26 @@ __setup("apicpmtimer", setup_apicpmtimer);  int x2apic_mode;  #ifdef CONFIG_X86_X2APIC  /* x2apic enabled before OS handover */ -static int x2apic_preenabled; +int x2apic_preenabled; +static int x2apic_disabled; +static int nox2apic;  static __init int setup_nox2apic(char *str)  {  	if (x2apic_enabled()) { -		pr_warning("Bios already enabled x2apic, " -			   "can't enforce nox2apic"); -		return 0; -	} +		int apicid = native_apic_msr_read(APIC_ID); + +		if (apicid >= 255) { +			pr_warning("Apicid: %08x, cannot enforce nox2apic\n", +				   apicid); +			return 0; +		} + +		pr_warning("x2apic already enabled. will disable it\n"); +	} else +		setup_clear_cpu_cap(X86_FEATURE_X2APIC); + +	nox2apic = 1; -	setup_clear_cpu_cap(X86_FEATURE_X2APIC);  	return 0;  }  early_param("nox2apic", setup_nox2apic); @@ -250,6 +260,7 @@ u32 native_safe_apic_wait_icr_idle(void)  		send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;  		if (!send_status)  			break; +		inc_irq_stat(icr_read_retry_count);  		udelay(100);  	} while (timeout++ < 1000); @@ -876,8 +887,8 @@ void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)  	 * Besides, if we don't timer interrupts ignore the global  	 * interrupt lock, which is the WrongThing (tm) to do.  	 */ -	exit_idle();  	irq_enter(); +	exit_idle();  	local_apic_timer_interrupt();  	irq_exit(); @@ -1431,6 +1442,45 @@ void __init bsp_end_local_APIC_setup(void)  }  #ifdef CONFIG_X86_X2APIC +/* + * Need to disable xapic and x2apic at the same time and then enable xapic mode + */ +static inline void __disable_x2apic(u64 msr) +{ +	wrmsrl(MSR_IA32_APICBASE, +	       msr & ~(X2APIC_ENABLE | XAPIC_ENABLE)); +	wrmsrl(MSR_IA32_APICBASE, msr & ~X2APIC_ENABLE); +} + +static __init void disable_x2apic(void) +{ +	u64 msr; + +	if (!cpu_has_x2apic) +		return; + +	rdmsrl(MSR_IA32_APICBASE, msr); +	if (msr & X2APIC_ENABLE) { +		u32 x2apic_id = read_apic_id(); + +		if (x2apic_id >= 255) +			panic("Cannot disable x2apic, id: %08x\n", x2apic_id); + +		pr_info("Disabling x2apic\n"); +		__disable_x2apic(msr); + +		if (nox2apic) { +			clear_cpu_cap(&cpu_data(0), X86_FEATURE_X2APIC); +			setup_clear_cpu_cap(X86_FEATURE_X2APIC); +		} + +		x2apic_disabled = 1; +		x2apic_mode = 0; + +		register_lapic_address(mp_lapic_addr); +	} +} +  void check_x2apic(void)  {  	if (x2apic_enabled()) { @@ -1441,15 +1491,20 @@ void check_x2apic(void)  void enable_x2apic(void)  { -	int msr, msr2; +	u64 msr; + +	rdmsrl(MSR_IA32_APICBASE, msr); +	if (x2apic_disabled) { +		__disable_x2apic(msr); +		return; +	}  	if (!x2apic_mode)  		return; -	rdmsr(MSR_IA32_APICBASE, msr, msr2);  	if (!(msr & X2APIC_ENABLE)) {  		printk_once(KERN_INFO "Enabling x2apic\n"); -		wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, msr2); +		wrmsrl(MSR_IA32_APICBASE, msr | X2APIC_ENABLE);  	}  }  #endif /* CONFIG_X86_X2APIC */ @@ -1486,25 +1541,34 @@ void __init enable_IR_x2apic(void)  	ret = save_ioapic_entries();  	if (ret) {  		pr_info("Saving IO-APIC state failed: %d\n", ret); -		goto out; +		return;  	}  	local_irq_save(flags);  	legacy_pic->mask_all();  	mask_ioapic_entries(); +	if (x2apic_preenabled && nox2apic) +		disable_x2apic(); +  	if (dmar_table_init_ret)  		ret = -1;  	else  		ret = enable_IR(); +	if (!x2apic_supported()) +		goto skip_x2apic; +  	if (ret < 0) {  		/* IR is required if there is APIC ID > 255 even when running  		 * under KVM  		 */  		if (max_physical_apicid > 255 || -		    !hypervisor_x2apic_available()) -			goto nox2apic; +		    !hypervisor_x2apic_available()) { +			if (x2apic_preenabled) +				disable_x2apic(); +			goto skip_x2apic; +		}  		/*  		 * without IR all CPUs can be addressed by IOAPIC/MSI  		 * only in physical mode @@ -1512,8 +1576,10 @@ void __init enable_IR_x2apic(void)  		x2apic_force_phys();  	} -	if (ret == IRQ_REMAP_XAPIC_MODE) -		goto nox2apic; +	if (ret == IRQ_REMAP_XAPIC_MODE) { +		pr_info("x2apic not enabled, IRQ remapping is in xapic mode\n"); +		goto skip_x2apic; +	}  	x2apic_enabled = 1; @@ -1523,22 +1589,11 @@ void __init enable_IR_x2apic(void)  		pr_info("Enabled x2apic\n");  	} -nox2apic: +skip_x2apic:  	if (ret < 0) /* IR enabling failed */  		restore_ioapic_entries();  	legacy_pic->restore_mask();  	local_irq_restore(flags); - -out: -	if (x2apic_enabled || !x2apic_supported()) -		return; - -	if (x2apic_preenabled) -		panic("x2apic: enabled by BIOS but kernel init failed."); -	else if (ret == IRQ_REMAP_XAPIC_MODE) -		pr_info("x2apic not enabled, IRQ remapping is in xapic mode\n"); -	else if (ret < 0) -		pr_info("x2apic not enabled, IRQ remapping init failed\n");  }  #ifdef CONFIG_X86_64 @@ -1809,8 +1864,8 @@ void smp_spurious_interrupt(struct pt_regs *regs)  {  	u32 v; -	exit_idle();  	irq_enter(); +	exit_idle();  	/*  	 * Check if this really is a spurious interrupt and ACK it  	 * if it is a vectored one.  Just in case... @@ -1846,8 +1901,8 @@ void smp_error_interrupt(struct pt_regs *regs)  		"Illegal register address",	/* APIC Error Bit 7 */  	}; -	exit_idle();  	irq_enter(); +	exit_idle();  	/* First tickle the hardware, only then report what went on. -- REW */  	v0 = apic_read(APIC_ESR);  	apic_write(APIC_ESR, 0); diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index f7a41e4cae4..8c3cdded6f2 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -62,7 +62,7 @@ static void flat_vector_allocation_domain(int cpu, struct cpumask *retmask)   * an APIC.  See e.g. "AP-388 82489DX User's Manual" (Intel   * document number 292116).  So here it goes...   */ -static void flat_init_apic_ldr(void) +void flat_init_apic_ldr(void)  {  	unsigned long val;  	unsigned long num, id; @@ -171,9 +171,14 @@ static int flat_phys_pkg_id(int initial_apic_id, int index_msb)  	return initial_apic_id >> index_msb;  } +static int flat_probe(void) +{ +	return 1; +} +  static struct apic apic_flat =  {  	.name				= "flat", -	.probe				= NULL, +	.probe				= flat_probe,  	.acpi_madt_oem_check		= flat_acpi_madt_oem_check,  	.apic_id_registered		= flat_apic_id_registered, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c new file mode 100644 index 00000000000..09d3d8c1cd9 --- /dev/null +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -0,0 +1,294 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License.  See the file "COPYING" in the main directory of this archive + * for more details. + * + * Numascale NumaConnect-Specific APIC Code + * + * Copyright (C) 2011 Numascale AS. All rights reserved. + * + * Send feedback to <support@numascale.com> + * + */ + +#include <linux/errno.h> +#include <linux/threads.h> +#include <linux/cpumask.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/ctype.h> +#include <linux/init.h> +#include <linux/hardirq.h> +#include <linux/delay.h> + +#include <asm/numachip/numachip_csr.h> +#include <asm/smp.h> +#include <asm/apic.h> +#include <asm/ipi.h> +#include <asm/apic_flat_64.h> + +static int numachip_system __read_mostly; + +static struct apic apic_numachip __read_mostly; + +static unsigned int get_apic_id(unsigned long x) +{ +	unsigned long value; +	unsigned int id; + +	rdmsrl(MSR_FAM10H_NODE_ID, value); +	id = ((x >> 24) & 0xffU) | ((value << 2) & 0x3f00U); + +	return id; +} + +static unsigned long set_apic_id(unsigned int id) +{ +	unsigned long x; + +	x = ((id & 0xffU) << 24); +	return x; +} + +static unsigned int read_xapic_id(void) +{ +	return get_apic_id(apic_read(APIC_ID)); +} + +static int numachip_apic_id_registered(void) +{ +	return physid_isset(read_xapic_id(), phys_cpu_present_map); +} + +static int numachip_phys_pkg_id(int initial_apic_id, int index_msb) +{ +	return initial_apic_id >> index_msb; +} + +static const struct cpumask *numachip_target_cpus(void) +{ +	return cpu_online_mask; +} + +static void numachip_vector_allocation_domain(int cpu, struct cpumask *retmask) +{ +	cpumask_clear(retmask); +	cpumask_set_cpu(cpu, retmask); +} + +static int __cpuinit numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip) +{ +	union numachip_csr_g3_ext_irq_gen int_gen; + +	int_gen.s._destination_apic_id = phys_apicid; +	int_gen.s._vector = 0; +	int_gen.s._msgtype = APIC_DM_INIT >> 8; +	int_gen.s._index = 0; + +	write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v); + +	int_gen.s._msgtype = APIC_DM_STARTUP >> 8; +	int_gen.s._vector = start_rip >> 12; + +	write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v); + +	atomic_set(&init_deasserted, 1); +	return 0; +} + +static void numachip_send_IPI_one(int cpu, int vector) +{ +	union numachip_csr_g3_ext_irq_gen int_gen; +	int apicid = per_cpu(x86_cpu_to_apicid, cpu); + +	int_gen.s._destination_apic_id = apicid; +	int_gen.s._vector = vector; +	int_gen.s._msgtype = (vector == NMI_VECTOR ? APIC_DM_NMI : APIC_DM_FIXED) >> 8; +	int_gen.s._index = 0; + +	write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v); +} + +static void numachip_send_IPI_mask(const struct cpumask *mask, int vector) +{ +	unsigned int cpu; + +	for_each_cpu(cpu, mask) +		numachip_send_IPI_one(cpu, vector); +} + +static void numachip_send_IPI_mask_allbutself(const struct cpumask *mask, +						int vector) +{ +	unsigned int this_cpu = smp_processor_id(); +	unsigned int cpu; + +	for_each_cpu(cpu, mask) { +		if (cpu != this_cpu) +			numachip_send_IPI_one(cpu, vector); +	} +} + +static void numachip_send_IPI_allbutself(int vector) +{ +	unsigned int this_cpu = smp_processor_id(); +	unsigned int cpu; + +	for_each_online_cpu(cpu) { +		if (cpu != this_cpu) +			numachip_send_IPI_one(cpu, vector); +	} +} + +static void numachip_send_IPI_all(int vector) +{ +	numachip_send_IPI_mask(cpu_online_mask, vector); +} + +static void numachip_send_IPI_self(int vector) +{ +	__default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); +} + +static unsigned int numachip_cpu_mask_to_apicid(const struct cpumask *cpumask) +{ +	int cpu; + +	/* +	 * We're using fixed IRQ delivery, can only return one phys APIC ID. +	 * May as well be the first. +	 */ +	cpu = cpumask_first(cpumask); +	if (likely((unsigned)cpu < nr_cpu_ids)) +		return per_cpu(x86_cpu_to_apicid, cpu); + +	return BAD_APICID; +} + +static unsigned int +numachip_cpu_mask_to_apicid_and(const struct cpumask *cpumask, +				const struct cpumask *andmask) +{ +	int cpu; + +	/* +	 * We're using fixed IRQ delivery, can only return one phys APIC ID. +	 * May as well be the first. +	 */ +	for_each_cpu_and(cpu, cpumask, andmask) { +		if (cpumask_test_cpu(cpu, cpu_online_mask)) +			break; +	} +	return per_cpu(x86_cpu_to_apicid, cpu); +} + +static int __init numachip_probe(void) +{ +	return apic == &apic_numachip; +} + +static void __init map_csrs(void) +{ +	printk(KERN_INFO "NumaChip: Mapping local CSR space (%016llx - %016llx)\n", +		NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_BASE + NUMACHIP_LCSR_SIZE - 1); +	init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE); + +	printk(KERN_INFO "NumaChip: Mapping global CSR space (%016llx - %016llx)\n", +		NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_BASE + NUMACHIP_GCSR_SIZE - 1); +	init_extra_mapping_uc(NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_SIZE); +} + +static void fixup_cpu_id(struct cpuinfo_x86 *c, int node) +{ +	c->phys_proc_id = node; +	per_cpu(cpu_llc_id, smp_processor_id()) = node; +} + +static int __init numachip_system_init(void) +{ +	unsigned int val; + +	if (!numachip_system) +		return 0; + +	x86_cpuinit.fixup_cpu_id = fixup_cpu_id; + +	map_csrs(); + +	val = read_lcsr(CSR_G0_NODE_IDS); +	printk(KERN_INFO "NumaChip: Local NodeID = %08x\n", val); + +	return 0; +} +early_initcall(numachip_system_init); + +static int numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ +	if (!strncmp(oem_id, "NUMASC", 6)) { +		numachip_system = 1; +		return 1; +	} + +	return 0; +} + +static struct apic apic_numachip __refconst = { + +	.name				= "NumaConnect system", +	.probe				= numachip_probe, +	.acpi_madt_oem_check		= numachip_acpi_madt_oem_check, +	.apic_id_registered		= numachip_apic_id_registered, + +	.irq_delivery_mode		= dest_Fixed, +	.irq_dest_mode			= 0, /* physical */ + +	.target_cpus			= numachip_target_cpus, +	.disable_esr			= 0, +	.dest_logical			= 0, +	.check_apicid_used		= NULL, +	.check_apicid_present		= NULL, + +	.vector_allocation_domain	= numachip_vector_allocation_domain, +	.init_apic_ldr			= flat_init_apic_ldr, + +	.ioapic_phys_id_map		= NULL, +	.setup_apic_routing		= NULL, +	.multi_timer_check		= NULL, +	.cpu_present_to_apicid		= default_cpu_present_to_apicid, +	.apicid_to_cpu_present		= NULL, +	.setup_portio_remap		= NULL, +	.check_phys_apicid_present	= default_check_phys_apicid_present, +	.enable_apic_mode		= NULL, +	.phys_pkg_id			= numachip_phys_pkg_id, +	.mps_oem_check			= NULL, + +	.get_apic_id			= get_apic_id, +	.set_apic_id			= set_apic_id, +	.apic_id_mask			= 0xffU << 24, + +	.cpu_mask_to_apicid		= numachip_cpu_mask_to_apicid, +	.cpu_mask_to_apicid_and		= numachip_cpu_mask_to_apicid_and, + +	.send_IPI_mask			= numachip_send_IPI_mask, +	.send_IPI_mask_allbutself	= numachip_send_IPI_mask_allbutself, +	.send_IPI_allbutself		= numachip_send_IPI_allbutself, +	.send_IPI_all			= numachip_send_IPI_all, +	.send_IPI_self			= numachip_send_IPI_self, + +	.wakeup_secondary_cpu		= numachip_wakeup_secondary, +	.trampoline_phys_low		= DEFAULT_TRAMPOLINE_PHYS_LOW, +	.trampoline_phys_high		= DEFAULT_TRAMPOLINE_PHYS_HIGH, +	.wait_for_init_deassert		= NULL, +	.smp_callin_clear_local_apic	= NULL, +	.inquire_remote_apic		= NULL, /* REMRD not supported */ + +	.read				= native_apic_mem_read, +	.write				= native_apic_mem_write, +	.icr_read			= native_apic_icr_read, +	.icr_write			= native_apic_icr_write, +	.wait_icr_idle			= native_apic_wait_icr_idle, +	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle, +}; +apic_driver(apic_numachip); + diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 6d939d7847e..fb072754bc1 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2421,8 +2421,8 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)  	unsigned vector, me;  	ack_APIC_irq(); -	exit_idle();  	irq_enter(); +	exit_idle();  	me = smp_processor_id();  	for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { @@ -2948,6 +2948,10 @@ static inline void __init check_timer(void)  	}  	local_irq_disable();  	apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); +	if (x2apic_preenabled) +		apic_printk(APIC_QUIET, KERN_INFO +			    "Perhaps problem with the pre-enabled x2apic mode\n" +			    "Try booting with x2apic and interrupt-remapping disabled in the bios.\n");  	panic("IO-APIC + timer doesn't work!  Boot with apic=debug and send a "  		"report.  Then try booting with the 'noapic' option.\n");  out: diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 62ae3001ae0..79b05b88aa1 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -93,6 +93,8 @@ static int __init early_get_pnodeid(void)  	if (node_id.s.part_number == UV2_HUB_PART_NUMBER)  		uv_min_hub_revision_id += UV2_HUB_REVISION_BASE - 1; +	if (node_id.s.part_number == UV2_HUB_PART_NUMBER_X) +		uv_min_hub_revision_id += UV2_HUB_REVISION_BASE - 1;  	uv_hub_info->hub_revision = uv_min_hub_revision_id;  	pnode = (node_id.s.node_id >> 1) & ((1 << m_n_config.s.n_skt) - 1); @@ -767,7 +769,12 @@ void __init uv_system_init(void)  	for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++)  		uv_possible_blades +=  		  hweight64(uv_read_local_mmr( UVH_NODE_PRESENT_TABLE + i * 8)); -	printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades()); + +	/* uv_num_possible_blades() is really the hub count */ +	printk(KERN_INFO "UV: Found %d blades, %d hubs\n", +			is_uv1_hub() ? uv_num_possible_blades() : +			(uv_num_possible_blades() + 1) / 2, +			uv_num_possible_blades());  	bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();  	uv_blade_info = kzalloc(bytes, GFP_KERNEL); diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index a46bd383953..f76623cbe26 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -383,21 +383,21 @@ static int ignore_sys_suspend;  static int ignore_normal_resume;  static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL; -static int debug __read_mostly; -static int smp __read_mostly; +static bool debug __read_mostly; +static bool smp __read_mostly;  static int apm_disabled = -1;  #ifdef CONFIG_SMP -static int power_off; +static bool power_off;  #else -static int power_off = 1; +static bool power_off = 1;  #endif -static int realmode_power_off; +static bool realmode_power_off;  #ifdef CONFIG_APM_ALLOW_INTS -static int allow_ints = 1; +static bool allow_ints = 1;  #else -static int allow_ints; +static bool allow_ints;  #endif -static int broken_psr; +static bool broken_psr;  static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue);  static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 4f13fafc526..68de2dc962e 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -67,4 +67,6 @@ void common(void) {  	OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);  	OFFSET(BP_version, boot_params, hdr.version);  	OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); +	OFFSET(BP_pref_address, boot_params, hdr.pref_address); +	OFFSET(BP_code32_start, boot_params, hdr.code32_start);  } diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 395a10e6806..85d98ab15cd 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -3,6 +3,11 @@  #include <linux/lguest.h>  #include "../../../drivers/lguest/lg.h" +#define __SYSCALL_I386(nr, sym, compat) [nr] = 1, +static char syscalls[] = { +#include <asm/syscalls_32.h> +}; +  /* workaround for a warning with -Wmissing-prototypes */  void foo(void); @@ -76,4 +81,7 @@ void foo(void)  	OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);  	OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);  #endif +	BLANK(); +	DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); +	DEFINE(NR_syscalls, sizeof(syscalls));  } diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index e72a1194af2..834e897b1e2 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -1,11 +1,12 @@  #include <asm/ia32.h> -#define __NO_STUBS 1 -#undef __SYSCALL -#undef _ASM_X86_UNISTD_64_H -#define __SYSCALL(nr, sym) [nr] = 1, -static char syscalls[] = { -#include <asm/unistd.h> +#define __SYSCALL_64(nr, sym, compat) [nr] = 1, +static char syscalls_64[] = { +#include <asm/syscalls_64.h> +}; +#define __SYSCALL_I386(nr, sym, compat) [nr] = 1, +static char syscalls_ia32[] = { +#include <asm/syscalls_32.h>  };  int main(void) @@ -72,7 +73,11 @@ int main(void)  	OFFSET(TSS_ist, tss_struct, x86_tss.ist);  	BLANK(); -	DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); +	DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1); +	DEFINE(NR_syscalls, sizeof(syscalls_64)); + +	DEFINE(__NR_ia32_syscall_max, sizeof(syscalls_ia32) - 1); +	DEFINE(IA32_NR_syscalls, sizeof(syscalls_ia32));  	return 0;  } diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c index 452932d3473..5da1269e8dd 100644 --- a/arch/x86/kernel/check.c +++ b/arch/x86/kernel/check.c @@ -62,7 +62,8 @@ early_param("memory_corruption_check_size", set_corruption_check_size);  void __init setup_bios_corruption_check(void)  { -	u64 addr = PAGE_SIZE;	/* assume first page is reserved anyway */ +	phys_addr_t start, end; +	u64 i;  	if (memory_corruption_check == -1) {  		memory_corruption_check = @@ -82,28 +83,23 @@ void __init setup_bios_corruption_check(void)  	corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); -	while (addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) { -		u64 size; -		addr = memblock_x86_find_in_range_size(addr, &size, PAGE_SIZE); +	for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) { +		start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE), +				PAGE_SIZE, corruption_check_size); +		end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE), +			      PAGE_SIZE, corruption_check_size); +		if (start >= end) +			continue; -		if (addr == MEMBLOCK_ERROR) -			break; - -		if (addr >= corruption_check_size) -			break; - -		if ((addr + size) > corruption_check_size) -			size = corruption_check_size - addr; - -		memblock_x86_reserve_range(addr, addr + size, "SCAN RAM"); -		scan_areas[num_scan_areas].addr = addr; -		scan_areas[num_scan_areas].size = size; -		num_scan_areas++; +		memblock_reserve(start, end - start); +		scan_areas[num_scan_areas].addr = start; +		scan_areas[num_scan_areas].size = end - start;  		/* Assume we've already mapped this early memory */ -		memset(__va(addr), 0, size); +		memset(__va(start), 0, end - start); -		addr += size; +		if (++num_scan_areas >= MAX_SCAN_AREAS) +			break;  	}  	if (num_scan_areas) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index c7e46cb3532..f4773f4aae3 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -148,7 +148,6 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)  static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)  { -#ifdef CONFIG_SMP  	/* calling is from identify_secondary_cpu() ? */  	if (!c->cpu_index)  		return; @@ -192,7 +191,6 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)  valid_k7:  	; -#endif  }  static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) @@ -353,6 +351,13 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)  	if (node == NUMA_NO_NODE)  		node = per_cpu(cpu_llc_id, cpu); +	/* +	 * If core numbers are inconsistent, it's likely a multi-fabric platform, +	 * so invoke platform-specific handler +	 */ +	if (c->phys_proc_id != node) +		x86_cpuinit.fixup_cpu_id(c, node); +  	if (!node_online(node)) {  		/*  		 * Two possibilities here: @@ -442,8 +447,6 @@ static void __cpuinit bsp_init_amd(struct cpuinfo_x86 *c)  static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)  { -	u32 dummy; -  	early_init_amd_mc(c);  	/* @@ -473,12 +476,12 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)  			set_cpu_cap(c, X86_FEATURE_EXTD_APICID);  	}  #endif - -	rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);  }  static void __cpuinit init_amd(struct cpuinfo_x86 *c)  { +	u32 dummy; +  #ifdef CONFIG_SMP  	unsigned long long value; @@ -657,6 +660,8 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)  			checking_wrmsrl(MSR_AMD64_MCx_MASK(4), mask);  		}  	} + +	rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);  }  #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index e58d978e075..159103c0b1f 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -278,7 +278,7 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)  	}  #ifdef CONFIG_X86_32  	/* Cyrix III family needs CX8 & PGE explicitly enabled. */ -	if (c->x86_model >= 6 && c->x86_model <= 9) { +	if (c->x86_model >= 6 && c->x86_model <= 13) {  		rdmsr(MSR_VIA_FCR, lo, hi);  		lo |= (1<<1 | 1<<7);  		wrmsr(MSR_VIA_FCR, lo, hi); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index aa003b13a83..c0f7d68d318 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -676,9 +676,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)  	if (this_cpu->c_early_init)  		this_cpu->c_early_init(c); -#ifdef CONFIG_SMP  	c->cpu_index = 0; -#endif  	filter_cpuid_features(c, false);  	setup_smep(c); @@ -764,10 +762,7 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)  		c->apicid = c->initial_apicid;  # endif  #endif - -#ifdef CONFIG_X86_HT  		c->phys_proc_id = c->initial_apicid; -#endif  	}  	setup_smep(c); @@ -1026,6 +1021,8 @@ __setup("clearcpuid=", setup_disablecpuid);  #ifdef CONFIG_X86_64  struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table }; +struct desc_ptr nmi_idt_descr = { NR_VECTORS * 16 - 1, +				    (unsigned long) nmi_idt_table };  DEFINE_PER_CPU_FIRST(union irq_stack_union,  		     irq_stack_union) __aligned(PAGE_SIZE); @@ -1047,6 +1044,9 @@ DEFINE_PER_CPU(char *, irq_stack_ptr) =  DEFINE_PER_CPU(unsigned int, irq_count) = -1; +DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); +EXPORT_PER_CPU_SYMBOL(fpu_owner_task); +  /*   * Special IST stacks which the CPU switches to when it calls   * an IST-marked descriptor entry. Up to 7 stacks (hardware @@ -1090,10 +1090,32 @@ unsigned long kernel_eflags;   */  DEFINE_PER_CPU(struct orig_ist, orig_ist); +static DEFINE_PER_CPU(unsigned long, debug_stack_addr); +DEFINE_PER_CPU(int, debug_stack_usage); + +int is_debug_stack(unsigned long addr) +{ +	return __get_cpu_var(debug_stack_usage) || +		(addr <= __get_cpu_var(debug_stack_addr) && +		 addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ)); +} + +void debug_stack_set_zero(void) +{ +	load_idt((const struct desc_ptr *)&nmi_idt_descr); +} + +void debug_stack_reset(void) +{ +	load_idt((const struct desc_ptr *)&idt_descr); +} +  #else	/* CONFIG_X86_64 */  DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;  EXPORT_PER_CPU_SYMBOL(current_task); +DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); +EXPORT_PER_CPU_SYMBOL(fpu_owner_task);  #ifdef CONFIG_CC_STACKPROTECTOR  DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); @@ -1141,6 +1163,15 @@ static void dbg_restore_debug_regs(void)  #endif /* ! CONFIG_KGDB */  /* + * Prints an error where the NUMA and configured core-number mismatch and the + * platform didn't override this to fix it up + */ +void __cpuinit x86_default_fixup_cpu_id(struct cpuinfo_x86 *c, int node) +{ +	pr_err("NUMA core number %d differs from configured core number %d\n", node, c->phys_proc_id); +} + +/*   * cpu_init() initializes state that is per-CPU. Some data is already   * initialized (naturally) in the bootstrap process, such as the GDT   * and IDT. We reload them nevertheless, this function acts as a @@ -1208,6 +1239,8 @@ void __cpuinit cpu_init(void)  			estacks += exception_stack_sizes[v];  			oist->ist[v] = t->x86_tss.ist[v] =  					(unsigned long)estacks; +			if (v == DEBUG_STACK-1) +				per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks;  		}  	} diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 1b22dcc51af..8bacc7826fb 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -1,5 +1,4 @@  #ifndef ARCH_X86_CPU_H -  #define ARCH_X86_CPU_H  struct cpu_model_info { @@ -35,6 +34,4 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[],  extern void get_cpu_cap(struct cpuinfo_x86 *c);  extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); -extern void get_cpu_cap(struct cpuinfo_x86 *c); - -#endif +#endif /* ARCH_X86_CPU_H */ diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 523131213f0..3e6ff6cbf42 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -181,7 +181,6 @@ static void __cpuinit trap_init_f00f_bug(void)  static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)  { -#ifdef CONFIG_SMP  	/* calling is from identify_secondary_cpu() ? */  	if (!c->cpu_index)  		return; @@ -198,7 +197,6 @@ static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)  		WARN_ONCE(1, "WARNING: SMP operation may be unreliable"  				    "with B stepping processors.\n");  	} -#endif  }  static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index a3b0811693c..73d08ed98a6 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -326,8 +326,7 @@ static void __cpuinit amd_calc_l3_indices(struct amd_northbridge *nb)  	l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;  } -static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, -					int index) +static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index)  {  	int node; @@ -725,14 +724,16 @@ static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info);  #define CPUID4_INFO_IDX(x, y)	(&((per_cpu(ici_cpuid4_info, x))[y]))  #ifdef CONFIG_SMP -static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) + +static int __cpuinit cache_shared_amd_cpu_map_setup(unsigned int cpu, int index)  { -	struct _cpuid4_info	*this_leaf, *sibling_leaf; -	unsigned long num_threads_sharing; -	int index_msb, i, sibling; +	struct _cpuid4_info *this_leaf; +	int ret, i, sibling;  	struct cpuinfo_x86 *c = &cpu_data(cpu); -	if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) { +	ret = 0; +	if (index == 3) { +		ret = 1;  		for_each_cpu(i, cpu_llc_shared_mask(cpu)) {  			if (!per_cpu(ici_cpuid4_info, i))  				continue; @@ -743,8 +744,35 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)  				set_bit(sibling, this_leaf->shared_cpu_map);  			}  		} -		return; +	} else if ((c->x86 == 0x15) && ((index == 1) || (index == 2))) { +		ret = 1; +		for_each_cpu(i, cpu_sibling_mask(cpu)) { +			if (!per_cpu(ici_cpuid4_info, i)) +				continue; +			this_leaf = CPUID4_INFO_IDX(i, index); +			for_each_cpu(sibling, cpu_sibling_mask(cpu)) { +				if (!cpu_online(sibling)) +					continue; +				set_bit(sibling, this_leaf->shared_cpu_map); +			} +		}  	} + +	return ret; +} + +static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) +{ +	struct _cpuid4_info *this_leaf, *sibling_leaf; +	unsigned long num_threads_sharing; +	int index_msb, i; +	struct cpuinfo_x86 *c = &cpu_data(cpu); + +	if (c->x86_vendor == X86_VENDOR_AMD) { +		if (cache_shared_amd_cpu_map_setup(cpu, index)) +			return; +	} +  	this_leaf = CPUID4_INFO_IDX(cpu, index);  	num_threads_sharing = 1 + this_leaf->base.eax.split.num_threads_sharing; @@ -844,8 +872,7 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)  #include <linux/kobject.h>  #include <linux/sysfs.h> - -extern struct sysdev_class cpu_sysdev_class; /* from drivers/base/cpu.c */ +#include <linux/cpu.h>  /* pointer to kobject for cpuX/cache */  static DEFINE_PER_CPU(struct kobject *, ici_cache_kobject); @@ -1073,9 +1100,9 @@ err_out:  static DECLARE_BITMAP(cache_dev_map, NR_CPUS);  /* Add/Remove cache interface for CPU device */ -static int __cpuinit cache_add_dev(struct sys_device * sys_dev) +static int __cpuinit cache_add_dev(struct device *dev)  { -	unsigned int cpu = sys_dev->id; +	unsigned int cpu = dev->id;  	unsigned long i, j;  	struct _index_kobject *this_object;  	struct _cpuid4_info   *this_leaf; @@ -1087,7 +1114,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)  	retval = kobject_init_and_add(per_cpu(ici_cache_kobject, cpu),  				      &ktype_percpu_entry, -				      &sys_dev->kobj, "%s", "cache"); +				      &dev->kobj, "%s", "cache");  	if (retval < 0) {  		cpuid4_cache_sysfs_exit(cpu);  		return retval; @@ -1124,9 +1151,9 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)  	return 0;  } -static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) +static void __cpuinit cache_remove_dev(struct device *dev)  { -	unsigned int cpu = sys_dev->id; +	unsigned int cpu = dev->id;  	unsigned long i;  	if (per_cpu(ici_cpuid4_info, cpu) == NULL) @@ -1145,17 +1172,17 @@ static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb,  					unsigned long action, void *hcpu)  {  	unsigned int cpu = (unsigned long)hcpu; -	struct sys_device *sys_dev; +	struct device *dev; -	sys_dev = get_cpu_sysdev(cpu); +	dev = get_cpu_device(cpu);  	switch (action) {  	case CPU_ONLINE:  	case CPU_ONLINE_FROZEN: -		cache_add_dev(sys_dev); +		cache_add_dev(dev);  		break;  	case CPU_DEAD:  	case CPU_DEAD_FROZEN: -		cache_remove_dev(sys_dev); +		cache_remove_dev(dev);  		break;  	}  	return NOTIFY_OK; @@ -1174,9 +1201,9 @@ static int __cpuinit cache_sysfs_init(void)  	for_each_online_cpu(i) {  		int err; -		struct sys_device *sys_dev = get_cpu_sysdev(i); +		struct device *dev = get_cpu_device(i); -		err = cache_add_dev(sys_dev); +		err = cache_add_dev(dev);  		if (err)  			return err;  	} diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 319882ef848..fc4beb39357 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -17,6 +17,7 @@  #include <linux/kernel.h>  #include <linux/string.h>  #include <linux/fs.h> +#include <linux/preempt.h>  #include <linux/smp.h>  #include <linux/notifier.h>  #include <linux/kdebug.h> @@ -92,6 +93,18 @@ static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs)  	return NMI_HANDLED;  } +static void mce_irq_ipi(void *info) +{ +	int cpu = smp_processor_id(); +	struct mce *m = &__get_cpu_var(injectm); + +	if (cpumask_test_cpu(cpu, mce_inject_cpumask) && +			m->inject_flags & MCJ_EXCEPTION) { +		cpumask_clear_cpu(cpu, mce_inject_cpumask); +		raise_exception(m, NULL); +	} +} +  /* Inject mce on current CPU */  static int raise_local(void)  { @@ -139,9 +152,10 @@ static void raise_mce(struct mce *m)  		return;  #ifdef CONFIG_X86_LOCAL_APIC -	if (m->inject_flags & MCJ_NMI_BROADCAST) { +	if (m->inject_flags & (MCJ_IRQ_BRAODCAST | MCJ_NMI_BROADCAST)) {  		unsigned long start;  		int cpu; +  		get_online_cpus();  		cpumask_copy(mce_inject_cpumask, cpu_online_mask);  		cpumask_clear_cpu(get_cpu(), mce_inject_cpumask); @@ -151,13 +165,25 @@ static void raise_mce(struct mce *m)  			    MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM)  				cpumask_clear_cpu(cpu, mce_inject_cpumask);  		} -		if (!cpumask_empty(mce_inject_cpumask)) -			apic->send_IPI_mask(mce_inject_cpumask, NMI_VECTOR); +		if (!cpumask_empty(mce_inject_cpumask)) { +			if (m->inject_flags & MCJ_IRQ_BRAODCAST) { +				/* +				 * don't wait because mce_irq_ipi is necessary +				 * to be sync with following raise_local +				 */ +				preempt_disable(); +				smp_call_function_many(mce_inject_cpumask, +					mce_irq_ipi, NULL, 0); +				preempt_enable(); +			} else if (m->inject_flags & MCJ_NMI_BROADCAST) +				apic->send_IPI_mask(mce_inject_cpumask, +						NMI_VECTOR); +		}  		start = jiffies;  		while (!cpumask_empty(mce_inject_cpumask)) {  			if (!time_before(jiffies, start + 2*HZ)) {  				printk(KERN_ERR -				"Timeout waiting for mce inject NMI %lx\n", +				"Timeout waiting for mce inject %lx\n",  					*cpumask_bits(mce_inject_cpumask));  				break;  			} diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index fefcc69ee8b..ed44c8a6585 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -1,4 +1,4 @@ -#include <linux/sysdev.h> +#include <linux/device.h>  #include <asm/mce.h>  enum severity_level { @@ -17,7 +17,7 @@ enum severity_level {  struct mce_bank {  	u64			ctl;			/* subevents to enable */  	unsigned char init;				/* initialise bank? */ -	struct sysdev_attribute attr;			/* sysdev attribute */ +	struct device_attribute attr;			/* device attribute */  	char			attrname[ATTR_LEN];	/* attribute name */  }; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 2af127d4c3d..5a11ae2e9e9 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -19,7 +19,7 @@  #include <linux/kernel.h>  #include <linux/percpu.h>  #include <linux/string.h> -#include <linux/sysdev.h> +#include <linux/device.h>  #include <linux/syscore_ops.h>  #include <linux/delay.h>  #include <linux/ctype.h> @@ -95,13 +95,6 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);  static DEFINE_PER_CPU(struct mce, mces_seen);  static int			cpu_missing; -/* - * CPU/chipset specific EDAC code can register a notifier call here to print - * MCE errors in a human-readable form. - */ -ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); -EXPORT_SYMBOL_GPL(x86_mce_decoder_chain); -  /* MCA banks polled by the period polling timer for corrected events */  DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {  	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL @@ -109,6 +102,12 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {  static DEFINE_PER_CPU(struct work_struct, mce_work); +/* + * CPU/chipset specific EDAC code can register a notifier call here to print + * MCE errors in a human-readable form. + */ +ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); +  /* Do initial initialization of a struct mce */  void mce_setup(struct mce *m)  { @@ -119,9 +118,7 @@ void mce_setup(struct mce *m)  	m->time = get_seconds();  	m->cpuvendor = boot_cpu_data.x86_vendor;  	m->cpuid = cpuid_eax(1); -#ifdef CONFIG_SMP  	m->socketid = cpu_data(m->extcpu).phys_proc_id; -#endif  	m->apicid = cpu_data(m->extcpu).initial_apicid;  	rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);  } @@ -190,6 +187,57 @@ void mce_log(struct mce *mce)  	set_bit(0, &mce_need_notify);  } +static void drain_mcelog_buffer(void) +{ +	unsigned int next, i, prev = 0; + +	next = rcu_dereference_check_mce(mcelog.next); + +	do { +		struct mce *m; + +		/* drain what was logged during boot */ +		for (i = prev; i < next; i++) { +			unsigned long start = jiffies; +			unsigned retries = 1; + +			m = &mcelog.entry[i]; + +			while (!m->finished) { +				if (time_after_eq(jiffies, start + 2*retries)) +					retries++; + +				cpu_relax(); + +				if (!m->finished && retries >= 4) { +					pr_err("MCE: skipping error being logged currently!\n"); +					break; +				} +			} +			smp_rmb(); +			atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); +		} + +		memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m)); +		prev = next; +		next = cmpxchg(&mcelog.next, prev, 0); +	} while (next != prev); +} + + +void mce_register_decode_chain(struct notifier_block *nb) +{ +	atomic_notifier_chain_register(&x86_mce_decoder_chain, nb); +	drain_mcelog_buffer(); +} +EXPORT_SYMBOL_GPL(mce_register_decode_chain); + +void mce_unregister_decode_chain(struct notifier_block *nb) +{ +	atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb); +} +EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); +  static void print_mce(struct mce *m)  {  	int ret = 0; @@ -1770,7 +1818,7 @@ static struct syscore_ops mce_syscore_ops = {  };  /* - * mce_sysdev: Sysfs support + * mce_device: Sysfs support   */  static void mce_cpu_restart(void *data) @@ -1806,27 +1854,28 @@ static void mce_enable_ce(void *all)  		__mcheck_cpu_init_timer();  } -static struct sysdev_class mce_sysdev_class = { +static struct bus_type mce_subsys = {  	.name		= "machinecheck", +	.dev_name	= "machinecheck",  }; -DEFINE_PER_CPU(struct sys_device, mce_sysdev); +struct device *mce_device[CONFIG_NR_CPUS];  __cpuinitdata  void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); -static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr) +static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)  {  	return container_of(attr, struct mce_bank, attr);  } -static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, +static ssize_t show_bank(struct device *s, struct device_attribute *attr,  			 char *buf)  {  	return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);  } -static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, +static ssize_t set_bank(struct device *s, struct device_attribute *attr,  			const char *buf, size_t size)  {  	u64 new; @@ -1841,14 +1890,14 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,  }  static ssize_t -show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) +show_trigger(struct device *s, struct device_attribute *attr, char *buf)  {  	strcpy(buf, mce_helper);  	strcat(buf, "\n");  	return strlen(mce_helper) + 1;  } -static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, +static ssize_t set_trigger(struct device *s, struct device_attribute *attr,  				const char *buf, size_t siz)  {  	char *p; @@ -1863,8 +1912,8 @@ static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,  	return strlen(mce_helper) + !!p;  } -static ssize_t set_ignore_ce(struct sys_device *s, -			     struct sysdev_attribute *attr, +static ssize_t set_ignore_ce(struct device *s, +			     struct device_attribute *attr,  			     const char *buf, size_t size)  {  	u64 new; @@ -1887,8 +1936,8 @@ static ssize_t set_ignore_ce(struct sys_device *s,  	return size;  } -static ssize_t set_cmci_disabled(struct sys_device *s, -				 struct sysdev_attribute *attr, +static ssize_t set_cmci_disabled(struct device *s, +				 struct device_attribute *attr,  				 const char *buf, size_t size)  {  	u64 new; @@ -1910,108 +1959,117 @@ static ssize_t set_cmci_disabled(struct sys_device *s,  	return size;  } -static ssize_t store_int_with_restart(struct sys_device *s, -				      struct sysdev_attribute *attr, +static ssize_t store_int_with_restart(struct device *s, +				      struct device_attribute *attr,  				      const char *buf, size_t size)  { -	ssize_t ret = sysdev_store_int(s, attr, buf, size); +	ssize_t ret = device_store_int(s, attr, buf, size);  	mce_restart();  	return ret;  } -static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); -static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); -static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); -static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); +static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger); +static DEVICE_INT_ATTR(tolerant, 0644, tolerant); +static DEVICE_INT_ATTR(monarch_timeout, 0644, monarch_timeout); +static DEVICE_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); -static struct sysdev_ext_attribute attr_check_interval = { -	_SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, -		     store_int_with_restart), +static struct dev_ext_attribute dev_attr_check_interval = { +	__ATTR(check_interval, 0644, device_show_int, store_int_with_restart),  	&check_interval  }; -static struct sysdev_ext_attribute attr_ignore_ce = { -	_SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce), +static struct dev_ext_attribute dev_attr_ignore_ce = { +	__ATTR(ignore_ce, 0644, device_show_int, set_ignore_ce),  	&mce_ignore_ce  }; -static struct sysdev_ext_attribute attr_cmci_disabled = { -	_SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled), +static struct dev_ext_attribute dev_attr_cmci_disabled = { +	__ATTR(cmci_disabled, 0644, device_show_int, set_cmci_disabled),  	&mce_cmci_disabled  }; -static struct sysdev_attribute *mce_sysdev_attrs[] = { -	&attr_tolerant.attr, -	&attr_check_interval.attr, -	&attr_trigger, -	&attr_monarch_timeout.attr, -	&attr_dont_log_ce.attr, -	&attr_ignore_ce.attr, -	&attr_cmci_disabled.attr, +static struct device_attribute *mce_device_attrs[] = { +	&dev_attr_tolerant.attr, +	&dev_attr_check_interval.attr, +	&dev_attr_trigger, +	&dev_attr_monarch_timeout.attr, +	&dev_attr_dont_log_ce.attr, +	&dev_attr_ignore_ce.attr, +	&dev_attr_cmci_disabled.attr,  	NULL  }; -static cpumask_var_t mce_sysdev_initialized; +static cpumask_var_t mce_device_initialized; + +static void mce_device_release(struct device *dev) +{ +	kfree(dev); +} -/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ -static __cpuinit int mce_sysdev_create(unsigned int cpu) +/* Per cpu device init. All of the cpus still share the same ctrl bank: */ +static __cpuinit int mce_device_create(unsigned int cpu)  { -	struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu); +	struct device *dev;  	int err;  	int i, j;  	if (!mce_available(&boot_cpu_data))  		return -EIO; -	memset(&sysdev->kobj, 0, sizeof(struct kobject)); -	sysdev->id  = cpu; -	sysdev->cls = &mce_sysdev_class; +	dev = kzalloc(sizeof *dev, GFP_KERNEL); +	if (!dev) +		return -ENOMEM; +	dev->id  = cpu; +	dev->bus = &mce_subsys; +	dev->release = &mce_device_release; -	err = sysdev_register(sysdev); +	err = device_register(dev);  	if (err)  		return err; -	for (i = 0; mce_sysdev_attrs[i]; i++) { -		err = sysdev_create_file(sysdev, mce_sysdev_attrs[i]); +	for (i = 0; mce_device_attrs[i]; i++) { +		err = device_create_file(dev, mce_device_attrs[i]);  		if (err)  			goto error;  	}  	for (j = 0; j < banks; j++) { -		err = sysdev_create_file(sysdev, &mce_banks[j].attr); +		err = device_create_file(dev, &mce_banks[j].attr);  		if (err)  			goto error2;  	} -	cpumask_set_cpu(cpu, mce_sysdev_initialized); +	cpumask_set_cpu(cpu, mce_device_initialized); +	mce_device[cpu] = dev;  	return 0;  error2:  	while (--j >= 0) -		sysdev_remove_file(sysdev, &mce_banks[j].attr); +		device_remove_file(dev, &mce_banks[j].attr);  error:  	while (--i >= 0) -		sysdev_remove_file(sysdev, mce_sysdev_attrs[i]); +		device_remove_file(dev, mce_device_attrs[i]); -	sysdev_unregister(sysdev); +	device_unregister(dev);  	return err;  } -static __cpuinit void mce_sysdev_remove(unsigned int cpu) +static __cpuinit void mce_device_remove(unsigned int cpu)  { -	struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu); +	struct device *dev = mce_device[cpu];  	int i; -	if (!cpumask_test_cpu(cpu, mce_sysdev_initialized)) +	if (!cpumask_test_cpu(cpu, mce_device_initialized))  		return; -	for (i = 0; mce_sysdev_attrs[i]; i++) -		sysdev_remove_file(sysdev, mce_sysdev_attrs[i]); +	for (i = 0; mce_device_attrs[i]; i++) +		device_remove_file(dev, mce_device_attrs[i]);  	for (i = 0; i < banks; i++) -		sysdev_remove_file(sysdev, &mce_banks[i].attr); +		device_remove_file(dev, &mce_banks[i].attr); -	sysdev_unregister(sysdev); -	cpumask_clear_cpu(cpu, mce_sysdev_initialized); +	device_unregister(dev); +	cpumask_clear_cpu(cpu, mce_device_initialized); +	mce_device[cpu] = NULL;  }  /* Make sure there are no machine checks on offlined CPUs. */ @@ -2061,7 +2119,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)  	switch (action) {  	case CPU_ONLINE:  	case CPU_ONLINE_FROZEN: -		mce_sysdev_create(cpu); +		mce_device_create(cpu);  		if (threshold_cpu_callback)  			threshold_cpu_callback(action, cpu);  		break; @@ -2069,7 +2127,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)  	case CPU_DEAD_FROZEN:  		if (threshold_cpu_callback)  			threshold_cpu_callback(action, cpu); -		mce_sysdev_remove(cpu); +		mce_device_remove(cpu);  		break;  	case CPU_DOWN_PREPARE:  	case CPU_DOWN_PREPARE_FROZEN: @@ -2103,7 +2161,7 @@ static __init void mce_init_banks(void)  	for (i = 0; i < banks; i++) {  		struct mce_bank *b = &mce_banks[i]; -		struct sysdev_attribute *a = &b->attr; +		struct device_attribute *a = &b->attr;  		sysfs_attr_init(&a->attr);  		a->attr.name	= b->attrname; @@ -2123,16 +2181,16 @@ static __init int mcheck_init_device(void)  	if (!mce_available(&boot_cpu_data))  		return -EIO; -	zalloc_cpumask_var(&mce_sysdev_initialized, GFP_KERNEL); +	zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);  	mce_init_banks(); -	err = sysdev_class_register(&mce_sysdev_class); +	err = subsys_system_register(&mce_subsys, NULL);  	if (err)  		return err;  	for_each_online_cpu(i) { -		err = mce_sysdev_create(i); +		err = mce_device_create(i);  		if (err)  			return err;  	} diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index f5474218cff..e4eeaaf58a4 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -17,7 +17,6 @@  #include <linux/notifier.h>  #include <linux/kobject.h>  #include <linux/percpu.h> -#include <linux/sysdev.h>  #include <linux/errno.h>  #include <linux/sched.h>  #include <linux/sysfs.h> @@ -64,11 +63,9 @@ struct threshold_bank {  };  static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks); -#ifdef CONFIG_SMP  static unsigned char shared_bank[NR_BANKS] = {  	0, 0, 0, 0, 1  }; -#endif  static DEFINE_PER_CPU(unsigned char, bank_map);	/* see which banks are on */ @@ -202,10 +199,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)  			if (!block)  				per_cpu(bank_map, cpu) |= (1 << bank); -#ifdef CONFIG_SMP  			if (shared_bank[bank] && c->cpu_core_id)  				break; -#endif +  			offset = setup_APIC_mce(offset,  						(high & MASK_LVTOFF_HI) >> 20); @@ -527,6 +523,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)  {  	int i, err = 0;  	struct threshold_bank *b = NULL; +	struct device *dev = mce_device[cpu];  	char name[32];  	sprintf(name, "threshold_bank%i", bank); @@ -548,8 +545,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)  		if (!b)  			goto out; -		err = sysfs_create_link(&per_cpu(mce_sysdev, cpu).kobj, -					b->kobj, name); +		err = sysfs_create_link(&dev->kobj, b->kobj, name);  		if (err)  			goto out; @@ -571,7 +567,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)  		goto out;  	} -	b->kobj = kobject_create_and_add(name, &per_cpu(mce_sysdev, cpu).kobj); +	b->kobj = kobject_create_and_add(name, &dev->kobj);  	if (!b->kobj)  		goto out_free; @@ -591,8 +587,9 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)  		if (i == cpu)  			continue; -		err = sysfs_create_link(&per_cpu(mce_sysdev, i).kobj, -					b->kobj, name); +		dev = mce_device[i]; +		if (dev) +			err = sysfs_create_link(&dev->kobj,b->kobj, name);  		if (err)  			goto out; @@ -655,6 +652,7 @@ static void deallocate_threshold_block(unsigned int cpu,  static void threshold_remove_bank(unsigned int cpu, int bank)  {  	struct threshold_bank *b; +	struct device *dev;  	char name[32];  	int i = 0; @@ -669,7 +667,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)  #ifdef CONFIG_SMP  	/* sibling symlink */  	if (shared_bank[bank] && b->blocks->cpu != cpu) { -		sysfs_remove_link(&per_cpu(mce_sysdev, cpu).kobj, name); +		sysfs_remove_link(&mce_device[cpu]->kobj, name);  		per_cpu(threshold_banks, cpu)[bank] = NULL;  		return; @@ -681,7 +679,9 @@ static void threshold_remove_bank(unsigned int cpu, int bank)  		if (i == cpu)  			continue; -		sysfs_remove_link(&per_cpu(mce_sysdev, i).kobj, name); +		dev = mce_device[i]; +		if (dev) +			sysfs_remove_link(&dev->kobj, name);  		per_cpu(threshold_banks, i)[bank] = NULL;  	} diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 787e06c84ea..67bb17a37a0 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -19,7 +19,6 @@  #include <linux/kernel.h>  #include <linux/percpu.h>  #include <linux/export.h> -#include <linux/sysdev.h>  #include <linux/types.h>  #include <linux/init.h>  #include <linux/smp.h> @@ -69,16 +68,16 @@ static atomic_t therm_throt_en	= ATOMIC_INIT(0);  static u32 lvtthmr_init __read_mostly;  #ifdef CONFIG_SYSFS -#define define_therm_throt_sysdev_one_ro(_name)				\ -	static SYSDEV_ATTR(_name, 0444,					\ -			   therm_throt_sysdev_show_##_name,		\ +#define define_therm_throt_device_one_ro(_name)				\ +	static DEVICE_ATTR(_name, 0444,					\ +			   therm_throt_device_show_##_name,		\  				   NULL)				\ -#define define_therm_throt_sysdev_show_func(event, name)		\ +#define define_therm_throt_device_show_func(event, name)		\  									\ -static ssize_t therm_throt_sysdev_show_##event##_##name(		\ -			struct sys_device *dev,				\ -			struct sysdev_attribute *attr,			\ +static ssize_t therm_throt_device_show_##event##_##name(		\ +			struct device *dev,				\ +			struct device_attribute *attr,			\  			char *buf)					\  {									\  	unsigned int cpu = dev->id;					\ @@ -95,20 +94,20 @@ static ssize_t therm_throt_sysdev_show_##event##_##name(		\  	return ret;							\  } -define_therm_throt_sysdev_show_func(core_throttle, count); -define_therm_throt_sysdev_one_ro(core_throttle_count); +define_therm_throt_device_show_func(core_throttle, count); +define_therm_throt_device_one_ro(core_throttle_count); -define_therm_throt_sysdev_show_func(core_power_limit, count); -define_therm_throt_sysdev_one_ro(core_power_limit_count); +define_therm_throt_device_show_func(core_power_limit, count); +define_therm_throt_device_one_ro(core_power_limit_count); -define_therm_throt_sysdev_show_func(package_throttle, count); -define_therm_throt_sysdev_one_ro(package_throttle_count); +define_therm_throt_device_show_func(package_throttle, count); +define_therm_throt_device_one_ro(package_throttle_count); -define_therm_throt_sysdev_show_func(package_power_limit, count); -define_therm_throt_sysdev_one_ro(package_power_limit_count); +define_therm_throt_device_show_func(package_power_limit, count); +define_therm_throt_device_one_ro(package_power_limit_count);  static struct attribute *thermal_throttle_attrs[] = { -	&attr_core_throttle_count.attr, +	&dev_attr_core_throttle_count.attr,  	NULL  }; @@ -223,36 +222,36 @@ static int thresh_event_valid(int event)  #ifdef CONFIG_SYSFS  /* Add/Remove thermal_throttle interface for CPU device: */ -static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev, +static __cpuinit int thermal_throttle_add_dev(struct device *dev,  				unsigned int cpu)  {  	int err;  	struct cpuinfo_x86 *c = &cpu_data(cpu); -	err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group); +	err = sysfs_create_group(&dev->kobj, &thermal_attr_group);  	if (err)  		return err;  	if (cpu_has(c, X86_FEATURE_PLN)) -		err = sysfs_add_file_to_group(&sys_dev->kobj, -					      &attr_core_power_limit_count.attr, +		err = sysfs_add_file_to_group(&dev->kobj, +					      &dev_attr_core_power_limit_count.attr,  					      thermal_attr_group.name);  	if (cpu_has(c, X86_FEATURE_PTS)) { -		err = sysfs_add_file_to_group(&sys_dev->kobj, -					      &attr_package_throttle_count.attr, +		err = sysfs_add_file_to_group(&dev->kobj, +					      &dev_attr_package_throttle_count.attr,  					      thermal_attr_group.name);  		if (cpu_has(c, X86_FEATURE_PLN)) -			err = sysfs_add_file_to_group(&sys_dev->kobj, -					&attr_package_power_limit_count.attr, +			err = sysfs_add_file_to_group(&dev->kobj, +					&dev_attr_package_power_limit_count.attr,  					thermal_attr_group.name);  	}  	return err;  } -static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) +static __cpuinit void thermal_throttle_remove_dev(struct device *dev)  { -	sysfs_remove_group(&sys_dev->kobj, &thermal_attr_group); +	sysfs_remove_group(&dev->kobj, &thermal_attr_group);  }  /* Mutex protecting device creation against CPU hotplug: */ @@ -265,16 +264,16 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,  			      void *hcpu)  {  	unsigned int cpu = (unsigned long)hcpu; -	struct sys_device *sys_dev; +	struct device *dev;  	int err = 0; -	sys_dev = get_cpu_sysdev(cpu); +	dev = get_cpu_device(cpu);  	switch (action) {  	case CPU_UP_PREPARE:  	case CPU_UP_PREPARE_FROZEN:  		mutex_lock(&therm_cpu_lock); -		err = thermal_throttle_add_dev(sys_dev, cpu); +		err = thermal_throttle_add_dev(dev, cpu);  		mutex_unlock(&therm_cpu_lock);  		WARN_ON(err);  		break; @@ -283,7 +282,7 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,  	case CPU_DEAD:  	case CPU_DEAD_FROZEN:  		mutex_lock(&therm_cpu_lock); -		thermal_throttle_remove_dev(sys_dev); +		thermal_throttle_remove_dev(dev);  		mutex_unlock(&therm_cpu_lock);  		break;  	} @@ -310,7 +309,7 @@ static __init int thermal_throttle_init_device(void)  #endif  	/* connect live CPUs to sysfs */  	for_each_online_cpu(cpu) { -		err = thermal_throttle_add_dev(get_cpu_sysdev(cpu), cpu); +		err = thermal_throttle_add_dev(get_cpu_device(cpu), cpu);  		WARN_ON(err);  	}  #ifdef CONFIG_HOTPLUG_CPU @@ -323,17 +322,6 @@ device_initcall(thermal_throttle_init_device);  #endif /* CONFIG_SYSFS */ -/* - * Set up the most two significant bit to notify mce log that this thermal - * event type. - * This is a temp solution. May be changed in the future with mce log - * infrasture. - */ -#define CORE_THROTTLED		(0) -#define CORE_POWER_LIMIT	((__u64)1 << 62) -#define PACKAGE_THROTTLED	((__u64)2 << 62) -#define PACKAGE_POWER_LIMIT	((__u64)3 << 62) -  static void notify_thresholds(__u64 msr_val)  {  	/* check whether the interrupt handler is defined; @@ -363,27 +351,23 @@ static void intel_thermal_interrupt(void)  	if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,  				THERMAL_THROTTLING_EVENT,  				CORE_LEVEL) != 0) -		mce_log_therm_throt_event(CORE_THROTTLED | msr_val); +		mce_log_therm_throt_event(msr_val);  	if (this_cpu_has(X86_FEATURE_PLN)) -		if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, +		therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,  					POWER_LIMIT_EVENT, -					CORE_LEVEL) != 0) -			mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val); +					CORE_LEVEL);  	if (this_cpu_has(X86_FEATURE_PTS)) {  		rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); -		if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, +		therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,  					THERMAL_THROTTLING_EVENT, -					PACKAGE_LEVEL) != 0) -			mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val); +					PACKAGE_LEVEL);  		if (this_cpu_has(X86_FEATURE_PLN)) -			if (therm_throt_process(msr_val & +			therm_throt_process(msr_val &  					PACKAGE_THERM_STATUS_POWER_LIMIT,  					POWER_LIMIT_EVENT, -					PACKAGE_LEVEL) != 0) -				mce_log_therm_throt_event(PACKAGE_POWER_LIMIT -							  | msr_val); +					PACKAGE_LEVEL);  	}  } @@ -397,8 +381,8 @@ static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;  asmlinkage void smp_thermal_interrupt(struct pt_regs *regs)  { -	exit_idle();  	irq_enter(); +	exit_idle();  	inc_irq_stat(irq_thermal_count);  	smp_thermal_vector();  	irq_exit(); diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index d746df2909c..aa578cadb94 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -19,8 +19,8 @@ void (*mce_threshold_vector)(void) = default_threshold_interrupt;  asmlinkage void smp_threshold_interrupt(void)  { -	exit_idle();  	irq_enter(); +	exit_idle();  	inc_irq_stat(irq_threshold_count);  	mce_threshold_vector();  	irq_exit(); diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index a71efcdbb09..97b26356e9e 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -547,6 +547,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,  		if (tmp != mask_lo) {  			printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n"); +			add_taint(TAINT_FIRMWARE_WORKAROUND);  			mask_lo = tmp;  		}  	} @@ -693,6 +694,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)  	/* Disable MTRRs, and set the default type to uncached */  	mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi); +	wbinvd();  }  static void post_set(void) __releases(set_atomicity_lock) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 640891014b2..5adce1040b1 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -312,12 +312,8 @@ int x86_setup_perfctr(struct perf_event *event)  			return -EOPNOTSUPP;  	} -	/* -	 * Do not allow config1 (extended registers) to propagate, -	 * there's no sane user-space generalization yet: -	 */  	if (attr->type == PERF_TYPE_RAW) -		return 0; +		return x86_pmu_extra_regs(event->attr.config, event);  	if (attr->type == PERF_TYPE_HW_CACHE)  		return set_ext_hw_attr(hwc, event); @@ -488,18 +484,195 @@ static inline int is_x86_event(struct perf_event *event)  	return event->pmu == &pmu;  } +/* + * Event scheduler state: + * + * Assign events iterating over all events and counters, beginning + * with events with least weights first. Keep the current iterator + * state in struct sched_state. + */ +struct sched_state { +	int	weight; +	int	event;		/* event index */ +	int	counter;	/* counter index */ +	int	unassigned;	/* number of events to be assigned left */ +	unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; +}; + +/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */ +#define	SCHED_STATES_MAX	2 + +struct perf_sched { +	int			max_weight; +	int			max_events; +	struct event_constraint	**constraints; +	struct sched_state	state; +	int			saved_states; +	struct sched_state	saved[SCHED_STATES_MAX]; +}; + +/* + * Initialize interator that runs through all events and counters. + */ +static void perf_sched_init(struct perf_sched *sched, struct event_constraint **c, +			    int num, int wmin, int wmax) +{ +	int idx; + +	memset(sched, 0, sizeof(*sched)); +	sched->max_events	= num; +	sched->max_weight	= wmax; +	sched->constraints	= c; + +	for (idx = 0; idx < num; idx++) { +		if (c[idx]->weight == wmin) +			break; +	} + +	sched->state.event	= idx;		/* start with min weight */ +	sched->state.weight	= wmin; +	sched->state.unassigned	= num; +} + +static void perf_sched_save_state(struct perf_sched *sched) +{ +	if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX)) +		return; + +	sched->saved[sched->saved_states] = sched->state; +	sched->saved_states++; +} + +static bool perf_sched_restore_state(struct perf_sched *sched) +{ +	if (!sched->saved_states) +		return false; + +	sched->saved_states--; +	sched->state = sched->saved[sched->saved_states]; + +	/* continue with next counter: */ +	clear_bit(sched->state.counter++, sched->state.used); + +	return true; +} + +/* + * Select a counter for the current event to schedule. Return true on + * success. + */ +static bool __perf_sched_find_counter(struct perf_sched *sched) +{ +	struct event_constraint *c; +	int idx; + +	if (!sched->state.unassigned) +		return false; + +	if (sched->state.event >= sched->max_events) +		return false; + +	c = sched->constraints[sched->state.event]; + +	/* Prefer fixed purpose counters */ +	if (x86_pmu.num_counters_fixed) { +		idx = X86_PMC_IDX_FIXED; +		for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_MAX) { +			if (!__test_and_set_bit(idx, sched->state.used)) +				goto done; +		} +	} +	/* Grab the first unused counter starting with idx */ +	idx = sched->state.counter; +	for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_FIXED) { +		if (!__test_and_set_bit(idx, sched->state.used)) +			goto done; +	} + +	return false; + +done: +	sched->state.counter = idx; + +	if (c->overlap) +		perf_sched_save_state(sched); + +	return true; +} + +static bool perf_sched_find_counter(struct perf_sched *sched) +{ +	while (!__perf_sched_find_counter(sched)) { +		if (!perf_sched_restore_state(sched)) +			return false; +	} + +	return true; +} + +/* + * Go through all unassigned events and find the next one to schedule. + * Take events with the least weight first. Return true on success. + */ +static bool perf_sched_next_event(struct perf_sched *sched) +{ +	struct event_constraint *c; + +	if (!sched->state.unassigned || !--sched->state.unassigned) +		return false; + +	do { +		/* next event */ +		sched->state.event++; +		if (sched->state.event >= sched->max_events) { +			/* next weight */ +			sched->state.event = 0; +			sched->state.weight++; +			if (sched->state.weight > sched->max_weight) +				return false; +		} +		c = sched->constraints[sched->state.event]; +	} while (c->weight != sched->state.weight); + +	sched->state.counter = 0;	/* start with first counter */ + +	return true; +} + +/* + * Assign a counter for each event. + */ +static int perf_assign_events(struct event_constraint **constraints, int n, +			      int wmin, int wmax, int *assign) +{ +	struct perf_sched sched; + +	perf_sched_init(&sched, constraints, n, wmin, wmax); + +	do { +		if (!perf_sched_find_counter(&sched)) +			break;	/* failed */ +		if (assign) +			assign[sched.state.event] = sched.state.counter; +	} while (perf_sched_next_event(&sched)); + +	return sched.state.unassigned; +} +  int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)  {  	struct event_constraint *c, *constraints[X86_PMC_IDX_MAX];  	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; -	int i, j, w, wmax, num = 0; +	int i, wmin, wmax, num = 0;  	struct hw_perf_event *hwc;  	bitmap_zero(used_mask, X86_PMC_IDX_MAX); -	for (i = 0; i < n; i++) { +	for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {  		c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);  		constraints[i] = c; +		wmin = min(wmin, c->weight); +		wmax = max(wmax, c->weight);  	}  	/* @@ -525,59 +698,11 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)  		if (assign)  			assign[i] = hwc->idx;  	} -	if (i == n) -		goto done; - -	/* -	 * begin slow path -	 */ - -	bitmap_zero(used_mask, X86_PMC_IDX_MAX); - -	/* -	 * weight = number of possible counters -	 * -	 * 1    = most constrained, only works on one counter -	 * wmax = least constrained, works on any counter -	 * -	 * assign events to counters starting with most -	 * constrained events. -	 */ -	wmax = x86_pmu.num_counters; - -	/* -	 * when fixed event counters are present, -	 * wmax is incremented by 1 to account -	 * for one more choice -	 */ -	if (x86_pmu.num_counters_fixed) -		wmax++; - -	for (w = 1, num = n; num && w <= wmax; w++) { -		/* for each event */ -		for (i = 0; num && i < n; i++) { -			c = constraints[i]; -			hwc = &cpuc->event_list[i]->hw; - -			if (c->weight != w) -				continue; -			for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) { -				if (!test_bit(j, used_mask)) -					break; -			} - -			if (j == X86_PMC_IDX_MAX) -				break; +	/* slow path */ +	if (i != n) +		num = perf_assign_events(constraints, n, wmin, wmax, assign); -			__set_bit(j, used_mask); - -			if (assign) -				assign[i] = j; -			num--; -		} -	} -done:  	/*  	 * scheduling failed or is just a simulation,  	 * free resources if necessary @@ -588,7 +713,7 @@ done:  				x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]);  		}  	} -	return num ? -ENOSPC : 0; +	return num ? -EINVAL : 0;  }  /* @@ -607,7 +732,7 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader,  	if (is_x86_event(leader)) {  		if (n >= max_count) -			return -ENOSPC; +			return -EINVAL;  		cpuc->event_list[n] = leader;  		n++;  	} @@ -620,7 +745,7 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader,  			continue;  		if (n >= max_count) -			return -ENOSPC; +			return -EINVAL;  		cpuc->event_list[n] = event;  		n++; @@ -1123,6 +1248,7 @@ static void __init pmu_check_apic(void)  static int __init init_hw_perf_events(void)  { +	struct x86_pmu_quirk *quirk;  	struct event_constraint *c;  	int err; @@ -1151,8 +1277,8 @@ static int __init init_hw_perf_events(void)  	pr_cont("%s PMU driver.\n", x86_pmu.name); -	if (x86_pmu.quirks) -		x86_pmu.quirks(); +	for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next) +		quirk->func();  	if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {  		WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", @@ -1175,12 +1301,18 @@ static int __init init_hw_perf_events(void)  	unconstrained = (struct event_constraint)  		__EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1, -				   0, x86_pmu.num_counters); +				   0, x86_pmu.num_counters, 0);  	if (x86_pmu.event_constraints) { +		/* +		 * event on fixed counter2 (REF_CYCLES) only works on this +		 * counter, so do not extend mask to generic counters +		 */  		for_each_event_constraint(c, x86_pmu.event_constraints) { -			if (c->cmask != X86_RAW_EVENT_MASK) +			if (c->cmask != X86_RAW_EVENT_MASK +			    || c->idxmsk64 == X86_PMC_MSK_FIXED_REF_CYCLES) {  				continue; +			}  			c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;  			c->weight += x86_pmu.num_counters; @@ -1316,7 +1448,7 @@ static int validate_event(struct perf_event *event)  	c = x86_pmu.get_event_constraints(fake_cpuc, event);  	if (!c || !c->weight) -		ret = -ENOSPC; +		ret = -EINVAL;  	if (x86_pmu.put_event_constraints)  		x86_pmu.put_event_constraints(fake_cpuc, event); @@ -1341,7 +1473,7 @@ static int validate_group(struct perf_event *event)  {  	struct perf_event *leader = event->group_leader;  	struct cpu_hw_events *fake_cpuc; -	int ret = -ENOSPC, n; +	int ret = -EINVAL, n;  	fake_cpuc = allocate_fake_cpuc();  	if (IS_ERR(fake_cpuc)) @@ -1570,3 +1702,15 @@ unsigned long perf_misc_flags(struct pt_regs *regs)  	return misc;  } + +void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) +{ +	cap->version		= x86_pmu.version; +	cap->num_counters_gp	= x86_pmu.num_counters; +	cap->num_counters_fixed	= x86_pmu.num_counters_fixed; +	cap->bit_width_gp	= x86_pmu.cntval_bits; +	cap->bit_width_fixed	= x86_pmu.cntval_bits; +	cap->events_mask	= (unsigned int)x86_pmu.events_maskl; +	cap->events_mask_len	= x86_pmu.events_mask_len; +} +EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability); diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index b9698d40ac4..c30c807ddc7 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -45,6 +45,7 @@ struct event_constraint {  	u64	code;  	u64	cmask;  	int	weight; +	int	overlap;  };  struct amd_nb { @@ -146,20 +147,47 @@ struct cpu_hw_events {  	/*  	 * AMD specific bits  	 */ -	struct amd_nb		*amd_nb; +	struct amd_nb			*amd_nb; +	/* Inverted mask of bits to clear in the perf_ctr ctrl registers */ +	u64				perf_ctr_virt_mask;  	void				*kfree_on_online;  }; -#define __EVENT_CONSTRAINT(c, n, m, w) {\ +#define __EVENT_CONSTRAINT(c, n, m, w, o) {\  	{ .idxmsk64 = (n) },		\  	.code = (c),			\  	.cmask = (m),			\  	.weight = (w),			\ +	.overlap = (o),			\  }  #define EVENT_CONSTRAINT(c, n, m)	\ -	__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n)) +	__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0) + +/* + * The overlap flag marks event constraints with overlapping counter + * masks. This is the case if the counter mask of such an event is not + * a subset of any other counter mask of a constraint with an equal or + * higher weight, e.g.: + * + *  c_overlaps = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0); + *  c_another1 = EVENT_CONSTRAINT(0, 0x07, 0); + *  c_another2 = EVENT_CONSTRAINT(0, 0x38, 0); + * + * The event scheduler may not select the correct counter in the first + * cycle because it needs to know which subsequent events will be + * scheduled. It may fail to schedule the events then. So we set the + * overlap flag for such constraints to give the scheduler a hint which + * events to select for counter rescheduling. + * + * Care must be taken as the rescheduling algorithm is O(n!) which + * will increase scheduling cycles for an over-commited system + * dramatically.  The number of such EVENT_CONSTRAINT_OVERLAP() macros + * and its counter masks must be kept at a minimum. + */ +#define EVENT_CONSTRAINT_OVERLAP(c, n, m)	\ +	__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 1)  /*   * Constraint on the Event code. @@ -235,6 +263,11 @@ union perf_capabilities {  	u64	capabilities;  }; +struct x86_pmu_quirk { +	struct x86_pmu_quirk *next; +	void (*func)(void); +}; +  /*   * struct x86_pmu - generic x86 pmu   */ @@ -259,6 +292,11 @@ struct x86_pmu {  	int		num_counters_fixed;  	int		cntval_bits;  	u64		cntval_mask; +	union { +			unsigned long events_maskl; +			unsigned long events_mask[BITS_TO_LONGS(ARCH_PERFMON_EVENTS_COUNT)]; +	}; +	int		events_mask_len;  	int		apic;  	u64		max_period;  	struct event_constraint * @@ -268,7 +306,7 @@ struct x86_pmu {  	void		(*put_event_constraints)(struct cpu_hw_events *cpuc,  						 struct perf_event *event);  	struct event_constraint *event_constraints; -	void		(*quirks)(void); +	struct x86_pmu_quirk *quirks;  	int		perfctr_second_write;  	int		(*cpu_prepare)(int cpu); @@ -309,6 +347,15 @@ struct x86_pmu {  	struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr);  }; +#define x86_add_quirk(func_)						\ +do {									\ +	static struct x86_pmu_quirk __quirk __initdata = {		\ +		.func = func_,						\ +	};								\ +	__quirk.next = x86_pmu.quirks;					\ +	x86_pmu.quirks = &__quirk;					\ +} while (0) +  #define ERF_NO_HT_SHARING	1  #define ERF_HAS_RSP_1		2 @@ -372,9 +419,11 @@ void x86_pmu_disable_all(void);  static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,  					  u64 enable_mask)  { +	u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask); +  	if (hwc->extra_reg.reg)  		wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config); -	wrmsrl(hwc->config_base, hwc->config | enable_mask); +	wrmsrl(hwc->config_base, (hwc->config | enable_mask) & ~disable_mask);  }  void x86_pmu_enable_all(int added); diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index aeefd45697a..67250a52430 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -1,4 +1,5 @@  #include <linux/perf_event.h> +#include <linux/export.h>  #include <linux/types.h>  #include <linux/init.h>  #include <linux/slab.h> @@ -357,7 +358,9 @@ static void amd_pmu_cpu_starting(int cpu)  	struct amd_nb *nb;  	int i, nb_id; -	if (boot_cpu_data.x86_max_cores < 2) +	cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY; + +	if (boot_cpu_data.x86_max_cores < 2 || boot_cpu_data.x86 == 0x15)  		return;  	nb_id = amd_get_nb_id(cpu); @@ -492,7 +495,7 @@ static __initconst const struct x86_pmu amd_pmu = {  static struct event_constraint amd_f15_PMC0  = EVENT_CONSTRAINT(0, 0x01, 0);  static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0);  static struct event_constraint amd_f15_PMC3  = EVENT_CONSTRAINT(0, 0x08, 0); -static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT(0, 0x09, 0); +static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);  static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0);  static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0); @@ -587,9 +590,9 @@ static __initconst const struct x86_pmu amd_pmu_f15h = {  	.put_event_constraints	= amd_put_event_constraints,  	.cpu_prepare		= amd_pmu_cpu_prepare, -	.cpu_starting		= amd_pmu_cpu_starting,  	.cpu_dead		= amd_pmu_cpu_dead,  #endif +	.cpu_starting		= amd_pmu_cpu_starting,  };  __init int amd_pmu_init(void) @@ -621,3 +624,33 @@ __init int amd_pmu_init(void)  	return 0;  } + +void amd_pmu_enable_virt(void) +{ +	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + +	cpuc->perf_ctr_virt_mask = 0; + +	/* Reload all events */ +	x86_pmu_disable_all(); +	x86_pmu_enable_all(0); +} +EXPORT_SYMBOL_GPL(amd_pmu_enable_virt); + +void amd_pmu_disable_virt(void) +{ +	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + +	/* +	 * We only mask out the Host-only bit so that host-only counting works +	 * when SVM is disabled. If someone sets up a guest-only counter when +	 * SVM is disabled the Guest-only bits still gets set and the counter +	 * will not count anything. +	 */ +	cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY; + +	/* Reload all events */ +	x86_pmu_disable_all(); +	x86_pmu_enable_all(0); +} +EXPORT_SYMBOL_GPL(amd_pmu_disable_virt); diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index ab6343d2182..3b8a2d30d14 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -199,8 +199,7 @@ static int force_ibs_eilvt_setup(void)  		goto out;  	} -	pr_err(FW_BUG "using offset %d for IBS interrupts\n", offset); -	pr_err(FW_BUG "workaround enabled for IBS LVT offset\n"); +	pr_info("IBS: LVT offset %d assigned\n", offset);  	return 0;  out: @@ -265,19 +264,23 @@ perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *h  static __init int amd_ibs_init(void)  {  	u32 caps; -	int ret; +	int ret = -EINVAL;  	caps = __get_ibs_caps();  	if (!caps)  		return -ENODEV;	/* ibs not supported by the cpu */ -	if (!ibs_eilvt_valid()) { -		ret = force_ibs_eilvt_setup(); -		if (ret) { -			pr_err("Failed to setup IBS, %d\n", ret); -			return ret; -		} -	} +	/* +	 * Force LVT offset assignment for family 10h: The offsets are +	 * not assigned by the BIOS for this family, so the OS is +	 * responsible for doing it. If the OS assignment fails, fall +	 * back to BIOS settings and try to setup this. +	 */ +	if (boot_cpu_data.x86 == 0x10) +		force_ibs_eilvt_setup(); + +	if (!ibs_eilvt_valid()) +		goto out;  	get_online_cpus();  	ibs_caps = caps; @@ -287,7 +290,11 @@ static __init int amd_ibs_init(void)  	smp_call_function(setup_APIC_ibs, NULL, 1);  	put_online_cpus(); -	return perf_event_ibs_init(); +	ret = perf_event_ibs_init(); +out: +	if (ret) +		pr_err("Failed to setup IBS, %d\n", ret); +	return ret;  }  /* Since we need the pci subsystem to init ibs we can't do this earlier: */ diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 2be5ebe9987..3bd37bdf1b8 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -28,6 +28,7 @@ static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =    [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4,    [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,    [PERF_COUNT_HW_BUS_CYCLES]		= 0x013c, +  [PERF_COUNT_HW_REF_CPU_CYCLES]	= 0x0300, /* pseudo-encoding */  };  static struct event_constraint intel_core_event_constraints[] __read_mostly = @@ -45,12 +46,7 @@ static struct event_constraint intel_core2_event_constraints[] __read_mostly =  {  	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */  	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ -	/* -	 * Core2 has Fixed Counter 2 listed as CPU_CLK_UNHALTED.REF and event -	 * 0x013c as CPU_CLK_UNHALTED.BUS and specifies there is a fixed -	 * ratio between these counters. -	 */ -	/* FIXED_EVENT_CONSTRAINT(0x013c, 2),  CPU_CLK_UNHALTED.REF */ +	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */  	INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */  	INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */  	INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ @@ -68,7 +64,7 @@ static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =  {  	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */  	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ -	/* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ +	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */  	INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */  	INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */  	INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */ @@ -90,7 +86,7 @@ static struct event_constraint intel_westmere_event_constraints[] __read_mostly  {  	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */  	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ -	/* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ +	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */  	INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */  	INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */  	INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ @@ -102,7 +98,7 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly =  {  	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */  	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ -	/* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ +	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */  	INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */  	INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */  	INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ @@ -125,7 +121,7 @@ static struct event_constraint intel_gen_event_constraints[] __read_mostly =  {  	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */  	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ -	/* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ +	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */  	EVENT_CONSTRAINT_END  }; @@ -1169,7 +1165,7 @@ again:  		 */  		c = &unconstrained;  	} else if (intel_try_alt_er(event, orig_idx)) { -		raw_spin_unlock(&era->lock); +		raw_spin_unlock_irqrestore(&era->lock, flags);  		goto again;  	}  	raw_spin_unlock_irqrestore(&era->lock, flags); @@ -1519,7 +1515,7 @@ static __initconst const struct x86_pmu intel_pmu = {  	.guest_get_msrs		= intel_guest_get_msrs,  }; -static void intel_clovertown_quirks(void) +static __init void intel_clovertown_quirk(void)  {  	/*  	 * PEBS is unreliable due to: @@ -1545,12 +1541,60 @@ static void intel_clovertown_quirks(void)  	x86_pmu.pebs_constraints = NULL;  } +static __init void intel_sandybridge_quirk(void) +{ +	printk(KERN_WARNING "PEBS disabled due to CPU errata.\n"); +	x86_pmu.pebs = 0; +	x86_pmu.pebs_constraints = NULL; +} + +static const struct { int id; char *name; } intel_arch_events_map[] __initconst = { +	{ PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" }, +	{ PERF_COUNT_HW_INSTRUCTIONS, "instructions" }, +	{ PERF_COUNT_HW_BUS_CYCLES, "bus cycles" }, +	{ PERF_COUNT_HW_CACHE_REFERENCES, "cache references" }, +	{ PERF_COUNT_HW_CACHE_MISSES, "cache misses" }, +	{ PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" }, +	{ PERF_COUNT_HW_BRANCH_MISSES, "branch misses" }, +}; + +static __init void intel_arch_events_quirk(void) +{ +	int bit; + +	/* disable event that reported as not presend by cpuid */ +	for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) { +		intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0; +		printk(KERN_WARNING "CPUID marked event: \'%s\' unavailable\n", +				intel_arch_events_map[bit].name); +	} +} + +static __init void intel_nehalem_quirk(void) +{ +	union cpuid10_ebx ebx; + +	ebx.full = x86_pmu.events_maskl; +	if (ebx.split.no_branch_misses_retired) { +		/* +		 * Erratum AAJ80 detected, we work it around by using +		 * the BR_MISP_EXEC.ANY event. This will over-count +		 * branch-misses, but it's still much better than the +		 * architectural event which is often completely bogus: +		 */ +		intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89; +		ebx.split.no_branch_misses_retired = 0; +		x86_pmu.events_maskl = ebx.full; +		printk(KERN_INFO "CPU erratum AAJ80 worked around\n"); +	} +} +  __init int intel_pmu_init(void)  {  	union cpuid10_edx edx;  	union cpuid10_eax eax; +	union cpuid10_ebx ebx;  	unsigned int unused; -	unsigned int ebx;  	int version;  	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { @@ -1567,8 +1611,8 @@ __init int intel_pmu_init(void)  	 * Check whether the Architectural PerfMon supports  	 * Branch Misses Retired hw_event or not.  	 */ -	cpuid(10, &eax.full, &ebx, &unused, &edx.full); -	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) +	cpuid(10, &eax.full, &ebx.full, &unused, &edx.full); +	if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT)  		return -ENODEV;  	version = eax.split.version_id; @@ -1582,6 +1626,9 @@ __init int intel_pmu_init(void)  	x86_pmu.cntval_bits		= eax.split.bit_width;  	x86_pmu.cntval_mask		= (1ULL << eax.split.bit_width) - 1; +	x86_pmu.events_maskl		= ebx.full; +	x86_pmu.events_mask_len		= eax.split.mask_length; +  	/*  	 * Quirk: v2 perfmon does not report fixed-purpose events, so  	 * assume at least 3 events: @@ -1601,6 +1648,8 @@ __init int intel_pmu_init(void)  	intel_ds_init(); +	x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */ +  	/*  	 * Install the hw-cache-events table:  	 */ @@ -1610,7 +1659,7 @@ __init int intel_pmu_init(void)  		break;  	case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ -		x86_pmu.quirks = intel_clovertown_quirks; +		x86_add_quirk(intel_clovertown_quirk);  	case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */  	case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */  	case 29: /* six-core 45 nm xeon "Dunnington" */ @@ -1644,17 +1693,8 @@ __init int intel_pmu_init(void)  		/* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */  		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1; -		if (ebx & 0x40) { -			/* -			 * Erratum AAJ80 detected, we work it around by using -			 * the BR_MISP_EXEC.ANY event. This will over-count -			 * branch-misses, but it's still much better than the -			 * architectural event which is often completely bogus: -			 */ -			intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89; +		x86_add_quirk(intel_nehalem_quirk); -			pr_cont("erratum AAJ80 worked around, "); -		}  		pr_cont("Nehalem events, ");  		break; @@ -1694,6 +1734,7 @@ __init int intel_pmu_init(void)  		break;  	case 42: /* SandyBridge */ +		x86_add_quirk(intel_sandybridge_quirk);  	case 45: /* SandyBridge, "Romely-EP" */  		memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,  		       sizeof(hw_cache_event_ids)); @@ -1730,5 +1771,6 @@ __init int intel_pmu_init(void)  			break;  		}  	} +  	return 0;  } diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index c0d238f49db..d6bd49faa40 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -439,7 +439,6 @@ void intel_pmu_pebs_enable(struct perf_event *event)  	hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;  	cpuc->pebs_enabled |= 1ULL << hwc->idx; -	WARN_ON_ONCE(cpuc->enabled);  	if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)  		intel_pmu_lbr_enable(event); @@ -493,6 +492,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)  	unsigned long from = cpuc->lbr_entries[0].from;  	unsigned long old_to, to = cpuc->lbr_entries[0].to;  	unsigned long ip = regs->ip; +	int is_64bit = 0;  	/*  	 * We don't need to fixup if the PEBS assist is fault like @@ -544,7 +544,10 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)  		} else  			kaddr = (void *)to; -		kernel_insn_init(&insn, kaddr); +#ifdef CONFIG_X86_64 +		is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32); +#endif +		insn_init(&insn, kaddr, is_64bit);  		insn_get_length(&insn);  		to += insn.length;  	} while (to < ip); diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 3fab3de3ce9..47a7e63bfe5 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -72,8 +72,6 @@ void intel_pmu_lbr_enable(struct perf_event *event)  	if (!x86_pmu.lbr_nr)  		return; -	WARN_ON_ONCE(cpuc->enabled); -  	/*  	 * Reset the LBR stack if we changed task context to  	 * avoid data leaks. diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index 492bf1358a7..ef484d9d0a2 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -1268,7 +1268,7 @@ reserve:  	}  done: -	return num ? -ENOSPC : 0; +	return num ? -EINVAL : 0;  }  static __initconst const struct x86_pmu p4_pmu = { diff --git a/arch/x86/kernel/cpu/powerflags.c b/arch/x86/kernel/cpu/powerflags.c index 5abbea297e0..7b3fe56b1c2 100644 --- a/arch/x86/kernel/cpu/powerflags.c +++ b/arch/x86/kernel/cpu/powerflags.c @@ -16,5 +16,6 @@ const char *const x86_power_flags[32] = {  	"100mhzsteps",  	"hwpstate",  	"",	/* tsc invariant mapped to constant_tsc */ -		/* nothing */ +	"cpb",  /* core performance boost */ +	"eff_freq_ro", /* Readonly aperf/mperf */  }; diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 14b23140e81..8022c668148 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -64,12 +64,10 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)  static int show_cpuinfo(struct seq_file *m, void *v)  {  	struct cpuinfo_x86 *c = v; -	unsigned int cpu = 0; +	unsigned int cpu;  	int i; -#ifdef CONFIG_SMP  	cpu = c->cpu_index; -#endif  	seq_printf(m, "processor\t: %u\n"  		   "vendor_id\t: %s\n"  		   "cpu family\t: %d\n" diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 212a6a42527..a524353d93f 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -177,7 +177,7 @@ static struct notifier_block __refdata cpuid_class_cpu_notifier =  	.notifier_call = cpuid_class_cpu_callback,  }; -static char *cpuid_devnode(struct device *dev, mode_t *mode) +static char *cpuid_devnode(struct device *dev, umode_t *mode)  {  	return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt));  } diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 1aae78f775f..4025fe4f928 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -252,7 +252,8 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)  	unsigned short ss;  	unsigned long sp;  #endif -	printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); +	printk(KERN_DEFAULT +	       "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);  #ifdef CONFIG_PREEMPT  	printk("PREEMPT ");  #endif diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 3b97a80ce32..c99f9ed013d 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -116,16 +116,16 @@ void show_registers(struct pt_regs *regs)  		for (i = 0; i < code_len; i++, ip++) {  			if (ip < (u8 *)PAGE_OFFSET ||  					probe_kernel_address(ip, c)) { -				printk(" Bad EIP value."); +				printk(KERN_CONT " Bad EIP value.");  				break;  			}  			if (ip == (u8 *)regs->ip) -				printk("<%02x> ", c); +				printk(KERN_CONT "<%02x> ", c);  			else -				printk("%02x ", c); +				printk(KERN_CONT "%02x ", c);  		}  	} -	printk("\n"); +	printk(KERN_CONT "\n");  }  int is_valid_bugaddr(unsigned long ip) diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 19853ad8afc..17107bd6e1f 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -129,7 +129,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,  	if (!stack) {  		if (regs)  			stack = (unsigned long *)regs->sp; -		else if (task && task != current) +		else if (task != current)  			stack = (unsigned long *)task->thread.sp;  		else  			stack = &dummy; @@ -269,11 +269,11 @@ void show_registers(struct pt_regs *regs)  		unsigned char c;  		u8 *ip; -		printk(KERN_EMERG "Stack:\n"); +		printk(KERN_DEFAULT "Stack:\n");  		show_stack_log_lvl(NULL, regs, (unsigned long *)sp, -				   0, KERN_EMERG); +				   0, KERN_DEFAULT); -		printk(KERN_EMERG "Code: "); +		printk(KERN_DEFAULT "Code: ");  		ip = (u8 *)regs->ip - code_prologue;  		if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { @@ -284,16 +284,16 @@ void show_registers(struct pt_regs *regs)  		for (i = 0; i < code_len; i++, ip++) {  			if (ip < (u8 *)PAGE_OFFSET ||  					probe_kernel_address(ip, c)) { -				printk(" Bad RIP value."); +				printk(KERN_CONT " Bad RIP value.");  				break;  			}  			if (ip == (u8 *)regs->ip) -				printk("<%02x> ", c); +				printk(KERN_CONT "<%02x> ", c);  			else -				printk("%02x ", c); +				printk(KERN_CONT "%02x ", c);  		}  	} -	printk("\n"); +	printk(KERN_CONT "\n");  }  int is_valid_bugaddr(unsigned long ip) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 303a0e48f07..62d61e9976e 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -19,6 +19,7 @@  #include <linux/acpi.h>  #include <linux/firmware-map.h>  #include <linux/memblock.h> +#include <linux/sort.h>  #include <asm/e820.h>  #include <asm/proto.h> @@ -227,22 +228,38 @@ void __init e820_print_map(char *who)   *	   ____________________33__   *	   ______________________4_   */ +struct change_member { +	struct e820entry *pbios; /* pointer to original bios entry */ +	unsigned long long addr; /* address for this change point */ +}; + +static int __init cpcompare(const void *a, const void *b) +{ +	struct change_member * const *app = a, * const *bpp = b; +	const struct change_member *ap = *app, *bp = *bpp; + +	/* +	 * Inputs are pointers to two elements of change_point[].  If their +	 * addresses are unequal, their difference dominates.  If the addresses +	 * are equal, then consider one that represents the end of its region +	 * to be greater than one that does not. +	 */ +	if (ap->addr != bp->addr) +		return ap->addr > bp->addr ? 1 : -1; + +	return (ap->addr != ap->pbios->addr) - (bp->addr != bp->pbios->addr); +}  int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,  			     u32 *pnr_map)  { -	struct change_member { -		struct e820entry *pbios; /* pointer to original bios entry */ -		unsigned long long addr; /* address for this change point */ -	};  	static struct change_member change_point_list[2*E820_X_MAX] __initdata;  	static struct change_member *change_point[2*E820_X_MAX] __initdata;  	static struct e820entry *overlap_list[E820_X_MAX] __initdata;  	static struct e820entry new_bios[E820_X_MAX] __initdata; -	struct change_member *change_tmp;  	unsigned long current_type, last_type;  	unsigned long long last_addr; -	int chgidx, still_changing; +	int chgidx;  	int overlap_entries;  	int new_bios_entry;  	int old_nr, new_nr, chg_nr; @@ -279,35 +296,7 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,  	chg_nr = chgidx;  	/* sort change-point list by memory addresses (low -> high) */ -	still_changing = 1; -	while (still_changing)	{ -		still_changing = 0; -		for (i = 1; i < chg_nr; i++)  { -			unsigned long long curaddr, lastaddr; -			unsigned long long curpbaddr, lastpbaddr; - -			curaddr = change_point[i]->addr; -			lastaddr = change_point[i - 1]->addr; -			curpbaddr = change_point[i]->pbios->addr; -			lastpbaddr = change_point[i - 1]->pbios->addr; - -			/* -			 * swap entries, when: -			 * -			 * curaddr > lastaddr or -			 * curaddr == lastaddr and curaddr == curpbaddr and -			 * lastaddr != lastpbaddr -			 */ -			if (curaddr < lastaddr || -			    (curaddr == lastaddr && curaddr == curpbaddr && -			     lastaddr != lastpbaddr)) { -				change_tmp = change_point[i]; -				change_point[i] = change_point[i-1]; -				change_point[i-1] = change_tmp; -				still_changing = 1; -			} -		} -	} +	sort(change_point, chg_nr, sizeof *change_point, cpcompare, NULL);  	/* create a new bios memory map, removing overlaps */  	overlap_entries = 0;	 /* number of entries in the overlap table */ @@ -714,7 +703,7 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn)  }  #endif -#ifdef CONFIG_HIBERNATION +#ifdef CONFIG_ACPI  /**   * Mark ACPI NVS memory region, so that we can save/restore it during   * hibernation and the subsequent resume. @@ -727,7 +716,7 @@ static int __init e820_mark_nvs_memory(void)  		struct e820entry *ei = &e820.map[i];  		if (ei->type == E820_NVS) -			suspend_nvs_register(ei->addr, ei->size); +			acpi_nvs_register(ei->addr, ei->size);  	}  	return 0; @@ -738,35 +727,17 @@ core_initcall(e820_mark_nvs_memory);  /*   * pre allocated 4k and reserved it in memblock and e820_saved   */ -u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align) +u64 __init early_reserve_e820(u64 size, u64 align)  { -	u64 size = 0;  	u64 addr; -	u64 start; -	for (start = startt; ; start += size) { -		start = memblock_x86_find_in_range_size(start, &size, align); -		if (start == MEMBLOCK_ERROR) -			return 0; -		if (size >= sizet) -			break; +	addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); +	if (addr) { +		e820_update_range_saved(addr, size, E820_RAM, E820_RESERVED); +		printk(KERN_INFO "update e820_saved for early_reserve_e820\n"); +		update_e820_saved();  	} -#ifdef CONFIG_X86_32 -	if (start >= MAXMEM) -		return 0; -	if (start + size > MAXMEM) -		size = MAXMEM - start; -#endif - -	addr = round_down(start + size - sizet, align); -	if (addr < start) -		return 0; -	memblock_x86_reserve_range(addr, addr + sizet, "new next"); -	e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED); -	printk(KERN_INFO "update e820_saved for early_reserve_e820\n"); -	update_e820_saved(); -  	return addr;  } @@ -1090,7 +1061,7 @@ void __init memblock_x86_fill(void)  	 * We are safe to enable resizing, beause memblock_x86_fill()  	 * is rather later for x86  	 */ -	memblock_can_resize = 1; +	memblock_allow_resize();  	for (i = 0; i < e820.nr_map; i++) {  		struct e820entry *ei = &e820.map[i]; @@ -1105,22 +1076,36 @@ void __init memblock_x86_fill(void)  		memblock_add(ei->addr, ei->size);  	} -	memblock_analyze();  	memblock_dump_all();  }  void __init memblock_find_dma_reserve(void)  {  #ifdef CONFIG_X86_64 -	u64 free_size_pfn; -	u64 mem_size_pfn; +	u64 nr_pages = 0, nr_free_pages = 0; +	unsigned long start_pfn, end_pfn; +	phys_addr_t start, end; +	int i; +	u64 u; +  	/*  	 * need to find out used area below MAX_DMA_PFN  	 * need to use memblock to get free size in [0, MAX_DMA_PFN]  	 * at first, and assume boot_mem will not take below MAX_DMA_PFN  	 */ -	mem_size_pfn = memblock_x86_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT; -	free_size_pfn = memblock_x86_free_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT; -	set_dma_reserve(mem_size_pfn - free_size_pfn); +	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { +		start_pfn = min_t(unsigned long, start_pfn, MAX_DMA_PFN); +		end_pfn = min_t(unsigned long, end_pfn, MAX_DMA_PFN); +		nr_pages += end_pfn - start_pfn; +	} + +	for_each_free_mem_range(u, MAX_NUMNODES, &start, &end, NULL) { +		start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN); +		end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN); +		if (start_pfn < end_pfn) +			nr_free_pages += end_pfn - start_pfn; +	} + +	set_dma_reserve(nr_pages - nr_free_pages);  #endif  } diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index cd28a350f7f..9b9f18b4991 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -240,14 +240,14 @@ static int __init setup_early_printk(char *buf)  		if (!strncmp(buf, "xen", 3))  			early_console_register(&xenboot_console, keep);  #endif -#ifdef CONFIG_EARLY_PRINTK_MRST +#ifdef CONFIG_EARLY_PRINTK_INTEL_MID  		if (!strncmp(buf, "mrst", 4)) {  			mrst_early_console_init();  			early_console_register(&early_mrst_console, keep);  		}  		if (!strncmp(buf, "hsu", 3)) { -			hsu_early_console_init(); +			hsu_early_console_init(buf + 3);  			early_console_register(&early_hsu_console, keep);  		}  #endif diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index f3f6f534400..79d97e68f04 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -42,6 +42,7 @@   */  #include <linux/linkage.h> +#include <linux/err.h>  #include <asm/thread_info.h>  #include <asm/irqflags.h>  #include <asm/errno.h> @@ -81,8 +82,6 @@   * enough to patch inline, increasing performance.   */ -#define nr_syscalls ((syscall_table_size)/4) -  #ifdef CONFIG_PREEMPT  #define preempt_stop(clobbers)	DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF  #else @@ -423,7 +422,7 @@ sysenter_past_esp:  	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)  	jnz sysenter_audit  sysenter_do_call: -	cmpl $(nr_syscalls), %eax +	cmpl $(NR_syscalls), %eax  	jae syscall_badsys  	call *sys_call_table(,%eax,4)  	movl %eax,PT_EAX(%esp) @@ -455,7 +454,7 @@ sysenter_audit:  	movl %ebx,%ecx			/* 3rd arg: 1st syscall arg */  	movl %eax,%edx			/* 2nd arg: syscall number */  	movl $AUDIT_ARCH_I386,%eax	/* 1st arg: audit arch */ -	call audit_syscall_entry +	call __audit_syscall_entry  	pushl_cfi %ebx  	movl PT_EAX(%esp),%eax		/* reload syscall number */  	jmp sysenter_do_call @@ -466,11 +465,10 @@ sysexit_audit:  	TRACE_IRQS_ON  	ENABLE_INTERRUPTS(CLBR_ANY)  	movl %eax,%edx		/* second arg, syscall return value */ -	cmpl $0,%eax		/* is it < 0? */ -	setl %al		/* 1 if so, 0 if not */ +	cmpl $-MAX_ERRNO,%eax	/* is it an error ? */ +	setbe %al		/* 1 if so, 0 if not */  	movzbl %al,%eax		/* zero-extend that */ -	inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ -	call audit_syscall_exit +	call __audit_syscall_exit  	DISABLE_INTERRUPTS(CLBR_ANY)  	TRACE_IRQS_OFF  	movl TI_flags(%ebp), %ecx @@ -504,7 +502,7 @@ ENTRY(system_call)  					# system call tracing in operation / emulation  	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)  	jnz syscall_trace_entry -	cmpl $(nr_syscalls), %eax +	cmpl $(NR_syscalls), %eax  	jae syscall_badsys  syscall_call:  	call *sys_call_table(,%eax,4) @@ -625,6 +623,8 @@ work_notifysig:				# deal with pending signals and  	movl %esp, %eax  	jne work_notifysig_v86		# returning to kernel-space or  					# vm86-space +	TRACE_IRQS_ON +	ENABLE_INTERRUPTS(CLBR_NONE)  	xorl %edx, %edx  	call do_notify_resume  	jmp resume_userspace_sig @@ -638,6 +638,8 @@ work_notifysig_v86:  #else  	movl %esp, %eax  #endif +	TRACE_IRQS_ON +	ENABLE_INTERRUPTS(CLBR_NONE)  	xorl %edx, %edx  	call do_notify_resume  	jmp resume_userspace_sig @@ -650,7 +652,7 @@ syscall_trace_entry:  	movl %esp, %eax  	call syscall_trace_enter  	/* What it returned is what we'll actually use.  */ -	cmpl $(nr_syscalls), %eax +	cmpl $(NR_syscalls), %eax  	jnae syscall_call  	jmp syscall_exit  END(syscall_trace_entry) @@ -690,29 +692,28 @@ END(syscall_badsys)   * System calls that need a pt_regs pointer.   */  #define PTREGSCALL0(name) \ -	ALIGN; \ -ptregs_##name: \ +ENTRY(ptregs_##name) ;  \  	leal 4(%esp),%eax; \ -	jmp sys_##name; +	jmp sys_##name; \ +ENDPROC(ptregs_##name)  #define PTREGSCALL1(name) \ -	ALIGN; \ -ptregs_##name: \ +ENTRY(ptregs_##name) ; \  	leal 4(%esp),%edx; \  	movl (PT_EBX+4)(%esp),%eax; \ -	jmp sys_##name; +	jmp sys_##name; \ +ENDPROC(ptregs_##name)  #define PTREGSCALL2(name) \ -	ALIGN; \ -ptregs_##name: \ +ENTRY(ptregs_##name) ; \  	leal 4(%esp),%ecx; \  	movl (PT_ECX+4)(%esp),%edx; \  	movl (PT_EBX+4)(%esp),%eax; \ -	jmp sys_##name; +	jmp sys_##name; \ +ENDPROC(ptregs_##name)  #define PTREGSCALL3(name) \ -	ALIGN; \ -ptregs_##name: \ +ENTRY(ptregs_##name) ; \  	CFI_STARTPROC; \  	leal 4(%esp),%eax; \  	pushl_cfi %eax; \ @@ -737,8 +738,7 @@ PTREGSCALL2(vm86)  PTREGSCALL1(vm86old)  /* Clone is an oddball.  The 4th arg is in %edi */ -	ALIGN; -ptregs_clone: +ENTRY(ptregs_clone)  	CFI_STARTPROC  	leal 4(%esp),%eax  	pushl_cfi %eax @@ -1209,11 +1209,6 @@ return_to_handler:  	jmp *%ecx  #endif -.section .rodata,"a" -#include "syscall_table_32.S" - -syscall_table_size=(.-sys_call_table) -  /*   * Some functions should be protected against kprobes   */ diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index faf8d5e74b0..1333d985177 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -55,6 +55,7 @@  #include <asm/paravirt.h>  #include <asm/ftrace.h>  #include <asm/percpu.h> +#include <linux/err.h>  /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */  #include <linux/elf-em.h> @@ -221,7 +222,7 @@ ENDPROC(native_usergs_sysret64)  	/*CFI_REL_OFFSET	ss,0*/  	pushq_cfi %rax /* rsp */  	CFI_REL_OFFSET	rsp,0 -	pushq_cfi $X86_EFLAGS_IF /* eflags - interrupts on */ +	pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_BIT1) /* eflags - interrupts on */  	/*CFI_REL_OFFSET	rflags,0*/  	pushq_cfi $__KERNEL_CS /* cs */  	/*CFI_REL_OFFSET	cs,0*/ @@ -411,7 +412,7 @@ ENTRY(ret_from_fork)  	RESTORE_REST  	testl $3, CS-ARGOFFSET(%rsp)		# from kernel_thread? -	je   int_ret_from_sys_call +	jz   retint_restore_args  	testl $_TIF_IA32, TI_flags(%rcx)	# 32-bit compat task needs IRET  	jnz  int_ret_from_sys_call @@ -465,7 +466,7 @@ ENTRY(system_call)  	 * after the swapgs, so that it can do the swapgs  	 * for the guest and jump here on syscall.  	 */ -ENTRY(system_call_after_swapgs) +GLOBAL(system_call_after_swapgs)  	movq	%rsp,PER_CPU_VAR(old_rsp)  	movq	PER_CPU_VAR(kernel_stack),%rsp @@ -478,8 +479,7 @@ ENTRY(system_call_after_swapgs)  	movq  %rax,ORIG_RAX-ARGOFFSET(%rsp)  	movq  %rcx,RIP-ARGOFFSET(%rsp)  	CFI_REL_OFFSET rip,RIP-ARGOFFSET -	GET_THREAD_INFO(%rcx) -	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx) +	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)  	jnz tracesys  system_call_fastpath:  	cmpq $__NR_syscall_max,%rax @@ -496,10 +496,9 @@ ret_from_sys_call:  	/* edi:	flagmask */  sysret_check:  	LOCKDEP_SYS_EXIT -	GET_THREAD_INFO(%rcx)  	DISABLE_INTERRUPTS(CLBR_NONE)  	TRACE_IRQS_OFF -	movl TI_flags(%rcx),%edx +	movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx  	andl %edi,%edx  	jnz  sysret_careful  	CFI_REMEMBER_STATE @@ -550,7 +549,7 @@ badsys:  #ifdef CONFIG_AUDITSYSCALL  	/*  	 * Fast path for syscall audit without full syscall trace. -	 * We just call audit_syscall_entry() directly, and then +	 * We just call __audit_syscall_entry() directly, and then  	 * jump back to the normal fast path.  	 */  auditsys: @@ -560,22 +559,21 @@ auditsys:  	movq %rdi,%rdx			/* 3rd arg: 1st syscall arg */  	movq %rax,%rsi			/* 2nd arg: syscall number */  	movl $AUDIT_ARCH_X86_64,%edi	/* 1st arg: audit arch */ -	call audit_syscall_entry +	call __audit_syscall_entry  	LOAD_ARGS 0		/* reload call-clobbered registers */  	jmp system_call_fastpath  	/* -	 * Return fast path for syscall audit.  Call audit_syscall_exit() +	 * Return fast path for syscall audit.  Call __audit_syscall_exit()  	 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT  	 * masked off.  	 */  sysret_audit:  	movq RAX-ARGOFFSET(%rsp),%rsi	/* second arg, syscall return value */ -	cmpq $0,%rsi		/* is it < 0? */ -	setl %al		/* 1 if so, 0 if not */ +	cmpq $-MAX_ERRNO,%rsi	/* is it < -MAX_ERRNO? */ +	setbe %al		/* 1 if so, 0 if not */  	movzbl %al,%edi		/* zero-extend that into %edi */ -	inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ -	call audit_syscall_exit +	call __audit_syscall_exit  	movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi  	jmp sysret_check  #endif	/* CONFIG_AUDITSYSCALL */ @@ -583,7 +581,7 @@ sysret_audit:  	/* Do syscall tracing */  tracesys:  #ifdef CONFIG_AUDITSYSCALL -	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx) +	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)  	jz auditsys  #endif  	SAVE_REST @@ -612,8 +610,6 @@ tracesys:  GLOBAL(int_ret_from_sys_call)  	DISABLE_INTERRUPTS(CLBR_NONE)  	TRACE_IRQS_OFF -	testl $3,CS-ARGOFFSET(%rsp) -	je retint_restore_args  	movl $_TIF_ALLWORK_MASK,%edi  	/* edi:	mask to check */  GLOBAL(int_with_check) @@ -953,6 +949,7 @@ END(common_interrupt)  ENTRY(\sym)  	INTR_FRAME  	pushq_cfi $~(\num) +.Lcommon_\sym:  	interrupt \do_sym  	jmp ret_from_intr  	CFI_ENDPROC @@ -976,13 +973,21 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR \  	x86_platform_ipi smp_x86_platform_ipi  #ifdef CONFIG_SMP -.irp idx,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \ +	ALIGN +	INTR_FRAME +.irp idx,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \  	16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31  .if NUM_INVALIDATE_TLB_VECTORS > \idx -apicinterrupt (INVALIDATE_TLB_VECTOR_START)+\idx \ -	invalidate_interrupt\idx smp_invalidate_interrupt +ENTRY(invalidate_interrupt\idx) +	pushq_cfi $~(INVALIDATE_TLB_VECTOR_START+\idx) +	jmp .Lcommon_invalidate_interrupt0 +	CFI_ADJUST_CFA_OFFSET -8 +END(invalidate_interrupt\idx)  .endif  .endr +	CFI_ENDPROC +apicinterrupt INVALIDATE_TLB_VECTOR_START, \ +	invalidate_interrupt0, smp_invalidate_interrupt  #endif  apicinterrupt THRESHOLD_APIC_VECTOR \ @@ -1475,62 +1480,221 @@ ENTRY(error_exit)  	CFI_ENDPROC  END(error_exit) +/* + * Test if a given stack is an NMI stack or not. + */ +	.macro test_in_nmi reg stack nmi_ret normal_ret +	cmpq %\reg, \stack +	ja \normal_ret +	subq $EXCEPTION_STKSZ, %\reg +	cmpq %\reg, \stack +	jb \normal_ret +	jmp \nmi_ret +	.endm  	/* runs on exception stack */  ENTRY(nmi)  	INTR_FRAME  	PARAVIRT_ADJUST_EXCEPTION_FRAME -	pushq_cfi $-1 +	/* +	 * We allow breakpoints in NMIs. If a breakpoint occurs, then +	 * the iretq it performs will take us out of NMI context. +	 * This means that we can have nested NMIs where the next +	 * NMI is using the top of the stack of the previous NMI. We +	 * can't let it execute because the nested NMI will corrupt the +	 * stack of the previous NMI. NMI handlers are not re-entrant +	 * anyway. +	 * +	 * To handle this case we do the following: +	 *  Check the a special location on the stack that contains +	 *  a variable that is set when NMIs are executing. +	 *  The interrupted task's stack is also checked to see if it +	 *  is an NMI stack. +	 *  If the variable is not set and the stack is not the NMI +	 *  stack then: +	 *    o Set the special variable on the stack +	 *    o Copy the interrupt frame into a "saved" location on the stack +	 *    o Copy the interrupt frame into a "copy" location on the stack +	 *    o Continue processing the NMI +	 *  If the variable is set or the previous stack is the NMI stack: +	 *    o Modify the "copy" location to jump to the repeate_nmi +	 *    o return back to the first NMI +	 * +	 * Now on exit of the first NMI, we first clear the stack variable +	 * The NMI stack will tell any nested NMIs at that point that it is +	 * nested. Then we pop the stack normally with iret, and if there was +	 * a nested NMI that updated the copy interrupt stack frame, a +	 * jump will be made to the repeat_nmi code that will handle the second +	 * NMI. +	 */ + +	/* Use %rdx as out temp variable throughout */ +	pushq_cfi %rdx + +	/* +	 * If %cs was not the kernel segment, then the NMI triggered in user +	 * space, which means it is definitely not nested. +	 */ +	cmpl $__KERNEL_CS, 16(%rsp) +	jne first_nmi + +	/* +	 * Check the special variable on the stack to see if NMIs are +	 * executing. +	 */ +	cmpl $1, -8(%rsp) +	je nested_nmi + +	/* +	 * Now test if the previous stack was an NMI stack. +	 * We need the double check. We check the NMI stack to satisfy the +	 * race when the first NMI clears the variable before returning. +	 * We check the variable because the first NMI could be in a +	 * breakpoint routine using a breakpoint stack. +	 */ +	lea 6*8(%rsp), %rdx +	test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi + +nested_nmi: +	/* +	 * Do nothing if we interrupted the fixup in repeat_nmi. +	 * It's about to repeat the NMI handler, so we are fine +	 * with ignoring this one. +	 */ +	movq $repeat_nmi, %rdx +	cmpq 8(%rsp), %rdx +	ja 1f +	movq $end_repeat_nmi, %rdx +	cmpq 8(%rsp), %rdx +	ja nested_nmi_out + +1: +	/* Set up the interrupted NMIs stack to jump to repeat_nmi */ +	leaq -6*8(%rsp), %rdx +	movq %rdx, %rsp +	CFI_ADJUST_CFA_OFFSET 6*8 +	pushq_cfi $__KERNEL_DS +	pushq_cfi %rdx +	pushfq_cfi +	pushq_cfi $__KERNEL_CS +	pushq_cfi $repeat_nmi + +	/* Put stack back */ +	addq $(11*8), %rsp +	CFI_ADJUST_CFA_OFFSET -11*8 + +nested_nmi_out: +	popq_cfi %rdx + +	/* No need to check faults here */ +	INTERRUPT_RETURN + +first_nmi: +	/* +	 * Because nested NMIs will use the pushed location that we +	 * stored in rdx, we must keep that space available. +	 * Here's what our stack frame will look like: +	 * +-------------------------+ +	 * | original SS             | +	 * | original Return RSP     | +	 * | original RFLAGS         | +	 * | original CS             | +	 * | original RIP            | +	 * +-------------------------+ +	 * | temp storage for rdx    | +	 * +-------------------------+ +	 * | NMI executing variable  | +	 * +-------------------------+ +	 * | Saved SS                | +	 * | Saved Return RSP        | +	 * | Saved RFLAGS            | +	 * | Saved CS                | +	 * | Saved RIP               | +	 * +-------------------------+ +	 * | copied SS               | +	 * | copied Return RSP       | +	 * | copied RFLAGS           | +	 * | copied CS               | +	 * | copied RIP              | +	 * +-------------------------+ +	 * | pt_regs                 | +	 * +-------------------------+ +	 * +	 * The saved RIP is used to fix up the copied RIP that a nested +	 * NMI may zero out. The original stack frame and the temp storage +	 * is also used by nested NMIs and can not be trusted on exit. +	 */ +	/* Set the NMI executing variable on the stack. */ +	pushq_cfi $1 + +	/* Copy the stack frame to the Saved frame */ +	.rept 5 +	pushq_cfi 6*8(%rsp) +	.endr + +	/* Make another copy, this one may be modified by nested NMIs */ +	.rept 5 +	pushq_cfi 4*8(%rsp) +	.endr + +	/* Do not pop rdx, nested NMIs will corrupt it */ +	movq 11*8(%rsp), %rdx + +	/* +	 * Everything below this point can be preempted by a nested +	 * NMI if the first NMI took an exception. Repeated NMIs +	 * caused by an exception and nested NMI will start here, and +	 * can still be preempted by another NMI. +	 */ +restart_nmi: +	pushq_cfi $-1		/* ORIG_RAX: no syscall to restart */  	subq $ORIG_RAX-R15, %rsp  	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 +	/* +	 * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit +	 * as we should not be calling schedule in NMI context. +	 * Even with normal interrupts enabled. An NMI should not be +	 * setting NEED_RESCHED or anything that normal interrupts and +	 * exceptions might do. +	 */  	call save_paranoid  	DEFAULT_FRAME 0  	/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */  	movq %rsp,%rdi  	movq $-1,%rsi  	call do_nmi -#ifdef CONFIG_TRACE_IRQFLAGS -	/* paranoidexit; without TRACE_IRQS_OFF */ -	/* ebx:	no swapgs flag */ -	DISABLE_INTERRUPTS(CLBR_NONE)  	testl %ebx,%ebx				/* swapgs needed? */  	jnz nmi_restore -	testl $3,CS(%rsp) -	jnz nmi_userspace  nmi_swapgs:  	SWAPGS_UNSAFE_STACK  nmi_restore:  	RESTORE_ALL 8 +	/* Clear the NMI executing stack variable */ +	movq $0, 10*8(%rsp)  	jmp irq_return -nmi_userspace: -	GET_THREAD_INFO(%rcx) -	movl TI_flags(%rcx),%ebx -	andl $_TIF_WORK_MASK,%ebx -	jz nmi_swapgs -	movq %rsp,%rdi			/* &pt_regs */ -	call sync_regs -	movq %rax,%rsp			/* switch stack for scheduling */ -	testl $_TIF_NEED_RESCHED,%ebx -	jnz nmi_schedule -	movl %ebx,%edx			/* arg3: thread flags */ -	ENABLE_INTERRUPTS(CLBR_NONE) -	xorl %esi,%esi 			/* arg2: oldset */ -	movq %rsp,%rdi 			/* arg1: &pt_regs */ -	call do_notify_resume -	DISABLE_INTERRUPTS(CLBR_NONE) -	jmp nmi_userspace -nmi_schedule: -	ENABLE_INTERRUPTS(CLBR_ANY) -	call schedule -	DISABLE_INTERRUPTS(CLBR_ANY) -	jmp nmi_userspace  	CFI_ENDPROC -#else -	jmp paranoid_exit -	CFI_ENDPROC -#endif  END(nmi) +	/* +	 * If an NMI hit an iret because of an exception or breakpoint, +	 * it can lose its NMI context, and a nested NMI may come in. +	 * In that case, the nested NMI will change the preempted NMI's +	 * stack to jump to here when it does the final iret. +	 */ +repeat_nmi: +	INTR_FRAME +	/* Update the stack variable to say we are still in NMI */ +	movq $1, 5*8(%rsp) + +	/* copy the saved stack back to copy stack */ +	.rept 5 +	pushq_cfi 4*8(%rsp) +	.endr + +	jmp restart_nmi +	CFI_ENDPROC +end_repeat_nmi: +  ENTRY(ignore_sysret)  	CFI_STARTPROC  	mov $-ENOSYS,%eax diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c index af0699ba48c..48d9d4ea102 100644 --- a/arch/x86/kernel/head.c +++ b/arch/x86/kernel/head.c @@ -52,5 +52,5 @@ void __init reserve_ebda_region(void)  		lowmem = 0x9f000;  	/* reserve all memory between lowmem and the 1MB mark */ -	memblock_x86_reserve_range(lowmem, 0x100000, "* BIOS reserved"); +	memblock_reserve(lowmem, 0x100000 - lowmem);  } diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 3bb08509a7a..51ff18616d5 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -31,9 +31,8 @@ static void __init i386_default_early_setup(void)  void __init i386_start_kernel(void)  { -	memblock_init(); - -	memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); +	memblock_reserve(__pa_symbol(&_text), +			 __pa_symbol(&__bss_stop) - __pa_symbol(&_text));  #ifdef CONFIG_BLK_DEV_INITRD  	/* Reserve INITRD */ @@ -42,7 +41,7 @@ void __init i386_start_kernel(void)  		u64 ramdisk_image = boot_params.hdr.ramdisk_image;  		u64 ramdisk_size  = boot_params.hdr.ramdisk_size;  		u64 ramdisk_end   = PAGE_ALIGN(ramdisk_image + ramdisk_size); -		memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK"); +		memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);  	}  #endif diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 5655c2272ad..3a3b779f41d 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -98,9 +98,8 @@ void __init x86_64_start_reservations(char *real_mode_data)  {  	copy_bootdata(__va(real_mode_data)); -	memblock_init(); - -	memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); +	memblock_reserve(__pa_symbol(&_text), +			 __pa_symbol(&__bss_stop) - __pa_symbol(&_text));  #ifdef CONFIG_BLK_DEV_INITRD  	/* Reserve INITRD */ @@ -109,7 +108,7 @@ void __init x86_64_start_reservations(char *real_mode_data)  		unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;  		unsigned long ramdisk_size  = boot_params.hdr.ramdisk_size;  		unsigned long ramdisk_end   = PAGE_ALIGN(ramdisk_image + ramdisk_size); -		memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK"); +		memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);  	}  #endif diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index e11e39478a4..40f4eb3766d 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -417,6 +417,10 @@ ENTRY(phys_base)  ENTRY(idt_table)  	.skip IDT_ENTRIES * 16 +	.align L1_CACHE_BYTES +ENTRY(nmi_idt_table) +	.skip IDT_ENTRIES * 16 +  	__PAGE_ALIGNED_BSS  	.align PAGE_SIZE  ENTRY(empty_zero_page) diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index b946a9eac7d..ad0de0c2714 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -2,7 +2,6 @@  #include <linux/clockchips.h>  #include <linux/interrupt.h>  #include <linux/export.h> -#include <linux/sysdev.h>  #include <linux/delay.h>  #include <linux/errno.h>  #include <linux/i8253.h> @@ -32,8 +31,6 @@  #define HPET_MIN_CYCLES			128  #define HPET_MIN_PROG_DELTA		(HPET_MIN_CYCLES + (HPET_MIN_CYCLES >> 1)) -#define EVT_TO_HPET_DEV(evt) container_of(evt, struct hpet_dev, evt) -  /*   * HPET address is set in acpi/boot.c, when an ACPI entry exists   */ @@ -55,6 +52,11 @@ struct hpet_dev {  	char				name[10];  }; +inline struct hpet_dev *EVT_TO_HPET_DEV(struct clock_event_device *evtdev) +{ +	return container_of(evtdev, struct hpet_dev, evt); +} +  inline unsigned int hpet_readl(unsigned int a)  {  	return readl(hpet_virt_address + a); @@ -1049,6 +1051,14 @@ int hpet_rtc_timer_init(void)  }  EXPORT_SYMBOL_GPL(hpet_rtc_timer_init); +static void hpet_disable_rtc_channel(void) +{ +	unsigned long cfg; +	cfg = hpet_readl(HPET_T1_CFG); +	cfg &= ~HPET_TN_ENABLE; +	hpet_writel(cfg, HPET_T1_CFG); +} +  /*   * The functions below are called from rtc driver.   * Return 0 if HPET is not being used. @@ -1060,6 +1070,9 @@ int hpet_mask_rtc_irq_bit(unsigned long bit_mask)  		return 0;  	hpet_rtc_flags &= ~bit_mask; +	if (unlikely(!hpet_rtc_flags)) +		hpet_disable_rtc_channel(); +  	return 1;  }  EXPORT_SYMBOL_GPL(hpet_mask_rtc_irq_bit); @@ -1125,15 +1138,11 @@ EXPORT_SYMBOL_GPL(hpet_rtc_dropped_irq);  static void hpet_rtc_timer_reinit(void)  { -	unsigned int cfg, delta; +	unsigned int delta;  	int lost_ints = -1; -	if (unlikely(!hpet_rtc_flags)) { -		cfg = hpet_readl(HPET_T1_CFG); -		cfg &= ~HPET_TN_ENABLE; -		hpet_writel(cfg, HPET_T1_CFG); -		return; -	} +	if (unlikely(!hpet_rtc_flags)) +		hpet_disable_rtc_channel();  	if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit)  		delta = hpet_default_delta; diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 429e0c92924..7943e0c21bd 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -74,6 +74,10 @@ int arch_show_interrupts(struct seq_file *p, int prec)  	for_each_online_cpu(j)  		seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs);  	seq_printf(p, "  IRQ work interrupts\n"); +	seq_printf(p, "%*s: ", prec, "RTR"); +	for_each_online_cpu(j) +		seq_printf(p, "%10u ", irq_stats(j)->icr_read_retry_count); +	seq_printf(p, "  APIC ICR read retries\n");  #endif  	if (x86_platform_ipi_callback) {  		seq_printf(p, "%*s: ", prec, "PLT"); @@ -136,6 +140,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)  	sum += irq_stats(cpu)->irq_spurious_count;  	sum += irq_stats(cpu)->apic_perf_irqs;  	sum += irq_stats(cpu)->apic_irq_work_irqs; +	sum += irq_stats(cpu)->icr_read_retry_count;  #endif  	if (x86_platform_ipi_callback)  		sum += irq_stats(cpu)->x86_platform_ipis; @@ -181,8 +186,8 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)  	unsigned vector = ~regs->orig_ax;  	unsigned irq; -	exit_idle();  	irq_enter(); +	exit_idle();  	irq = __this_cpu_read(vector_irq[vector]); @@ -209,10 +214,10 @@ void smp_x86_platform_ipi(struct pt_regs *regs)  	ack_APIC_irq(); -	exit_idle(); -  	irq_enter(); +	exit_idle(); +  	inc_irq_stat(x86_platform_ipis);  	if (x86_platform_ipi_callback) diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 72090705a65..40fc86161d9 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -28,6 +28,9 @@ DEFINE_PER_CPU(struct pt_regs *, irq_regs);  EXPORT_PER_CPU_SYMBOL(irq_regs);  #ifdef CONFIG_DEBUG_STACKOVERFLOW + +int sysctl_panic_on_stackoverflow __read_mostly; +  /* Debugging check for stack overflow: is there less than 1KB free? */  static int check_stack_overflow(void)  { @@ -43,6 +46,8 @@ static void print_stack_overflow(void)  {  	printk(KERN_WARNING "low stack detected by irq handler\n");  	dump_stack(); +	if (sysctl_panic_on_stackoverflow) +		panic("low stack detected by irq handler - check messages\n");  }  #else diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index acf8fbf8fbd..d04d3ecded6 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -26,6 +26,8 @@ EXPORT_PER_CPU_SYMBOL(irq_stat);  DEFINE_PER_CPU(struct pt_regs *, irq_regs);  EXPORT_PER_CPU_SYMBOL(irq_regs); +int sysctl_panic_on_stackoverflow; +  /*   * Probabilistic stack overflow check:   * @@ -36,15 +38,39 @@ EXPORT_PER_CPU_SYMBOL(irq_regs);  static inline void stack_overflow_check(struct pt_regs *regs)  {  #ifdef CONFIG_DEBUG_STACKOVERFLOW +#define STACK_TOP_MARGIN	128 +	struct orig_ist *oist; +	u64 irq_stack_top, irq_stack_bottom; +	u64 estack_top, estack_bottom;  	u64 curbase = (u64)task_stack_page(current); -	WARN_ONCE(regs->sp >= curbase && -		  regs->sp <= curbase + THREAD_SIZE && -		  regs->sp <  curbase + sizeof(struct thread_info) + -					sizeof(struct pt_regs) + 128, +	if (user_mode_vm(regs)) +		return; + +	if (regs->sp >= curbase + sizeof(struct thread_info) + +				  sizeof(struct pt_regs) + STACK_TOP_MARGIN && +	    regs->sp <= curbase + THREAD_SIZE) +		return; + +	irq_stack_top = (u64)__get_cpu_var(irq_stack_union.irq_stack) + +			STACK_TOP_MARGIN; +	irq_stack_bottom = (u64)__get_cpu_var(irq_stack_ptr); +	if (regs->sp >= irq_stack_top && regs->sp <= irq_stack_bottom) +		return; + +	oist = &__get_cpu_var(orig_ist); +	estack_top = (u64)oist->ist[0] - EXCEPTION_STKSZ + STACK_TOP_MARGIN; +	estack_bottom = (u64)oist->ist[N_EXCEPTION_STACKS - 1]; +	if (regs->sp >= estack_top && regs->sp <= estack_bottom) +		return; + +	WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx)\n", +		current->comm, curbase, regs->sp, +		irq_stack_top, irq_stack_bottom, +		estack_top, estack_bottom); -		  "do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n", -			current->comm, curbase, regs->sp); +	if (sysctl_panic_on_stackoverflow) +		panic("low stack detected by irq handler - check messages\n");  #endif  } diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index b3300e6bace..313fb5cddbc 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -9,7 +9,7 @@  #include <linux/kprobes.h>  #include <linux/init.h>  #include <linux/kernel_stat.h> -#include <linux/sysdev.h> +#include <linux/device.h>  #include <linux/bitops.h>  #include <linux/acpi.h>  #include <linux/io.h> diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index ea9d5f2f13e..2889b3d4388 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -50,7 +50,7 @@ void arch_jump_label_transform(struct jump_entry *entry,  	put_online_cpus();  } -void arch_jump_label_transform_static(struct jump_entry *entry, +__init_or_module void arch_jump_label_transform_static(struct jump_entry *entry,  				      enum jump_label_type type)  {  	__jump_label_transform(entry, type, text_poke_early); diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index a9c2116001d..f0c6fd6f176 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -39,8 +39,6 @@  #include <asm/desc.h>  #include <asm/tlbflush.h> -#define MMU_QUEUE_SIZE 1024 -  static int kvmapf = 1;  static int parse_no_kvmapf(char *arg) @@ -60,21 +58,10 @@ static int parse_no_stealacc(char *arg)  early_param("no-steal-acc", parse_no_stealacc); -struct kvm_para_state { -	u8 mmu_queue[MMU_QUEUE_SIZE]; -	int mmu_queue_len; -}; - -static DEFINE_PER_CPU(struct kvm_para_state, para_state);  static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);  static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);  static int has_steal_clock = 0; -static struct kvm_para_state *kvm_para_state(void) -{ -	return &per_cpu(para_state, raw_smp_processor_id()); -} -  /*   * No need for any "IO delay" on KVM   */ @@ -271,151 +258,6 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code)  	}  } -static void kvm_mmu_op(void *buffer, unsigned len) -{ -	int r; -	unsigned long a1, a2; - -	do { -		a1 = __pa(buffer); -		a2 = 0;   /* on i386 __pa() always returns <4G */ -		r = kvm_hypercall3(KVM_HC_MMU_OP, len, a1, a2); -		buffer += r; -		len -= r; -	} while (len); -} - -static void mmu_queue_flush(struct kvm_para_state *state) -{ -	if (state->mmu_queue_len) { -		kvm_mmu_op(state->mmu_queue, state->mmu_queue_len); -		state->mmu_queue_len = 0; -	} -} - -static void kvm_deferred_mmu_op(void *buffer, int len) -{ -	struct kvm_para_state *state = kvm_para_state(); - -	if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) { -		kvm_mmu_op(buffer, len); -		return; -	} -	if (state->mmu_queue_len + len > sizeof state->mmu_queue) -		mmu_queue_flush(state); -	memcpy(state->mmu_queue + state->mmu_queue_len, buffer, len); -	state->mmu_queue_len += len; -} - -static void kvm_mmu_write(void *dest, u64 val) -{ -	__u64 pte_phys; -	struct kvm_mmu_op_write_pte wpte; - -#ifdef CONFIG_HIGHPTE -	struct page *page; -	unsigned long dst = (unsigned long) dest; - -	page = kmap_atomic_to_page(dest); -	pte_phys = page_to_pfn(page); -	pte_phys <<= PAGE_SHIFT; -	pte_phys += (dst & ~(PAGE_MASK)); -#else -	pte_phys = (unsigned long)__pa(dest); -#endif -	wpte.header.op = KVM_MMU_OP_WRITE_PTE; -	wpte.pte_val = val; -	wpte.pte_phys = pte_phys; - -	kvm_deferred_mmu_op(&wpte, sizeof wpte); -} - -/* - * We only need to hook operations that are MMU writes.  We hook these so that - * we can use lazy MMU mode to batch these operations.  We could probably - * improve the performance of the host code if we used some of the information - * here to simplify processing of batched writes. - */ -static void kvm_set_pte(pte_t *ptep, pte_t pte) -{ -	kvm_mmu_write(ptep, pte_val(pte)); -} - -static void kvm_set_pte_at(struct mm_struct *mm, unsigned long addr, -			   pte_t *ptep, pte_t pte) -{ -	kvm_mmu_write(ptep, pte_val(pte)); -} - -static void kvm_set_pmd(pmd_t *pmdp, pmd_t pmd) -{ -	kvm_mmu_write(pmdp, pmd_val(pmd)); -} - -#if PAGETABLE_LEVELS >= 3 -#ifdef CONFIG_X86_PAE -static void kvm_set_pte_atomic(pte_t *ptep, pte_t pte) -{ -	kvm_mmu_write(ptep, pte_val(pte)); -} - -static void kvm_pte_clear(struct mm_struct *mm, -			  unsigned long addr, pte_t *ptep) -{ -	kvm_mmu_write(ptep, 0); -} - -static void kvm_pmd_clear(pmd_t *pmdp) -{ -	kvm_mmu_write(pmdp, 0); -} -#endif - -static void kvm_set_pud(pud_t *pudp, pud_t pud) -{ -	kvm_mmu_write(pudp, pud_val(pud)); -} - -#if PAGETABLE_LEVELS == 4 -static void kvm_set_pgd(pgd_t *pgdp, pgd_t pgd) -{ -	kvm_mmu_write(pgdp, pgd_val(pgd)); -} -#endif -#endif /* PAGETABLE_LEVELS >= 3 */ - -static void kvm_flush_tlb(void) -{ -	struct kvm_mmu_op_flush_tlb ftlb = { -		.header.op = KVM_MMU_OP_FLUSH_TLB, -	}; - -	kvm_deferred_mmu_op(&ftlb, sizeof ftlb); -} - -static void kvm_release_pt(unsigned long pfn) -{ -	struct kvm_mmu_op_release_pt rpt = { -		.header.op = KVM_MMU_OP_RELEASE_PT, -		.pt_phys = (u64)pfn << PAGE_SHIFT, -	}; - -	kvm_mmu_op(&rpt, sizeof rpt); -} - -static void kvm_enter_lazy_mmu(void) -{ -	paravirt_enter_lazy_mmu(); -} - -static void kvm_leave_lazy_mmu(void) -{ -	struct kvm_para_state *state = kvm_para_state(); - -	mmu_queue_flush(state); -	paravirt_leave_lazy_mmu(); -} -  static void __init paravirt_ops_setup(void)  {  	pv_info.name = "KVM"; @@ -424,29 +266,6 @@ static void __init paravirt_ops_setup(void)  	if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))  		pv_cpu_ops.io_delay = kvm_io_delay; -	if (kvm_para_has_feature(KVM_FEATURE_MMU_OP)) { -		pv_mmu_ops.set_pte = kvm_set_pte; -		pv_mmu_ops.set_pte_at = kvm_set_pte_at; -		pv_mmu_ops.set_pmd = kvm_set_pmd; -#if PAGETABLE_LEVELS >= 3 -#ifdef CONFIG_X86_PAE -		pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic; -		pv_mmu_ops.pte_clear = kvm_pte_clear; -		pv_mmu_ops.pmd_clear = kvm_pmd_clear; -#endif -		pv_mmu_ops.set_pud = kvm_set_pud; -#if PAGETABLE_LEVELS == 4 -		pv_mmu_ops.set_pgd = kvm_set_pgd; -#endif -#endif -		pv_mmu_ops.flush_tlb_user = kvm_flush_tlb; -		pv_mmu_ops.release_pte = kvm_release_pt; -		pv_mmu_ops.release_pmd = kvm_release_pt; -		pv_mmu_ops.release_pud = kvm_release_pt; - -		pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu; -		pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu; -	}  #ifdef CONFIG_X86_IO_APIC  	no_timer_check = 1;  #endif diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index d494799aafc..73465aab28f 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -1,14 +1,18 @@  /*   *  AMD CPU Microcode Update Driver for Linux - *  Copyright (C) 2008 Advanced Micro Devices Inc. + *  Copyright (C) 2008-2011 Advanced Micro Devices Inc.   *   *  Author: Peter Oruba <peter.oruba@amd.com>   *   *  Based on work by:   *  Tigran Aivazian <tigran@aivazian.fsnet.co.uk>   * - *  This driver allows to upgrade microcode on AMD - *  family 0x10 and 0x11 processors. + *  Maintainers: + *  Andreas Herrmann <andreas.herrmann3@amd.com> + *  Borislav Petkov <borislav.petkov@amd.com> + * + *  This driver allows to upgrade microcode on F10h AMD + *  CPUs and later.   *   *  Licensed under the terms of the GNU General Public   *  License version 2. See file COPYING for details. @@ -71,6 +75,9 @@ struct microcode_amd {  static struct equiv_cpu_entry *equiv_cpu_table; +/* page-sized ucode patch buffer */ +void *patch; +  static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)  {  	struct cpuinfo_x86 *c = &cpu_data(cpu); @@ -86,27 +93,76 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)  	return 0;  } -static int get_matching_microcode(int cpu, struct microcode_header_amd *mc_hdr, -				  int rev) +static unsigned int verify_ucode_size(int cpu, u32 patch_size, +				      unsigned int size)  { -	unsigned int current_cpu_id; -	u16 equiv_cpu_id = 0; -	unsigned int i = 0; +	struct cpuinfo_x86 *c = &cpu_data(cpu); +	u32 max_size; + +#define F1XH_MPB_MAX_SIZE 2048 +#define F14H_MPB_MAX_SIZE 1824 +#define F15H_MPB_MAX_SIZE 4096 + +	switch (c->x86) { +	case 0x14: +		max_size = F14H_MPB_MAX_SIZE; +		break; +	case 0x15: +		max_size = F15H_MPB_MAX_SIZE; +		break; +	default: +		max_size = F1XH_MPB_MAX_SIZE; +		break; +	} + +	if (patch_size > min_t(u32, size, max_size)) { +		pr_err("patch size mismatch\n"); +		return 0; +	} + +	return patch_size; +} + +static u16 find_equiv_id(void) +{ +	unsigned int current_cpu_id, i = 0;  	BUG_ON(equiv_cpu_table == NULL); +  	current_cpu_id = cpuid_eax(0x00000001);  	while (equiv_cpu_table[i].installed_cpu != 0) { -		if (current_cpu_id == equiv_cpu_table[i].installed_cpu) { -			equiv_cpu_id = equiv_cpu_table[i].equiv_cpu; -			break; -		} +		if (current_cpu_id == equiv_cpu_table[i].installed_cpu) +			return equiv_cpu_table[i].equiv_cpu; +  		i++;  	} +	return 0; +} + +/* + * we signal a good patch is found by returning its size > 0 + */ +static int get_matching_microcode(int cpu, const u8 *ucode_ptr, +				  unsigned int leftover_size, int rev, +				  unsigned int *current_size) +{ +	struct microcode_header_amd *mc_hdr; +	unsigned int actual_size; +	u16 equiv_cpu_id; + +	/* size of the current patch we're staring at */ +	*current_size = *(u32 *)(ucode_ptr + 4) + SECTION_HDR_SIZE; +	equiv_cpu_id = find_equiv_id();  	if (!equiv_cpu_id)  		return 0; +	/* +	 * let's look at the patch header itself now +	 */ +	mc_hdr = (struct microcode_header_amd *)(ucode_ptr + SECTION_HDR_SIZE); +  	if (mc_hdr->processor_rev_id != equiv_cpu_id)  		return 0; @@ -120,7 +176,20 @@ static int get_matching_microcode(int cpu, struct microcode_header_amd *mc_hdr,  	if (mc_hdr->patch_id <= rev)  		return 0; -	return 1; +	/* +	 * now that the header looks sane, verify its size +	 */ +	actual_size = verify_ucode_size(cpu, *current_size, leftover_size); +	if (!actual_size) +		return 0; + +	/* clear the patch buffer */ +	memset(patch, 0, PAGE_SIZE); + +	/* all looks ok, get the binary patch */ +	get_ucode_data(patch, ucode_ptr + SECTION_HDR_SIZE, actual_size); + +	return actual_size;  }  static int apply_microcode_amd(int cpu) @@ -155,63 +224,6 @@ static int apply_microcode_amd(int cpu)  	return 0;  } -static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size) -{ -	struct cpuinfo_x86 *c = &cpu_data(cpu); -	u32 max_size, actual_size; - -#define F1XH_MPB_MAX_SIZE 2048 -#define F14H_MPB_MAX_SIZE 1824 -#define F15H_MPB_MAX_SIZE 4096 - -	switch (c->x86) { -	case 0x14: -		max_size = F14H_MPB_MAX_SIZE; -		break; -	case 0x15: -		max_size = F15H_MPB_MAX_SIZE; -		break; -	default: -		max_size = F1XH_MPB_MAX_SIZE; -		break; -	} - -	actual_size = *(u32 *)(buf + 4); - -	if (actual_size + SECTION_HDR_SIZE > size || actual_size > max_size) { -		pr_err("section size mismatch\n"); -		return 0; -	} - -	return actual_size; -} - -static struct microcode_header_amd * -get_next_ucode(int cpu, const u8 *buf, unsigned int size, unsigned int *mc_size) -{ -	struct microcode_header_amd *mc = NULL; -	unsigned int actual_size = 0; - -	if (*(u32 *)buf != UCODE_UCODE_TYPE) { -		pr_err("invalid type field in container file section header\n"); -		goto out; -	} - -	actual_size = verify_ucode_size(cpu, buf, size); -	if (!actual_size) -		goto out; - -	mc = vzalloc(actual_size); -	if (!mc) -		goto out; - -	get_ucode_data(mc, buf + SECTION_HDR_SIZE, actual_size); -	*mc_size = actual_size + SECTION_HDR_SIZE; - -out: -	return mc; -} -  static int install_equiv_cpu_table(const u8 *buf)  {  	unsigned int *ibuf = (unsigned int *)buf; @@ -247,36 +259,38 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)  {  	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;  	struct microcode_header_amd *mc_hdr = NULL; -	unsigned int mc_size, leftover; +	unsigned int mc_size, leftover, current_size = 0;  	int offset;  	const u8 *ucode_ptr = data;  	void *new_mc = NULL;  	unsigned int new_rev = uci->cpu_sig.rev; -	enum ucode_state state = UCODE_OK; +	enum ucode_state state = UCODE_ERROR;  	offset = install_equiv_cpu_table(ucode_ptr);  	if (offset < 0) {  		pr_err("failed to create equivalent cpu table\n"); -		return UCODE_ERROR; +		goto out;  	} -  	ucode_ptr += offset;  	leftover = size - offset; -	while (leftover) { -		mc_hdr = get_next_ucode(cpu, ucode_ptr, leftover, &mc_size); -		if (!mc_hdr) -			break; +	if (*(u32 *)ucode_ptr != UCODE_UCODE_TYPE) { +		pr_err("invalid type field in container file section header\n"); +		goto free_table; +	} -		if (get_matching_microcode(cpu, mc_hdr, new_rev)) { -			vfree(new_mc); +	while (leftover) { +		mc_size = get_matching_microcode(cpu, ucode_ptr, leftover, +						 new_rev, ¤t_size); +		if (mc_size) { +			mc_hdr  = patch; +			new_mc  = patch;  			new_rev = mc_hdr->patch_id; -			new_mc  = mc_hdr; -		} else -			vfree(mc_hdr); +			goto out_ok; +		} -		ucode_ptr += mc_size; -		leftover  -= mc_size; +		ucode_ptr += current_size; +		leftover  -= current_size;  	}  	if (!new_mc) { @@ -284,29 +298,46 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)  		goto free_table;  	} -	if (!leftover) { -		vfree(uci->mc); -		uci->mc = new_mc; -		pr_debug("CPU%d update ucode (0x%08x -> 0x%08x)\n", -			 cpu, uci->cpu_sig.rev, new_rev); -	} else { -		vfree(new_mc); -		state = UCODE_ERROR; -	} +out_ok: +	uci->mc = new_mc; +	state = UCODE_OK; +	pr_debug("CPU%d update ucode (0x%08x -> 0x%08x)\n", +		 cpu, uci->cpu_sig.rev, new_rev);  free_table:  	free_equiv_cpu_table(); +out:  	return state;  } +/* + * AMD microcode firmware naming convention, up to family 15h they are in + * the legacy file: + * + *    amd-ucode/microcode_amd.bin + * + * This legacy file is always smaller than 2K in size. + * + * Starting at family 15h they are in family specific firmware files: + * + *    amd-ucode/microcode_amd_fam15h.bin + *    amd-ucode/microcode_amd_fam16h.bin + *    ... + * + * These might be larger than 2K. + */  static enum ucode_state request_microcode_amd(int cpu, struct device *device)  { -	const char *fw_name = "amd-ucode/microcode_amd.bin"; +	char fw_name[36] = "amd-ucode/microcode_amd.bin";  	const struct firmware *fw;  	enum ucode_state ret = UCODE_NFOUND; +	struct cpuinfo_x86 *c = &cpu_data(cpu); + +	if (c->x86 >= 0x15) +		snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86); -	if (request_firmware(&fw, fw_name, device)) { +	if (request_firmware(&fw, (const char *)fw_name, device)) {  		pr_err("failed to load file %s\n", fw_name);  		goto out;  	} @@ -329,7 +360,6 @@ out:  static enum ucode_state  request_microcode_user(int cpu, const void __user *buf, size_t size)  { -	pr_info("AMD microcode update via /dev/cpu/microcode not supported\n");  	return UCODE_ERROR;  } @@ -337,7 +367,6 @@ static void microcode_fini_cpu_amd(int cpu)  {  	struct ucode_cpu_info *uci = ucode_cpu_info + cpu; -	vfree(uci->mc);  	uci->mc = NULL;  } @@ -351,5 +380,14 @@ static struct microcode_ops microcode_amd_ops = {  struct microcode_ops * __init init_amd_microcode(void)  { +	patch = (void *)get_zeroed_page(GFP_KERNEL); +	if (!patch) +		return NULL; +  	return µcode_amd_ops;  } + +void __exit exit_amd_microcode(void) +{ +	free_page((unsigned long)patch); +} diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index f2d2a664e79..fda91c30710 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -256,7 +256,7 @@ static int __init microcode_dev_init(void)  	return 0;  } -static void microcode_dev_exit(void) +static void __exit microcode_dev_exit(void)  {  	misc_deregister(µcode_dev);  } @@ -292,8 +292,8 @@ static int reload_for_cpu(int cpu)  	return err;  } -static ssize_t reload_store(struct sys_device *dev, -			    struct sysdev_attribute *attr, +static ssize_t reload_store(struct device *dev, +			    struct device_attribute *attr,  			    const char *buf, size_t size)  {  	unsigned long val; @@ -318,30 +318,30 @@ static ssize_t reload_store(struct sys_device *dev,  	return ret;  } -static ssize_t version_show(struct sys_device *dev, -			struct sysdev_attribute *attr, char *buf) +static ssize_t version_show(struct device *dev, +			struct device_attribute *attr, char *buf)  {  	struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;  	return sprintf(buf, "0x%x\n", uci->cpu_sig.rev);  } -static ssize_t pf_show(struct sys_device *dev, -			struct sysdev_attribute *attr, char *buf) +static ssize_t pf_show(struct device *dev, +			struct device_attribute *attr, char *buf)  {  	struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;  	return sprintf(buf, "0x%x\n", uci->cpu_sig.pf);  } -static SYSDEV_ATTR(reload, 0200, NULL, reload_store); -static SYSDEV_ATTR(version, 0400, version_show, NULL); -static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL); +static DEVICE_ATTR(reload, 0200, NULL, reload_store); +static DEVICE_ATTR(version, 0400, version_show, NULL); +static DEVICE_ATTR(processor_flags, 0400, pf_show, NULL);  static struct attribute *mc_default_attrs[] = { -	&attr_reload.attr, -	&attr_version.attr, -	&attr_processor_flags.attr, +	&dev_attr_reload.attr, +	&dev_attr_version.attr, +	&dev_attr_processor_flags.attr,  	NULL  }; @@ -405,43 +405,45 @@ static enum ucode_state microcode_update_cpu(int cpu)  	return ustate;  } -static int mc_sysdev_add(struct sys_device *sys_dev) +static int mc_device_add(struct device *dev, struct subsys_interface *sif)  { -	int err, cpu = sys_dev->id; +	int err, cpu = dev->id;  	if (!cpu_online(cpu))  		return 0;  	pr_debug("CPU%d added\n", cpu); -	err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); +	err = sysfs_create_group(&dev->kobj, &mc_attr_group);  	if (err)  		return err;  	if (microcode_init_cpu(cpu) == UCODE_ERROR) { -		sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); +		sysfs_remove_group(&dev->kobj, &mc_attr_group);  		return -EINVAL;  	}  	return err;  } -static int mc_sysdev_remove(struct sys_device *sys_dev) +static int mc_device_remove(struct device *dev, struct subsys_interface *sif)  { -	int cpu = sys_dev->id; +	int cpu = dev->id;  	if (!cpu_online(cpu))  		return 0;  	pr_debug("CPU%d removed\n", cpu);  	microcode_fini_cpu(cpu); -	sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); +	sysfs_remove_group(&dev->kobj, &mc_attr_group);  	return 0;  } -static struct sysdev_driver mc_sysdev_driver = { -	.add			= mc_sysdev_add, -	.remove			= mc_sysdev_remove, +static struct subsys_interface mc_cpu_interface = { +	.name			= "microcode", +	.subsys			= &cpu_subsys, +	.add_dev		= mc_device_add, +	.remove_dev		= mc_device_remove,  };  /** @@ -464,9 +466,9 @@ static __cpuinit int  mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)  {  	unsigned int cpu = (unsigned long)hcpu; -	struct sys_device *sys_dev; +	struct device *dev; -	sys_dev = get_cpu_sysdev(cpu); +	dev = get_cpu_device(cpu);  	switch (action) {  	case CPU_ONLINE:  	case CPU_ONLINE_FROZEN: @@ -474,13 +476,13 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)  	case CPU_DOWN_FAILED:  	case CPU_DOWN_FAILED_FROZEN:  		pr_debug("CPU%d added\n", cpu); -		if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) +		if (sysfs_create_group(&dev->kobj, &mc_attr_group))  			pr_err("Failed to create group for CPU%d\n", cpu);  		break;  	case CPU_DOWN_PREPARE:  	case CPU_DOWN_PREPARE_FROZEN:  		/* Suspend is in progress, only remove the interface */ -		sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); +		sysfs_remove_group(&dev->kobj, &mc_attr_group);  		pr_debug("CPU%d removed\n", cpu);  		break; @@ -519,27 +521,23 @@ static int __init microcode_init(void)  	microcode_pdev = platform_device_register_simple("microcode", -1,  							 NULL, 0); -	if (IS_ERR(microcode_pdev)) { -		microcode_dev_exit(); +	if (IS_ERR(microcode_pdev))  		return PTR_ERR(microcode_pdev); -	}  	get_online_cpus();  	mutex_lock(µcode_mutex); -	error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); +	error = subsys_interface_register(&mc_cpu_interface);  	mutex_unlock(µcode_mutex);  	put_online_cpus(); -	if (error) { -		platform_device_unregister(microcode_pdev); -		return error; -	} +	if (error) +		goto out_pdev;  	error = microcode_dev_init();  	if (error) -		return error; +		goto out_driver;  	register_syscore_ops(&mc_syscore_ops);  	register_hotcpu_notifier(&mc_cpu_notifier); @@ -548,11 +546,27 @@ static int __init microcode_init(void)  		" <tigran@aivazian.fsnet.co.uk>, Peter Oruba\n");  	return 0; + +out_driver: +	get_online_cpus(); +	mutex_lock(µcode_mutex); + +	subsys_interface_unregister(&mc_cpu_interface); + +	mutex_unlock(µcode_mutex); +	put_online_cpus(); + +out_pdev: +	platform_device_unregister(microcode_pdev); +	return error; +  }  module_init(microcode_init);  static void __exit microcode_exit(void)  { +	struct cpuinfo_x86 *c = &cpu_data(0); +  	microcode_dev_exit();  	unregister_hotcpu_notifier(&mc_cpu_notifier); @@ -561,7 +575,7 @@ static void __exit microcode_exit(void)  	get_online_cpus();  	mutex_lock(µcode_mutex); -	sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); +	subsys_interface_unregister(&mc_cpu_interface);  	mutex_unlock(µcode_mutex);  	put_online_cpus(); @@ -570,6 +584,9 @@ static void __exit microcode_exit(void)  	microcode_ops = NULL; +	if (c->x86_vendor == X86_VENDOR_AMD) +		exit_amd_microcode(); +  	pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");  }  module_exit(microcode_exit); diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 9103b89c145..ca470e4c92d 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -95,8 +95,8 @@ static void __init MP_bus_info(struct mpc_bus *m)  	}  #endif +	set_bit(m->busid, mp_bus_not_pci);  	if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) { -		set_bit(m->busid, mp_bus_not_pci);  #if defined(CONFIG_EISA) || defined(CONFIG_MCA)  		mp_bus_id_to_type[m->busid] = MP_BUS_ISA;  #endif @@ -564,9 +564,7 @@ void __init default_get_smp_config(unsigned int early)  static void __init smp_reserve_memory(struct mpf_intel *mpf)  { -	unsigned long size = get_mpc_size(mpf->physptr); - -	memblock_x86_reserve_range(mpf->physptr, mpf->physptr+size, "* MP-table mpc"); +	memblock_reserve(mpf->physptr, get_mpc_size(mpf->physptr));  }  static int __init smp_scan_config(unsigned long base, unsigned long length) @@ -595,7 +593,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)  			       mpf, (u64)virt_to_phys(mpf));  			mem = virt_to_phys(mpf); -			memblock_x86_reserve_range(mem, mem + sizeof(*mpf), "* MP-table mpf"); +			memblock_reserve(mem, sizeof(*mpf));  			if (mpf->physptr)  				smp_reserve_memory(mpf); @@ -836,10 +834,8 @@ early_param("alloc_mptable", parse_alloc_mptable_opt);  void __init early_reserve_e820_mpc_new(void)  { -	if (enable_update_mptable && alloc_mptable) { -		u64 startt = 0; -		mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4); -	} +	if (enable_update_mptable && alloc_mptable) +		mpc_new_phys = early_reserve_e820(mpc_new_length, 4);  }  static int __init update_mp_table(void) diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 12fcbe2c143..96356762a51 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -236,7 +236,7 @@ static struct notifier_block __refdata msr_class_cpu_notifier = {  	.notifier_call = msr_class_cpu_callback,  }; -static char *msr_devnode(struct device *dev, mode_t *mode) +static char *msr_devnode(struct device *dev, umode_t *mode)  {  	return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt));  } diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index e88f37b58dd..47acaf31916 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -405,9 +405,108 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)  		unknown_nmi_error(reason, regs);  } +/* + * NMIs can hit breakpoints which will cause it to lose its + * NMI context with the CPU when the breakpoint does an iret. + */ +#ifdef CONFIG_X86_32 +/* + * For i386, NMIs use the same stack as the kernel, and we can + * add a workaround to the iret problem in C. Simply have 3 states + * the NMI can be in. + * + *  1) not running + *  2) executing + *  3) latched + * + * When no NMI is in progress, it is in the "not running" state. + * When an NMI comes in, it goes into the "executing" state. + * Normally, if another NMI is triggered, it does not interrupt + * the running NMI and the HW will simply latch it so that when + * the first NMI finishes, it will restart the second NMI. + * (Note, the latch is binary, thus multiple NMIs triggering, + *  when one is running, are ignored. Only one NMI is restarted.) + * + * If an NMI hits a breakpoint that executes an iret, another + * NMI can preempt it. We do not want to allow this new NMI + * to run, but we want to execute it when the first one finishes. + * We set the state to "latched", and the first NMI will perform + * an cmpxchg on the state, and if it doesn't successfully + * reset the state to "not running" it will restart the next + * NMI. + */ +enum nmi_states { +	NMI_NOT_RUNNING, +	NMI_EXECUTING, +	NMI_LATCHED, +}; +static DEFINE_PER_CPU(enum nmi_states, nmi_state); + +#define nmi_nesting_preprocess(regs)					\ +	do {								\ +		if (__get_cpu_var(nmi_state) != NMI_NOT_RUNNING) {	\ +			__get_cpu_var(nmi_state) = NMI_LATCHED;		\ +			return;						\ +		}							\ +	nmi_restart:							\ +		__get_cpu_var(nmi_state) = NMI_EXECUTING;		\ +	} while (0) + +#define nmi_nesting_postprocess()					\ +	do {								\ +		if (cmpxchg(&__get_cpu_var(nmi_state),			\ +		    NMI_EXECUTING, NMI_NOT_RUNNING) != NMI_EXECUTING)	\ +			goto nmi_restart;				\ +	} while (0) +#else /* x86_64 */ +/* + * In x86_64 things are a bit more difficult. This has the same problem + * where an NMI hitting a breakpoint that calls iret will remove the + * NMI context, allowing a nested NMI to enter. What makes this more + * difficult is that both NMIs and breakpoints have their own stack. + * When a new NMI or breakpoint is executed, the stack is set to a fixed + * point. If an NMI is nested, it will have its stack set at that same + * fixed address that the first NMI had, and will start corrupting the + * stack. This is handled in entry_64.S, but the same problem exists with + * the breakpoint stack. + * + * If a breakpoint is being processed, and the debug stack is being used, + * if an NMI comes in and also hits a breakpoint, the stack pointer + * will be set to the same fixed address as the breakpoint that was + * interrupted, causing that stack to be corrupted. To handle this case, + * check if the stack that was interrupted is the debug stack, and if + * so, change the IDT so that new breakpoints will use the current stack + * and not switch to the fixed address. On return of the NMI, switch back + * to the original IDT. + */ +static DEFINE_PER_CPU(int, update_debug_stack); + +static inline void nmi_nesting_preprocess(struct pt_regs *regs) +{ +	/* +	 * If we interrupted a breakpoint, it is possible that +	 * the nmi handler will have breakpoints too. We need to +	 * change the IDT such that breakpoints that happen here +	 * continue to use the NMI stack. +	 */ +	if (unlikely(is_debug_stack(regs->sp))) { +		debug_stack_set_zero(); +		__get_cpu_var(update_debug_stack) = 1; +	} +} + +static inline void nmi_nesting_postprocess(void) +{ +	if (unlikely(__get_cpu_var(update_debug_stack))) +		debug_stack_reset(); +} +#endif +  dotraplinkage notrace __kprobes void  do_nmi(struct pt_regs *regs, long error_code)  { +	nmi_nesting_preprocess(regs); +  	nmi_enter();  	inc_irq_stat(__nmi_count); @@ -416,6 +515,9 @@ do_nmi(struct pt_regs *regs, long error_code)  		default_do_nmi(regs);  	nmi_exit(); + +	/* On i386, may loop back to preprocess */ +	nmi_nesting_postprocess();  }  void stop_nmi(void) diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c new file mode 100644 index 00000000000..0d01a8ea4e1 --- /dev/null +++ b/arch/x86/kernel/nmi_selftest.c @@ -0,0 +1,180 @@ +/* + * arch/x86/kernel/nmi-selftest.c + * + * Testsuite for NMI: IPIs + * + * Started by Don Zickus: + * (using lib/locking-selftest.c as a guide) + * + *   Copyright (C) 2011 Red Hat, Inc., Don Zickus <dzickus@redhat.com> + */ + +#include <linux/smp.h> +#include <linux/cpumask.h> +#include <linux/delay.h> + +#include <asm/apic.h> +#include <asm/nmi.h> + +#define SUCCESS		0 +#define FAILURE		1 +#define TIMEOUT		2 + +static int nmi_fail; + +/* check to see if NMI IPIs work on this machine */ +static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __read_mostly; + +static int testcase_total; +static int testcase_successes; +static int expected_testcase_failures; +static int unexpected_testcase_failures; +static int unexpected_testcase_unknowns; + +static int nmi_unk_cb(unsigned int val, struct pt_regs *regs) +{ +	unexpected_testcase_unknowns++; +	return NMI_HANDLED; +} + +static void init_nmi_testsuite(void) +{ +	/* trap all the unknown NMIs we may generate */ +	register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk"); +} + +static void cleanup_nmi_testsuite(void) +{ +	unregister_nmi_handler(NMI_UNKNOWN, "nmi_selftest_unk"); +} + +static int test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs) +{ +        int cpu = raw_smp_processor_id(); + +        if (cpumask_test_and_clear_cpu(cpu, to_cpumask(nmi_ipi_mask))) +                return NMI_HANDLED; + +        return NMI_DONE; +} + +static void test_nmi_ipi(struct cpumask *mask) +{ +	unsigned long timeout; + +	if (register_nmi_handler(NMI_LOCAL, test_nmi_ipi_callback, +				 NMI_FLAG_FIRST, "nmi_selftest")) { +		nmi_fail = FAILURE; +		return; +	} + +	/* sync above data before sending NMI */ +	wmb(); + +	apic->send_IPI_mask(mask, NMI_VECTOR); + +	/* Don't wait longer than a second */ +	timeout = USEC_PER_SEC; +	while (!cpumask_empty(mask) && timeout--) +	        udelay(1); + +	/* What happens if we timeout, do we still unregister?? */ +	unregister_nmi_handler(NMI_LOCAL, "nmi_selftest"); + +	if (!timeout) +		nmi_fail = TIMEOUT; +	return; +} + +static void remote_ipi(void) +{ +	cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask); +	cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask)); +	if (!cpumask_empty(to_cpumask(nmi_ipi_mask))) +		test_nmi_ipi(to_cpumask(nmi_ipi_mask)); +} + +static void local_ipi(void) +{ +	cpumask_clear(to_cpumask(nmi_ipi_mask)); +	cpumask_set_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask)); +	test_nmi_ipi(to_cpumask(nmi_ipi_mask)); +} + +static void reset_nmi(void) +{ +	nmi_fail = 0; +} + +static void dotest(void (*testcase_fn)(void), int expected) +{ +	testcase_fn(); +	/* +	 * Filter out expected failures: +	 */ +	if (nmi_fail != expected) { +		unexpected_testcase_failures++; + +		if (nmi_fail == FAILURE) +			printk("FAILED |"); +		else if (nmi_fail == TIMEOUT) +			printk("TIMEOUT|"); +		else +			printk("ERROR  |"); +		dump_stack(); +	} else { +		testcase_successes++; +		printk("  ok  |"); +	} +	testcase_total++; + +	reset_nmi(); +} + +static inline void print_testname(const char *testname) +{ +	printk("%12s:", testname); +} + +void nmi_selftest(void) +{ +	init_nmi_testsuite(); + +        /* +	 * Run the testsuite: +	 */ +	printk("----------------\n"); +	printk("| NMI testsuite:\n"); +	printk("--------------------\n"); + +	print_testname("remote IPI"); +	dotest(remote_ipi, SUCCESS); +	printk("\n"); +	print_testname("local IPI"); +	dotest(local_ipi, SUCCESS); +	printk("\n"); + +	cleanup_nmi_testsuite(); + +	if (unexpected_testcase_failures) { +		printk("--------------------\n"); +		printk("BUG: %3d unexpected failures (out of %3d) - debugging disabled! |\n", +			unexpected_testcase_failures, testcase_total); +		printk("-----------------------------------------------------------------\n"); +	} else if (expected_testcase_failures && testcase_successes) { +		printk("--------------------\n"); +		printk("%3d out of %3d testcases failed, as expected. |\n", +			expected_testcase_failures, testcase_total); +		printk("----------------------------------------------------\n"); +	} else if (expected_testcase_failures && !testcase_successes) { +		printk("--------------------\n"); +		printk("All %3d testcases failed, as expected. |\n", +			expected_testcase_failures); +		printk("----------------------------------------\n"); +	} else { +		printk("--------------------\n"); +		printk("Good, all %3d testcases passed! |\n", +			testcase_successes); +		printk("---------------------------------\n"); +	} +} diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 80dc793b3f6..1c4d769e21e 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -45,6 +45,15 @@ int iommu_detected __read_mostly = 0;   */  int iommu_pass_through __read_mostly; +/* + * Group multi-function PCI devices into a single device-group for the + * iommu_device_group interface.  This tells the iommu driver to pretend + * it cannot distinguish between functions of a device, exposing only one + * group for the device.  Useful for disallowing use of individual PCI + * functions from userspace drivers. + */ +int iommu_group_mf __read_mostly; +  extern struct iommu_table_entry __iommu_table[], __iommu_table_end[];  /* Dummy device used for NULL arguments (normally ISA). */ @@ -169,6 +178,8 @@ static __init int iommu_setup(char *p)  #endif  		if (!strncmp(p, "pt", 2))  			iommu_pass_through = 1; +		if (!strncmp(p, "group_mf", 8)) +			iommu_group_mf = 1;  		gart_parse_options(p); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index b9b3b1a5164..15763af7bfe 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -293,7 +293,7 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)  	regs.orig_ax = -1;  	regs.ip = (unsigned long) kernel_thread_helper;  	regs.cs = __KERNEL_CS | get_kernel_rpl(); -	regs.flags = X86_EFLAGS_IF | 0x2; +	regs.flags = X86_EFLAGS_IF | X86_EFLAGS_BIT1;  	/* Ok, create the new process.. */  	return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); @@ -403,6 +403,14 @@ void default_idle(void)  EXPORT_SYMBOL(default_idle);  #endif +bool set_pm_idle_to_default(void) +{ +	bool ret = !!pm_idle; + +	pm_idle = default_idle; + +	return ret; +}  void stop_this_cpu(void *dummy)  {  	local_irq_disable(); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 795b79f984c..c08d1ff12b7 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -99,7 +99,8 @@ void cpu_idle(void)  	/* endless idle loop with no priority at all */  	while (1) { -		tick_nohz_stop_sched_tick(1); +		tick_nohz_idle_enter(); +		rcu_idle_enter();  		while (!need_resched()) {  			check_pgt_cache(); @@ -116,7 +117,8 @@ void cpu_idle(void)  				pm_idle();  			start_critical_timings();  		} -		tick_nohz_restart_sched_tick(); +		rcu_idle_exit(); +		tick_nohz_idle_exit();  		preempt_enable_no_resched();  		schedule();  		preempt_disable(); @@ -212,6 +214,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,  	task_user_gs(p) = get_user_gs(regs); +	p->fpu_counter = 0;  	p->thread.io_bitmap_ptr = NULL;  	tsk = current;  	err = -ENOMEM; @@ -297,22 +300,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)  				 *next = &next_p->thread;  	int cpu = smp_processor_id();  	struct tss_struct *tss = &per_cpu(init_tss, cpu); -	bool preload_fpu; +	fpu_switch_t fpu;  	/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ -	/* -	 * If the task has used fpu the last 5 timeslices, just do a full -	 * restore of the math state immediately to avoid the trap; the -	 * chances of needing FPU soon are obviously high now -	 */ -	preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; - -	__unlazy_fpu(prev_p); - -	/* we're going to use this soon, after a few expensive things */ -	if (preload_fpu) -		prefetch(next->fpu.state); +	fpu = switch_fpu_prepare(prev_p, next_p, cpu);  	/*  	 * Reload esp0. @@ -352,11 +344,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)  		     task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))  		__switch_to_xtra(prev_p, next_p, tss); -	/* If we're going to preload the fpu context, make sure clts -	   is run while we're batching the cpu state updates. */ -	if (preload_fpu) -		clts(); -  	/*  	 * Leave lazy mode, flushing any hypercalls made here.  	 * This must be done before restoring TLS segments so @@ -366,15 +353,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)  	 */  	arch_end_context_switch(next_p); -	if (preload_fpu) -		__math_state_restore(); -  	/*  	 * Restore %gs if needed (which is common)  	 */  	if (prev->gs | next->gs)  		lazy_load_gs(next->gs); +	switch_fpu_finish(next_p, fpu); +  	percpu_write(current_task, next_p);  	return prev_p; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 3bd7e6eebf3..cfa5c90c01d 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -122,7 +122,7 @@ void cpu_idle(void)  	/* endless idle loop with no priority at all */  	while (1) { -		tick_nohz_stop_sched_tick(1); +		tick_nohz_idle_enter();  		while (!need_resched()) {  			rmb(); @@ -139,8 +139,14 @@ void cpu_idle(void)  			enter_idle();  			/* Don't trace irqs off for idle */  			stop_critical_timings(); + +			/* enter_idle() needs rcu for notifiers */ +			rcu_idle_enter(); +  			if (cpuidle_idle_call())  				pm_idle(); + +			rcu_idle_exit();  			start_critical_timings();  			/* In many cases the interrupt that ended idle @@ -149,7 +155,7 @@ void cpu_idle(void)  			__exit_idle();  		} -		tick_nohz_restart_sched_tick(); +		tick_nohz_idle_exit();  		preempt_enable_no_resched();  		schedule();  		preempt_disable(); @@ -280,6 +286,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,  	set_tsk_thread_flag(p, TIF_FORK); +	p->fpu_counter = 0;  	p->thread.io_bitmap_ptr = NULL;  	savesegment(gs, p->thread.gsindex); @@ -293,13 +300,12 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,  	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));  	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { -		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); +		p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr, +						  IO_BITMAP_BYTES, GFP_KERNEL);  		if (!p->thread.io_bitmap_ptr) {  			p->thread.io_bitmap_max = 0;  			return -ENOMEM;  		} -		memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, -				IO_BITMAP_BYTES);  		set_tsk_thread_flag(p, TIF_IO_BITMAP);  	} @@ -381,18 +387,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)  	int cpu = smp_processor_id();  	struct tss_struct *tss = &per_cpu(init_tss, cpu);  	unsigned fsindex, gsindex; -	bool preload_fpu; - -	/* -	 * If the task has used fpu the last 5 timeslices, just do a full -	 * restore of the math state immediately to avoid the trap; the -	 * chances of needing FPU soon are obviously high now -	 */ -	preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; +	fpu_switch_t fpu; -	/* we're going to use this soon, after a few expensive things */ -	if (preload_fpu) -		prefetch(next->fpu.state); +	fpu = switch_fpu_prepare(prev_p, next_p, cpu);  	/*  	 * Reload esp0, LDT and the page table pointer: @@ -422,13 +419,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)  	load_TLS(next, cpu); -	/* Must be after DS reload */ -	__unlazy_fpu(prev_p); - -	/* Make sure cpu is ready for new context */ -	if (preload_fpu) -		clts(); -  	/*  	 * Leave lazy mode, flushing any hypercalls made here.  	 * This must be done before restoring TLS segments so @@ -469,6 +459,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)  		wrmsrl(MSR_KERNEL_GS_BASE, next->gs);  	prev->gsindex = gsindex; +	switch_fpu_finish(next_p, fpu); +  	/*  	 * Switch the PDA and FPU contexts.  	 */ @@ -487,13 +479,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)  		     task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))  		__switch_to_xtra(prev_p, next_p, tss); -	/* -	 * Preload the FPU context, now that we've determined that the -	 * task is likely to be using it.  -	 */ -	if (preload_fpu) -		__math_state_restore(); -  	return prev_p;  } diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 82528799c5d..50267386b76 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -749,7 +749,8 @@ put:  /*   * Handle PTRACE_POKEUSR calls for the debug register area.   */ -int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val) +static int ptrace_set_debugreg(struct task_struct *tsk, int n, +			       unsigned long val)  {  	struct thread_struct *thread = &(tsk->thread);  	int rc = 0; @@ -1391,20 +1392,18 @@ long syscall_trace_enter(struct pt_regs *regs)  	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))  		trace_sys_enter(regs, regs->orig_ax); -	if (unlikely(current->audit_context)) { -		if (IS_IA32) -			audit_syscall_entry(AUDIT_ARCH_I386, -					    regs->orig_ax, -					    regs->bx, regs->cx, -					    regs->dx, regs->si); +	if (IS_IA32) +		audit_syscall_entry(AUDIT_ARCH_I386, +				    regs->orig_ax, +				    regs->bx, regs->cx, +				    regs->dx, regs->si);  #ifdef CONFIG_X86_64 -		else -			audit_syscall_entry(AUDIT_ARCH_X86_64, -					    regs->orig_ax, -					    regs->di, regs->si, -					    regs->dx, regs->r10); +	else +		audit_syscall_entry(AUDIT_ARCH_X86_64, +				    regs->orig_ax, +				    regs->di, regs->si, +				    regs->dx, regs->r10);  #endif -	}  	return ret ?: regs->orig_ax;  } @@ -1413,8 +1412,7 @@ void syscall_trace_leave(struct pt_regs *regs)  {  	bool step; -	if (unlikely(current->audit_context)) -		audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); +	audit_syscall_exit(regs);  	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))  		trace_sys_exit(regs, regs->ax); diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index b78643d0f9a..03920a15a63 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -553,4 +553,17 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC,  			quirk_amd_nb_node);  DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_LINK,  			quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F0, +			quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F1, +			quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F2, +			quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3, +			quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4, +			quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F5, +			quirk_amd_nb_node); +  #endif diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index e334be1182b..d840e69a853 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -39,6 +39,14 @@ static int reboot_mode;  enum reboot_type reboot_type = BOOT_ACPI;  int reboot_force; +/* This variable is used privately to keep track of whether or not + * reboot_type is still set to its default value (i.e., reboot= hasn't + * been set on the command line).  This is needed so that we can + * suppress DMI scanning for reboot quirks.  Without it, it's + * impossible to override a faulty reboot quirk without recompiling. + */ +static int reboot_default = 1; +  #if defined(CONFIG_X86_32) && defined(CONFIG_SMP)  static int reboot_cpu = -1;  #endif @@ -67,6 +75,12 @@ bool port_cf9_safe = false;  static int __init reboot_setup(char *str)  {  	for (;;) { +		/* Having anything passed on the command line via +		 * reboot= will cause us to disable DMI checking +		 * below. +		 */ +		reboot_default = 0; +  		switch (*str) {  		case 'w':  			reboot_mode = 0x1234; @@ -124,7 +138,7 @@ __setup("reboot=", reboot_setup);   */  /* - * Some machines require the "reboot=b"  commandline option, + * Some machines require the "reboot=b" or "reboot=k"  commandline options,   * this quirk makes that automatic.   */  static int __init set_bios_reboot(const struct dmi_system_id *d) @@ -136,6 +150,15 @@ static int __init set_bios_reboot(const struct dmi_system_id *d)  	return 0;  } +static int __init set_kbd_reboot(const struct dmi_system_id *d) +{ +	if (reboot_type != BOOT_KBD) { +		reboot_type = BOOT_KBD; +		printk(KERN_INFO "%s series board detected. Selecting KBD-method for reboot.\n", d->ident); +	} +	return 0; +} +  static struct dmi_system_id __initdata reboot_dmi_table[] = {  	{	/* Handle problems with rebooting on Dell E520's */  		.callback = set_bios_reboot, @@ -286,16 +309,8 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {  			DMI_MATCH(DMI_BOARD_NAME, "P4S800"),  		},  	}, -	{	/* Handle problems with rebooting on VersaLogic Menlow boards */ -		.callback = set_bios_reboot, -		.ident = "VersaLogic Menlow based board", -		.matches = { -			DMI_MATCH(DMI_BOARD_VENDOR, "VersaLogic Corporation"), -			DMI_MATCH(DMI_BOARD_NAME, "VersaLogic Menlow board"), -		}, -	},  	{ /* Handle reboot issue on Acer Aspire one */ -		.callback = set_bios_reboot, +		.callback = set_kbd_reboot,  		.ident = "Acer Aspire One A110",  		.matches = {  			DMI_MATCH(DMI_SYS_VENDOR, "Acer"), @@ -307,7 +322,12 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {  static int __init reboot_init(void)  { -	dmi_check_system(reboot_dmi_table); +	/* Only do the DMI check if reboot_type hasn't been overridden +	 * on the command line +	 */ +	if (reboot_default) { +		dmi_check_system(reboot_dmi_table); +	}  	return 0;  }  core_initcall(reboot_init); @@ -443,12 +463,25 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {  			DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6420"),  		},  	}, +	{	/* Handle problems with rebooting on the OptiPlex 990. */ +		.callback = set_pci_reboot, +		.ident = "Dell OptiPlex 990", +		.matches = { +			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), +			DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"), +		}, +	},  	{ }  };  static int __init pci_reboot_init(void)  { -	dmi_check_system(pci_reboot_dmi_table); +	/* Only do the DMI check if reboot_type hasn't been overridden +	 * on the command line +	 */ +	if (reboot_default) { +		dmi_check_system(pci_reboot_dmi_table); +	}  	return 0;  }  core_initcall(pci_reboot_init); diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 348ce016a83..af6db6ec5b2 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -12,6 +12,7 @@  #include <asm/vsyscall.h>  #include <asm/x86_init.h>  #include <asm/time.h> +#include <asm/mrst.h>  #ifdef CONFIG_X86_32  /* @@ -242,6 +243,10 @@ static __init int add_rtc_cmos(void)  	if (of_have_populated_dt())  		return 0; +	/* Intel MID platforms don't have ioport rtc */ +	if (mrst_identify_cpu()) +		return -ENODEV; +  	platform_device_register(&rtc_device);  	dev_info(&rtc_device.dev,  		 "registered platform RTC device (no PNP device found)\n"); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index cf0ef986cb6..d7d5099fe87 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -306,7 +306,8 @@ static void __init cleanup_highmap(void)  static void __init reserve_brk(void)  {  	if (_brk_end > _brk_start) -		memblock_x86_reserve_range(__pa(_brk_start), __pa(_brk_end), "BRK"); +		memblock_reserve(__pa(_brk_start), +				 __pa(_brk_end) - __pa(_brk_start));  	/* Mark brk area as locked down and no longer taking any  	   new allocations */ @@ -331,13 +332,13 @@ static void __init relocate_initrd(void)  	ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size,  					 PAGE_SIZE); -	if (ramdisk_here == MEMBLOCK_ERROR) +	if (!ramdisk_here)  		panic("Cannot find place for new RAMDISK of size %lld\n",  			 ramdisk_size);  	/* Note: this includes all the lowmem currently occupied by  	   the initrd, we rely on that fact to keep the data intact. */ -	memblock_x86_reserve_range(ramdisk_here, ramdisk_here + area_size, "NEW RAMDISK"); +	memblock_reserve(ramdisk_here, area_size);  	initrd_start = ramdisk_here + PAGE_OFFSET;  	initrd_end   = initrd_start + ramdisk_size;  	printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n", @@ -393,7 +394,7 @@ static void __init reserve_initrd(void)  	initrd_start = 0;  	if (ramdisk_size >= (end_of_lowmem>>1)) { -		memblock_x86_free_range(ramdisk_image, ramdisk_end); +		memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);  		printk(KERN_ERR "initrd too large to handle, "  		       "disabling initrd\n");  		return; @@ -416,7 +417,7 @@ static void __init reserve_initrd(void)  	relocate_initrd(); -	memblock_x86_free_range(ramdisk_image, ramdisk_end); +	memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);  }  #else  static void __init reserve_initrd(void) @@ -490,15 +491,13 @@ static void __init memblock_x86_reserve_range_setup_data(void)  {  	struct setup_data *data;  	u64 pa_data; -	char buf[32];  	if (boot_params.hdr.version < 0x0209)  		return;  	pa_data = boot_params.hdr.setup_data;  	while (pa_data) {  		data = early_memremap(pa_data, sizeof(*data)); -		sprintf(buf, "setup data %x", data->type); -		memblock_x86_reserve_range(pa_data, pa_data+sizeof(*data)+data->len, buf); +		memblock_reserve(pa_data, sizeof(*data) + data->len);  		pa_data = data->next;  		early_iounmap(data, sizeof(*data));  	} @@ -554,7 +553,7 @@ static void __init reserve_crashkernel(void)  		crash_base = memblock_find_in_range(alignment,  			       CRASH_KERNEL_ADDR_MAX, crash_size, alignment); -		if (crash_base == MEMBLOCK_ERROR) { +		if (!crash_base) {  			pr_info("crashkernel reservation failed - No suitable area found.\n");  			return;  		} @@ -568,7 +567,7 @@ static void __init reserve_crashkernel(void)  			return;  		}  	} -	memblock_x86_reserve_range(crash_base, crash_base + crash_size, "CRASH KERNEL"); +	memblock_reserve(crash_base, crash_size);  	printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "  			"for crashkernel (System RAM: %ldMB)\n", @@ -626,7 +625,7 @@ static __init void reserve_ibft_region(void)  	addr = find_ibft_region(&size);  	if (size) -		memblock_x86_reserve_range(addr, addr + size, "* ibft"); +		memblock_reserve(addr, size);  }  static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10; @@ -750,12 +749,7 @@ void __init setup_arch(char **cmdline_p)  #endif  #ifdef CONFIG_EFI  	if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, -#ifdef CONFIG_X86_32 -		     "EL32", -#else -		     "EL64", -#endif -	 4)) { +		     EFI_LOADER_SIGNATURE, 4)) {  		efi_enabled = 1;  		efi_memblock_x86_reserve_range();  	} diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 54ddaeb221c..46a01bdc27e 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -682,7 +682,6 @@ static int  handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,  		struct pt_regs *regs)  { -	sigset_t blocked;  	int ret;  	/* Are we from a system call? */ @@ -733,10 +732,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,  	 */  	regs->flags &= ~X86_EFLAGS_TF; -	sigorsets(&blocked, ¤t->blocked, &ka->sa.sa_mask); -	if (!(ka->sa.sa_flags & SA_NODEFER)) -		sigaddset(&blocked, sig); -	set_current_blocked(&blocked); +	block_sigmask(ka, sig);  	tracehook_signal_handler(sig, info, ka, regs,  				 test_thread_flag(TIF_SINGLESTEP)); diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 16204dc1548..66c74f481ca 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -29,6 +29,7 @@  #include <asm/mmu_context.h>  #include <asm/proto.h>  #include <asm/apic.h> +#include <asm/nmi.h>  /*   *	Some notes on x86 processor bugs affecting SMP operation:   * @@ -148,6 +149,60 @@ void native_send_call_func_ipi(const struct cpumask *mask)  	free_cpumask_var(allbutself);  } +static atomic_t stopping_cpu = ATOMIC_INIT(-1); + +static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs) +{ +	/* We are registered on stopping cpu too, avoid spurious NMI */ +	if (raw_smp_processor_id() == atomic_read(&stopping_cpu)) +		return NMI_HANDLED; + +	stop_this_cpu(NULL); + +	return NMI_HANDLED; +} + +static void native_nmi_stop_other_cpus(int wait) +{ +	unsigned long flags; +	unsigned long timeout; + +	if (reboot_force) +		return; + +	/* +	 * Use an own vector here because smp_call_function +	 * does lots of things not suitable in a panic situation. +	 */ +	if (num_online_cpus() > 1) { +		/* did someone beat us here? */ +		if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id()) != -1) +			return; + +		if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback, +					 NMI_FLAG_FIRST, "smp_stop")) +			/* Note: we ignore failures here */ +			return; + +		/* sync above data before sending NMI */ +		wmb(); + +		apic->send_IPI_allbutself(NMI_VECTOR); + +		/* +		 * Don't wait longer than a second if the caller +		 * didn't ask us to wait. +		 */ +		timeout = USEC_PER_SEC; +		while (num_online_cpus() > 1 && (wait || timeout--)) +			udelay(1); +	} + +	local_irq_save(flags); +	disable_local_APIC(); +	local_irq_restore(flags); +} +  /*   * this function calls the 'stop' function on all other CPUs in the system.   */ @@ -160,7 +215,7 @@ asmlinkage void smp_reboot_interrupt(void)  	irq_exit();  } -static void native_stop_other_cpus(int wait) +static void native_irq_stop_other_cpus(int wait)  {  	unsigned long flags;  	unsigned long timeout; @@ -194,6 +249,11 @@ static void native_stop_other_cpus(int wait)  	local_irq_restore(flags);  } +static void native_smp_disable_nmi_ipi(void) +{ +	smp_ops.stop_other_cpus = native_irq_stop_other_cpus; +} +  /*   * Reschedule call back.   */ @@ -225,12 +285,20 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)  	irq_exit();  } +static int __init nonmi_ipi_setup(char *str) +{ +        native_smp_disable_nmi_ipi(); +        return 1; +} + +__setup("nonmi_ipi", nonmi_ipi_setup); +  struct smp_ops smp_ops = {  	.smp_prepare_boot_cpu	= native_smp_prepare_boot_cpu,  	.smp_prepare_cpus	= native_smp_prepare_cpus,  	.smp_cpus_done		= native_smp_cpus_done, -	.stop_other_cpus	= native_stop_other_cpus, +	.stop_other_cpus	= native_nmi_stop_other_cpus,  	.smp_send_reschedule	= native_smp_send_reschedule,  	.cpu_up			= native_cpu_up, diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 9f548cb4a95..66d250c00d1 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -207,23 +207,29 @@ static void __cpuinit smp_callin(void)  	 * Need to setup vector mappings before we enable interrupts.  	 */  	setup_vector_irq(smp_processor_id()); + +	/* +	 * Save our processor parameters. Note: this information +	 * is needed for clock calibration. +	 */ +	smp_store_cpu_info(cpuid); +  	/*  	 * Get our bogomips. +	 * Update loops_per_jiffy in cpu_data. Previous call to +	 * smp_store_cpu_info() stored a value that is close but not as +	 * accurate as the value just calculated.  	 *  	 * Need to enable IRQs because it can take longer and then  	 * the NMI watchdog might kill us.  	 */  	local_irq_enable();  	calibrate_delay(); +	cpu_data(cpuid).loops_per_jiffy = loops_per_jiffy;  	local_irq_disable();  	pr_debug("Stack at about %p\n", &cpuid);  	/* -	 * Save our processor parameters -	 */ -	smp_store_cpu_info(cpuid); - -	/*  	 * This must be done before setting cpu_online_mask  	 * or calling notify_cpu_starting.  	 */ @@ -840,7 +846,8 @@ int __cpuinit native_cpu_up(unsigned int cpu)  	pr_debug("++++++++++++++++++++=_---CPU UP  %u\n", cpu);  	if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid || -	    !physid_isset(apicid, phys_cpu_present_map)) { +	    !physid_isset(apicid, phys_cpu_present_map) || +	    (!x2apic_mode && apicid >= 255)) {  		printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu);  		return -EINVAL;  	} @@ -1142,6 +1149,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus)  {  	pr_debug("Boot done.\n"); +	nmi_selftest();  	impress_friends();  #ifdef CONFIG_X86_IO_APIC  	setup_ioapic_dest(); diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c new file mode 100644 index 00000000000..147fcd4941c --- /dev/null +++ b/arch/x86/kernel/syscall_32.c @@ -0,0 +1,25 @@ +/* System call table for i386. */ + +#include <linux/linkage.h> +#include <linux/sys.h> +#include <linux/cache.h> +#include <asm/asm-offsets.h> + +#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ; +#include <asm/syscalls_32.h> +#undef __SYSCALL_I386 + +#define __SYSCALL_I386(nr, sym, compat) [nr] = sym, + +typedef asmlinkage void (*sys_call_ptr_t)(void); + +extern asmlinkage void sys_ni_syscall(void); + +const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { +	/* +	 * Smells like a compiler bug -- it doesn't work +	 * when the & below is removed. +	 */ +	[0 ... __NR_syscall_max] = &sys_ni_syscall, +#include <asm/syscalls_32.h> +}; diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c index de87d600829..7ac7943be02 100644 --- a/arch/x86/kernel/syscall_64.c +++ b/arch/x86/kernel/syscall_64.c @@ -5,15 +5,11 @@  #include <linux/cache.h>  #include <asm/asm-offsets.h> -#define __NO_STUBS +#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ; +#include <asm/syscalls_64.h> +#undef __SYSCALL_64 -#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ; -#undef _ASM_X86_UNISTD_64_H -#include <asm/unistd_64.h> - -#undef __SYSCALL -#define __SYSCALL(nr, sym) [nr] = sym, -#undef _ASM_X86_UNISTD_64_H +#define __SYSCALL_64(nr, sym, compat) [nr] = sym,  typedef void (*sys_call_ptr_t)(void); @@ -21,9 +17,9 @@ extern void sys_ni_syscall(void);  const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {  	/* -	*Smells like a like a compiler bug -- it doesn't work -	*when the & below is removed. -	*/ +	 * Smells like a compiler bug -- it doesn't work +	 * when the & below is removed. +	 */  	[0 ... __NR_syscall_max] = &sys_ni_syscall, -#include <asm/unistd_64.h> +#include <asm/syscalls_64.h>  }; diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S deleted file mode 100644 index 9a0e3129392..00000000000 --- a/arch/x86/kernel/syscall_table_32.S +++ /dev/null @@ -1,350 +0,0 @@ -ENTRY(sys_call_table) -	.long sys_restart_syscall	/* 0 - old "setup()" system call, used for restarting */ -	.long sys_exit -	.long ptregs_fork -	.long sys_read -	.long sys_write -	.long sys_open		/* 5 */ -	.long sys_close -	.long sys_waitpid -	.long sys_creat -	.long sys_link -	.long sys_unlink	/* 10 */ -	.long ptregs_execve -	.long sys_chdir -	.long sys_time -	.long sys_mknod -	.long sys_chmod		/* 15 */ -	.long sys_lchown16 -	.long sys_ni_syscall	/* old break syscall holder */ -	.long sys_stat -	.long sys_lseek -	.long sys_getpid	/* 20 */ -	.long sys_mount -	.long sys_oldumount -	.long sys_setuid16 -	.long sys_getuid16 -	.long sys_stime		/* 25 */ -	.long sys_ptrace -	.long sys_alarm -	.long sys_fstat -	.long sys_pause -	.long sys_utime		/* 30 */ -	.long sys_ni_syscall	/* old stty syscall holder */ -	.long sys_ni_syscall	/* old gtty syscall holder */ -	.long sys_access -	.long sys_nice -	.long sys_ni_syscall	/* 35 - old ftime syscall holder */ -	.long sys_sync -	.long sys_kill -	.long sys_rename -	.long sys_mkdir -	.long sys_rmdir		/* 40 */ -	.long sys_dup -	.long sys_pipe -	.long sys_times -	.long sys_ni_syscall	/* old prof syscall holder */ -	.long sys_brk		/* 45 */ -	.long sys_setgid16 -	.long sys_getgid16 -	.long sys_signal -	.long sys_geteuid16 -	.long sys_getegid16	/* 50 */ -	.long sys_acct -	.long sys_umount	/* recycled never used phys() */ -	.long sys_ni_syscall	/* old lock syscall holder */ -	.long sys_ioctl -	.long sys_fcntl		/* 55 */ -	.long sys_ni_syscall	/* old mpx syscall holder */ -	.long sys_setpgid -	.long sys_ni_syscall	/* old ulimit syscall holder */ -	.long sys_olduname -	.long sys_umask		/* 60 */ -	.long sys_chroot -	.long sys_ustat -	.long sys_dup2 -	.long sys_getppid -	.long sys_getpgrp	/* 65 */ -	.long sys_setsid -	.long sys_sigaction -	.long sys_sgetmask -	.long sys_ssetmask -	.long sys_setreuid16	/* 70 */ -	.long sys_setregid16 -	.long sys_sigsuspend -	.long sys_sigpending -	.long sys_sethostname -	.long sys_setrlimit	/* 75 */ -	.long sys_old_getrlimit -	.long sys_getrusage -	.long sys_gettimeofday -	.long sys_settimeofday -	.long sys_getgroups16	/* 80 */ -	.long sys_setgroups16 -	.long sys_old_select -	.long sys_symlink -	.long sys_lstat -	.long sys_readlink	/* 85 */ -	.long sys_uselib -	.long sys_swapon -	.long sys_reboot -	.long sys_old_readdir -	.long sys_old_mmap	/* 90 */ -	.long sys_munmap -	.long sys_truncate -	.long sys_ftruncate -	.long sys_fchmod -	.long sys_fchown16	/* 95 */ -	.long sys_getpriority -	.long sys_setpriority -	.long sys_ni_syscall	/* old profil syscall holder */ -	.long sys_statfs -	.long sys_fstatfs	/* 100 */ -	.long sys_ioperm -	.long sys_socketcall -	.long sys_syslog -	.long sys_setitimer -	.long sys_getitimer	/* 105 */ -	.long sys_newstat -	.long sys_newlstat -	.long sys_newfstat -	.long sys_uname -	.long ptregs_iopl	/* 110 */ -	.long sys_vhangup -	.long sys_ni_syscall	/* old "idle" system call */ -	.long ptregs_vm86old -	.long sys_wait4 -	.long sys_swapoff	/* 115 */ -	.long sys_sysinfo -	.long sys_ipc -	.long sys_fsync -	.long ptregs_sigreturn -	.long ptregs_clone	/* 120 */ -	.long sys_setdomainname -	.long sys_newuname -	.long sys_modify_ldt -	.long sys_adjtimex -	.long sys_mprotect	/* 125 */ -	.long sys_sigprocmask -	.long sys_ni_syscall	/* old "create_module" */ -	.long sys_init_module -	.long sys_delete_module -	.long sys_ni_syscall	/* 130:	old "get_kernel_syms" */ -	.long sys_quotactl -	.long sys_getpgid -	.long sys_fchdir -	.long sys_bdflush -	.long sys_sysfs		/* 135 */ -	.long sys_personality -	.long sys_ni_syscall	/* reserved for afs_syscall */ -	.long sys_setfsuid16 -	.long sys_setfsgid16 -	.long sys_llseek	/* 140 */ -	.long sys_getdents -	.long sys_select -	.long sys_flock -	.long sys_msync -	.long sys_readv		/* 145 */ -	.long sys_writev -	.long sys_getsid -	.long sys_fdatasync -	.long sys_sysctl -	.long sys_mlock		/* 150 */ -	.long sys_munlock -	.long sys_mlockall -	.long sys_munlockall -	.long sys_sched_setparam -	.long sys_sched_getparam   /* 155 */ -	.long sys_sched_setscheduler -	.long sys_sched_getscheduler -	.long sys_sched_yield -	.long sys_sched_get_priority_max -	.long sys_sched_get_priority_min  /* 160 */ -	.long sys_sched_rr_get_interval -	.long sys_nanosleep -	.long sys_mremap -	.long sys_setresuid16 -	.long sys_getresuid16	/* 165 */ -	.long ptregs_vm86 -	.long sys_ni_syscall	/* Old sys_query_module */ -	.long sys_poll -	.long sys_ni_syscall	/* Old nfsservctl */ -	.long sys_setresgid16	/* 170 */ -	.long sys_getresgid16 -	.long sys_prctl -	.long ptregs_rt_sigreturn -	.long sys_rt_sigaction -	.long sys_rt_sigprocmask	/* 175 */ -	.long sys_rt_sigpending -	.long sys_rt_sigtimedwait -	.long sys_rt_sigqueueinfo -	.long sys_rt_sigsuspend -	.long sys_pread64	/* 180 */ -	.long sys_pwrite64 -	.long sys_chown16 -	.long sys_getcwd -	.long sys_capget -	.long sys_capset	/* 185 */ -	.long ptregs_sigaltstack -	.long sys_sendfile -	.long sys_ni_syscall	/* reserved for streams1 */ -	.long sys_ni_syscall	/* reserved for streams2 */ -	.long ptregs_vfork	/* 190 */ -	.long sys_getrlimit -	.long sys_mmap_pgoff -	.long sys_truncate64 -	.long sys_ftruncate64 -	.long sys_stat64	/* 195 */ -	.long sys_lstat64 -	.long sys_fstat64 -	.long sys_lchown -	.long sys_getuid -	.long sys_getgid	/* 200 */ -	.long sys_geteuid -	.long sys_getegid -	.long sys_setreuid -	.long sys_setregid -	.long sys_getgroups	/* 205 */ -	.long sys_setgroups -	.long sys_fchown -	.long sys_setresuid -	.long sys_getresuid -	.long sys_setresgid	/* 210 */ -	.long sys_getresgid -	.long sys_chown -	.long sys_setuid -	.long sys_setgid -	.long sys_setfsuid	/* 215 */ -	.long sys_setfsgid -	.long sys_pivot_root -	.long sys_mincore -	.long sys_madvise -	.long sys_getdents64	/* 220 */ -	.long sys_fcntl64 -	.long sys_ni_syscall	/* reserved for TUX */ -	.long sys_ni_syscall -	.long sys_gettid -	.long sys_readahead	/* 225 */ -	.long sys_setxattr -	.long sys_lsetxattr -	.long sys_fsetxattr -	.long sys_getxattr -	.long sys_lgetxattr	/* 230 */ -	.long sys_fgetxattr -	.long sys_listxattr -	.long sys_llistxattr -	.long sys_flistxattr -	.long sys_removexattr	/* 235 */ -	.long sys_lremovexattr -	.long sys_fremovexattr -	.long sys_tkill -	.long sys_sendfile64 -	.long sys_futex		/* 240 */ -	.long sys_sched_setaffinity -	.long sys_sched_getaffinity -	.long sys_set_thread_area -	.long sys_get_thread_area -	.long sys_io_setup	/* 245 */ -	.long sys_io_destroy -	.long sys_io_getevents -	.long sys_io_submit -	.long sys_io_cancel -	.long sys_fadvise64	/* 250 */ -	.long sys_ni_syscall -	.long sys_exit_group -	.long sys_lookup_dcookie -	.long sys_epoll_create -	.long sys_epoll_ctl	/* 255 */ -	.long sys_epoll_wait - 	.long sys_remap_file_pages - 	.long sys_set_tid_address - 	.long sys_timer_create - 	.long sys_timer_settime		/* 260 */ - 	.long sys_timer_gettime - 	.long sys_timer_getoverrun - 	.long sys_timer_delete - 	.long sys_clock_settime - 	.long sys_clock_gettime		/* 265 */ - 	.long sys_clock_getres - 	.long sys_clock_nanosleep -	.long sys_statfs64 -	.long sys_fstatfs64 -	.long sys_tgkill	/* 270 */ -	.long sys_utimes - 	.long sys_fadvise64_64 -	.long sys_ni_syscall	/* sys_vserver */ -	.long sys_mbind -	.long sys_get_mempolicy -	.long sys_set_mempolicy -	.long sys_mq_open -	.long sys_mq_unlink -	.long sys_mq_timedsend -	.long sys_mq_timedreceive	/* 280 */ -	.long sys_mq_notify -	.long sys_mq_getsetattr -	.long sys_kexec_load -	.long sys_waitid -	.long sys_ni_syscall		/* 285 */ /* available */ -	.long sys_add_key -	.long sys_request_key -	.long sys_keyctl -	.long sys_ioprio_set -	.long sys_ioprio_get		/* 290 */ -	.long sys_inotify_init -	.long sys_inotify_add_watch -	.long sys_inotify_rm_watch -	.long sys_migrate_pages -	.long sys_openat		/* 295 */ -	.long sys_mkdirat -	.long sys_mknodat -	.long sys_fchownat -	.long sys_futimesat -	.long sys_fstatat64		/* 300 */ -	.long sys_unlinkat -	.long sys_renameat -	.long sys_linkat -	.long sys_symlinkat -	.long sys_readlinkat		/* 305 */ -	.long sys_fchmodat -	.long sys_faccessat -	.long sys_pselect6 -	.long sys_ppoll -	.long sys_unshare		/* 310 */ -	.long sys_set_robust_list -	.long sys_get_robust_list -	.long sys_splice -	.long sys_sync_file_range -	.long sys_tee			/* 315 */ -	.long sys_vmsplice -	.long sys_move_pages -	.long sys_getcpu -	.long sys_epoll_pwait -	.long sys_utimensat		/* 320 */ -	.long sys_signalfd -	.long sys_timerfd_create -	.long sys_eventfd -	.long sys_fallocate -	.long sys_timerfd_settime	/* 325 */ -	.long sys_timerfd_gettime -	.long sys_signalfd4 -	.long sys_eventfd2 -	.long sys_epoll_create1 -	.long sys_dup3			/* 330 */ -	.long sys_pipe2 -	.long sys_inotify_init1 -	.long sys_preadv -	.long sys_pwritev -	.long sys_rt_tgsigqueueinfo	/* 335 */ -	.long sys_perf_event_open -	.long sys_recvmmsg -	.long sys_fanotify_init -	.long sys_fanotify_mark -	.long sys_prlimit64		/* 340 */ -	.long sys_name_to_handle_at -	.long sys_open_by_handle_at -	.long sys_clock_adjtime -	.long sys_syncfs -	.long sys_sendmmsg		/* 345 */ -	.long sys_setns -	.long sys_process_vm_readv -	.long sys_process_vm_writev diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c index a91ae7709b4..a73b61055ad 100644 --- a/arch/x86/kernel/trampoline.c +++ b/arch/x86/kernel/trampoline.c @@ -14,11 +14,11 @@ void __init setup_trampolines(void)  	/* Has to be in very low memory so we can execute real-mode AP code. */  	mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE); -	if (mem == MEMBLOCK_ERROR) +	if (!mem)  		panic("Cannot allocate trampoline\n");  	x86_trampoline_base = __va(mem); -	memblock_x86_reserve_range(mem, mem + size, "TRAMPOLINE"); +	memblock_reserve(mem, size);  	printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",  	       x86_trampoline_base, (unsigned long long)mem, size); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a8e3eb83466..4bbe04d9674 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -306,19 +306,20 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)  			== NOTIFY_STOP)  		return;  #endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */ -#ifdef CONFIG_KPROBES +  	if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)  			== NOTIFY_STOP)  		return; -#else -	if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP) -			== NOTIFY_STOP) -		return; -#endif +	/* +	 * Let others (NMI) know that the debug stack is in use +	 * as we may switch to the interrupt stack. +	 */ +	debug_stack_usage_inc();  	preempt_conditional_sti(regs);  	do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);  	preempt_conditional_cli(regs); +	debug_stack_usage_dec();  }  #ifdef CONFIG_X86_64 @@ -411,6 +412,12 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)  							SIGTRAP) == NOTIFY_STOP)  		return; +	/* +	 * Let others (NMI) know that the debug stack is in use +	 * as we may switch to the interrupt stack. +	 */ +	debug_stack_usage_inc(); +  	/* It's safe to allow irq's after DR6 has been saved */  	preempt_conditional_sti(regs); @@ -418,6 +425,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)  		handle_vm86_trap((struct kernel_vm86_regs *) regs,  				error_code, 1);  		preempt_conditional_cli(regs); +		debug_stack_usage_dec();  		return;  	} @@ -437,6 +445,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)  	if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)  		send_sigtrap(tsk, regs, error_code, si_code);  	preempt_conditional_cli(regs); +	debug_stack_usage_dec();  	return;  } @@ -562,41 +571,18 @@ asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void)  }  /* - * __math_state_restore assumes that cr0.TS is already clear and the - * fpu state is all ready for use.  Used during context switch. - */ -void __math_state_restore(void) -{ -	struct thread_info *thread = current_thread_info(); -	struct task_struct *tsk = thread->task; - -	/* -	 * Paranoid restore. send a SIGSEGV if we fail to restore the state. -	 */ -	if (unlikely(restore_fpu_checking(tsk))) { -		stts(); -		force_sig(SIGSEGV, tsk); -		return; -	} - -	thread->status |= TS_USEDFPU;	/* So we fnsave on switch_to() */ -	tsk->fpu_counter++; -} - -/*   * 'math_state_restore()' saves the current math information in the   * old math state array, and gets the new ones from the current task   *   * Careful.. There are problems with IBM-designed IRQ13 behaviour.   * Don't touch unless you *really* know how it works.   * - * Must be called with kernel preemption disabled (in this case, - * local interrupts are disabled at the call-site in entry.S). + * Must be called with kernel preemption disabled (eg with local + * local interrupts as in the case of do_device_not_available).   */ -asmlinkage void math_state_restore(void) +void math_state_restore(void)  { -	struct thread_info *thread = current_thread_info(); -	struct task_struct *tsk = thread->task; +	struct task_struct *tsk = current;  	if (!tsk_used_math(tsk)) {  		local_irq_enable(); @@ -613,9 +599,17 @@ asmlinkage void math_state_restore(void)  		local_irq_disable();  	} -	clts();				/* Allow maths ops (or we recurse) */ +	__thread_fpu_begin(tsk); +	/* +	 * Paranoid restore. send a SIGSEGV if we fail to restore the state. +	 */ +	if (unlikely(restore_fpu_checking(tsk))) { +		__thread_fpu_end(tsk); +		force_sig(SIGSEGV, tsk); +		return; +	} -	__math_state_restore(); +	tsk->fpu_counter++;  }  EXPORT_SYMBOL_GPL(math_state_restore); @@ -723,4 +717,10 @@ void __init trap_init(void)  	cpu_init();  	x86_init.irqs.trap_init(); + +#ifdef CONFIG_X86_64 +	memcpy(&nmi_idt_table, &idt_table, IDT_ENTRIES * 16); +	set_nmi_gate(1, &debug); +	set_nmi_gate(3, &int3); +#endif  } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index db483369f10..a62c201c97e 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -35,7 +35,7 @@ static int __read_mostly tsc_unstable;     erroneous rdtsc usage on !cpu_has_tsc processors */  static int __read_mostly tsc_disabled = -1; -static int tsc_clocksource_reliable; +int tsc_clocksource_reliable;  /*   * Scheduler clock - returns current time in nanosec units.   */ @@ -178,11 +178,11 @@ static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2)  }  #define CAL_MS		10 -#define CAL_LATCH	(CLOCK_TICK_RATE / (1000 / CAL_MS)) +#define CAL_LATCH	(PIT_TICK_RATE / (1000 / CAL_MS))  #define CAL_PIT_LOOPS	1000  #define CAL2_MS		50 -#define CAL2_LATCH	(CLOCK_TICK_RATE / (1000 / CAL2_MS)) +#define CAL2_LATCH	(PIT_TICK_RATE / (1000 / CAL2_MS))  #define CAL2_PIT_LOOPS	5000 @@ -290,14 +290,15 @@ static inline int pit_verify_msb(unsigned char val)  static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap)  {  	int count; -	u64 tsc = 0; +	u64 tsc = 0, prev_tsc = 0;  	for (count = 0; count < 50000; count++) {  		if (!pit_verify_msb(val))  			break; +		prev_tsc = tsc;  		tsc = get_cycles();  	} -	*deltap = get_cycles() - tsc; +	*deltap = get_cycles() - prev_tsc;  	*tscp = tsc;  	/* @@ -311,9 +312,9 @@ static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *de   * How many MSB values do we want to see? We aim for   * a maximum error rate of 500ppm (in practice the   * real error is much smaller), but refuse to spend - * more than 25ms on it. + * more than 50ms on it.   */ -#define MAX_QUICK_PIT_MS 25 +#define MAX_QUICK_PIT_MS 50  #define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)  static unsigned long quick_pit_calibrate(void) @@ -383,15 +384,12 @@ success:  	 *  	 * As a result, we can depend on there not being  	 * any odd delays anywhere, and the TSC reads are -	 * reliable (within the error). We also adjust the -	 * delta to the middle of the error bars, just -	 * because it looks nicer. +	 * reliable (within the error).  	 *  	 * kHz = ticks / time-in-seconds / 1000;  	 * kHz = (t2 - t1) / (I * 256 / PIT_TICK_RATE) / 1000  	 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (I * 256 * 1000)  	 */ -	delta += (long)(d2 - d1)/2;  	delta *= PIT_TICK_RATE;  	do_div(delta, i*256*1000);  	printk("Fast TSC calibration using PIT\n"); @@ -995,3 +993,23 @@ void __init tsc_init(void)  	check_system_tsc_reliable();  } +#ifdef CONFIG_SMP +/* + * If we have a constant TSC and are using the TSC for the delay loop, + * we can skip clock calibration if another cpu in the same socket has already + * been calibrated. This assumes that CONSTANT_TSC applies to all + * cpus in the socket - this should be a safe assumption. + */ +unsigned long __cpuinit calibrate_delay_is_known(void) +{ +	int i, cpu = smp_processor_id(); + +	if (!tsc_disabled && !cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC)) +		return 0; + +	for_each_online_cpu(i) +		if (cpu_data(i).phys_proc_id == cpu_data(cpu).phys_proc_id) +			return cpu_data(i).loops_per_jiffy; +	return 0; +} +#endif diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 0aa5fed8b9e..9eba29b46cb 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -113,7 +113,7 @@ void __cpuinit check_tsc_sync_source(int cpu)  	if (unsynchronized_tsc())  		return; -	if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { +	if (tsc_clocksource_reliable) {  		if (cpu == (nr_cpu_ids-1) || system_state != SYSTEM_BOOTING)  			pr_info(  			"Skipped synchronization checks as TSC is reliable.\n"); @@ -172,7 +172,7 @@ void __cpuinit check_tsc_sync_target(void)  {  	int cpus = 2; -	if (unsynchronized_tsc() || boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) +	if (unsynchronized_tsc() || tsc_clocksource_reliable)  		return;  	/* diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 863f8753ab0..b466cab5ba1 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -335,9 +335,11 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk  	if (info->flags & VM86_SCREEN_BITMAP)  		mark_screen_rdonly(tsk->mm); -	/*call audit_syscall_exit since we do not exit via the normal paths */ +	/*call __audit_syscall_exit since we do not exit via the normal paths */ +#ifdef CONFIG_AUDITSYSCALL  	if (unlikely(current->audit_context)) -		audit_syscall_exit(AUDITSC_RESULT(0), 0); +		__audit_syscall_exit(1, 0); +#endif  	__asm__ __volatile__(  		"movl %0,%%esp\n\t" diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index e4d4a22e8b9..b07ba939356 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -57,7 +57,7 @@ DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) =  	.lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock),  }; -static enum { EMULATE, NATIVE, NONE } vsyscall_mode = NATIVE; +static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE;  static int __init vsyscall_setup(char *str)  { @@ -140,11 +140,40 @@ static int addr_to_vsyscall_nr(unsigned long addr)  	return nr;  } +static bool write_ok_or_segv(unsigned long ptr, size_t size) +{ +	/* +	 * XXX: if access_ok, get_user, and put_user handled +	 * sig_on_uaccess_error, this could go away. +	 */ + +	if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) { +		siginfo_t info; +		struct thread_struct *thread = ¤t->thread; + +		thread->error_code	= 6;  /* user fault, no page, write */ +		thread->cr2		= ptr; +		thread->trap_no		= 14; + +		memset(&info, 0, sizeof(info)); +		info.si_signo		= SIGSEGV; +		info.si_errno		= 0; +		info.si_code		= SEGV_MAPERR; +		info.si_addr		= (void __user *)ptr; + +		force_sig_info(SIGSEGV, &info, current); +		return false; +	} else { +		return true; +	} +} +  bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)  {  	struct task_struct *tsk;  	unsigned long caller;  	int vsyscall_nr; +	int prev_sig_on_uaccess_error;  	long ret;  	/* @@ -180,35 +209,65 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)  	if (seccomp_mode(&tsk->seccomp))  		do_exit(SIGKILL); +	/* +	 * With a real vsyscall, page faults cause SIGSEGV.  We want to +	 * preserve that behavior to make writing exploits harder. +	 */ +	prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error; +	current_thread_info()->sig_on_uaccess_error = 1; + +	/* +	 * 0 is a valid user pointer (in the access_ok sense) on 32-bit and +	 * 64-bit, so we don't need to special-case it here.  For all the +	 * vsyscalls, 0 means "don't write anything" not "write it at +	 * address 0". +	 */ +	ret = -EFAULT;  	switch (vsyscall_nr) {  	case 0: +		if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) || +		    !write_ok_or_segv(regs->si, sizeof(struct timezone))) +			break; +  		ret = sys_gettimeofday(  			(struct timeval __user *)regs->di,  			(struct timezone __user *)regs->si);  		break;  	case 1: +		if (!write_ok_or_segv(regs->di, sizeof(time_t))) +			break; +  		ret = sys_time((time_t __user *)regs->di);  		break;  	case 2: +		if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || +		    !write_ok_or_segv(regs->si, sizeof(unsigned))) +			break; +  		ret = sys_getcpu((unsigned __user *)regs->di,  				 (unsigned __user *)regs->si,  				 0);  		break;  	} +	current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error; +  	if (ret == -EFAULT) { -		/* -		 * Bad news -- userspace fed a bad pointer to a vsyscall. -		 * -		 * With a real vsyscall, that would have caused SIGSEGV. -		 * To make writing reliable exploits using the emulated -		 * vsyscalls harder, generate SIGSEGV here as well. -		 */ +		/* Bad news -- userspace fed a bad pointer to a vsyscall. */  		warn_bad_vsyscall(KERN_INFO, regs,  				  "vsyscall fault (exploit attempt?)"); -		goto sigsegv; + +		/* +		 * If we failed to generate a signal for any reason, +		 * generate one here.  (This should be impossible.) +		 */ +		if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) && +				 !sigismember(&tsk->pending.signal, SIGSEGV))) +			goto sigsegv; + +		return true;  /* Don't emulate the ret. */  	}  	regs->ax = ret; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index c1d6cd54939..947a06ccc67 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -92,6 +92,7 @@ struct x86_init_ops x86_init __initdata = {  struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {  	.setup_percpu_clockev		= setup_secondary_APIC_clock, +	.fixup_cpu_id			= x86_default_fixup_cpu_id,  };  static void default_nmi_init(void) { }; @@ -114,4 +115,5 @@ struct x86_msi_ops x86_msi = {  	.setup_msi_irqs = native_setup_msi_irqs,  	.teardown_msi_irq = native_teardown_msi_irq,  	.teardown_msi_irqs = default_teardown_msi_irqs, +	.restore_msi_irqs = default_restore_msi_irqs,  }; diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index a3911343976..71109111411 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -47,7 +47,7 @@ void __sanitize_i387_state(struct task_struct *tsk)  	if (!fx)  		return; -	BUG_ON(task_thread_info(tsk)->status & TS_USEDFPU); +	BUG_ON(__thread_has_fpu(tsk));  	xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv; @@ -168,7 +168,7 @@ int save_i387_xstate(void __user *buf)  	if (!used_math())  		return 0; -	if (task_thread_info(tsk)->status & TS_USEDFPU) { +	if (user_has_fpu()) {  		if (use_xsave())  			err = xsave_user(buf);  		else @@ -176,8 +176,7 @@ int save_i387_xstate(void __user *buf)  		if (err)  			return err; -		task_thread_info(tsk)->status &= ~TS_USEDFPU; -		stts(); +		user_fpu_end();  	} else {  		sanitize_i387_state(tsk);  		if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave, @@ -292,10 +291,7 @@ int restore_i387_xstate(void __user *buf)  			return err;  	} -	if (!(task_thread_info(current)->status & TS_USEDFPU)) { -		clts(); -		task_thread_info(current)->status |= TS_USEDFPU; -	} +	user_fpu_begin();  	if (use_xsave())  		err = restore_user_xstate(buf);  	else | 
