diff options
Diffstat (limited to 'arch/x86/kernel')
151 files changed, 6637 insertions, 4958 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index a5408b965c9..047f9ff2e36 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -26,16 +26,21 @@ obj-$(CONFIG_IRQ_WORK)  += irq_work.o  obj-y			+= probe_roms.o  obj-$(CONFIG_X86_32)	+= i386_ksyms_32.o  obj-$(CONFIG_X86_64)	+= sys_x86_64.o x8664_ksyms_64.o -obj-y			+= syscall_$(BITS).o +obj-$(CONFIG_X86_64)	+= mcount_64.o +obj-y			+= syscall_$(BITS).o vsyscall_gtod.o  obj-$(CONFIG_X86_64)	+= vsyscall_64.o  obj-$(CONFIG_X86_64)	+= vsyscall_emu_64.o +obj-$(CONFIG_X86_ESPFIX64)	+= espfix_64.o +obj-$(CONFIG_SYSFS)	+= ksysfs.o  obj-y			+= bootflag.o e820.o  obj-y			+= pci-dma.o quirks.o topology.o kdebugfs.o  obj-y			+= alternative.o i8253.o pci-nommu.o hw_breakpoint.o -obj-y			+= tsc.o io_delay.o rtc.o +obj-y			+= tsc.o tsc_msr.o io_delay.o rtc.o  obj-y			+= pci-iommu_table.o  obj-y			+= resource.o +obj-$(CONFIG_PREEMPT)	+= preempt.o +  obj-y				+= process.o  obj-y				+= i387.o xsave.o  obj-y				+= ptrace.o @@ -89,15 +94,6 @@ obj-$(CONFIG_PARAVIRT_CLOCK)	+= pvclock.o  obj-$(CONFIG_PCSPKR_PLATFORM)	+= pcspeaker.o -obj-$(CONFIG_MICROCODE_EARLY)		+= microcode_core_early.o -obj-$(CONFIG_MICROCODE_INTEL_EARLY)	+= microcode_intel_early.o -obj-$(CONFIG_MICROCODE_INTEL_LIB)	+= microcode_intel_lib.o -microcode-y				:= microcode_core.o -microcode-$(CONFIG_MICROCODE_INTEL)	+= microcode_intel.o -microcode-$(CONFIG_MICROCODE_AMD)	+= microcode_amd.o -obj-$(CONFIG_MICROCODE_AMD_EARLY)	+= microcode_amd_early.o -obj-$(CONFIG_MICROCODE)			+= microcode.o -  obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o  obj-$(CONFIG_SWIOTLB)			+= pci-swiotlb.o @@ -109,6 +105,7 @@ obj-$(CONFIG_EFI)			+= sysfb_efi.o  obj-$(CONFIG_PERF_EVENTS)		+= perf_regs.o  obj-$(CONFIG_TRACING)			+= tracepoint.o +obj-$(CONFIG_IOSF_MBI)			+= iosf_mbi.o  ###  # 64 bit specific files diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 40c76604199..86281ffb96d 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -46,7 +46,6 @@  #include "sleep.h" /* To include x86_acpi_suspend_lowlevel */  static int __initdata acpi_force = 0; -u32 acpi_rsdt_forced;  int acpi_disabled;  EXPORT_SYMBOL(acpi_disabled); @@ -54,10 +53,6 @@ EXPORT_SYMBOL(acpi_disabled);  # include <asm/proto.h>  #endif				/* X86 */ -#define BAD_MADT_ENTRY(entry, end) (					    \ -		(!entry) || (unsigned long)entry + sizeof(*entry) > end ||  \ -		((struct acpi_subtable_header *)entry)->length < sizeof(*entry)) -  #define PREFIX			"ACPI: "  int acpi_noirq;				/* skip ACPI IRQ initialization */ @@ -189,24 +184,31 @@ static int __init acpi_parse_madt(struct acpi_table_header *table)  	return 0;  } -static void acpi_register_lapic(int id, u8 enabled) +/** + * acpi_register_lapic - register a local apic and generates a logic cpu number + * @id: local apic id to register + * @enabled: this cpu is enabled or not + * + * Returns the logic cpu number which maps to the local apic + */ +static int acpi_register_lapic(int id, u8 enabled)  {  	unsigned int ver = 0;  	if (id >= MAX_LOCAL_APIC) {  		printk(KERN_INFO PREFIX "skipped apicid that is too big\n"); -		return; +		return -EINVAL;  	}  	if (!enabled) {  		++disabled_cpus; -		return; +		return -EINVAL;  	}  	if (boot_cpu_physical_apicid != -1U)  		ver = apic_version[boot_cpu_physical_apicid]; -	generic_processor_info(id, ver); +	return generic_processor_info(id, ver);  }  static int __init @@ -607,91 +609,34 @@ static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)  	int nid;  	nid = acpi_get_node(handle); -	if (nid == -1 || !node_online(nid)) -		return; -	set_apicid_to_node(physid, nid); -	numa_set_node(cpu, nid); +	if (nid != -1) { +		set_apicid_to_node(physid, nid); +		numa_set_node(cpu, nid); +	}  #endif  } -static int _acpi_map_lsapic(acpi_handle handle, int *pcpu) +static int _acpi_map_lsapic(acpi_handle handle, int physid, int *pcpu)  { -	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; -	union acpi_object *obj; -	struct acpi_madt_local_apic *lapic; -	cpumask_var_t tmp_map, new_map; -	u8 physid;  	int cpu; -	int retval = -ENOMEM; -	if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer))) -		return -EINVAL; - -	if (!buffer.length || !buffer.pointer) -		return -EINVAL; - -	obj = buffer.pointer; -	if (obj->type != ACPI_TYPE_BUFFER || -	    obj->buffer.length < sizeof(*lapic)) { -		kfree(buffer.pointer); -		return -EINVAL; -	} - -	lapic = (struct acpi_madt_local_apic *)obj->buffer.pointer; - -	if (lapic->header.type != ACPI_MADT_TYPE_LOCAL_APIC || -	    !(lapic->lapic_flags & ACPI_MADT_ENABLED)) { -		kfree(buffer.pointer); -		return -EINVAL; -	} - -	physid = lapic->id; - -	kfree(buffer.pointer); -	buffer.length = ACPI_ALLOCATE_BUFFER; -	buffer.pointer = NULL; -	lapic = NULL; - -	if (!alloc_cpumask_var(&tmp_map, GFP_KERNEL)) -		goto out; - -	if (!alloc_cpumask_var(&new_map, GFP_KERNEL)) -		goto free_tmp_map; - -	cpumask_copy(tmp_map, cpu_present_mask); -	acpi_register_lapic(physid, ACPI_MADT_ENABLED); - -	/* -	 * If acpi_register_lapic successfully generates a new logical cpu -	 * number, then the following will get us exactly what was mapped -	 */ -	cpumask_andnot(new_map, cpu_present_mask, tmp_map); -	if (cpumask_empty(new_map)) { -		printk ("Unable to map lapic to logical cpu number\n"); -		retval = -EINVAL; -		goto free_new_map; +	cpu = acpi_register_lapic(physid, ACPI_MADT_ENABLED); +	if (cpu < 0) { +		pr_info(PREFIX "Unable to map lapic to logical cpu number\n"); +		return cpu;  	}  	acpi_processor_set_pdc(handle); - -	cpu = cpumask_first(new_map);  	acpi_map_cpu2node(handle, cpu, physid);  	*pcpu = cpu; -	retval = 0; - -free_new_map: -	free_cpumask_var(new_map); -free_tmp_map: -	free_cpumask_var(tmp_map); -out: -	return retval; +	return 0;  }  /* wrapper to silence section mismatch warning */ -int __ref acpi_map_lsapic(acpi_handle handle, int *pcpu) +int __ref acpi_map_lsapic(acpi_handle handle, int physid, int *pcpu)  { -	return _acpi_map_lsapic(handle, pcpu); +	return _acpi_map_lsapic(handle, physid, pcpu);  }  EXPORT_SYMBOL(acpi_map_lsapic); @@ -745,7 +690,7 @@ static int __init acpi_parse_sbf(struct acpi_table_header *table)  #ifdef CONFIG_HPET_TIMER  #include <asm/hpet.h> -static struct __initdata resource *hpet_res; +static struct resource *hpet_res __initdata;  static int __init acpi_parse_hpet(struct acpi_table_header *table)  { @@ -958,10 +903,6 @@ static int __init acpi_parse_madt_lapic_entries(void)  #ifdef	CONFIG_X86_IO_APIC  #define MP_ISA_BUS		0 -#ifdef CONFIG_X86_ES7000 -extern int es7000_plat; -#endif -  void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)  {  	int ioapic; @@ -1011,14 +952,6 @@ void __init mp_config_acpi_legacy_irqs(void)  	set_bit(MP_ISA_BUS, mp_bus_not_pci);  	pr_debug("Bus #%d is ISA\n", MP_ISA_BUS); -#ifdef CONFIG_X86_ES7000 -	/* -	 * Older generations of ES7000 have no legacy identity mappings -	 */ -	if (es7000_plat == 1) -		return; -#endif -  	/*  	 * Use the default configuration for the IRQs 0-15.  Unless  	 * overridden by (MADT) interrupt source override entries. @@ -1084,9 +1017,7 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,  	if (!acpi_ioapic)  		return 0; -	if (!dev) -		return 0; -	if (dev->bus != &pci_bus_type) +	if (!dev || !dev_is_pci(dev))  		return 0;  	pdev = to_pci_dev(dev); @@ -1614,7 +1545,7 @@ static int __init parse_acpi(char *arg)  	}  	/* acpi=rsdt use RSDT instead of XSDT */  	else if (strcmp(arg, "rsdt") == 0) { -		acpi_rsdt_forced = 1; +		acpi_gbl_do_not_use_xsdt = TRUE;  	}  	/* "acpi=noirq" disables ACPI interrupt routing */  	else if (strcmp(arg, "noirq") == 0) { diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index d2b7f27781b..4b28159e042 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -87,7 +87,9 @@ static long acpi_processor_ffh_cstate_probe_cpu(void *_cx)  	num_cstate_subtype = edx_part & MWAIT_SUBSTATE_MASK;  	retval = 0; -	if (num_cstate_subtype < (cx->address & MWAIT_SUBSTATE_MASK)) { +	/* If the HW does not support any sub-states in this C-state */ +	if (num_cstate_subtype == 0) { +		pr_warn(FW_BUG "ACPI MWAIT C-state 0x%x not supported by HW (0x%x)\n", cx->address, edx_part);  		retval = -1;  		goto out;  	} @@ -150,29 +152,6 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,  }  EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); -/* - * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, - * which can obviate IPI to trigger checking of need_resched. - * We execute MONITOR against need_resched and enter optimized wait state - * through MWAIT. Whenever someone changes need_resched, we would be woken - * up from MWAIT (without an IPI). - * - * New with Core Duo processors, MWAIT can take some hints based on CPU - * capability. - */ -void mwait_idle_with_hints(unsigned long ax, unsigned long cx) -{ -	if (!need_resched()) { -		if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) -			clflush((void *)¤t_thread_info()->flags); - -		__monitor((void *)¤t_thread_info()->flags, 0, 0); -		smp_mb(); -		if (!need_resched()) -			__mwait(ax, cx); -	} -} -  void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)  {  	unsigned int cpu = smp_processor_id(); diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 33120100ff5..31368207837 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -26,6 +26,17 @@ static char temp_stack[4096];  #endif  /** + * x86_acpi_enter_sleep_state - enter sleep state + * @state: Sleep state to enter. + * + * Wrapper around acpi_enter_sleep_state() to be called by assmebly. + */ +acpi_status asmlinkage __visible x86_acpi_enter_sleep_state(u8 state) +{ +	return acpi_enter_sleep_state(state); +} + +/**   * x86_acpi_suspend_lowlevel - save kernel state   *   * Create an identity mapped page table and copy the wakeup routine to diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h index c9c2c982d5e..65c7b606b60 100644 --- a/arch/x86/kernel/acpi/sleep.h +++ b/arch/x86/kernel/acpi/sleep.h @@ -17,3 +17,5 @@ extern void wakeup_long64(void);  extern void do_suspend_lowlevel(void);  extern int x86_acpi_suspend_lowlevel(void); + +acpi_status asmlinkage x86_acpi_enter_sleep_state(u8 state); diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S index d1daa66ab16..665c6b7d2ea 100644 --- a/arch/x86/kernel/acpi/wakeup_32.S +++ b/arch/x86/kernel/acpi/wakeup_32.S @@ -73,7 +73,7 @@ ENTRY(do_suspend_lowlevel)  	call	save_processor_state  	call	save_registers  	pushl	$3 -	call	acpi_enter_sleep_state +	call	x86_acpi_enter_sleep_state  	addl	$4, %esp  #	In case of S3 failure, we'll emerge here.  Jump diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index 8ea5164cbd0..ae693b51ed8 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -73,7 +73,7 @@ ENTRY(do_suspend_lowlevel)  	addq	$8, %rsp  	movl	$3, %edi  	xorl	%eax, %eax -	call	acpi_enter_sleep_state +	call	x86_acpi_enter_sleep_state  	/* in case something went wrong, restore the machine status and go on */  	jmp	resume_point diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 15e8563e5c2..703130f469e 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -5,7 +5,6 @@  #include <linux/mutex.h>  #include <linux/list.h>  #include <linux/stringify.h> -#include <linux/kprobes.h>  #include <linux/mm.h>  #include <linux/vmalloc.h>  #include <linux/memory.h> @@ -402,17 +401,6 @@ void alternatives_enable_smp(void)  {  	struct smp_alt_module *mod; -#ifdef CONFIG_LOCKDEP -	/* -	 * Older binutils section handling bug prevented -	 * alternatives-replacement from working reliably. -	 * -	 * If this still occurs then you should see a hang -	 * or crash shortly after this line: -	 */ -	pr_info("lockdep: fixing up alternatives\n"); -#endif -  	/* Why bother if there are no other CPUs? */  	BUG_ON(num_possible_cpus() == 1); @@ -562,7 +550,7 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode,   *   * Note: Must be called under text_mutex.   */ -void *__kprobes text_poke(void *addr, const void *opcode, size_t len) +void *text_poke(void *addr, const void *opcode, size_t len)  {  	unsigned long flags;  	char *vaddr; diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index b574b295a2f..8e3842fc8be 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -512,7 +512,7 @@ gart_free_coherent(struct device *dev, size_t size, void *vaddr,  		   dma_addr_t dma_addr, struct dma_attrs *attrs)  {  	gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, NULL); -	free_pages((unsigned long)vaddr, get_order(size)); +	dma_generic_free_coherent(dev, size, vaddr, dma_addr, attrs);  }  static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr) diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 59554dca96e..f04dbb3069b 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -22,6 +22,7 @@ const struct pci_device_id amd_nb_misc_ids[] = {  	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M10H_F3) },  	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F3) },  	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) }, +	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) },  	{}  };  EXPORT_SYMBOL(amd_nb_misc_ids); @@ -30,6 +31,7 @@ static const struct pci_device_id amd_nb_link_ids[] = {  	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) },  	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M30H_NB_F4) },  	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) }, +	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) },  	{}  }; @@ -179,7 +181,7 @@ int amd_get_subcaches(int cpu)  	return (mask >> (4 * cuid)) & 0xf;  } -int amd_set_subcaches(int cpu, int mask) +int amd_set_subcaches(int cpu, unsigned long mask)  {  	static unsigned int reset, ban;  	struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu)); diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index c9876efecaf..af5b08ab3b7 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -40,7 +40,7 @@  #include <asm/fixmap.h>  #include <asm/apb_timer.h> -#include <asm/mrst.h> +#include <asm/intel-mid.h>  #include <asm/time.h>  #define APBT_CLOCKEVENT_RATING		110 @@ -157,13 +157,13 @@ static int __init apbt_clockevent_register(void)  	adev->num = smp_processor_id();  	adev->timer = dw_apb_clockevent_init(smp_processor_id(), "apbt0", -		mrst_timer_options == MRST_TIMER_LAPIC_APBT ? +		intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT ?  		APBT_CLOCKEVENT_RATING - 100 : APBT_CLOCKEVENT_RATING,  		adev_virt_addr(adev), 0, apbt_freq);  	/* Firmware does EOI handling for us. */  	adev->timer->eoi = NULL; -	if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) { +	if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT) {  		global_clock_event = &adev->timer->ced;  		printk(KERN_DEBUG "%s clockevent registered as global\n",  		       global_clock_event->name); @@ -253,7 +253,7 @@ static int apbt_cpuhp_notify(struct notifier_block *n,  static __init int apbt_late_init(void)  { -	if (mrst_timer_options == MRST_TIMER_LAPIC_APBT || +	if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT ||  		!apb_timer_block_enabled)  		return 0;  	/* This notifier should be called after workqueue is ready */ @@ -340,7 +340,7 @@ void __init apbt_time_init(void)  	}  #ifdef CONFIG_SMP  	/* kernel cmdline disable apb timer, so we will use lapic timers */ -	if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) { +	if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT) {  		printk(KERN_INFO "apbt: disabled per cpu timer\n");  		return;  	} diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index fd972a3e4cb..76164e173a2 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -10,6 +10,8 @@   *   * Copyright 2002 Andi Kleen, SuSE Labs.   */ +#define pr_fmt(fmt) "AGP: " fmt +  #include <linux/kernel.h>  #include <linux/types.h>  #include <linux/init.h> @@ -18,7 +20,6 @@  #include <linux/pci_ids.h>  #include <linux/pci.h>  #include <linux/bitops.h> -#include <linux/ioport.h>  #include <linux/suspend.h>  #include <asm/e820.h>  #include <asm/io.h> @@ -54,18 +55,6 @@ int fallback_aper_force __initdata;  int fix_aperture __initdata = 1; -static struct resource gart_resource = { -	.name	= "GART", -	.flags	= IORESOURCE_MEM, -}; - -static void __init insert_aperture_resource(u32 aper_base, u32 aper_size) -{ -	gart_resource.start = aper_base; -	gart_resource.end = aper_base + aper_size - 1; -	insert_resource(&iomem_resource, &gart_resource); -} -  /* This code runs before the PCI subsystem is initialized, so just     access the northbridge directly. */ @@ -88,15 +77,13 @@ static u32 __init allocate_aperture(void)  	addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR,  				      aper_size, aper_size);  	if (!addr) { -		printk(KERN_ERR -			"Cannot allocate aperture memory hole (%lx,%uK)\n", -				addr, aper_size>>10); +		pr_err("Cannot allocate aperture memory hole [mem %#010lx-%#010lx] (%uKB)\n", +		       addr, addr + aper_size - 1, aper_size >> 10);  		return 0;  	}  	memblock_reserve(addr, aper_size); -	printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n", -			aper_size >> 10, addr); -	insert_aperture_resource((u32)addr, aper_size); +	pr_info("Mapping aperture over RAM [mem %#010lx-%#010lx] (%uKB)\n", +		addr, addr + aper_size - 1, aper_size >> 10);  	register_nosave_region(addr >> PAGE_SHIFT,  			       (addr+aper_size) >> PAGE_SHIFT); @@ -140,10 +127,11 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order)  	u64 aper;  	u32 old_order; -	printk(KERN_INFO "AGP bridge at %02x:%02x:%02x\n", bus, slot, func); +	pr_info("pci 0000:%02x:%02x:%02x: AGP bridge\n", bus, slot, func);  	apsizereg = read_pci_config_16(bus, slot, func, cap + 0x14);  	if (apsizereg == 0xffffffff) { -		printk(KERN_ERR "APSIZE in AGP bridge unreadable\n"); +		pr_err("pci 0000:%02x:%02x.%d: APSIZE unreadable\n", +		       bus, slot, func);  		return 0;  	} @@ -167,16 +155,18 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order)  	 * On some sick chips, APSIZE is 0. It means it wants 4G  	 * so let double check that order, and lets trust AMD NB settings:  	 */ -	printk(KERN_INFO "Aperture from AGP @ %Lx old size %u MB\n", -			aper, 32 << old_order); +	pr_info("pci 0000:%02x:%02x.%d: AGP aperture [bus addr %#010Lx-%#010Lx] (old size %uMB)\n", +		bus, slot, func, aper, aper + (32ULL << (old_order + 20)) - 1, +		32 << old_order);  	if (aper + (32ULL<<(20 + *order)) > 0x100000000ULL) { -		printk(KERN_INFO "Aperture size %u MB (APSIZE %x) is not right, using settings from NB\n", -				32 << *order, apsizereg); +		pr_info("pci 0000:%02x:%02x.%d: AGP aperture size %uMB (APSIZE %#x) is not right, using settings from NB\n", +			bus, slot, func, 32 << *order, apsizereg);  		*order = old_order;  	} -	printk(KERN_INFO "Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", -			aper, 32 << *order, apsizereg); +	pr_info("pci 0000:%02x:%02x.%d: AGP aperture [bus addr %#010Lx-%#010Lx] (%uMB, APSIZE %#x)\n", +		bus, slot, func, aper, aper + (32ULL << (*order + 20)) - 1, +		32 << *order, apsizereg);  	if (!aperture_valid(aper, (32*1024*1024) << *order, 32<<20))  		return 0; @@ -232,7 +222,7 @@ static u32 __init search_agp_bridge(u32 *order, int *valid_agp)  			}  		}  	} -	printk(KERN_INFO "No AGP bridge found\n"); +	pr_info("No AGP bridge found\n");  	return 0;  } @@ -324,7 +314,8 @@ void __init early_gart_iommu_check(void)  		if (e820_any_mapped(aper_base, aper_base + aper_size,  				    E820_RAM)) {  			/* reserve it, so we can reuse it in second kernel */ -			printk(KERN_INFO "update e820 for GART\n"); +			pr_info("e820: reserve [mem %#010Lx-%#010Lx] for GART\n", +				aper_base, aper_base + aper_size - 1);  			e820_add_region(aper_base, aper_size, E820_RESERVED);  			update_e820();  		} @@ -368,7 +359,7 @@ int __init gart_iommu_hole_init(void)  	    !early_pci_allowed())  		return -ENODEV; -	printk(KERN_INFO  "Checking aperture...\n"); +	pr_info("Checking aperture...\n");  	if (!fallback_aper_force)  		agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp); @@ -409,8 +400,9 @@ int __init gart_iommu_hole_init(void)  			aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;  			aper_base <<= 25; -			printk(KERN_INFO "Node %d: aperture @ %Lx size %u MB\n", -					node, aper_base, aper_size >> 20); +			pr_info("Node %d: aperture [bus addr %#010Lx-%#010Lx] (%uMB)\n", +				node, aper_base, aper_base + aper_size - 1, +				aper_size >> 20);  			node++;  			if (!aperture_valid(aper_base, aper_size, 64<<20)) { @@ -421,9 +413,9 @@ int __init gart_iommu_hole_init(void)  					if (!no_iommu &&  					    max_pfn > MAX_DMA32_PFN &&  					    !printed_gart_size_msg) { -						printk(KERN_ERR "you are using iommu with agp, but GART size is less than 64M\n"); -						printk(KERN_ERR "please increase GART size in your BIOS setup\n"); -						printk(KERN_ERR "if BIOS doesn't have that option, contact your HW vendor!\n"); +						pr_err("you are using iommu with agp, but GART size is less than 64MB\n"); +						pr_err("please increase GART size in your BIOS setup\n"); +						pr_err("if BIOS doesn't have that option, contact your HW vendor!\n");  						printed_gart_size_msg = 1;  					}  				} else { @@ -444,12 +436,8 @@ int __init gart_iommu_hole_init(void)  out:  	if (!fix && !fallback_aper_force) { -		if (last_aper_base) { -			unsigned long n = (32 * 1024 * 1024) << last_aper_order; - -			insert_aperture_resource((u32)last_aper_base, n); +		if (last_aper_base)  			return 1; -		}  		return 0;  	} @@ -464,13 +452,10 @@ out:  		   force_iommu ||  		   valid_agp ||  		   fallback_aper_force) { -		printk(KERN_INFO -			"Your BIOS doesn't leave a aperture memory hole\n"); -		printk(KERN_INFO -			"Please enable the IOMMU option in the BIOS setup\n"); -		printk(KERN_INFO -			"This costs you %d MB of RAM\n", -				32 << fallback_aper_order); +		pr_info("Your BIOS doesn't leave a aperture memory hole\n"); +		pr_info("Please enable the IOMMU option in the BIOS setup\n"); +		pr_info("This costs you %dMB of RAM\n", +			32 << fallback_aper_order);  		aper_order = fallback_aper_order;  		aper_alloc = allocate_aperture(); diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 0ae0323b1f9..dcb5b15401c 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -18,10 +18,7 @@ obj-y				+= apic_flat_64.o  endif  # APIC probe will depend on the listing order here -obj-$(CONFIG_X86_NUMAQ)		+= numaq_32.o -obj-$(CONFIG_X86_SUMMIT)	+= summit_32.o  obj-$(CONFIG_X86_BIGSMP)	+= bigsmp_32.o -obj-$(CONFIG_X86_ES7000)	+= es7000_32.o  # For 32bit, probe_32 need to be listed last  obj-$(CONFIG_X86_LOCAL_APIC)	+= probe_$(BITS).o diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index a7eb82d9b01..ad28db7e6bd 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -62,6 +62,7 @@ unsigned disabled_cpus;  /* Processor that is doing the boot up */  unsigned int boot_cpu_physical_apicid = -1U; +EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid);  /*   * The highest APIC ID seen during enumeration. @@ -74,6 +75,13 @@ unsigned int max_physical_apicid;  physid_mask_t phys_cpu_present_map;  /* + * Processor to be disabled specified by kernel parameter + * disable_cpu_apicid=<int>, mostly used for the kdump 2nd kernel to + * avoid undefined behaviour caused by sending INIT from AP to BSP. + */ +static unsigned int disabled_cpu_apicid __read_mostly = BAD_APICID; + +/*   * Map cpu index to physical APIC ID   */  DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID); @@ -125,6 +133,10 @@ static inline void imcr_apic_to_pic(void)   * +1=force-enable   */  static int force_enable_local_apic __initdata; + +/* Control whether x2APIC mode is enabled or not */ +static bool nox2apic __initdata; +  /*   * APIC command line parameters   */ @@ -154,8 +166,7 @@ int x2apic_mode;  /* x2apic enabled before OS handover */  int x2apic_preenabled;  static int x2apic_disabled; -static int nox2apic; -static __init int setup_nox2apic(char *str) +static int __init setup_nox2apic(char *str)  {  	if (x2apic_enabled()) {  		int apicid = native_apic_msr_read(APIC_ID); @@ -170,7 +181,7 @@ static __init int setup_nox2apic(char *str)  	} else  		setup_clear_cpu_cap(X86_FEATURE_X2APIC); -	nox2apic = 1; +	nox2apic = true;  	return 0;  } @@ -275,8 +286,12 @@ u32 native_safe_apic_wait_icr_idle(void)  void native_apic_icr_write(u32 low, u32 id)  { +	unsigned long flags; + +	local_irq_save(flags);  	apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id));  	apic_write(APIC_ICR, low); +	local_irq_restore(flags);  }  u64 native_apic_icr_read(void) @@ -1967,7 +1982,7 @@ __visible void smp_trace_spurious_interrupt(struct pt_regs *regs)   */  static inline void __smp_error_interrupt(struct pt_regs *regs)  { -	u32 v0, v1; +	u32 v;  	u32 i = 0;  	static const char * const error_interrupt_reason[] = {  		"Send CS error",		/* APIC Error Bit 0 */ @@ -1981,21 +1996,21 @@ static inline void __smp_error_interrupt(struct pt_regs *regs)  	};  	/* First tickle the hardware, only then report what went on. -- REW */ -	v0 = apic_read(APIC_ESR); -	apic_write(APIC_ESR, 0); -	v1 = apic_read(APIC_ESR); +	if (lapic_get_maxlvt() > 3)	/* Due to the Pentium erratum 3AP. */ +		apic_write(APIC_ESR, 0); +	v = apic_read(APIC_ESR);  	ack_APIC_irq();  	atomic_inc(&irq_err_count); -	apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x(%02x)", -		    smp_processor_id(), v0 , v1); +	apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x", +		    smp_processor_id(), v); -	v1 = v1 & 0xff; -	while (v1) { -		if (v1 & 0x1) +	v &= 0xff; +	while (v) { +		if (v & 0x1)  			apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]);  		i++; -		v1 >>= 1; +		v >>= 1;  	}  	apic_printk(APIC_DEBUG, KERN_CONT "\n"); @@ -2107,13 +2122,45 @@ void disconnect_bsp_APIC(int virt_wire_setup)  	apic_write(APIC_LVT1, value);  } -void generic_processor_info(int apicid, int version) +int generic_processor_info(int apicid, int version)  {  	int cpu, max = nr_cpu_ids;  	bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid,  				phys_cpu_present_map);  	/* +	 * boot_cpu_physical_apicid is designed to have the apicid +	 * returned by read_apic_id(), i.e, the apicid of the +	 * currently booting-up processor. However, on some platforms, +	 * it is temporarily modified by the apicid reported as BSP +	 * through MP table. Concretely: +	 * +	 * - arch/x86/kernel/mpparse.c: MP_processor_info() +	 * - arch/x86/mm/amdtopology.c: amd_numa_init() +	 * +	 * This function is executed with the modified +	 * boot_cpu_physical_apicid. So, disabled_cpu_apicid kernel +	 * parameter doesn't work to disable APs on kdump 2nd kernel. +	 * +	 * Since fixing handling of boot_cpu_physical_apicid requires +	 * another discussion and tests on each platform, we leave it +	 * for now and here we use read_apic_id() directly in this +	 * function, generic_processor_info(). +	 */ +	if (disabled_cpu_apicid != BAD_APICID && +	    disabled_cpu_apicid != read_apic_id() && +	    disabled_cpu_apicid == apicid) { +		int thiscpu = num_processors + disabled_cpus; + +		pr_warning("APIC: Disabling requested cpu." +			   " Processor %d/0x%x ignored.\n", +			   thiscpu, apicid); + +		disabled_cpus++; +		return -ENODEV; +	} + +	/*  	 * If boot cpu has not been detected yet, then only allow upto  	 * nr_cpu_ids - 1 processors and keep one slot free for boot cpu  	 */ @@ -2127,7 +2174,7 @@ void generic_processor_info(int apicid, int version)  			"  Processor %d/0x%x ignored.\n", max, thiscpu, apicid);  		disabled_cpus++; -		return; +		return -ENODEV;  	}  	if (num_processors >= nr_cpu_ids) { @@ -2138,7 +2185,7 @@ void generic_processor_info(int apicid, int version)  			"  Processor %d/0x%x ignored.\n", max, thiscpu, apicid);  		disabled_cpus++; -		return; +		return -EINVAL;  	}  	num_processors++; @@ -2183,6 +2230,8 @@ void generic_processor_info(int apicid, int version)  #endif  	set_cpu_possible(cpu, true);  	set_cpu_present(cpu, true); + +	return cpu;  }  int hard_smp_processor_id(void) @@ -2589,3 +2638,12 @@ static int __init lapic_insert_resource(void)   * that is using request_resource   */  late_initcall(lapic_insert_resource); + +static int __init apic_set_disabled_cpu_apicid(char *arg) +{ +	if (!arg || !get_option(&arg, &disabled_cpu_apicid)) +		return -EINVAL; + +	return 0; +} +early_param("disable_cpu_apicid", apic_set_disabled_cpu_apicid); diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 00c77cf78e9..7c1b2947951 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -14,16 +14,13 @@  #include <linux/string.h>  #include <linux/kernel.h>  #include <linux/ctype.h> -#include <linux/init.h>  #include <linux/hardirq.h>  #include <linux/module.h>  #include <asm/smp.h>  #include <asm/apic.h>  #include <asm/ipi.h> -#ifdef CONFIG_ACPI -#include <acpi/acpi_bus.h> -#endif +#include <linux/acpi.h>  static struct apic apic_physflat;  static struct apic apic_flat; @@ -201,7 +198,7 @@ static struct apic apic_flat =  {  	.trampoline_phys_low		= DEFAULT_TRAMPOLINE_PHYS_LOW,  	.trampoline_phys_high		= DEFAULT_TRAMPOLINE_PHYS_HIGH, -	.wait_for_init_deassert		= NULL, +	.wait_for_init_deassert		= false,  	.smp_callin_clear_local_apic	= NULL,  	.inquire_remote_apic		= default_inquire_remote_apic, @@ -317,7 +314,7 @@ static struct apic apic_physflat =  {  	.trampoline_phys_low		= DEFAULT_TRAMPOLINE_PHYS_LOW,  	.trampoline_phys_high		= DEFAULT_TRAMPOLINE_PHYS_HIGH, -	.wait_for_init_deassert		= NULL, +	.wait_for_init_deassert		= false,  	.smp_callin_clear_local_apic	= NULL,  	.inquire_remote_apic		= default_inquire_remote_apic, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index e145f28b409..8c7c98249c2 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -15,7 +15,6 @@  #include <linux/string.h>  #include <linux/kernel.h>  #include <linux/ctype.h> -#include <linux/init.h>  #include <linux/errno.h>  #include <asm/fixmap.h>  #include <asm/mpspec.h> @@ -173,8 +172,7 @@ struct apic apic_noop = {  	.trampoline_phys_low		= DEFAULT_TRAMPOLINE_PHYS_LOW,  	.trampoline_phys_high		= DEFAULT_TRAMPOLINE_PHYS_HIGH, -	.wait_for_init_deassert		= NULL, - +	.wait_for_init_deassert		= false,  	.smp_callin_clear_local_apic	= NULL,  	.inquire_remote_apic		= NULL, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 3e67f9e3d7e..a5b45df8bc8 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -248,7 +248,7 @@ static const struct apic apic_numachip __refconst = {  	.wakeup_secondary_cpu		= numachip_wakeup_secondary,  	.trampoline_phys_low		= DEFAULT_TRAMPOLINE_PHYS_LOW,  	.trampoline_phys_high		= DEFAULT_TRAMPOLINE_PHYS_HIGH, -	.wait_for_init_deassert		= NULL, +	.wait_for_init_deassert		= false,  	.smp_callin_clear_local_apic	= NULL,  	.inquire_remote_apic		= NULL, /* REMRD not supported */ diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index d50e3640d5a..e4840aa7a25 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -199,8 +199,7 @@ static struct apic apic_bigsmp = {  	.trampoline_phys_low		= DEFAULT_TRAMPOLINE_PHYS_LOW,  	.trampoline_phys_high		= DEFAULT_TRAMPOLINE_PHYS_HIGH, -	.wait_for_init_deassert		= default_wait_for_init_deassert, - +	.wait_for_init_deassert		= true,  	.smp_callin_clear_local_apic	= NULL,  	.inquire_remote_apic		= default_inquire_remote_apic, diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c deleted file mode 100644 index c55224731b2..00000000000 --- a/arch/x86/kernel/apic/es7000_32.c +++ /dev/null @@ -1,746 +0,0 @@ -/* - * Written by: Garry Forsgren, Unisys Corporation - *             Natalie Protasevich, Unisys Corporation - * - * This file contains the code to configure and interface - * with Unisys ES7000 series hardware system manager. - * - * Copyright (c) 2003 Unisys Corporation. - * Copyright (C) 2009, Red Hat, Inc., Ingo Molnar - * - *   All Rights Reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston MA 02111-1307, USA. - * - * Contact information: Unisys Corporation, Township Line & Union Meeting - * Roads-A, Unisys Way, Blue Bell, Pennsylvania, 19424, or: - * - * http://www.unisys.com - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/notifier.h> -#include <linux/spinlock.h> -#include <linux/cpumask.h> -#include <linux/threads.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/reboot.h> -#include <linux/string.h> -#include <linux/types.h> -#include <linux/errno.h> -#include <linux/acpi.h> -#include <linux/init.h> -#include <linux/gfp.h> -#include <linux/nmi.h> -#include <linux/smp.h> -#include <linux/io.h> - -#include <asm/apicdef.h> -#include <linux/atomic.h> -#include <asm/fixmap.h> -#include <asm/mpspec.h> -#include <asm/setup.h> -#include <asm/apic.h> -#include <asm/ipi.h> - -/* - * ES7000 chipsets - */ - -#define NON_UNISYS			0 -#define ES7000_CLASSIC			1 -#define ES7000_ZORRO			2 - -#define	MIP_REG				1 -#define	MIP_PSAI_REG			4 - -#define	MIP_BUSY			1 -#define	MIP_SPIN			0xf0000 -#define	MIP_VALID			0x0100000000000000ULL -#define	MIP_SW_APIC			0x1020b - -#define	MIP_PORT(val)			((val >> 32) & 0xffff) - -#define	MIP_RD_LO(val)			(val & 0xffffffff) - -struct mip_reg { -	unsigned long long		off_0x00; -	unsigned long long		off_0x08; -	unsigned long long		off_0x10; -	unsigned long long		off_0x18; -	unsigned long long		off_0x20; -	unsigned long long		off_0x28; -	unsigned long long		off_0x30; -	unsigned long long		off_0x38; -}; - -struct mip_reg_info { -	unsigned long long		mip_info; -	unsigned long long		delivery_info; -	unsigned long long		host_reg; -	unsigned long long		mip_reg; -}; - -struct psai { -	unsigned long long		entry_type; -	unsigned long long		addr; -	unsigned long long		bep_addr; -}; - -#ifdef CONFIG_ACPI - -struct es7000_oem_table { -	struct acpi_table_header	Header; -	u32				OEMTableAddr; -	u32				OEMTableSize; -}; - -static unsigned long			oem_addrX; -static unsigned long			oem_size; - -#endif - -/* - * ES7000 Globals - */ - -static volatile unsigned long		*psai; -static struct mip_reg			*mip_reg; -static struct mip_reg			*host_reg; -static int 				mip_port; -static unsigned long			mip_addr; -static unsigned long			host_addr; - -int					es7000_plat; - -/* - * GSI override for ES7000 platforms. - */ - - -static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) -{ -	unsigned long vect = 0, psaival = 0; - -	if (psai == NULL) -		return -1; - -	vect = ((unsigned long)__pa(eip)/0x1000) << 16; -	psaival = (0x1000000 | vect | cpu); - -	while (*psai & 0x1000000) -		; - -	*psai = psaival; - -	return 0; -} - -static int es7000_apic_is_cluster(void) -{ -	/* MPENTIUMIII */ -	if (boot_cpu_data.x86 == 6 && -	    (boot_cpu_data.x86_model >= 7 && boot_cpu_data.x86_model <= 11)) -		return 1; - -	return 0; -} - -static void setup_unisys(void) -{ -	/* -	 * Determine the generation of the ES7000 currently running. -	 * -	 * es7000_plat = 1 if the machine is a 5xx ES7000 box -	 * es7000_plat = 2 if the machine is a x86_64 ES7000 box -	 * -	 */ -	if (!(boot_cpu_data.x86 <= 15 && boot_cpu_data.x86_model <= 2)) -		es7000_plat = ES7000_ZORRO; -	else -		es7000_plat = ES7000_CLASSIC; -} - -/* - * Parse the OEM Table: - */ -static int parse_unisys_oem(char *oemptr) -{ -	int			i; -	int 			success = 0; -	unsigned char		type, size; -	unsigned long		val; -	char			*tp = NULL; -	struct psai		*psaip = NULL; -	struct mip_reg_info 	*mi; -	struct mip_reg		*host, *mip; - -	tp = oemptr; - -	tp += 8; - -	for (i = 0; i <= 6; i++) { -		type = *tp++; -		size = *tp++; -		tp -= 2; -		switch (type) { -		case MIP_REG: -			mi = (struct mip_reg_info *)tp; -			val = MIP_RD_LO(mi->host_reg); -			host_addr = val; -			host = (struct mip_reg *)val; -			host_reg = __va(host); -			val = MIP_RD_LO(mi->mip_reg); -			mip_port = MIP_PORT(mi->mip_info); -			mip_addr = val; -			mip = (struct mip_reg *)val; -			mip_reg = __va(mip); -			pr_debug("host_reg = 0x%lx\n", -				 (unsigned long)host_reg); -			pr_debug("mip_reg = 0x%lx\n", -				 (unsigned long)mip_reg); -			success++; -			break; -		case MIP_PSAI_REG: -			psaip = (struct psai *)tp; -			if (tp != NULL) { -				if (psaip->addr) -					psai = __va(psaip->addr); -				else -					psai = NULL; -				success++; -			} -			break; -		default: -			break; -		} -		tp += size; -	} - -	if (success < 2) -		es7000_plat = NON_UNISYS; -	else -		setup_unisys(); - -	return es7000_plat; -} - -#ifdef CONFIG_ACPI -static int __init find_unisys_acpi_oem_table(unsigned long *oem_addr) -{ -	struct acpi_table_header *header = NULL; -	struct es7000_oem_table *table; -	acpi_size tbl_size; -	acpi_status ret; -	int i = 0; - -	for (;;) { -		ret = acpi_get_table_with_size("OEM1", i++, &header, &tbl_size); -		if (!ACPI_SUCCESS(ret)) -			return -1; - -		if (!memcmp((char *) &header->oem_id, "UNISYS", 6)) -			break; - -		early_acpi_os_unmap_memory(header, tbl_size); -	} - -	table = (void *)header; - -	oem_addrX	= table->OEMTableAddr; -	oem_size	= table->OEMTableSize; - -	early_acpi_os_unmap_memory(header, tbl_size); - -	*oem_addr	= (unsigned long)__acpi_map_table(oem_addrX, oem_size); - -	return 0; -} - -static void __init unmap_unisys_acpi_oem_table(unsigned long oem_addr) -{ -	if (!oem_addr) -		return; - -	__acpi_unmap_table((char *)oem_addr, oem_size); -} - -static int es7000_check_dsdt(void) -{ -	struct acpi_table_header header; - -	if (ACPI_SUCCESS(acpi_get_table_header(ACPI_SIG_DSDT, 0, &header)) && -	    !strncmp(header.oem_id, "UNISYS", 6)) -		return 1; -	return 0; -} - -static int es7000_acpi_ret; - -/* Hook from generic ACPI tables.c */ -static int __init es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id) -{ -	unsigned long oem_addr = 0; -	int check_dsdt; -	int ret = 0; - -	/* check dsdt at first to avoid clear fix_map for oem_addr */ -	check_dsdt = es7000_check_dsdt(); - -	if (!find_unisys_acpi_oem_table(&oem_addr)) { -		if (check_dsdt) { -			ret = parse_unisys_oem((char *)oem_addr); -		} else { -			setup_unisys(); -			ret = 1; -		} -		/* -		 * we need to unmap it -		 */ -		unmap_unisys_acpi_oem_table(oem_addr); -	} - -	es7000_acpi_ret = ret; - -	return ret && !es7000_apic_is_cluster(); -} - -static int es7000_acpi_madt_oem_check_cluster(char *oem_id, char *oem_table_id) -{ -	int ret = es7000_acpi_ret; - -	return ret && es7000_apic_is_cluster(); -} - -#else /* !CONFIG_ACPI: */ -static int es7000_acpi_madt_oem_check(char *oem_id, char *oem_table_id) -{ -	return 0; -} - -static int es7000_acpi_madt_oem_check_cluster(char *oem_id, char *oem_table_id) -{ -	return 0; -} -#endif /* !CONFIG_ACPI */ - -static void es7000_spin(int n) -{ -	int i = 0; - -	while (i++ < n) -		rep_nop(); -} - -static int es7000_mip_write(struct mip_reg *mip_reg) -{ -	int status = 0; -	int spin; - -	spin = MIP_SPIN; -	while ((host_reg->off_0x38 & MIP_VALID) != 0) { -		if (--spin <= 0) { -			WARN(1,	"Timeout waiting for Host Valid Flag\n"); -			return -1; -		} -		es7000_spin(MIP_SPIN); -	} - -	memcpy(host_reg, mip_reg, sizeof(struct mip_reg)); -	outb(1, mip_port); - -	spin = MIP_SPIN; - -	while ((mip_reg->off_0x38 & MIP_VALID) == 0) { -		if (--spin <= 0) { -			WARN(1,	"Timeout waiting for MIP Valid Flag\n"); -			return -1; -		} -		es7000_spin(MIP_SPIN); -	} - -	status = (mip_reg->off_0x00 & 0xffff0000000000ULL) >> 48; -	mip_reg->off_0x38 &= ~MIP_VALID; - -	return status; -} - -static void es7000_enable_apic_mode(void) -{ -	struct mip_reg es7000_mip_reg; -	int mip_status; - -	if (!es7000_plat) -		return; - -	pr_info("Enabling APIC mode.\n"); -	memset(&es7000_mip_reg, 0, sizeof(struct mip_reg)); -	es7000_mip_reg.off_0x00 = MIP_SW_APIC; -	es7000_mip_reg.off_0x38 = MIP_VALID; - -	while ((mip_status = es7000_mip_write(&es7000_mip_reg)) != 0) -		WARN(1, "Command failed, status = %x\n", mip_status); -} - -static void es7000_wait_for_init_deassert(atomic_t *deassert) -{ -	while (!atomic_read(deassert)) -		cpu_relax(); -} - -static unsigned int es7000_get_apic_id(unsigned long x) -{ -	return (x >> 24) & 0xFF; -} - -static void es7000_send_IPI_mask(const struct cpumask *mask, int vector) -{ -	default_send_IPI_mask_sequence_phys(mask, vector); -} - -static void es7000_send_IPI_allbutself(int vector) -{ -	default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector); -} - -static void es7000_send_IPI_all(int vector) -{ -	es7000_send_IPI_mask(cpu_online_mask, vector); -} - -static int es7000_apic_id_registered(void) -{ -	return 1; -} - -static const struct cpumask *target_cpus_cluster(void) -{ -	return cpu_all_mask; -} - -static const struct cpumask *es7000_target_cpus(void) -{ -	return cpumask_of(smp_processor_id()); -} - -static unsigned long es7000_check_apicid_used(physid_mask_t *map, int apicid) -{ -	return 0; -} - -static unsigned long es7000_check_apicid_present(int bit) -{ -	return physid_isset(bit, phys_cpu_present_map); -} - -static int es7000_early_logical_apicid(int cpu) -{ -	/* on es7000, logical apicid is the same as physical */ -	return early_per_cpu(x86_bios_cpu_apicid, cpu); -} - -static unsigned long calculate_ldr(int cpu) -{ -	unsigned long id = per_cpu(x86_bios_cpu_apicid, cpu); - -	return SET_APIC_LOGICAL_ID(id); -} - -/* - * Set up the logical destination ID. - * - * Intel recommends to set DFR, LdR and TPR before enabling - * an APIC.  See e.g. "AP-388 82489DX User's Manual" (Intel - * document number 292116).  So here it goes... - */ -static void es7000_init_apic_ldr_cluster(void) -{ -	unsigned long val; -	int cpu = smp_processor_id(); - -	apic_write(APIC_DFR, APIC_DFR_CLUSTER); -	val = calculate_ldr(cpu); -	apic_write(APIC_LDR, val); -} - -static void es7000_init_apic_ldr(void) -{ -	unsigned long val; -	int cpu = smp_processor_id(); - -	apic_write(APIC_DFR, APIC_DFR_FLAT); -	val = calculate_ldr(cpu); -	apic_write(APIC_LDR, val); -} - -static void es7000_setup_apic_routing(void) -{ -	int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id()); - -	pr_info("Enabling APIC mode:  %s. Using %d I/O APICs, target cpus %lx\n", -		(apic_version[apic] == 0x14) ? -			"Physical Cluster" : "Logical Cluster", -		nr_ioapics, cpumask_bits(es7000_target_cpus())[0]); -} - -static int es7000_cpu_present_to_apicid(int mps_cpu) -{ -	if (!mps_cpu) -		return boot_cpu_physical_apicid; -	else if (mps_cpu < nr_cpu_ids) -		return per_cpu(x86_bios_cpu_apicid, mps_cpu); -	else -		return BAD_APICID; -} - -static int cpu_id; - -static void es7000_apicid_to_cpu_present(int phys_apicid, physid_mask_t *retmap) -{ -	physid_set_mask_of_physid(cpu_id, retmap); -	++cpu_id; -} - -static void es7000_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) -{ -	/* For clustered we don't have a good way to do this yet - hack */ -	physids_promote(0xFFL, retmap); -} - -static int es7000_check_phys_apicid_present(int cpu_physical_apicid) -{ -	boot_cpu_physical_apicid = read_apic_id(); -	return 1; -} - -static inline int -es7000_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id) -{ -	unsigned int round = 0; -	unsigned int cpu, uninitialized_var(apicid); - -	/* -	 * The cpus in the mask must all be on the apic cluster. -	 */ -	for_each_cpu_and(cpu, cpumask, cpu_online_mask) { -		int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); - -		if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { -			WARN(1, "Not a valid mask!"); - -			return -EINVAL; -		} -		apicid |= new_apicid; -		round++; -	} -	if (!round) -		return -EINVAL; -	*dest_id = apicid; -	return 0; -} - -static int -es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask, -			      const struct cpumask *andmask, -			      unsigned int *apicid) -{ -	cpumask_var_t cpumask; -	*apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0); - -	if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) -		return 0; - -	cpumask_and(cpumask, inmask, andmask); -	es7000_cpu_mask_to_apicid(cpumask, apicid); - -	free_cpumask_var(cpumask); - -	return 0; -} - -static int es7000_phys_pkg_id(int cpuid_apic, int index_msb) -{ -	return cpuid_apic >> index_msb; -} - -static int probe_es7000(void) -{ -	/* probed later in mptable/ACPI hooks */ -	return 0; -} - -static int es7000_mps_ret; -static int es7000_mps_oem_check(struct mpc_table *mpc, char *oem, -		char *productid) -{ -	int ret = 0; - -	if (mpc->oemptr) { -		struct mpc_oemtable *oem_table = -			(struct mpc_oemtable *)mpc->oemptr; - -		if (!strncmp(oem, "UNISYS", 6)) -			ret = parse_unisys_oem((char *)oem_table); -	} - -	es7000_mps_ret = ret; - -	return ret && !es7000_apic_is_cluster(); -} - -static int es7000_mps_oem_check_cluster(struct mpc_table *mpc, char *oem, -		char *productid) -{ -	int ret = es7000_mps_ret; - -	return ret && es7000_apic_is_cluster(); -} - -/* We've been warned by a false positive warning.Use __refdata to keep calm. */ -static struct apic __refdata apic_es7000_cluster = { - -	.name				= "es7000", -	.probe				= probe_es7000, -	.acpi_madt_oem_check		= es7000_acpi_madt_oem_check_cluster, -	.apic_id_valid			= default_apic_id_valid, -	.apic_id_registered		= es7000_apic_id_registered, - -	.irq_delivery_mode		= dest_LowestPrio, -	/* logical delivery broadcast to all procs: */ -	.irq_dest_mode			= 1, - -	.target_cpus			= target_cpus_cluster, -	.disable_esr			= 1, -	.dest_logical			= 0, -	.check_apicid_used		= es7000_check_apicid_used, -	.check_apicid_present		= es7000_check_apicid_present, - -	.vector_allocation_domain	= flat_vector_allocation_domain, -	.init_apic_ldr			= es7000_init_apic_ldr_cluster, - -	.ioapic_phys_id_map		= es7000_ioapic_phys_id_map, -	.setup_apic_routing		= es7000_setup_apic_routing, -	.multi_timer_check		= NULL, -	.cpu_present_to_apicid		= es7000_cpu_present_to_apicid, -	.apicid_to_cpu_present		= es7000_apicid_to_cpu_present, -	.setup_portio_remap		= NULL, -	.check_phys_apicid_present	= es7000_check_phys_apicid_present, -	.enable_apic_mode		= es7000_enable_apic_mode, -	.phys_pkg_id			= es7000_phys_pkg_id, -	.mps_oem_check			= es7000_mps_oem_check_cluster, - -	.get_apic_id			= es7000_get_apic_id, -	.set_apic_id			= NULL, -	.apic_id_mask			= 0xFF << 24, - -	.cpu_mask_to_apicid_and		= es7000_cpu_mask_to_apicid_and, - -	.send_IPI_mask			= es7000_send_IPI_mask, -	.send_IPI_mask_allbutself	= NULL, -	.send_IPI_allbutself		= es7000_send_IPI_allbutself, -	.send_IPI_all			= es7000_send_IPI_all, -	.send_IPI_self			= default_send_IPI_self, - -	.wakeup_secondary_cpu		= wakeup_secondary_cpu_via_mip, - -	.trampoline_phys_low		= 0x467, -	.trampoline_phys_high		= 0x469, - -	.wait_for_init_deassert		= NULL, - -	/* Nothing to do for most platforms, since cleared by the INIT cycle: */ -	.smp_callin_clear_local_apic	= NULL, -	.inquire_remote_apic		= default_inquire_remote_apic, - -	.read				= native_apic_mem_read, -	.write				= native_apic_mem_write, -	.eoi_write			= native_apic_mem_write, -	.icr_read			= native_apic_icr_read, -	.icr_write			= native_apic_icr_write, -	.wait_icr_idle			= native_apic_wait_icr_idle, -	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle, - -	.x86_32_early_logical_apicid	= es7000_early_logical_apicid, -}; - -static struct apic __refdata apic_es7000 = { - -	.name				= "es7000", -	.probe				= probe_es7000, -	.acpi_madt_oem_check		= es7000_acpi_madt_oem_check, -	.apic_id_valid			= default_apic_id_valid, -	.apic_id_registered		= es7000_apic_id_registered, - -	.irq_delivery_mode		= dest_Fixed, -	/* phys delivery to target CPUs: */ -	.irq_dest_mode			= 0, - -	.target_cpus			= es7000_target_cpus, -	.disable_esr			= 1, -	.dest_logical			= 0, -	.check_apicid_used		= es7000_check_apicid_used, -	.check_apicid_present		= es7000_check_apicid_present, - -	.vector_allocation_domain	= flat_vector_allocation_domain, -	.init_apic_ldr			= es7000_init_apic_ldr, - -	.ioapic_phys_id_map		= es7000_ioapic_phys_id_map, -	.setup_apic_routing		= es7000_setup_apic_routing, -	.multi_timer_check		= NULL, -	.cpu_present_to_apicid		= es7000_cpu_present_to_apicid, -	.apicid_to_cpu_present		= es7000_apicid_to_cpu_present, -	.setup_portio_remap		= NULL, -	.check_phys_apicid_present	= es7000_check_phys_apicid_present, -	.enable_apic_mode		= es7000_enable_apic_mode, -	.phys_pkg_id			= es7000_phys_pkg_id, -	.mps_oem_check			= es7000_mps_oem_check, - -	.get_apic_id			= es7000_get_apic_id, -	.set_apic_id			= NULL, -	.apic_id_mask			= 0xFF << 24, - -	.cpu_mask_to_apicid_and		= es7000_cpu_mask_to_apicid_and, - -	.send_IPI_mask			= es7000_send_IPI_mask, -	.send_IPI_mask_allbutself	= NULL, -	.send_IPI_allbutself		= es7000_send_IPI_allbutself, -	.send_IPI_all			= es7000_send_IPI_all, -	.send_IPI_self			= default_send_IPI_self, - -	.trampoline_phys_low		= 0x467, -	.trampoline_phys_high		= 0x469, - -	.wait_for_init_deassert		= es7000_wait_for_init_deassert, - -	/* Nothing to do for most platforms, since cleared by the INIT cycle: */ -	.smp_callin_clear_local_apic	= NULL, -	.inquire_remote_apic		= default_inquire_remote_apic, - -	.read				= native_apic_mem_read, -	.write				= native_apic_mem_write, -	.eoi_write			= native_apic_mem_write, -	.icr_read			= native_apic_icr_read, -	.icr_write			= native_apic_icr_write, -	.wait_icr_idle			= native_apic_wait_icr_idle, -	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle, - -	.x86_32_early_logical_apicid	= es7000_early_logical_apicid, -}; - -/* - * Need to check for es7000 followed by es7000_cluster, so this order - * in apic_drivers is important. - */ -apic_drivers(apic_es7000, apic_es7000_cluster); diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index a698d7165c9..6a1e71bde32 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -33,34 +33,44 @@ static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;  /* "in progress" flag of arch_trigger_all_cpu_backtrace */  static unsigned long backtrace_flag; -void arch_trigger_all_cpu_backtrace(void) +void arch_trigger_all_cpu_backtrace(bool include_self)  {  	int i; +	int cpu = get_cpu(); -	if (test_and_set_bit(0, &backtrace_flag)) +	if (test_and_set_bit(0, &backtrace_flag)) {  		/*  		 * If there is already a trigger_all_cpu_backtrace() in progress  		 * (backtrace_flag == 1), don't output double cpu dump infos.  		 */ +		put_cpu();  		return; +	}  	cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); +	if (!include_self) +		cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); -	printk(KERN_INFO "sending NMI to all CPUs:\n"); -	apic->send_IPI_all(NMI_VECTOR); +	if (!cpumask_empty(to_cpumask(backtrace_mask))) { +		pr_info("sending NMI to %s CPUs:\n", +			(include_self ? "all" : "other")); +		apic->send_IPI_mask(to_cpumask(backtrace_mask), NMI_VECTOR); +	}  	/* Wait for up to 10 seconds for all CPUs to do the backtrace */  	for (i = 0; i < 10 * 1000; i++) {  		if (cpumask_empty(to_cpumask(backtrace_mask)))  			break;  		mdelay(1); +		touch_softlockup_watchdog();  	}  	clear_bit(0, &backtrace_flag); -	smp_mb__after_clear_bit(); +	smp_mb__after_atomic(); +	put_cpu();  } -static int __kprobes +static int  arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)  {  	int cpu; @@ -80,6 +90,7 @@ arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)  	return NMI_DONE;  } +NOKPROBE_SYMBOL(arch_trigger_all_cpu_backtrace_handler);  static int __init register_trigger_all_cpu_backtrace(void)  { diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index e63a5bd2a78..81e08eff05e 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -37,9 +37,6 @@  #include <linux/kthread.h>  #include <linux/jiffies.h>	/* time_after() */  #include <linux/slab.h> -#ifdef CONFIG_ACPI -#include <acpi/acpi_bus.h> -#endif  #include <linux/bootmem.h>  #include <linux/dmar.h>  #include <linux/hpet.h> @@ -209,9 +206,6 @@ int __init arch_early_irq_init(void)  	count = ARRAY_SIZE(irq_cfgx);  	node = cpu_to_node(0); -	/* Make sure the legacy interrupts are marked in the bitmap */ -	irq_reserve_irqs(0, legacy_pic->nr_legacy_irqs); -  	for (i = 0; i < count; i++) {  		irq_set_chip_data(i, &cfg[i]);  		zalloc_cpumask_var_node(&cfg[i].domain, GFP_KERNEL, node); @@ -284,18 +278,6 @@ static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)  	return cfg;  } -static int alloc_irqs_from(unsigned int from, unsigned int count, int node) -{ -	return irq_alloc_descs_from(from, count, node); -} - -static void free_irq_at(unsigned int at, struct irq_cfg *cfg) -{ -	free_irq_cfg(at, cfg); -	irq_free_desc(at); -} - -  struct io_apic {  	unsigned int index;  	unsigned int unused[3]; @@ -1142,9 +1124,10 @@ next:  		if (test_bit(vector, used_vectors))  			goto next; -		for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) -			if (per_cpu(vector_irq, new_cpu)[vector] != -1) +		for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) { +			if (per_cpu(vector_irq, new_cpu)[vector] > VECTOR_UNDEFINED)  				goto next; +		}  		/* Found one! */  		current_vector = vector;  		current_offset = offset; @@ -1183,7 +1166,7 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg)  	vector = cfg->vector;  	for_each_cpu_and(cpu, cfg->domain, cpu_online_mask) -		per_cpu(vector_irq, cpu)[vector] = -1; +		per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;  	cfg->vector = 0;  	cpumask_clear(cfg->domain); @@ -1191,11 +1174,10 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg)  	if (likely(!cfg->move_in_progress))  		return;  	for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) { -		for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; -								vector++) { +		for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {  			if (per_cpu(vector_irq, cpu)[vector] != irq)  				continue; -			per_cpu(vector_irq, cpu)[vector] = -1; +			per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;  			break;  		}  	} @@ -1228,12 +1210,12 @@ void __setup_vector_irq(int cpu)  	/* Mark the free vectors */  	for (vector = 0; vector < NR_VECTORS; ++vector) {  		irq = per_cpu(vector_irq, cpu)[vector]; -		if (irq < 0) +		if (irq <= VECTOR_UNDEFINED)  			continue;  		cfg = irq_cfg(irq);  		if (!cpumask_test_cpu(cpu, cfg->domain)) -			per_cpu(vector_irq, cpu)[vector] = -1; +			per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;  	}  	raw_spin_unlock(&vector_lock);  } @@ -2192,7 +2174,7 @@ void send_cleanup_vector(struct irq_cfg *cfg)  	cfg->move_in_progress = 0;  } -asmlinkage void smp_irq_move_cleanup_interrupt(void) +asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)  {  	unsigned vector, me; @@ -2202,13 +2184,13 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)  	me = smp_processor_id();  	for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { -		unsigned int irq; +		int irq;  		unsigned int irr;  		struct irq_desc *desc;  		struct irq_cfg *cfg;  		irq = __this_cpu_read(vector_irq[vector]); -		if (irq == -1) +		if (irq <= VECTOR_UNDEFINED)  			continue;  		desc = irq_to_desc(irq); @@ -2315,7 +2297,7 @@ int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,  	int err;  	if (!config_enabled(CONFIG_SMP)) -		return -1; +		return -EPERM;  	if (!cpumask_intersects(mask, cpu_online_mask))  		return -EINVAL; @@ -2346,7 +2328,7 @@ int native_ioapic_set_affinity(struct irq_data *data,  	int ret;  	if (!config_enabled(CONFIG_SMP)) -		return -1; +		return -EPERM;  	raw_spin_lock_irqsave(&ioapic_lock, flags);  	ret = __ioapic_set_affinity(data, mask, &dest); @@ -2919,98 +2901,39 @@ static int __init ioapic_init_ops(void)  device_initcall(ioapic_init_ops);  /* - * Dynamic irq allocate and deallocation + * Dynamic irq allocate and deallocation. Should be replaced by irq domains!   */ -unsigned int __create_irqs(unsigned int from, unsigned int count, int node) +int arch_setup_hwirq(unsigned int irq, int node)  { -	struct irq_cfg **cfg; +	struct irq_cfg *cfg;  	unsigned long flags; -	int irq, i; - -	if (from < nr_irqs_gsi) -		from = nr_irqs_gsi; +	int ret; -	cfg = kzalloc_node(count * sizeof(cfg[0]), GFP_KERNEL, node); +	cfg = alloc_irq_cfg(irq, node);  	if (!cfg) -		return 0; - -	irq = alloc_irqs_from(from, count, node); -	if (irq < 0) -		goto out_cfgs; - -	for (i = 0; i < count; i++) { -		cfg[i] = alloc_irq_cfg(irq + i, node); -		if (!cfg[i]) -			goto out_irqs; -	} +		return -ENOMEM;  	raw_spin_lock_irqsave(&vector_lock, flags); -	for (i = 0; i < count; i++) -		if (__assign_irq_vector(irq + i, cfg[i], apic->target_cpus())) -			goto out_vecs; +	ret = __assign_irq_vector(irq, cfg, apic->target_cpus());  	raw_spin_unlock_irqrestore(&vector_lock, flags); -	for (i = 0; i < count; i++) { -		irq_set_chip_data(irq + i, cfg[i]); -		irq_clear_status_flags(irq + i, IRQ_NOREQUEST); -	} - -	kfree(cfg); -	return irq; - -out_vecs: -	for (i--; i >= 0; i--) -		__clear_irq_vector(irq + i, cfg[i]); -	raw_spin_unlock_irqrestore(&vector_lock, flags); -out_irqs: -	for (i = 0; i < count; i++) -		free_irq_at(irq + i, cfg[i]); -out_cfgs: -	kfree(cfg); -	return 0; -} - -unsigned int create_irq_nr(unsigned int from, int node) -{ -	return __create_irqs(from, 1, node); -} - -int create_irq(void) -{ -	int node = cpu_to_node(0); -	unsigned int irq_want; -	int irq; - -	irq_want = nr_irqs_gsi; -	irq = create_irq_nr(irq_want, node); - -	if (irq == 0) -		irq = -1; - -	return irq; +	if (!ret) +		irq_set_chip_data(irq, cfg); +	else +		free_irq_cfg(irq, cfg); +	return ret;  } -void destroy_irq(unsigned int irq) +void arch_teardown_hwirq(unsigned int irq)  {  	struct irq_cfg *cfg = irq_get_chip_data(irq);  	unsigned long flags; -	irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE); -  	free_remapped_irq(irq); -  	raw_spin_lock_irqsave(&vector_lock, flags);  	__clear_irq_vector(irq, cfg);  	raw_spin_unlock_irqrestore(&vector_lock, flags); -	free_irq_at(irq, cfg); -} - -void destroy_irqs(unsigned int irq, unsigned int count) -{ -	unsigned int i; - -	for (i = 0; i < count; i++) -		destroy_irq(irq + i); +	free_irq_cfg(irq, cfg);  }  /* @@ -3078,9 +3001,11 @@ msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)  	struct irq_cfg *cfg = data->chip_data;  	struct msi_msg msg;  	unsigned int dest; +	int ret; -	if (__ioapic_set_affinity(data, mask, &dest)) -		return -1; +	ret = __ioapic_set_affinity(data, mask, &dest); +	if (ret) +		return ret;  	__get_cached_msi_msg(data->msi_desc, &msg); @@ -3139,8 +3064,8 @@ int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,  int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)  { -	unsigned int irq, irq_want;  	struct msi_desc *msidesc; +	unsigned int irq;  	int node, ret;  	/* Multiple MSI vectors only supported with interrupt remapping */ @@ -3148,28 +3073,25 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)  		return 1;  	node = dev_to_node(&dev->dev); -	irq_want = nr_irqs_gsi; +  	list_for_each_entry(msidesc, &dev->msi_list, list) { -		irq = create_irq_nr(irq_want, node); -		if (irq == 0) +		irq = irq_alloc_hwirq(node); +		if (!irq)  			return -ENOSPC; -		irq_want = irq + 1; -  		ret = setup_msi_irq(dev, msidesc, irq, 0); -		if (ret < 0) -			goto error; +		if (ret < 0) { +			irq_free_hwirq(irq); +			return ret; +		} +  	}  	return 0; - -error: -	destroy_irq(irq); -	return ret;  }  void native_teardown_msi_irq(unsigned int irq)  { -	destroy_irq(irq); +	irq_free_hwirq(irq);  }  #ifdef CONFIG_DMAR_TABLE @@ -3180,9 +3102,11 @@ dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,  	struct irq_cfg *cfg = data->chip_data;  	unsigned int dest, irq = data->irq;  	struct msi_msg msg; +	int ret; -	if (__ioapic_set_affinity(data, mask, &dest)) -		return -1; +	ret = __ioapic_set_affinity(data, mask, &dest); +	if (ret) +		return ret;  	dmar_msi_read(irq, &msg); @@ -3229,9 +3153,11 @@ static int hpet_msi_set_affinity(struct irq_data *data,  	struct irq_cfg *cfg = data->chip_data;  	struct msi_msg msg;  	unsigned int dest; +	int ret; -	if (__ioapic_set_affinity(data, mask, &dest)) -		return -1; +	ret = __ioapic_set_affinity(data, mask, &dest); +	if (ret) +		return ret;  	hpet_msi_read(data->handler_data, &msg); @@ -3298,9 +3224,11 @@ ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)  {  	struct irq_cfg *cfg = data->chip_data;  	unsigned int dest; +	int ret; -	if (__ioapic_set_affinity(data, mask, &dest)) -		return -1; +	ret = __ioapic_set_affinity(data, mask, &dest); +	if (ret) +		return ret;  	target_ht_irq(data->irq, dest, cfg->vector);  	return IRQ_SET_MASK_OK_NOCOPY; @@ -3423,9 +3351,9 @@ static void __init probe_nr_irqs_gsi(void)  	printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);  } -int get_nr_irqs_gsi(void) +unsigned int arch_dynirq_lower_bound(unsigned int from)  { -	return nr_irqs_gsi; +	return from < nr_irqs_gsi ? nr_irqs_gsi : from;  }  int __init arch_probe_nr_irqs(void) diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index 7434d8556d0..62071569bd5 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -1,6 +1,5 @@  #include <linux/cpumask.h>  #include <linux/interrupt.h> -#include <linux/init.h>  #include <linux/mm.h>  #include <linux/delay.h> diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c deleted file mode 100644 index 1e42e8f305e..00000000000 --- a/arch/x86/kernel/apic/numaq_32.c +++ /dev/null @@ -1,525 +0,0 @@ -/* - * Written by: Patricia Gaughen, IBM Corporation - * - * Copyright (C) 2002, IBM Corp. - * Copyright (C) 2009, Red Hat, Inc., Ingo Molnar - * - * All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT.  See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Send feedback to <gone@us.ibm.com> - */ -#include <linux/nodemask.h> -#include <linux/topology.h> -#include <linux/bootmem.h> -#include <linux/memblock.h> -#include <linux/threads.h> -#include <linux/cpumask.h> -#include <linux/kernel.h> -#include <linux/mmzone.h> -#include <linux/module.h> -#include <linux/string.h> -#include <linux/init.h> -#include <linux/numa.h> -#include <linux/smp.h> -#include <linux/io.h> -#include <linux/mm.h> - -#include <asm/processor.h> -#include <asm/fixmap.h> -#include <asm/mpspec.h> -#include <asm/numaq.h> -#include <asm/setup.h> -#include <asm/apic.h> -#include <asm/e820.h> -#include <asm/ipi.h> - -int found_numaq; - -/* - * Have to match translation table entries to main table entries by counter - * hence the mpc_record variable .... can't see a less disgusting way of - * doing this .... - */ -struct mpc_trans { -	unsigned char			mpc_type; -	unsigned char			trans_len; -	unsigned char			trans_type; -	unsigned char			trans_quad; -	unsigned char			trans_global; -	unsigned char			trans_local; -	unsigned short			trans_reserved; -}; - -static int				mpc_record; - -static struct mpc_trans			*translation_table[MAX_MPC_ENTRY]; - -int					mp_bus_id_to_node[MAX_MP_BUSSES]; -int					mp_bus_id_to_local[MAX_MP_BUSSES]; -int					quad_local_to_mp_bus_id[NR_CPUS/4][4]; - - -static inline void numaq_register_node(int node, struct sys_cfg_data *scd) -{ -	struct eachquadmem *eq = scd->eq + node; -	u64 start = (u64)(eq->hi_shrd_mem_start - eq->priv_mem_size) << 20; -	u64 end = (u64)(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size) << 20; -	int ret; - -	node_set(node, numa_nodes_parsed); -	ret = numa_add_memblk(node, start, end); -	BUG_ON(ret < 0); -} - -/* - * Function: smp_dump_qct() - * - * Description: gets memory layout from the quad config table.  This - * function also updates numa_nodes_parsed with the nodes (quads) present. - */ -static void __init smp_dump_qct(void) -{ -	struct sys_cfg_data *scd; -	int node; - -	scd = (void *)__va(SYS_CFG_DATA_PRIV_ADDR); - -	for_each_node(node) { -		if (scd->quads_present31_0 & (1 << node)) -			numaq_register_node(node, scd); -	} -} - -void numaq_tsc_disable(void) -{ -	if (!found_numaq) -		return; - -	if (num_online_nodes() > 1) { -		printk(KERN_DEBUG "NUMAQ: disabling TSC\n"); -		setup_clear_cpu_cap(X86_FEATURE_TSC); -	} -} - -static void __init numaq_tsc_init(void) -{ -	numaq_tsc_disable(); -} - -static inline int generate_logical_apicid(int quad, int phys_apicid) -{ -	return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1); -} - -/* x86_quirks member */ -static int mpc_apic_id(struct mpc_cpu *m) -{ -	int quad = translation_table[mpc_record]->trans_quad; -	int logical_apicid = generate_logical_apicid(quad, m->apicid); - -	printk(KERN_DEBUG -		"Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n", -		 m->apicid, (m->cpufeature & CPU_FAMILY_MASK) >> 8, -		(m->cpufeature & CPU_MODEL_MASK) >> 4, -		 m->apicver, quad, logical_apicid); - -	return logical_apicid; -} - -/* x86_quirks member */ -static void mpc_oem_bus_info(struct mpc_bus *m, char *name) -{ -	int quad = translation_table[mpc_record]->trans_quad; -	int local = translation_table[mpc_record]->trans_local; - -	mp_bus_id_to_node[m->busid] = quad; -	mp_bus_id_to_local[m->busid] = local; - -	printk(KERN_INFO "Bus #%d is %s (node %d)\n", m->busid, name, quad); -} - -/* x86_quirks member */ -static void mpc_oem_pci_bus(struct mpc_bus *m) -{ -	int quad = translation_table[mpc_record]->trans_quad; -	int local = translation_table[mpc_record]->trans_local; - -	quad_local_to_mp_bus_id[quad][local] = m->busid; -} - -/* - * Called from mpparse code. - * mode = 0: prescan - * mode = 1: one mpc entry scanned - */ -static void numaq_mpc_record(unsigned int mode) -{ -	if (!mode) -		mpc_record = 0; -	else -		mpc_record++; -} - -static void __init MP_translation_info(struct mpc_trans *m) -{ -	printk(KERN_INFO -	    "Translation: record %d, type %d, quad %d, global %d, local %d\n", -	       mpc_record, m->trans_type, m->trans_quad, m->trans_global, -	       m->trans_local); - -	if (mpc_record >= MAX_MPC_ENTRY) -		printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n"); -	else -		translation_table[mpc_record] = m; /* stash this for later */ - -	if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad)) -		node_set_online(m->trans_quad); -} - -static int __init mpf_checksum(unsigned char *mp, int len) -{ -	int sum = 0; - -	while (len--) -		sum += *mp++; - -	return sum & 0xFF; -} - -/* - * Read/parse the MPC oem tables - */ -static void __init smp_read_mpc_oem(struct mpc_table *mpc) -{ -	struct mpc_oemtable *oemtable = (void *)(long)mpc->oemptr; -	int count = sizeof(*oemtable);	/* the header size */ -	unsigned char *oemptr = ((unsigned char *)oemtable) + count; - -	mpc_record = 0; -	printk(KERN_INFO -		"Found an OEM MPC table at %8p - parsing it...\n", oemtable); - -	if (memcmp(oemtable->signature, MPC_OEM_SIGNATURE, 4)) { -		printk(KERN_WARNING -		       "SMP mpc oemtable: bad signature [%c%c%c%c]!\n", -		       oemtable->signature[0], oemtable->signature[1], -		       oemtable->signature[2], oemtable->signature[3]); -		return; -	} - -	if (mpf_checksum((unsigned char *)oemtable, oemtable->length)) { -		printk(KERN_WARNING "SMP oem mptable: checksum error!\n"); -		return; -	} - -	while (count < oemtable->length) { -		switch (*oemptr) { -		case MP_TRANSLATION: -			{ -				struct mpc_trans *m = (void *)oemptr; - -				MP_translation_info(m); -				oemptr += sizeof(*m); -				count += sizeof(*m); -				++mpc_record; -				break; -			} -		default: -			printk(KERN_WARNING -			       "Unrecognised OEM table entry type! - %d\n", -			       (int)*oemptr); -			return; -		} -	} -} - -static __init void early_check_numaq(void) -{ -	/* -	 * get boot-time SMP configuration: -	 */ -	if (smp_found_config) -		early_get_smp_config(); - -	if (found_numaq) { -		x86_init.mpparse.mpc_record = numaq_mpc_record; -		x86_init.mpparse.setup_ioapic_ids = x86_init_noop; -		x86_init.mpparse.mpc_apic_id = mpc_apic_id; -		x86_init.mpparse.smp_read_mpc_oem = smp_read_mpc_oem; -		x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus; -		x86_init.mpparse.mpc_oem_bus_info = mpc_oem_bus_info; -		x86_init.timers.tsc_pre_init = numaq_tsc_init; -		x86_init.pci.init = pci_numaq_init; -	} -} - -int __init numaq_numa_init(void) -{ -	early_check_numaq(); -	if (!found_numaq) -		return -ENOENT; -	smp_dump_qct(); - -	return 0; -} - -#define NUMAQ_APIC_DFR_VALUE	(APIC_DFR_CLUSTER) - -static inline unsigned int numaq_get_apic_id(unsigned long x) -{ -	return (x >> 24) & 0x0F; -} - -static inline void numaq_send_IPI_mask(const struct cpumask *mask, int vector) -{ -	default_send_IPI_mask_sequence_logical(mask, vector); -} - -static inline void numaq_send_IPI_allbutself(int vector) -{ -	default_send_IPI_mask_allbutself_logical(cpu_online_mask, vector); -} - -static inline void numaq_send_IPI_all(int vector) -{ -	numaq_send_IPI_mask(cpu_online_mask, vector); -} - -#define NUMAQ_TRAMPOLINE_PHYS_LOW	(0x8) -#define NUMAQ_TRAMPOLINE_PHYS_HIGH	(0xa) - -/* - * Because we use NMIs rather than the INIT-STARTUP sequence to - * bootstrap the CPUs, the APIC may be in a weird state. Kick it: - */ -static inline void numaq_smp_callin_clear_local_apic(void) -{ -	clear_local_APIC(); -} - -static inline const struct cpumask *numaq_target_cpus(void) -{ -	return cpu_all_mask; -} - -static unsigned long numaq_check_apicid_used(physid_mask_t *map, int apicid) -{ -	return physid_isset(apicid, *map); -} - -static inline unsigned long numaq_check_apicid_present(int bit) -{ -	return physid_isset(bit, phys_cpu_present_map); -} - -static inline int numaq_apic_id_registered(void) -{ -	return 1; -} - -static inline void numaq_init_apic_ldr(void) -{ -	/* Already done in NUMA-Q firmware */ -} - -static inline void numaq_setup_apic_routing(void) -{ -	printk(KERN_INFO -		"Enabling APIC mode:  NUMA-Q.  Using %d I/O APICs\n", -		nr_ioapics); -} - -/* - * Skip adding the timer int on secondary nodes, which causes - * a small but painful rift in the time-space continuum. - */ -static inline int numaq_multi_timer_check(int apic, int irq) -{ -	return apic != 0 && irq == 0; -} - -static inline void numaq_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) -{ -	/* We don't have a good way to do this yet - hack */ -	return physids_promote(0xFUL, retmap); -} - -/* - * Supporting over 60 cpus on NUMA-Q requires a locality-dependent - * cpu to APIC ID relation to properly interact with the intelligent - * mode of the cluster controller. - */ -static inline int numaq_cpu_present_to_apicid(int mps_cpu) -{ -	if (mps_cpu < 60) -		return ((mps_cpu >> 2) << 4) | (1 << (mps_cpu & 0x3)); -	else -		return BAD_APICID; -} - -static inline int numaq_apicid_to_node(int logical_apicid) -{ -	return logical_apicid >> 4; -} - -static int numaq_numa_cpu_node(int cpu) -{ -	int logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); - -	if (logical_apicid != BAD_APICID) -		return numaq_apicid_to_node(logical_apicid); -	return NUMA_NO_NODE; -} - -static void numaq_apicid_to_cpu_present(int logical_apicid, physid_mask_t *retmap) -{ -	int node = numaq_apicid_to_node(logical_apicid); -	int cpu = __ffs(logical_apicid & 0xf); - -	physid_set_mask_of_physid(cpu + 4*node, retmap); -} - -/* Where the IO area was mapped on multiquad, always 0 otherwise */ -void *xquad_portio; - -static inline int numaq_check_phys_apicid_present(int phys_apicid) -{ -	return 1; -} - -/* - * We use physical apicids here, not logical, so just return the default - * physical broadcast to stop people from breaking us - */ -static int -numaq_cpu_mask_to_apicid_and(const struct cpumask *cpumask, -			     const struct cpumask *andmask, -			     unsigned int *apicid) -{ -	*apicid = 0x0F; -	return 0; -} - -/* No NUMA-Q box has a HT CPU, but it can't hurt to use the default code. */ -static inline int numaq_phys_pkg_id(int cpuid_apic, int index_msb) -{ -	return cpuid_apic >> index_msb; -} - -static int -numaq_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid) -{ -	if (strncmp(oem, "IBM NUMA", 8)) -		printk(KERN_ERR "Warning! Not a NUMA-Q system!\n"); -	else -		found_numaq = 1; - -	return found_numaq; -} - -static int probe_numaq(void) -{ -	/* already know from get_memcfg_numaq() */ -	return found_numaq; -} - -static void numaq_setup_portio_remap(void) -{ -	int num_quads = num_online_nodes(); - -	if (num_quads <= 1) -		return; - -	printk(KERN_INFO -		"Remapping cross-quad port I/O for %d quads\n", num_quads); - -	xquad_portio = ioremap(XQUAD_PORTIO_BASE, num_quads*XQUAD_PORTIO_QUAD); - -	printk(KERN_INFO -		"xquad_portio vaddr 0x%08lx, len %08lx\n", -		(u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD); -} - -/* Use __refdata to keep false positive warning calm.  */ -static struct apic __refdata apic_numaq = { - -	.name				= "NUMAQ", -	.probe				= probe_numaq, -	.acpi_madt_oem_check		= NULL, -	.apic_id_valid			= default_apic_id_valid, -	.apic_id_registered		= numaq_apic_id_registered, - -	.irq_delivery_mode		= dest_LowestPrio, -	/* physical delivery on LOCAL quad: */ -	.irq_dest_mode			= 0, - -	.target_cpus			= numaq_target_cpus, -	.disable_esr			= 1, -	.dest_logical			= APIC_DEST_LOGICAL, -	.check_apicid_used		= numaq_check_apicid_used, -	.check_apicid_present		= numaq_check_apicid_present, - -	.vector_allocation_domain	= flat_vector_allocation_domain, -	.init_apic_ldr			= numaq_init_apic_ldr, - -	.ioapic_phys_id_map		= numaq_ioapic_phys_id_map, -	.setup_apic_routing		= numaq_setup_apic_routing, -	.multi_timer_check		= numaq_multi_timer_check, -	.cpu_present_to_apicid		= numaq_cpu_present_to_apicid, -	.apicid_to_cpu_present		= numaq_apicid_to_cpu_present, -	.setup_portio_remap		= numaq_setup_portio_remap, -	.check_phys_apicid_present	= numaq_check_phys_apicid_present, -	.enable_apic_mode		= NULL, -	.phys_pkg_id			= numaq_phys_pkg_id, -	.mps_oem_check			= numaq_mps_oem_check, - -	.get_apic_id			= numaq_get_apic_id, -	.set_apic_id			= NULL, -	.apic_id_mask			= 0x0F << 24, - -	.cpu_mask_to_apicid_and		= numaq_cpu_mask_to_apicid_and, - -	.send_IPI_mask			= numaq_send_IPI_mask, -	.send_IPI_mask_allbutself	= NULL, -	.send_IPI_allbutself		= numaq_send_IPI_allbutself, -	.send_IPI_all			= numaq_send_IPI_all, -	.send_IPI_self			= default_send_IPI_self, - -	.wakeup_secondary_cpu		= wakeup_secondary_cpu_via_nmi, -	.trampoline_phys_low		= NUMAQ_TRAMPOLINE_PHYS_LOW, -	.trampoline_phys_high		= NUMAQ_TRAMPOLINE_PHYS_HIGH, - -	/* We don't do anything here because we use NMI's to boot instead */ -	.wait_for_init_deassert		= NULL, - -	.smp_callin_clear_local_apic	= numaq_smp_callin_clear_local_apic, -	.inquire_remote_apic		= NULL, - -	.read				= native_apic_mem_read, -	.write				= native_apic_mem_write, -	.eoi_write			= native_apic_mem_write, -	.icr_read			= native_apic_icr_read, -	.icr_write			= native_apic_icr_write, -	.wait_icr_idle			= native_apic_wait_icr_idle, -	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle, - -	.x86_32_early_logical_apicid	= noop_x86_32_early_logical_apicid, -	.x86_32_numa_cpu_node		= numaq_numa_cpu_node, -}; - -apic_driver(apic_numaq); diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index eb35ef9ee63..cceb352c968 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -119,8 +119,7 @@ static struct apic apic_default = {  	.trampoline_phys_low		= DEFAULT_TRAMPOLINE_PHYS_LOW,  	.trampoline_phys_high		= DEFAULT_TRAMPOLINE_PHYS_HIGH, -	.wait_for_init_deassert		= default_wait_for_init_deassert, - +	.wait_for_init_deassert		= true,  	.smp_callin_clear_local_apic	= NULL,  	.inquire_remote_apic		= default_inquire_remote_apic, diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c deleted file mode 100644 index 77c95c0e1bf..00000000000 --- a/arch/x86/kernel/apic/summit_32.c +++ /dev/null @@ -1,552 +0,0 @@ -/* - * IBM Summit-Specific Code - * - * Written By: Matthew Dobson, IBM Corporation - * - * Copyright (c) 2003 IBM Corp. - * - * All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or (at - * your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT.  See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Send feedback to <colpatch@us.ibm.com> - * - */ - -#define pr_fmt(fmt) "summit: %s: " fmt, __func__ - -#include <linux/mm.h> -#include <linux/init.h> -#include <asm/io.h> -#include <asm/bios_ebda.h> - -/* - * APIC driver for the IBM "Summit" chipset. - */ -#include <linux/threads.h> -#include <linux/cpumask.h> -#include <asm/mpspec.h> -#include <asm/apic.h> -#include <asm/smp.h> -#include <asm/fixmap.h> -#include <asm/apicdef.h> -#include <asm/ipi.h> -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/gfp.h> -#include <linux/smp.h> - -static unsigned summit_get_apic_id(unsigned long x) -{ -	return (x >> 24) & 0xFF; -} - -static inline void summit_send_IPI_mask(const struct cpumask *mask, int vector) -{ -	default_send_IPI_mask_sequence_logical(mask, vector); -} - -static void summit_send_IPI_allbutself(int vector) -{ -	default_send_IPI_mask_allbutself_logical(cpu_online_mask, vector); -} - -static void summit_send_IPI_all(int vector) -{ -	summit_send_IPI_mask(cpu_online_mask, vector); -} - -#include <asm/tsc.h> - -extern int use_cyclone; - -#ifdef CONFIG_X86_SUMMIT_NUMA -static void setup_summit(void); -#else -static inline void setup_summit(void) {} -#endif - -static int summit_mps_oem_check(struct mpc_table *mpc, char *oem, -		char *productid) -{ -	if (!strncmp(oem, "IBM ENSW", 8) && -			(!strncmp(productid, "VIGIL SMP", 9) -			 || !strncmp(productid, "EXA", 3) -			 || !strncmp(productid, "RUTHLESS SMP", 12))){ -		mark_tsc_unstable("Summit based system"); -		use_cyclone = 1; /*enable cyclone-timer*/ -		setup_summit(); -		return 1; -	} -	return 0; -} - -/* Hook from generic ACPI tables.c */ -static int summit_acpi_madt_oem_check(char *oem_id, char *oem_table_id) -{ -	if (!strncmp(oem_id, "IBM", 3) && -	    (!strncmp(oem_table_id, "SERVIGIL", 8) -	     || !strncmp(oem_table_id, "EXA", 3))){ -		mark_tsc_unstable("Summit based system"); -		use_cyclone = 1; /*enable cyclone-timer*/ -		setup_summit(); -		return 1; -	} -	return 0; -} - -struct rio_table_hdr { -	unsigned char version;      /* Version number of this data structure           */ -	                            /* Version 3 adds chassis_num & WP_index           */ -	unsigned char num_scal_dev; /* # of Scalability devices (Twisters for Vigil)   */ -	unsigned char num_rio_dev;  /* # of RIO I/O devices (Cyclones and Winnipegs)   */ -} __attribute__((packed)); - -struct scal_detail { -	unsigned char node_id;      /* Scalability Node ID                             */ -	unsigned long CBAR;         /* Address of 1MB register space                   */ -	unsigned char port0node;    /* Node ID port connected to: 0xFF=None            */ -	unsigned char port0port;    /* Port num port connected to: 0,1,2, or 0xFF=None */ -	unsigned char port1node;    /* Node ID port connected to: 0xFF = None          */ -	unsigned char port1port;    /* Port num port connected to: 0,1,2, or 0xFF=None */ -	unsigned char port2node;    /* Node ID port connected to: 0xFF = None          */ -	unsigned char port2port;    /* Port num port connected to: 0,1,2, or 0xFF=None */ -	unsigned char chassis_num;  /* 1 based Chassis number (1 = boot node)          */ -} __attribute__((packed)); - -struct rio_detail { -	unsigned char node_id;      /* RIO Node ID                                     */ -	unsigned long BBAR;         /* Address of 1MB register space                   */ -	unsigned char type;         /* Type of device                                  */ -	unsigned char owner_id;     /* For WPEG: Node ID of Cyclone that owns this WPEG*/ -	                            /* For CYC:  Node ID of Twister that owns this CYC */ -	unsigned char port0node;    /* Node ID port connected to: 0xFF=None            */ -	unsigned char port0port;    /* Port num port connected to: 0,1,2, or 0xFF=None */ -	unsigned char port1node;    /* Node ID port connected to: 0xFF=None            */ -	unsigned char port1port;    /* Port num port connected to: 0,1,2, or 0xFF=None */ -	unsigned char first_slot;   /* For WPEG: Lowest slot number below this WPEG    */ -	                            /* For CYC:  0                                     */ -	unsigned char status;       /* For WPEG: Bit 0 = 1 : the XAPIC is used         */ -	                            /*                 = 0 : the XAPIC is not used, ie:*/ -	                            /*                     ints fwded to another XAPIC */ -	                            /*           Bits1:7 Reserved                      */ -	                            /* For CYC:  Bits0:7 Reserved                      */ -	unsigned char WP_index;     /* For WPEG: WPEG instance index - lower ones have */ -	                            /*           lower slot numbers/PCI bus numbers    */ -	                            /* For CYC:  No meaning                            */ -	unsigned char chassis_num;  /* 1 based Chassis number                          */ -	                            /* For LookOut WPEGs this field indicates the      */ -	                            /* Expansion Chassis #, enumerated from Boot       */ -	                            /* Node WPEG external port, then Boot Node CYC     */ -	                            /* external port, then Next Vigil chassis WPEG     */ -	                            /* external port, etc.                             */ -	                            /* Shared Lookouts have only 1 chassis number (the */ -	                            /* first one assigned)                             */ -} __attribute__((packed)); - - -typedef enum { -	CompatTwister = 0,  /* Compatibility Twister               */ -	AltTwister    = 1,  /* Alternate Twister of internal 8-way */ -	CompatCyclone = 2,  /* Compatibility Cyclone               */ -	AltCyclone    = 3,  /* Alternate Cyclone of internal 8-way */ -	CompatWPEG    = 4,  /* Compatibility WPEG                  */ -	AltWPEG       = 5,  /* Second Planar WPEG                  */ -	LookOutAWPEG  = 6,  /* LookOut WPEG                        */ -	LookOutBWPEG  = 7,  /* LookOut WPEG                        */ -} node_type; - -static inline int is_WPEG(struct rio_detail *rio){ -	return (rio->type == CompatWPEG || rio->type == AltWPEG || -		rio->type == LookOutAWPEG || rio->type == LookOutBWPEG); -} - -#define SUMMIT_APIC_DFR_VALUE	(APIC_DFR_CLUSTER) - -static const struct cpumask *summit_target_cpus(void) -{ -	/* CPU_MASK_ALL (0xff) has undefined behaviour with -	 * dest_LowestPrio mode logical clustered apic interrupt routing -	 * Just start on cpu 0.  IRQ balancing will spread load -	 */ -	return cpumask_of(0); -} - -static unsigned long summit_check_apicid_used(physid_mask_t *map, int apicid) -{ -	return 0; -} - -/* we don't use the phys_cpu_present_map to indicate apicid presence */ -static unsigned long summit_check_apicid_present(int bit) -{ -	return 1; -} - -static int summit_early_logical_apicid(int cpu) -{ -	int count = 0; -	u8 my_id = early_per_cpu(x86_cpu_to_apicid, cpu); -	u8 my_cluster = APIC_CLUSTER(my_id); -#ifdef CONFIG_SMP -	u8 lid; -	int i; - -	/* Create logical APIC IDs by counting CPUs already in cluster. */ -	for (count = 0, i = nr_cpu_ids; --i >= 0; ) { -		lid = early_per_cpu(x86_cpu_to_logical_apicid, i); -		if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster) -			++count; -	} -#endif -	/* We only have a 4 wide bitmap in cluster mode.  If a deranged -	 * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */ -	BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT); -	return my_cluster | (1UL << count); -} - -static void summit_init_apic_ldr(void) -{ -	int cpu = smp_processor_id(); -	unsigned long id = early_per_cpu(x86_cpu_to_logical_apicid, cpu); -	unsigned long val; - -	apic_write(APIC_DFR, SUMMIT_APIC_DFR_VALUE); -	val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; -	val |= SET_APIC_LOGICAL_ID(id); -	apic_write(APIC_LDR, val); -} - -static int summit_apic_id_registered(void) -{ -	return 1; -} - -static void summit_setup_apic_routing(void) -{ -	pr_info("Enabling APIC mode:  Summit.  Using %d I/O APICs\n", -		nr_ioapics); -} - -static int summit_cpu_present_to_apicid(int mps_cpu) -{ -	if (mps_cpu < nr_cpu_ids) -		return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu); -	else -		return BAD_APICID; -} - -static void summit_ioapic_phys_id_map(physid_mask_t *phys_id_map, physid_mask_t *retmap) -{ -	/* For clustered we don't have a good way to do this yet - hack */ -	physids_promote(0x0FL, retmap); -} - -static void summit_apicid_to_cpu_present(int apicid, physid_mask_t *retmap) -{ -	physid_set_mask_of_physid(0, retmap); -} - -static int summit_check_phys_apicid_present(int physical_apicid) -{ -	return 1; -} - -static inline int -summit_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id) -{ -	unsigned int round = 0; -	unsigned int cpu, apicid = 0; - -	/* -	 * The cpus in the mask must all be on the apic cluster. -	 */ -	for_each_cpu_and(cpu, cpumask, cpu_online_mask) { -		int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); - -		if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { -			pr_err("Not a valid mask!\n"); -			return -EINVAL; -		} -		apicid |= new_apicid; -		round++; -	} -	if (!round) -		return -EINVAL; -	*dest_id = apicid; -	return 0; -} - -static int -summit_cpu_mask_to_apicid_and(const struct cpumask *inmask, -			      const struct cpumask *andmask, -			      unsigned int *apicid) -{ -	cpumask_var_t cpumask; -	*apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0); - -	if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) -		return 0; - -	cpumask_and(cpumask, inmask, andmask); -	summit_cpu_mask_to_apicid(cpumask, apicid); - -	free_cpumask_var(cpumask); - -	return 0; -} - -/* - * cpuid returns the value latched in the HW at reset, not the APIC ID - * register's value.  For any box whose BIOS changes APIC IDs, like - * clustered APIC systems, we must use hard_smp_processor_id. - * - * See Intel's IA-32 SW Dev's Manual Vol2 under CPUID. - */ -static int summit_phys_pkg_id(int cpuid_apic, int index_msb) -{ -	return hard_smp_processor_id() >> index_msb; -} - -static int probe_summit(void) -{ -	/* probed later in mptable/ACPI hooks */ -	return 0; -} - -#ifdef CONFIG_X86_SUMMIT_NUMA -static struct rio_table_hdr *rio_table_hdr; -static struct scal_detail   *scal_devs[MAX_NUMNODES]; -static struct rio_detail    *rio_devs[MAX_NUMNODES*4]; - -#ifndef CONFIG_X86_NUMAQ -static int mp_bus_id_to_node[MAX_MP_BUSSES]; -#endif - -static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus) -{ -	int twister = 0, node = 0; -	int i, bus, num_buses; - -	for (i = 0; i < rio_table_hdr->num_rio_dev; i++) { -		if (rio_devs[i]->node_id == rio_devs[wpeg_num]->owner_id) { -			twister = rio_devs[i]->owner_id; -			break; -		} -	} -	if (i == rio_table_hdr->num_rio_dev) { -		pr_err("Couldn't find owner Cyclone for Winnipeg!\n"); -		return last_bus; -	} - -	for (i = 0; i < rio_table_hdr->num_scal_dev; i++) { -		if (scal_devs[i]->node_id == twister) { -			node = scal_devs[i]->node_id; -			break; -		} -	} -	if (i == rio_table_hdr->num_scal_dev) { -		pr_err("Couldn't find owner Twister for Cyclone!\n"); -		return last_bus; -	} - -	switch (rio_devs[wpeg_num]->type) { -	case CompatWPEG: -		/* -		 * The Compatibility Winnipeg controls the 2 legacy buses, -		 * the 66MHz PCI bus [2 slots] and the 2 "extra" buses in case -		 * a PCI-PCI bridge card is used in either slot: total 5 buses. -		 */ -		num_buses = 5; -		break; -	case AltWPEG: -		/* -		 * The Alternate Winnipeg controls the 2 133MHz buses [1 slot -		 * each], their 2 "extra" buses, the 100MHz bus [2 slots] and -		 * the "extra" buses for each of those slots: total 7 buses. -		 */ -		num_buses = 7; -		break; -	case LookOutAWPEG: -	case LookOutBWPEG: -		/* -		 * A Lookout Winnipeg controls 3 100MHz buses [2 slots each] -		 * & the "extra" buses for each of those slots: total 9 buses. -		 */ -		num_buses = 9; -		break; -	default: -		pr_info("Unsupported Winnipeg type!\n"); -		return last_bus; -	} - -	for (bus = last_bus; bus < last_bus + num_buses; bus++) -		mp_bus_id_to_node[bus] = node; -	return bus; -} - -static int build_detail_arrays(void) -{ -	unsigned long ptr; -	int i, scal_detail_size, rio_detail_size; - -	if (rio_table_hdr->num_scal_dev > MAX_NUMNODES) { -		pr_warn("MAX_NUMNODES too low!  Defined as %d, but system has %d nodes\n", -			MAX_NUMNODES, rio_table_hdr->num_scal_dev); -		return 0; -	} - -	switch (rio_table_hdr->version) { -	default: -		pr_warn("Invalid Rio Grande Table Version: %d\n", -			rio_table_hdr->version); -		return 0; -	case 2: -		scal_detail_size = 11; -		rio_detail_size = 13; -		break; -	case 3: -		scal_detail_size = 12; -		rio_detail_size = 15; -		break; -	} - -	ptr = (unsigned long)rio_table_hdr + 3; -	for (i = 0; i < rio_table_hdr->num_scal_dev; i++, ptr += scal_detail_size) -		scal_devs[i] = (struct scal_detail *)ptr; - -	for (i = 0; i < rio_table_hdr->num_rio_dev; i++, ptr += rio_detail_size) -		rio_devs[i] = (struct rio_detail *)ptr; - -	return 1; -} - -void setup_summit(void) -{ -	unsigned long		ptr; -	unsigned short		offset; -	int			i, next_wpeg, next_bus = 0; - -	/* The pointer to the EBDA is stored in the word @ phys 0x40E(40:0E) */ -	ptr = get_bios_ebda(); -	ptr = (unsigned long)phys_to_virt(ptr); - -	rio_table_hdr = NULL; -	offset = 0x180; -	while (offset) { -		/* The block id is stored in the 2nd word */ -		if (*((unsigned short *)(ptr + offset + 2)) == 0x4752) { -			/* set the pointer past the offset & block id */ -			rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4); -			break; -		} -		/* The next offset is stored in the 1st word.  0 means no more */ -		offset = *((unsigned short *)(ptr + offset)); -	} -	if (!rio_table_hdr) { -		pr_err("Unable to locate Rio Grande Table in EBDA - bailing!\n"); -		return; -	} - -	if (!build_detail_arrays()) -		return; - -	/* The first Winnipeg we're looking for has an index of 0 */ -	next_wpeg = 0; -	do { -		for (i = 0; i < rio_table_hdr->num_rio_dev; i++) { -			if (is_WPEG(rio_devs[i]) && rio_devs[i]->WP_index == next_wpeg) { -				/* It's the Winnipeg we're looking for! */ -				next_bus = setup_pci_node_map_for_wpeg(i, next_bus); -				next_wpeg++; -				break; -			} -		} -		/* -		 * If we go through all Rio devices and don't find one with -		 * the next index, it means we've found all the Winnipegs, -		 * and thus all the PCI buses. -		 */ -		if (i == rio_table_hdr->num_rio_dev) -			next_wpeg = 0; -	} while (next_wpeg != 0); -} -#endif - -static struct apic apic_summit = { - -	.name				= "summit", -	.probe				= probe_summit, -	.acpi_madt_oem_check		= summit_acpi_madt_oem_check, -	.apic_id_valid			= default_apic_id_valid, -	.apic_id_registered		= summit_apic_id_registered, - -	.irq_delivery_mode		= dest_LowestPrio, -	/* logical delivery broadcast to all CPUs: */ -	.irq_dest_mode			= 1, - -	.target_cpus			= summit_target_cpus, -	.disable_esr			= 1, -	.dest_logical			= APIC_DEST_LOGICAL, -	.check_apicid_used		= summit_check_apicid_used, -	.check_apicid_present		= summit_check_apicid_present, - -	.vector_allocation_domain	= flat_vector_allocation_domain, -	.init_apic_ldr			= summit_init_apic_ldr, - -	.ioapic_phys_id_map		= summit_ioapic_phys_id_map, -	.setup_apic_routing		= summit_setup_apic_routing, -	.multi_timer_check		= NULL, -	.cpu_present_to_apicid		= summit_cpu_present_to_apicid, -	.apicid_to_cpu_present		= summit_apicid_to_cpu_present, -	.setup_portio_remap		= NULL, -	.check_phys_apicid_present	= summit_check_phys_apicid_present, -	.enable_apic_mode		= NULL, -	.phys_pkg_id			= summit_phys_pkg_id, -	.mps_oem_check			= summit_mps_oem_check, - -	.get_apic_id			= summit_get_apic_id, -	.set_apic_id			= NULL, -	.apic_id_mask			= 0xFF << 24, - -	.cpu_mask_to_apicid_and		= summit_cpu_mask_to_apicid_and, - -	.send_IPI_mask			= summit_send_IPI_mask, -	.send_IPI_mask_allbutself	= NULL, -	.send_IPI_allbutself		= summit_send_IPI_allbutself, -	.send_IPI_all			= summit_send_IPI_all, -	.send_IPI_self			= default_send_IPI_self, - -	.trampoline_phys_low		= DEFAULT_TRAMPOLINE_PHYS_LOW, -	.trampoline_phys_high		= DEFAULT_TRAMPOLINE_PHYS_HIGH, - -	.wait_for_init_deassert		= default_wait_for_init_deassert, - -	.smp_callin_clear_local_apic	= NULL, -	.inquire_remote_apic		= default_inquire_remote_apic, - -	.read				= native_apic_mem_read, -	.write				= native_apic_mem_write, -	.eoi_write			= native_apic_mem_write, -	.icr_read			= native_apic_icr_read, -	.icr_write			= native_apic_icr_write, -	.wait_icr_idle			= native_apic_wait_icr_idle, -	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle, - -	.x86_32_early_logical_apicid	= summit_early_logical_apicid, -}; - -apic_driver(apic_summit); diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 140e29db478..e66766bf164 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -3,7 +3,6 @@  #include <linux/string.h>  #include <linux/kernel.h>  #include <linux/ctype.h> -#include <linux/init.h>  #include <linux/dmar.h>  #include <linux/cpu.h> @@ -280,7 +279,7 @@ static struct apic apic_x2apic_cluster = {  	.trampoline_phys_low		= DEFAULT_TRAMPOLINE_PHYS_LOW,  	.trampoline_phys_high		= DEFAULT_TRAMPOLINE_PHYS_HIGH, -	.wait_for_init_deassert		= NULL, +	.wait_for_init_deassert		= false,  	.smp_callin_clear_local_apic	= NULL,  	.inquire_remote_apic		= NULL, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 562a76d433c..6d600ebf6c1 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -3,7 +3,6 @@  #include <linux/string.h>  #include <linux/kernel.h>  #include <linux/ctype.h> -#include <linux/init.h>  #include <linux/dmar.h>  #include <asm/smp.h> @@ -134,7 +133,7 @@ static struct apic apic_x2apic_phys = {  	.trampoline_phys_low		= DEFAULT_TRAMPOLINE_PHYS_LOW,  	.trampoline_phys_high		= DEFAULT_TRAMPOLINE_PHYS_HIGH, -	.wait_for_init_deassert		= NULL, +	.wait_for_init_deassert		= false,  	.smp_callin_clear_local_apic	= NULL,  	.inquire_remote_apic		= NULL, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 1191ac1c9d2..293b41df54e 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -5,7 +5,7 @@   *   * SGI UV APIC functions (note: not an Intel compatible APIC)   * - * Copyright (C) 2007-2013 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2007-2014 Silicon Graphics, Inc. All rights reserved.   */  #include <linux/cpumask.h>  #include <linux/hardirq.h> @@ -39,12 +39,6 @@  #include <asm/x86_init.h>  #include <asm/nmi.h> -/* BMC sets a bit this MMR non-zero before sending an NMI */ -#define UVH_NMI_MMR				UVH_SCRATCH5 -#define UVH_NMI_MMR_CLEAR			(UVH_NMI_MMR + 8) -#define UV_NMI_PENDING_MASK			(1UL << 63) -DEFINE_PER_CPU(unsigned long, cpu_last_nmi_count); -  DEFINE_PER_CPU(int, x2apic_extra_bits);  #define PR_DEVEL(fmt, args...)	pr_devel("%s: " fmt, __func__, args) @@ -58,7 +52,6 @@ int uv_min_hub_revision_id;  EXPORT_SYMBOL_GPL(uv_min_hub_revision_id);  unsigned int uv_apicid_hibits;  EXPORT_SYMBOL_GPL(uv_apicid_hibits); -static DEFINE_SPINLOCK(uv_nmi_lock);  static struct apic apic_x2apic_uv_x; @@ -113,7 +106,7 @@ static int __init early_get_pnodeid(void)  		break;  	case UV3_HUB_PART_NUMBER:  	case UV3_HUB_PART_NUMBER_X: -		uv_min_hub_revision_id += UV3_HUB_REVISION_BASE - 1; +		uv_min_hub_revision_id += UV3_HUB_REVISION_BASE;  		break;  	} @@ -403,7 +396,7 @@ static struct apic __refdata apic_x2apic_uv_x = {  	.wakeup_secondary_cpu		= uv_wakeup_secondary,  	.trampoline_phys_low		= DEFAULT_TRAMPOLINE_PHYS_LOW,  	.trampoline_phys_high		= DEFAULT_TRAMPOLINE_PHYS_HIGH, -	.wait_for_init_deassert		= NULL, +	.wait_for_init_deassert		= false,  	.smp_callin_clear_local_apic	= NULL,  	.inquire_remote_apic		= NULL, @@ -447,6 +440,20 @@ static __initdata struct redir_addr redir_addrs[] = {  	{UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR},  }; +static unsigned char get_n_lshift(int m_val) +{ +	union uv3h_gr0_gam_gr_config_u m_gr_config; + +	if (is_uv1_hub()) +		return m_val; + +	if (is_uv2_hub()) +		return m_val == 40 ? 40 : 39; + +	m_gr_config.v = uv_read_local_mmr(UV3H_GR0_GAM_GR_CONFIG); +	return m_gr_config.s3.m_skt; +} +  static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)  {  	union uvh_rh_gam_alias210_overlay_config_2_mmr_u alias; @@ -847,68 +854,6 @@ void uv_cpu_init(void)  		set_x2apic_extra_bits(uv_hub_info->pnode);  } -/* - * When NMI is received, print a stack trace. - */ -int uv_handle_nmi(unsigned int reason, struct pt_regs *regs) -{ -	unsigned long real_uv_nmi; -	int bid; - -	/* -	 * Each blade has an MMR that indicates when an NMI has been sent -	 * to cpus on the blade. If an NMI is detected, atomically -	 * clear the MMR and update a per-blade NMI count used to -	 * cause each cpu on the blade to notice a new NMI. -	 */ -	bid = uv_numa_blade_id(); -	real_uv_nmi = (uv_read_local_mmr(UVH_NMI_MMR) & UV_NMI_PENDING_MASK); - -	if (unlikely(real_uv_nmi)) { -		spin_lock(&uv_blade_info[bid].nmi_lock); -		real_uv_nmi = (uv_read_local_mmr(UVH_NMI_MMR) & UV_NMI_PENDING_MASK); -		if (real_uv_nmi) { -			uv_blade_info[bid].nmi_count++; -			uv_write_local_mmr(UVH_NMI_MMR_CLEAR, UV_NMI_PENDING_MASK); -		} -		spin_unlock(&uv_blade_info[bid].nmi_lock); -	} - -	if (likely(__get_cpu_var(cpu_last_nmi_count) == uv_blade_info[bid].nmi_count)) -		return NMI_DONE; - -	__get_cpu_var(cpu_last_nmi_count) = uv_blade_info[bid].nmi_count; - -	/* -	 * Use a lock so only one cpu prints at a time. -	 * This prevents intermixed output. -	 */ -	spin_lock(&uv_nmi_lock); -	pr_info("UV NMI stack dump cpu %u:\n", smp_processor_id()); -	dump_stack(); -	spin_unlock(&uv_nmi_lock); - -	return NMI_HANDLED; -} - -void uv_register_nmi_notifier(void) -{ -	if (register_nmi_handler(NMI_UNKNOWN, uv_handle_nmi, 0, "uv")) -		printk(KERN_WARNING "UV NMI handler failed to register\n"); -} - -void uv_nmi_init(void) -{ -	unsigned int value; - -	/* -	 * Unmask NMI on all cpus -	 */ -	value = apic_read(APIC_LVT1) | APIC_DM_NMI; -	value &= ~APIC_LVT_MASKED; -	apic_write(APIC_LVT1, value); -} -  void __init uv_system_init(void)  {  	union uvh_rh_gam_config_mmr_u  m_n_config; @@ -918,6 +863,7 @@ void __init uv_system_init(void)  	int gnode_extra, min_pnode = 999999, max_pnode = -1;  	unsigned long mmr_base, present, paddr;  	unsigned short pnode_mask; +	unsigned char n_lshift;  	char *hub = (is_uv1_hub() ? "UV1" :  		    (is_uv2_hub() ? "UV2" :  				    "UV3")); @@ -929,6 +875,7 @@ void __init uv_system_init(void)  	m_val = m_n_config.s.m_skt;  	n_val = m_n_config.s.n_skt;  	pnode_mask = (1 << n_val) - 1; +	n_lshift = get_n_lshift(m_val);  	mmr_base =  	    uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &  	    ~UV_MMR_ENABLE; @@ -936,8 +883,9 @@ void __init uv_system_init(void)  	node_id.v = uv_read_local_mmr(UVH_NODE_ID);  	gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1;  	gnode_upper = ((unsigned long)gnode_extra  << m_val); -	pr_info("UV: N:%d M:%d pnode_mask:0x%x gnode_upper/extra:0x%lx/0x%x\n", -			n_val, m_val, pnode_mask, gnode_upper, gnode_extra); +	pr_info("UV: N:%d M:%d pnode_mask:0x%x gnode_upper/extra:0x%lx/0x%x n_lshift 0x%x\n", +			n_val, m_val, pnode_mask, gnode_upper, gnode_extra, +			n_lshift);  	pr_info("UV: global MMR base 0x%lx\n", mmr_base); @@ -1004,8 +952,7 @@ void __init uv_system_init(void)  		uv_cpu_hub_info(cpu)->hub_revision = uv_hub_info->hub_revision;  		uv_cpu_hub_info(cpu)->m_shift = 64 - m_val; -		uv_cpu_hub_info(cpu)->n_lshift = is_uv2_1_hub() ? -				(m_val == 40 ? 40 : 39) : m_val; +		uv_cpu_hub_info(cpu)->n_lshift = n_lshift;  		pnode = uv_apicid_to_pnode(apicid);  		blade = boot_pnode_to_blade(pnode); @@ -1046,9 +993,9 @@ void __init uv_system_init(void)  	map_mmr_high(max_pnode);  	map_mmioh_high(min_pnode, max_pnode); +	uv_nmi_setup();  	uv_cpu_init();  	uv_scir_register_cpu_notifier(); -	uv_register_nmi_notifier();  	proc_mkdir("sgi_uv", NULL);  	/* register Legacy VGA I/O redirection handler */ diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 3ab03430211..58487445141 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -841,24 +841,12 @@ static int apm_do_idle(void)  	u32 eax;  	u8 ret = 0;  	int idled = 0; -	int polling;  	int err = 0; -	polling = !!(current_thread_info()->status & TS_POLLING); -	if (polling) { -		current_thread_info()->status &= ~TS_POLLING; -		/* -		 * TS_POLLING-cleared state must be visible before we -		 * test NEED_RESCHED: -		 */ -		smp_mb(); -	}  	if (!need_resched()) {  		idled = 1;  		ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax, &err);  	} -	if (polling) -		current_thread_info()->status |= TS_POLLING;  	if (!idled)  		return 0; diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 28610822fb3..9f6b9341950 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -32,7 +32,6 @@ void common(void) {  	OFFSET(TI_flags, thread_info, flags);  	OFFSET(TI_status, thread_info, status);  	OFFSET(TI_addr_limit, thread_info, addr_limit); -	OFFSET(TI_preempt_count, thread_info, preempt_count);  	BLANK();  	OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c index e2dbcb7dabd..83a7995625a 100644 --- a/arch/x86/kernel/check.c +++ b/arch/x86/kernel/check.c @@ -91,7 +91,7 @@ void __init setup_bios_corruption_check(void)  	corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); -	for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) { +	for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) {  		start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE),  				PAGE_SIZE, corruption_check_size);  		end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE), diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 47b56a7e99c..7fd54f09b01 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -36,12 +36,13 @@ obj-$(CONFIG_CPU_SUP_AMD)		+= perf_event_amd_iommu.o  endif  obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_knc.o perf_event_p4.o  obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o -obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_uncore.o +obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_uncore.o perf_event_intel_rapl.o  endif  obj-$(CONFIG_X86_MCE)			+= mcheck/  obj-$(CONFIG_MTRR)			+= mtrr/ +obj-$(CONFIG_MICROCODE)			+= microcode/  obj-$(CONFIG_X86_LOCAL_APIC)		+= perfctr-watchdog.o perf_event_amd_ibs.o diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 903a264af98..ce8b8ff0e0e 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -1,5 +1,4 @@  #include <linux/export.h> -#include <linux/init.h>  #include <linux/bitops.h>  #include <linux/elf.h>  #include <linux/mm.h> @@ -219,7 +218,7 @@ static void amd_k7_smp_check(struct cpuinfo_x86 *c)  	 */  	WARN_ONCE(1, "WARNING: This combination of AMD"  		" processors is not suitable for SMP.\n"); -	add_taint(TAINT_UNSAFE_SMP, LOCKDEP_NOW_UNRELIABLE); +	add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE);  }  static void init_amd_k7(struct cpuinfo_x86 *c) @@ -234,9 +233,7 @@ static void init_amd_k7(struct cpuinfo_x86 *c)  	if (c->x86_model >= 6 && c->x86_model <= 10) {  		if (!cpu_has(c, X86_FEATURE_XMM)) {  			printk(KERN_INFO "Enabling disabled K7/SSE Support.\n"); -			rdmsr(MSR_K7_HWCR, l, h); -			l &= ~0x00008000; -			wrmsr(MSR_K7_HWCR, l, h); +			msr_clear_bit(MSR_K7_HWCR, 15);  			set_cpu_cap(c, X86_FEATURE_XMM);  		}  	} @@ -339,7 +336,7 @@ static void amd_get_topology(struct cpuinfo_x86 *c)  #endif  /* - * On a AMD dual core setup the lower bits of the APIC id distingush the cores. + * On a AMD dual core setup the lower bits of the APIC id distinguish the cores.   * Assumes number of cores is a power of two.   */  static void amd_detect_cmp(struct cpuinfo_x86 *c) @@ -487,7 +484,7 @@ static void early_init_amd(struct cpuinfo_x86 *c)  		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);  		set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);  		if (!check_tsc_unstable()) -			sched_clock_stable = 1; +			set_sched_clock_stable();  	}  #ifdef CONFIG_X86_64 @@ -508,6 +505,10 @@ static void early_init_amd(struct cpuinfo_x86 *c)  			set_cpu_cap(c, X86_FEATURE_EXTD_APICID);  	}  #endif + +	/* F16h erratum 793, CVE-2013-6885 */ +	if (c->x86 == 0x16 && c->x86_model <= 0xf) +		msr_set_bit(MSR_AMD64_LS_CFG, 15);  }  static const int amd_erratum_383[]; @@ -527,11 +528,8 @@ static void init_amd(struct cpuinfo_x86 *c)  	 * Errata 63 for SH-B3 steppings  	 * Errata 122 for all steppings (F+ have it disabled by default)  	 */ -	if (c->x86 == 0xf) { -		rdmsrl(MSR_K7_HWCR, value); -		value |= 1 << 6; -		wrmsrl(MSR_K7_HWCR, value); -	} +	if (c->x86 == 0xf) +		msr_set_bit(MSR_K7_HWCR, 6);  #endif  	early_init_amd(c); @@ -614,14 +612,11 @@ static void init_amd(struct cpuinfo_x86 *c)  	    (c->x86_model >= 0x10) && (c->x86_model <= 0x1f) &&  	    !cpu_has(c, X86_FEATURE_TOPOEXT)) { -		if (!rdmsrl_safe(0xc0011005, &value)) { -			value |= 1ULL << 54; -			wrmsrl_safe(0xc0011005, value); +		if (msr_set_bit(0xc0011005, 54) > 0) {  			rdmsrl(0xc0011005, value); -			if (value & (1ULL << 54)) { +			if (value & BIT_64(54)) {  				set_cpu_cap(c, X86_FEATURE_TOPOEXT); -				printk(KERN_INFO FW_INFO "CPU: Re-enabling " -				  "disabled Topology Extensions Support\n"); +				pr_info(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n");  			}  		}  	} @@ -700,19 +695,12 @@ static void init_amd(struct cpuinfo_x86 *c)  		 * Disable GART TLB Walk Errors on Fam10h. We do this here  		 * because this is always needed when GART is enabled, even in a  		 * kernel which has no MCE support built in. -		 * BIOS should disable GartTlbWlk Errors themself. If -		 * it doesn't do it here as suggested by the BKDG. +		 * BIOS should disable GartTlbWlk Errors already. If +		 * it doesn't, do it here as suggested by the BKDG.  		 *  		 * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012  		 */ -		u64 mask; -		int err; - -		err = rdmsrl_safe(MSR_AMD64_MCx_MASK(4), &mask); -		if (err == 0) { -			mask |= (1 << 10); -			wrmsrl_safe(MSR_AMD64_MCx_MASK(4), mask); -		} +		msr_set_bit(MSR_AMD64_MCx_MASK(4), 10);  		/*  		 * On family 10h BIOS may not have properly enabled WC+ support, @@ -724,10 +712,7 @@ static void init_amd(struct cpuinfo_x86 *c)  		 * NOTE: we want to use the _safe accessors so as not to #GP kvm  		 * guests on older kvm hosts.  		 */ - -		rdmsrl_safe(MSR_AMD64_BU_CFG2, &value); -		value &= ~(1ULL << 24); -		wrmsrl_safe(MSR_AMD64_BU_CFG2, value); +		msr_clear_bit(MSR_AMD64_BU_CFG2, 24);  		if (cpu_has_amd_erratum(c, amd_erratum_383))  			set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH); @@ -758,10 +743,7 @@ static unsigned int amd_size_cache(struct cpuinfo_x86 *c, unsigned int size)  static void cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c)  { -	tlb_flushall_shift = 5; - -	if (c->x86 <= 0x11) -		tlb_flushall_shift = 4; +	tlb_flushall_shift = 6;  }  static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c) @@ -790,14 +772,10 @@ static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c)  	}  	/* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ -	if (!((eax >> 16) & mask)) { -		u32 a, b, c, d; - -		cpuid(0x80000005, &a, &b, &c, &d); -		tlb_lld_2m[ENTRIES] = (a >> 16) & 0xff; -	} else { +	if (!((eax >> 16) & mask)) +		tlb_lld_2m[ENTRIES] = (cpuid_eax(0x80000005) >> 16) & 0xff; +	else  		tlb_lld_2m[ENTRIES] = (eax >> 16) & mask; -	}  	/* a 4M entry uses two 2M entries */  	tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1; @@ -823,8 +801,8 @@ static const struct cpu_dev amd_cpu_dev = {  	.c_vendor	= "AMD",  	.c_ident	= { "AuthenticAMD" },  #ifdef CONFIG_X86_32 -	.c_models = { -		{ .vendor = X86_VENDOR_AMD, .family = 4, .model_names = +	.legacy_models = { +		{ .family = 4, .model_names =  		  {  			  [3] = "486 DX/2",  			  [7] = "486 DX/2-WB", @@ -835,7 +813,7 @@ static const struct cpu_dev amd_cpu_dev = {  		  }  		},  	}, -	.c_size_cache	= amd_size_cache, +	.legacy_cache_size = amd_size_cache,  #endif  	.c_early_init   = early_init_amd,  	.c_detect_tlb	= cpu_detect_tlb_amd, diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index fbf6c3bc240..d8fba5c15fb 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -1,6 +1,5 @@  #include <linux/bitops.h>  #include <linux/kernel.h> -#include <linux/init.h>  #include <asm/processor.h>  #include <asm/e820.h> @@ -9,236 +8,6 @@  #include "cpu.h" -#ifdef CONFIG_X86_OOSTORE - -static u32 power2(u32 x) -{ -	u32 s = 1; - -	while (s <= x) -		s <<= 1; - -	return s >>= 1; -} - - -/* - * Set up an actual MCR - */ -static void centaur_mcr_insert(int reg, u32 base, u32 size, int key) -{ -	u32 lo, hi; - -	hi = base & ~0xFFF; -	lo = ~(size-1);		/* Size is a power of 2 so this makes a mask */ -	lo &= ~0xFFF;		/* Remove the ctrl value bits */ -	lo |= key;		/* Attribute we wish to set */ -	wrmsr(reg+MSR_IDT_MCR0, lo, hi); -	mtrr_centaur_report_mcr(reg, lo, hi);	/* Tell the mtrr driver */ -} - -/* - * Figure what we can cover with MCR's - * - * Shortcut: We know you can't put 4Gig of RAM on a winchip - */ -static u32 ramtop(void) -{ -	u32 clip = 0xFFFFFFFFUL; -	u32 top = 0; -	int i; - -	for (i = 0; i < e820.nr_map; i++) { -		unsigned long start, end; - -		if (e820.map[i].addr > 0xFFFFFFFFUL) -			continue; -		/* -		 * Don't MCR over reserved space. Ignore the ISA hole -		 * we frob around that catastrophe already -		 */ -		if (e820.map[i].type == E820_RESERVED) { -			if (e820.map[i].addr >= 0x100000UL && -			    e820.map[i].addr < clip) -				clip = e820.map[i].addr; -			continue; -		} -		start = e820.map[i].addr; -		end = e820.map[i].addr + e820.map[i].size; -		if (start >= end) -			continue; -		if (end > top) -			top = end; -	} -	/* -	 * Everything below 'top' should be RAM except for the ISA hole. -	 * Because of the limited MCR's we want to map NV/ACPI into our -	 * MCR range for gunk in RAM -	 * -	 * Clip might cause us to MCR insufficient RAM but that is an -	 * acceptable failure mode and should only bite obscure boxes with -	 * a VESA hole at 15Mb -	 * -	 * The second case Clip sometimes kicks in is when the EBDA is marked -	 * as reserved. Again we fail safe with reasonable results -	 */ -	if (top > clip) -		top = clip; - -	return top; -} - -/* - * Compute a set of MCR's to give maximum coverage - */ -static int centaur_mcr_compute(int nr, int key) -{ -	u32 mem = ramtop(); -	u32 root = power2(mem); -	u32 base = root; -	u32 top = root; -	u32 floor = 0; -	int ct = 0; - -	while (ct < nr) { -		u32 fspace = 0; -		u32 high; -		u32 low; - -		/* -		 * Find the largest block we will fill going upwards -		 */ -		high = power2(mem-top); - -		/* -		 * Find the largest block we will fill going downwards -		 */ -		low = base/2; - -		/* -		 * Don't fill below 1Mb going downwards as there -		 * is an ISA hole in the way. -		 */ -		if (base <= 1024*1024) -			low = 0; - -		/* -		 * See how much space we could cover by filling below -		 * the ISA hole -		 */ - -		if (floor == 0) -			fspace = 512*1024; -		else if (floor == 512*1024) -			fspace = 128*1024; - -		/* And forget ROM space */ - -		/* -		 * Now install the largest coverage we get -		 */ -		if (fspace > high && fspace > low) { -			centaur_mcr_insert(ct, floor, fspace, key); -			floor += fspace; -		} else if (high > low) { -			centaur_mcr_insert(ct, top, high, key); -			top += high; -		} else if (low > 0) { -			base -= low; -			centaur_mcr_insert(ct, base, low, key); -		} else -			break; -		ct++; -	} -	/* -	 * We loaded ct values. We now need to set the mask. The caller -	 * must do this bit. -	 */ -	return ct; -} - -static void centaur_create_optimal_mcr(void) -{ -	int used; -	int i; - -	/* -	 * Allocate up to 6 mcrs to mark as much of ram as possible -	 * as write combining and weak write ordered. -	 * -	 * To experiment with: Linux never uses stack operations for -	 * mmio spaces so we could globally enable stack operation wc -	 * -	 * Load the registers with type 31 - full write combining, all -	 * writes weakly ordered. -	 */ -	used = centaur_mcr_compute(6, 31); - -	/* -	 * Wipe unused MCRs -	 */ -	for (i = used; i < 8; i++) -		wrmsr(MSR_IDT_MCR0+i, 0, 0); -} - -static void winchip2_create_optimal_mcr(void) -{ -	u32 lo, hi; -	int used; -	int i; - -	/* -	 * Allocate up to 6 mcrs to mark as much of ram as possible -	 * as write combining, weak store ordered. -	 * -	 * Load the registers with type 25 -	 *	8	-	weak write ordering -	 *	16	-	weak read ordering -	 *	1	-	write combining -	 */ -	used = centaur_mcr_compute(6, 25); - -	/* -	 * Mark the registers we are using. -	 */ -	rdmsr(MSR_IDT_MCR_CTRL, lo, hi); -	for (i = 0; i < used; i++) -		lo |= 1<<(9+i); -	wrmsr(MSR_IDT_MCR_CTRL, lo, hi); - -	/* -	 * Wipe unused MCRs -	 */ - -	for (i = used; i < 8; i++) -		wrmsr(MSR_IDT_MCR0+i, 0, 0); -} - -/* - * Handle the MCR key on the Winchip 2. - */ -static void winchip2_unprotect_mcr(void) -{ -	u32 lo, hi; -	u32 key; - -	rdmsr(MSR_IDT_MCR_CTRL, lo, hi); -	lo &= ~0x1C0;	/* blank bits 8-6 */ -	key = (lo>>17) & 7; -	lo |= key<<6;	/* replace with unlock key */ -	wrmsr(MSR_IDT_MCR_CTRL, lo, hi); -} - -static void winchip2_protect_mcr(void) -{ -	u32 lo, hi; - -	rdmsr(MSR_IDT_MCR_CTRL, lo, hi); -	lo &= ~0x1C0;	/* blank bits 8-6 */ -	wrmsr(MSR_IDT_MCR_CTRL, lo, hi); -} -#endif /* CONFIG_X86_OOSTORE */ -  #define ACE_PRESENT	(1 << 6)  #define ACE_ENABLED	(1 << 7)  #define ACE_FCR		(1 << 28)	/* MSR_VIA_FCR */ @@ -363,20 +132,6 @@ static void init_centaur(struct cpuinfo_x86 *c)  			fcr_clr = DPDC;  			printk(KERN_NOTICE "Disabling bugged TSC.\n");  			clear_cpu_cap(c, X86_FEATURE_TSC); -#ifdef CONFIG_X86_OOSTORE -			centaur_create_optimal_mcr(); -			/* -			 * Enable: -			 *	write combining on non-stack, non-string -			 *	write combining on string, all types -			 *	weak write ordering -			 * -			 * The C6 original lacks weak read order -			 * -			 * Note 0x120 is write only on Winchip 1 -			 */ -			wrmsr(MSR_IDT_MCR_CTRL, 0x01F0001F, 0); -#endif  			break;  		case 8:  			switch (c->x86_mask) { @@ -393,40 +148,12 @@ static void init_centaur(struct cpuinfo_x86 *c)  			fcr_set = ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK|  				  E2MMX|EAMD3D;  			fcr_clr = DPDC; -#ifdef CONFIG_X86_OOSTORE -			winchip2_unprotect_mcr(); -			winchip2_create_optimal_mcr(); -			rdmsr(MSR_IDT_MCR_CTRL, lo, hi); -			/* -			 * Enable: -			 *	write combining on non-stack, non-string -			 *	write combining on string, all types -			 *	weak write ordering -			 */ -			lo |= 31; -			wrmsr(MSR_IDT_MCR_CTRL, lo, hi); -			winchip2_protect_mcr(); -#endif  			break;  		case 9:  			name = "3";  			fcr_set = ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK|  				  E2MMX|EAMD3D;  			fcr_clr = DPDC; -#ifdef CONFIG_X86_OOSTORE -			winchip2_unprotect_mcr(); -			winchip2_create_optimal_mcr(); -			rdmsr(MSR_IDT_MCR_CTRL, lo, hi); -			/* -			 * Enable: -			 *	write combining on non-stack, non-string -			 *	write combining on string, all types -			 *	weak write ordering -			 */ -			lo |= 31; -			wrmsr(MSR_IDT_MCR_CTRL, lo, hi); -			winchip2_protect_mcr(); -#endif  			break;  		default:  			name = "??"; @@ -468,10 +195,10 @@ static void init_centaur(struct cpuinfo_x86 *c)  #endif  } +#ifdef CONFIG_X86_32  static unsigned int  centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)  { -#ifdef CONFIG_X86_32  	/* VIA C3 CPUs (670-68F) need further shifting. */  	if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8)))  		size >>= 8; @@ -484,16 +211,18 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)  	if ((c->x86 == 6) && (c->x86_model == 9) &&  				(c->x86_mask == 1) && (size == 65))  		size -= 1; -#endif  	return size;  } +#endif  static const struct cpu_dev centaur_cpu_dev = {  	.c_vendor	= "Centaur",  	.c_ident	= { "CentaurHauls" },  	.c_early_init	= early_init_centaur,  	.c_init		= init_centaur, -	.c_size_cache	= centaur_size_cache, +#ifdef CONFIG_X86_32 +	.legacy_cache_size = centaur_size_cache, +#endif  	.c_x86_vendor	= X86_VENDOR_CENTAUR,  }; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 2793d1f095a..ef1b93f18ed 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -8,6 +8,7 @@  #include <linux/delay.h>  #include <linux/sched.h>  #include <linux/init.h> +#include <linux/kprobes.h>  #include <linux/kgdb.h>  #include <linux/smp.h>  #include <linux/io.h> @@ -20,6 +21,7 @@  #include <asm/processor.h>  #include <asm/debugreg.h>  #include <asm/sections.h> +#include <asm/vsyscall.h>  #include <linux/topology.h>  #include <linux/cpumask.h>  #include <asm/pgtable.h> @@ -284,8 +286,13 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)  	raw_local_save_flags(eflags);  	BUG_ON(eflags & X86_EFLAGS_AC); -	if (cpu_has(c, X86_FEATURE_SMAP)) +	if (cpu_has(c, X86_FEATURE_SMAP)) { +#ifdef CONFIG_X86_SMAP  		set_in_cr4(X86_CR4_SMAP); +#else +		clear_in_cr4(X86_CR4_SMAP); +#endif +	}  }  /* @@ -346,7 +353,8 @@ static void filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)  /* Look up CPU names by table lookup. */  static const char *table_lookup_model(struct cpuinfo_x86 *c)  { -	const struct cpu_model_info *info; +#ifdef CONFIG_X86_32 +	const struct legacy_cpu_model_info *info;  	if (c->x86_model >= 16)  		return NULL;	/* Range check */ @@ -354,13 +362,14 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c)  	if (!this_cpu)  		return NULL; -	info = this_cpu->c_models; +	info = this_cpu->legacy_models; -	while (info && info->family) { +	while (info->family) {  		if (info->family == c->x86)  			return info->model_names[c->x86_model];  		info++;  	} +#endif  	return NULL;		/* Not found */  } @@ -450,8 +459,8 @@ void cpu_detect_cache_sizes(struct cpuinfo_x86 *c)  	c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);  #else  	/* do processor-specific cache resizing */ -	if (this_cpu->c_size_cache) -		l2size = this_cpu->c_size_cache(c, l2size); +	if (this_cpu->legacy_cache_size) +		l2size = this_cpu->legacy_cache_size(c, l2size);  	/* Allow user to override all this if necessary. */  	if (cachesize_override != -1) @@ -470,6 +479,7 @@ u16 __read_mostly tlb_lli_4m[NR_INFO];  u16 __read_mostly tlb_lld_4k[NR_INFO];  u16 __read_mostly tlb_lld_2m[NR_INFO];  u16 __read_mostly tlb_lld_4m[NR_INFO]; +u16 __read_mostly tlb_lld_1g[NR_INFO];  /*   * tlb_flushall_shift shows the balance point in replacing cr3 write @@ -484,13 +494,13 @@ void cpu_detect_tlb(struct cpuinfo_x86 *c)  	if (this_cpu->c_detect_tlb)  		this_cpu->c_detect_tlb(c); -	printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \ -		"Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d\n"	     \ +	printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" +		"Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n"  		"tlb_flushall_shift: %d\n",  		tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES],  		tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES],  		tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES], -		tlb_flushall_shift); +		tlb_lld_1g[ENTRIES], tlb_flushall_shift);  }  void detect_ht(struct cpuinfo_x86 *c) @@ -945,6 +955,38 @@ static void vgetcpu_set_mode(void)  	else  		vgetcpu_mode = VGETCPU_LSL;  } + +/* May not be __init: called during resume */ +static void syscall32_cpu_init(void) +{ +	/* Load these always in case some future AMD CPU supports +	   SYSENTER from compat mode too. */ +	wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); +	wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); +	wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); + +	wrmsrl(MSR_CSTAR, ia32_cstar_target); +} +#endif + +#ifdef CONFIG_X86_32 +void enable_sep_cpu(void) +{ +	int cpu = get_cpu(); +	struct tss_struct *tss = &per_cpu(init_tss, cpu); + +	if (!boot_cpu_has(X86_FEATURE_SEP)) { +		put_cpu(); +		return; +	} + +	tss->x86_tss.ss1 = __KERNEL_CS; +	tss->x86_tss.sp1 = sizeof(struct tss_struct) + (unsigned long) tss; +	wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); +	wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0); +	wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0); +	put_cpu(); +}  #endif  void __init identify_boot_cpu(void) @@ -1017,7 +1059,8 @@ __setup("show_msr=", setup_show_msr);  static __init int setup_noclflush(char *arg)  { -	setup_clear_cpu_cap(X86_FEATURE_CLFLSH); +	setup_clear_cpu_cap(X86_FEATURE_CLFLUSH); +	setup_clear_cpu_cap(X86_FEATURE_CLFLUSHOPT);  	return 1;  }  __setup("noclflush", setup_noclflush); @@ -1070,6 +1113,10 @@ static __init int setup_disablecpuid(char *arg)  }  __setup("clearcpuid=", setup_disablecpuid); +DEFINE_PER_CPU(unsigned long, kernel_stack) = +	(unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE; +EXPORT_PER_CPU_SYMBOL(kernel_stack); +  #ifdef CONFIG_X86_64  struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table };  struct desc_ptr debug_idt_descr = { NR_VECTORS * 16 - 1, @@ -1086,15 +1133,14 @@ DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =  	&init_task;  EXPORT_PER_CPU_SYMBOL(current_task); -DEFINE_PER_CPU(unsigned long, kernel_stack) = -	(unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE; -EXPORT_PER_CPU_SYMBOL(kernel_stack); -  DEFINE_PER_CPU(char *, irq_stack_ptr) =  	init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;  DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; +DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; +EXPORT_PER_CPU_SYMBOL(__preempt_count); +  DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);  /* @@ -1148,6 +1194,7 @@ int is_debug_stack(unsigned long addr)  		(addr <= __get_cpu_var(debug_stack_addr) &&  		 addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ));  } +NOKPROBE_SYMBOL(is_debug_stack);  DEFINE_PER_CPU(u32, debug_idt_ctr); @@ -1156,6 +1203,7 @@ void debug_stack_set_zero(void)  	this_cpu_inc(debug_idt_ctr);  	load_current_idt();  } +NOKPROBE_SYMBOL(debug_stack_set_zero);  void debug_stack_reset(void)  { @@ -1164,11 +1212,14 @@ void debug_stack_reset(void)  	if (this_cpu_dec_return(debug_idt_ctr) == 0)  		load_current_idt();  } +NOKPROBE_SYMBOL(debug_stack_reset);  #else	/* CONFIG_X86_64 */  DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;  EXPORT_PER_CPU_SYMBOL(current_task); +DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; +EXPORT_PER_CPU_SYMBOL(__preempt_count);  DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);  #ifdef CONFIG_CC_STACKPROTECTOR diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 4041c24ae7d..c37dc37e831 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -1,12 +1,6 @@  #ifndef ARCH_X86_CPU_H  #define ARCH_X86_CPU_H -struct cpu_model_info { -	int		vendor; -	int		family; -	const char	*model_names[16]; -}; -  /* attempt to consolidate cpu attributes */  struct cpu_dev {  	const char	*c_vendor; @@ -14,15 +8,23 @@ struct cpu_dev {  	/* some have two possibilities for cpuid string */  	const char	*c_ident[2]; -	struct		cpu_model_info c_models[4]; -  	void            (*c_early_init)(struct cpuinfo_x86 *);  	void		(*c_bsp_init)(struct cpuinfo_x86 *);  	void		(*c_init)(struct cpuinfo_x86 *);  	void		(*c_identify)(struct cpuinfo_x86 *);  	void		(*c_detect_tlb)(struct cpuinfo_x86 *); -	unsigned int	(*c_size_cache)(struct cpuinfo_x86 *, unsigned int);  	int		c_x86_vendor; +#ifdef CONFIG_X86_32 +	/* Optional vendor specific routine to obtain the cache size. */ +	unsigned int	(*legacy_cache_size)(struct cpuinfo_x86 *, +					     unsigned int); + +	/* Family/stepping-based lookup table for model names. */ +	struct legacy_cpu_model_info { +		int		family; +		const char	*model_names[16]; +	}		legacy_models[5]; +#endif  };  struct _tlb_table { diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index d0969c75ab5..aaf152e7963 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -1,4 +1,3 @@ -#include <linux/init.h>  #include <linux/bitops.h>  #include <linux/delay.h>  #include <linux/pci.h> diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index ec7299566f7..f9e4fdd3b87 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -1,4 +1,3 @@ -#include <linux/init.h>  #include <linux/kernel.h>  #include <linux/string.h> @@ -32,11 +31,8 @@ static void early_init_intel(struct cpuinfo_x86 *c)  	/* Unmask CPUID levels if masked: */  	if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { -		rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); - -		if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) { -			misc_enable &= ~MSR_IA32_MISC_ENABLE_LIMIT_CPUID; -			wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); +		if (msr_clear_bit(MSR_IA32_MISC_ENABLE, +				  MSR_IA32_MISC_ENABLE_LIMIT_CPUID_BIT) > 0) {  			c->cpuid_level = cpuid_eax(0);  			get_cpu_cap(c);  		} @@ -93,7 +89,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)  		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);  		set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);  		if (!check_tsc_unstable()) -			sched_clock_stable = 1; +			set_sched_clock_stable();  	}  	/* Penwell and Cloverview have the TSC which doesn't sleep on S3 */ @@ -130,16 +126,10 @@ static void early_init_intel(struct cpuinfo_x86 *c)  	 * Ingo Molnar reported a Pentium D (model 6) and a Xeon  	 * (model 2) with the same problem.  	 */ -	if (c->x86 == 15) { -		rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); - -		if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) { -			printk(KERN_INFO "kmemcheck: Disabling fast string operations\n"); - -			misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING; -			wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); -		} -	} +	if (c->x86 == 15) +		if (msr_clear_bit(MSR_IA32_MISC_ENABLE, +				  MSR_IA32_MISC_ENABLE_FAST_STRING_BIT) > 0) +			pr_info("kmemcheck: Disabling fast string operations\n");  #endif  	/* @@ -196,10 +186,16 @@ static void intel_smp_check(struct cpuinfo_x86 *c)  	}  } -static void intel_workarounds(struct cpuinfo_x86 *c) +static int forcepae; +static int __init forcepae_setup(char *__unused)  { -	unsigned long lo, hi; +	forcepae = 1; +	return 1; +} +__setup("forcepae", forcepae_setup); +static void intel_workarounds(struct cpuinfo_x86 *c) +{  #ifdef CONFIG_X86_F00F_BUG  	/*  	 * All current models of Pentium and Pentium with MMX technology CPUs @@ -226,16 +222,26 @@ static void intel_workarounds(struct cpuinfo_x86 *c)  		clear_cpu_cap(c, X86_FEATURE_SEP);  	/* +	 * PAE CPUID issue: many Pentium M report no PAE but may have a +	 * functionally usable PAE implementation. +	 * Forcefully enable PAE if kernel parameter "forcepae" is present. +	 */ +	if (forcepae) { +		printk(KERN_WARNING "PAE forced!\n"); +		set_cpu_cap(c, X86_FEATURE_PAE); +		add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE); +	} + +	/*  	 * P4 Xeon errata 037 workaround.  	 * Hardware prefetcher may cause stale data to be loaded into the cache.  	 */  	if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) { -		rdmsr(MSR_IA32_MISC_ENABLE, lo, hi); -		if ((lo & MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE) == 0) { -			printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n"); -			printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n"); -			lo |= MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE; -			wrmsr(MSR_IA32_MISC_ENABLE, lo, hi); +		if (msr_set_bit(MSR_IA32_MISC_ENABLE, +				MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT) +		    > 0) { +			pr_info("CPU: C0 stepping P4 Xeon detected.\n"); +			pr_info("CPU: Disabling hardware prefetching (Errata 037)\n");  		}  	} @@ -268,10 +274,6 @@ static void intel_workarounds(struct cpuinfo_x86 *c)  	}  #endif -#ifdef CONFIG_X86_NUMAQ -	numaq_tsc_disable(); -#endif -  	intel_smp_check(c);  }  #else @@ -368,6 +370,17 @@ static void init_intel(struct cpuinfo_x86 *c)  	 */  	detect_extended_topology(c); +	if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) { +		/* +		 * let's use the legacy cpuid vector 0x1 and 0x4 for topology +		 * detection. +		 */ +		c->x86_max_cores = intel_num_cpu_cores(c); +#ifdef CONFIG_X86_32 +		detect_ht(c); +#endif +	} +  	l2 = init_intel_cacheinfo(c);  	if (c->cpuid_level > 9) {  		unsigned eax = cpuid_eax(10); @@ -387,7 +400,8 @@ static void init_intel(struct cpuinfo_x86 *c)  			set_cpu_cap(c, X86_FEATURE_PEBS);  	} -	if (c->x86 == 6 && c->x86_model == 29 && cpu_has_clflush) +	if (c->x86 == 6 && cpu_has_clflush && +	    (c->x86_model == 29 || c->x86_model == 46 || c->x86_model == 47))  		set_cpu_cap(c, X86_FEATURE_CLFLUSH_MONITOR);  #ifdef CONFIG_X86_64 @@ -435,17 +449,6 @@ static void init_intel(struct cpuinfo_x86 *c)  		set_cpu_cap(c, X86_FEATURE_P3);  #endif -	if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) { -		/* -		 * let's use the legacy cpuid vector 0x1 and 0x4 for topology -		 * detection. -		 */ -		c->x86_max_cores = intel_num_cpu_cores(c); -#ifdef CONFIG_X86_32 -		detect_ht(c); -#endif -	} -  	/* Work around errata */  	srat_detect_node(c); @@ -505,6 +508,7 @@ static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)  #define TLB_DATA0_2M_4M	0x23  #define STLB_4K		0x41 +#define STLB_4K_2M	0x42  static const struct _tlb_table intel_tlb_table[] = {  	{ 0x01, TLB_INST_4K,		32,	" TLB_INST 4 KByte pages, 4-way set associative" }, @@ -525,13 +529,20 @@ static const struct _tlb_table intel_tlb_table[] = {  	{ 0x5b, TLB_DATA_4K_4M,		64,	" TLB_DATA 4 KByte and 4 MByte pages" },  	{ 0x5c, TLB_DATA_4K_4M,		128,	" TLB_DATA 4 KByte and 4 MByte pages" },  	{ 0x5d, TLB_DATA_4K_4M,		256,	" TLB_DATA 4 KByte and 4 MByte pages" }, +	{ 0x61, TLB_INST_4K,		48,	" TLB_INST 4 KByte pages, full associative" }, +	{ 0x63, TLB_DATA_1G,		4,	" TLB_DATA 1 GByte pages, 4-way set associative" }, +	{ 0x76, TLB_INST_2M_4M,		8,	" TLB_INST 2-MByte or 4-MByte pages, fully associative" },  	{ 0xb0, TLB_INST_4K,		128,	" TLB_INST 4 KByte pages, 4-way set associative" },  	{ 0xb1, TLB_INST_2M_4M,		4,	" TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" },  	{ 0xb2, TLB_INST_4K,		64,	" TLB_INST 4KByte pages, 4-way set associative" },  	{ 0xb3, TLB_DATA_4K,		128,	" TLB_DATA 4 KByte pages, 4-way set associative" },  	{ 0xb4, TLB_DATA_4K,		256,	" TLB_DATA 4 KByte pages, 4-way associative" }, +	{ 0xb5, TLB_INST_4K,		64,	" TLB_INST 4 KByte pages, 8-way set ssociative" }, +	{ 0xb6, TLB_INST_4K,		128,	" TLB_INST 4 KByte pages, 8-way set ssociative" },  	{ 0xba, TLB_DATA_4K,		64,	" TLB_DATA 4 KByte pages, 4-way associative" },  	{ 0xc0, TLB_DATA_4K_4M,		8,	" TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" }, +	{ 0xc1, STLB_4K_2M,		1024,	" STLB 4 KByte and 2 MByte pages, 8-way associative" }, +	{ 0xc2, TLB_DATA_2M_4M,		16,	" DTLB 2 MByte/4MByte pages, 4-way associative" },  	{ 0xca, STLB_4K,		512,	" STLB 4 KByte pages, 4-way associative" },  	{ 0x00, 0, 0 }  }; @@ -557,6 +568,20 @@ static void intel_tlb_lookup(const unsigned char desc)  		if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)  			tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;  		break; +	case STLB_4K_2M: +		if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) +			tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; +		if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) +			tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; +		if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries) +			tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries; +		if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries) +			tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries; +		if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) +			tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; +		if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) +			tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; +		break;  	case TLB_INST_ALL:  		if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)  			tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; @@ -602,6 +627,10 @@ static void intel_tlb_lookup(const unsigned char desc)  		if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)  			tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;  		break; +	case TLB_DATA_1G: +		if (tlb_lld_1g[ENTRIES] < intel_tlb_table[k].entries) +			tlb_lld_1g[ENTRIES] = intel_tlb_table[k].entries; +		break;  	}  } @@ -614,21 +643,17 @@ static void intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c)  	case 0x61d: /* six-core 45 nm xeon "Dunnington" */  		tlb_flushall_shift = -1;  		break; +	case 0x63a: /* Ivybridge */ +		tlb_flushall_shift = 2; +		break;  	case 0x61a: /* 45 nm nehalem, "Bloomfield" */  	case 0x61e: /* 45 nm nehalem, "Lynnfield" */  	case 0x625: /* 32 nm nehalem, "Clarkdale" */  	case 0x62c: /* 32 nm nehalem, "Gulftown" */  	case 0x62e: /* 45 nm nehalem-ex, "Beckton" */  	case 0x62f: /* 32 nm Xeon E7 */ -		tlb_flushall_shift = 6; -		break;  	case 0x62a: /* SandyBridge */  	case 0x62d: /* SandyBridge, "Romely-EP" */ -		tlb_flushall_shift = 5; -		break; -	case 0x63a: /* Ivybridge */ -		tlb_flushall_shift = 1; -		break;  	default:  		tlb_flushall_shift = 6;  	} @@ -665,8 +690,8 @@ static const struct cpu_dev intel_cpu_dev = {  	.c_vendor	= "Intel",  	.c_ident	= { "GenuineIntel" },  #ifdef CONFIG_X86_32 -	.c_models = { -		{ .vendor = X86_VENDOR_INTEL, .family = 4, .model_names = +	.legacy_models = { +		{ .family = 4, .model_names =  		  {  			  [0] = "486 DX-25/33",  			  [1] = "486 DX-50", @@ -679,7 +704,7 @@ static const struct cpu_dev intel_cpu_dev = {  			  [9] = "486 DX/4-WB"  		  }  		}, -		{ .vendor = X86_VENDOR_INTEL, .family = 5, .model_names = +		{ .family = 5, .model_names =  		  {  			  [0] = "Pentium 60/66 A-step",  			  [1] = "Pentium 60/66", @@ -690,7 +715,7 @@ static const struct cpu_dev intel_cpu_dev = {  			  [8] = "Mobile Pentium MMX"  		  }  		}, -		{ .vendor = X86_VENDOR_INTEL, .family = 6, .model_names = +		{ .family = 6, .model_names =  		  {  			  [0] = "Pentium Pro A-step",  			  [1] = "Pentium Pro", @@ -704,7 +729,7 @@ static const struct cpu_dev intel_cpu_dev = {  			  [11] = "Pentium III (Tualatin)",  		  }  		}, -		{ .vendor = X86_VENDOR_INTEL, .family = 15, .model_names = +		{ .family = 15, .model_names =  		  {  			  [0] = "Pentium 4 (Unknown)",  			  [1] = "Pentium 4 (Willamette)", @@ -714,7 +739,7 @@ static const struct cpu_dev intel_cpu_dev = {  		  }  		},  	}, -	.c_size_cache	= intel_size_cache, +	.legacy_cache_size = intel_size_cache,  #endif  	.c_detect_tlb	= intel_detect_tlb,  	.c_early_init   = early_init_intel, diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 1414c90feab..9c8f7394c61 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -1,5 +1,5 @@  /* - *	Routines to indentify caches on Intel CPU. + *	Routines to identify caches on Intel CPU.   *   *	Changes:   *	Venkatesh Pallipadi	: Adding cache identification through cpuid(4) @@ -730,6 +730,18 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c)  #endif  	} +#ifdef CONFIG_X86_HT +	/* +	 * If cpu_llc_id is not yet set, this means cpuid_level < 4 which in +	 * turns means that the only possibility is SMT (as indicated in +	 * cpuid1). Since cpuid2 doesn't specify shared caches, and we know +	 * that SMT shares all caches, we can unconditionally set cpu_llc_id to +	 * c->phys_proc_id. +	 */ +	if (per_cpu(cpu_llc_id, cpu) == BAD_APICID) +		per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; +#endif +  	c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d));  	return l2; @@ -1225,21 +1237,24 @@ static struct notifier_block cacheinfo_cpu_notifier = {  static int __init cache_sysfs_init(void)  { -	int i; +	int i, err = 0;  	if (num_cache_leaves == 0)  		return 0; +	cpu_notifier_register_begin();  	for_each_online_cpu(i) { -		int err;  		struct device *dev = get_cpu_device(i);  		err = cache_add_dev(dev);  		if (err) -			return err; +			goto out;  	} -	register_hotcpu_notifier(&cacheinfo_cpu_notifier); -	return 0; +	__register_hotcpu_notifier(&cacheinfo_cpu_notifier); + +out: +	cpu_notifier_register_done(); +	return err;  }  device_initcall(cache_sysfs_init); diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index 36565373af8..afa9f0d487e 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -47,45 +47,3 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match)  	return NULL;  }  EXPORT_SYMBOL(x86_match_cpu); - -ssize_t arch_print_cpu_modalias(struct device *dev, -				struct device_attribute *attr, -				char *bufptr) -{ -	int size = PAGE_SIZE; -	int i, n; -	char *buf = bufptr; - -	n = snprintf(buf, size, "x86cpu:vendor:%04X:family:%04X:" -		     "model:%04X:feature:", -		boot_cpu_data.x86_vendor, -		boot_cpu_data.x86, -		boot_cpu_data.x86_model); -	size -= n; -	buf += n; -	size -= 1; -	for (i = 0; i < NCAPINTS*32; i++) { -		if (boot_cpu_has(i)) { -			n = snprintf(buf, size, ",%04X", i); -			if (n >= size) { -				WARN(1, "x86 features overflow page\n"); -				break; -			} -			size -= n; -			buf += n; -		} -	} -	*buf++ = '\n'; -	return buf - bufptr; -} - -int arch_cpu_uevent(struct device *dev, struct kobj_uevent_env *env) -{ -	char *buf = kzalloc(PAGE_SIZE, GFP_KERNEL); -	if (buf) { -		arch_print_cpu_modalias(NULL, NULL, buf); -		add_uevent_var(env, "MODALIAS=%s", buf); -		kfree(buf); -	} -	return 0; -} diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c index cd8b166a173..a1aef953315 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-apei.c +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c @@ -33,23 +33,28 @@  #include <linux/acpi.h>  #include <linux/cper.h>  #include <acpi/apei.h> +#include <acpi/ghes.h>  #include <asm/mce.h>  #include "mce-internal.h" -void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err) +void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)  {  	struct mce m; -	/* Only corrected MC is reported */ -	if (!corrected || !(mem_err->validation_bits & -				CPER_MEM_VALID_PHYSICAL_ADDRESS)) +	if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))  		return;  	mce_setup(&m);  	m.bank = 1; -	/* Fake a memory read corrected error with unknown channel */ +	/* Fake a memory read error with unknown channel */  	m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f; + +	if (severity >= GHES_SEV_RECOVERABLE) +		m.status |= MCI_STATUS_UC; +	if (severity >= GHES_SEV_PANIC) +		m.status |= MCI_STATUS_PCC; +  	m.addr = mem_err->physical_addr;  	mce_log(&m);  	mce_notify_irq(); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index b3218cdee95..9a79c8dbd8e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -60,8 +60,6 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex);  #define SPINUNIT 100	/* 100ns */ -atomic_t mce_entry; -  DEFINE_PER_CPU(unsigned, mce_exception_count);  struct mce_bank *mce_banks __read_mostly; @@ -89,6 +87,9 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);  static DEFINE_PER_CPU(struct mce, mces_seen);  static int			cpu_missing; +/* CMCI storm detection filter */ +static DEFINE_PER_CPU(unsigned long, mce_polled_error); +  /*   * MCA banks polled by the period polling timer for corrected events.   * With Intel CMCI, this only has MCA banks which do not support CMCI (if any). @@ -614,6 +615,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)  		if (!(m.status & MCI_STATUS_VAL))  			continue; +		this_cpu_write(mce_polled_error, 1);  		/*  		 * Uncorrected or signalled events are handled by the exception  		 * handler when it is enabled, so don't process those here. @@ -700,8 +702,7 @@ static int mce_timed_out(u64 *t)  	if (!mca_cfg.monarch_timeout)  		goto out;  	if ((s64)*t < SPINUNIT) { -		/* CHECKME: Make panic default for 1 too? */ -		if (mca_cfg.tolerant < 1) +		if (mca_cfg.tolerant <= 1)  			mce_panic("Timeout synchronizing machine check over CPUs",  				  NULL, NULL);  		cpu_missing = 1; @@ -1037,8 +1038,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)  	DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);  	char *msg = "Unknown"; -	atomic_inc(&mce_entry); -  	this_cpu_inc(mce_exception_count);  	if (!cfg->banks) @@ -1168,7 +1167,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)  		mce_report_event(regs);  	mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);  out: -	atomic_dec(&mce_entry);  	sync_core();  }  EXPORT_SYMBOL_GPL(do_machine_check); @@ -1278,10 +1276,18 @@ static unsigned long mce_adjust_timer_default(unsigned long interval)  static unsigned long (*mce_adjust_timer)(unsigned long interval) =  	mce_adjust_timer_default; +static int cmc_error_seen(void) +{ +	unsigned long *v = &__get_cpu_var(mce_polled_error); + +	return test_and_clear_bit(0, v); +} +  static void mce_timer_fn(unsigned long data)  {  	struct timer_list *t = &__get_cpu_var(mce_timer);  	unsigned long iv; +	int notify;  	WARN_ON(smp_processor_id() != data); @@ -1296,7 +1302,9 @@ static void mce_timer_fn(unsigned long data)  	 * polling interval, otherwise increase the polling interval.  	 */  	iv = __this_cpu_read(mce_next_interval); -	if (mce_notify_irq()) { +	notify = mce_notify_irq(); +	notify |= cmc_error_seen(); +	if (notify) {  		iv = max(iv / 2, (unsigned long) HZ/100);  	} else {  		iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); @@ -1638,15 +1646,15 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)  static void mce_start_timer(unsigned int cpu, struct timer_list *t)  { -	unsigned long iv = mce_adjust_timer(check_interval * HZ); - -	__this_cpu_write(mce_next_interval, iv); +	unsigned long iv = check_interval * HZ;  	if (mca_cfg.ignore_ce || !iv)  		return; +	per_cpu(mce_next_interval, cpu) = iv; +  	t->expires = round_jiffies(jiffies + iv); -	add_timer_on(t, smp_processor_id()); +	add_timer_on(t, cpu);  }  static void __mcheck_cpu_init_timer(void) @@ -2272,8 +2280,10 @@ static int mce_device_create(unsigned int cpu)  	dev->release = &mce_device_release;  	err = device_register(dev); -	if (err) +	if (err) { +		put_device(dev);  		return err; +	}  	for (i = 0; mce_device_attrs[i]; i++) {  		err = device_create_file(dev, mce_device_attrs[i]); @@ -2421,28 +2431,67 @@ static __init int mcheck_init_device(void)  	int err;  	int i = 0; -	if (!mce_available(&boot_cpu_data)) -		return -EIO; +	if (!mce_available(&boot_cpu_data)) { +		err = -EIO; +		goto err_out; +	} -	zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL); +	if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) { +		err = -ENOMEM; +		goto err_out; +	}  	mce_init_banks();  	err = subsys_system_register(&mce_subsys, NULL);  	if (err) -		return err; +		goto err_out_mem; +	cpu_notifier_register_begin();  	for_each_online_cpu(i) {  		err = mce_device_create(i); -		if (err) -			return err; +		if (err) { +			/* +			 * Register notifier anyway (and do not unreg it) so +			 * that we don't leave undeleted timers, see notifier +			 * callback above. +			 */ +			__register_hotcpu_notifier(&mce_cpu_notifier); +			cpu_notifier_register_done(); +			goto err_device_create; +		}  	} +	__register_hotcpu_notifier(&mce_cpu_notifier); +	cpu_notifier_register_done(); +  	register_syscore_ops(&mce_syscore_ops); -	register_hotcpu_notifier(&mce_cpu_notifier);  	/* register character device /dev/mcelog */ -	misc_register(&mce_chrdev_device); +	err = misc_register(&mce_chrdev_device); +	if (err) +		goto err_register; + +	return 0; + +err_register: +	unregister_syscore_ops(&mce_syscore_ops); + +err_device_create: +	/* +	 * We didn't keep track of which devices were created above, but +	 * even if we had, the set of online cpus might have changed. +	 * Play safe and remove for every possible cpu, since +	 * mce_device_remove() will do the right thing. +	 */ +	for_each_possible_cpu(i) +		mce_device_remove(i); + +err_out_mem: +	free_cpumask_var(mce_device_initialized); + +err_out: +	pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);  	return err;  } diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 4cfe0458ca6..9a316b21df8 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -6,10 +6,10 @@   */  #include <linux/gfp.h> -#include <linux/init.h>  #include <linux/interrupt.h>  #include <linux/percpu.h>  #include <linux/sched.h> +#include <linux/cpumask.h>  #include <asm/apic.h>  #include <asm/processor.h>  #include <asm/msr.h> @@ -42,7 +42,7 @@ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);   * cmci_discover_lock protects against parallel discovery attempts   * which could race against each other.   */ -static DEFINE_RAW_SPINLOCK(cmci_discover_lock); +static DEFINE_SPINLOCK(cmci_discover_lock);  #define CMCI_THRESHOLD		1  #define CMCI_POLL_INTERVAL	(30 * HZ) @@ -138,6 +138,22 @@ unsigned long mce_intel_adjust_timer(unsigned long interval)  	}  } +static void cmci_storm_disable_banks(void) +{ +	unsigned long flags, *owned; +	int bank; +	u64 val; + +	spin_lock_irqsave(&cmci_discover_lock, flags); +	owned = __get_cpu_var(mce_banks_owned); +	for_each_set_bit(bank, owned, MAX_NR_BANKS) { +		rdmsrl(MSR_IA32_MCx_CTL2(bank), val); +		val &= ~MCI_CTL2_CMCI_EN; +		wrmsrl(MSR_IA32_MCx_CTL2(bank), val); +	} +	spin_unlock_irqrestore(&cmci_discover_lock, flags); +} +  static bool cmci_storm_detect(void)  {  	unsigned int cnt = __this_cpu_read(cmci_storm_cnt); @@ -159,7 +175,7 @@ static bool cmci_storm_detect(void)  	if (cnt <= CMCI_STORM_THRESHOLD)  		return false; -	cmci_clear(); +	cmci_storm_disable_banks();  	__this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);  	r = atomic_add_return(1, &cmci_storm_on_cpus);  	mce_timer_kick(CMCI_POLL_INTERVAL); @@ -195,7 +211,7 @@ static void cmci_discover(int banks)  	int i;  	int bios_wrong_thresh = 0; -	raw_spin_lock_irqsave(&cmci_discover_lock, flags); +	spin_lock_irqsave(&cmci_discover_lock, flags);  	for (i = 0; i < banks; i++) {  		u64 val;  		int bios_zero_thresh = 0; @@ -250,7 +266,7 @@ static void cmci_discover(int banks)  			WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));  		}  	} -	raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); +	spin_unlock_irqrestore(&cmci_discover_lock, flags);  	if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) {  		pr_info_once(  			"bios_cmci_threshold: Some banks do not have valid thresholds set\n"); @@ -300,10 +316,10 @@ void cmci_clear(void)  	if (!cmci_supported(&banks))  		return; -	raw_spin_lock_irqsave(&cmci_discover_lock, flags); +	spin_lock_irqsave(&cmci_discover_lock, flags);  	for (i = 0; i < banks; i++)  		__cmci_disable_bank(i); -	raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); +	spin_unlock_irqrestore(&cmci_discover_lock, flags);  }  static void cmci_rediscover_work_func(void *arg) @@ -344,9 +360,9 @@ void cmci_disable_bank(int bank)  	if (!cmci_supported(&banks))  		return; -	raw_spin_lock_irqsave(&cmci_discover_lock, flags); +	spin_lock_irqsave(&cmci_discover_lock, flags);  	__cmci_disable_bank(bank); -	raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); +	spin_unlock_irqrestore(&cmci_discover_lock, flags);  }  static void intel_init_cmci(void) diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index 1c044b1ccc5..a3042989398 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c @@ -5,7 +5,6 @@  #include <linux/interrupt.h>  #include <linux/kernel.h>  #include <linux/types.h> -#include <linux/init.h>  #include <linux/smp.h>  #include <asm/processor.h> diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 3eec7de76ef..36a1bb6d1ee 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -271,9 +271,6 @@ static void thermal_throttle_remove_dev(struct device *dev)  	sysfs_remove_group(&dev->kobj, &thermal_attr_group);  } -/* Mutex protecting device creation against CPU hotplug: */ -static DEFINE_MUTEX(therm_cpu_lock); -  /* Get notified when a cpu comes on/off. Be hotplug friendly. */  static int  thermal_throttle_cpu_callback(struct notifier_block *nfb, @@ -289,18 +286,14 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,  	switch (action) {  	case CPU_UP_PREPARE:  	case CPU_UP_PREPARE_FROZEN: -		mutex_lock(&therm_cpu_lock);  		err = thermal_throttle_add_dev(dev, cpu); -		mutex_unlock(&therm_cpu_lock);  		WARN_ON(err);  		break;  	case CPU_UP_CANCELED:  	case CPU_UP_CANCELED_FROZEN:  	case CPU_DEAD:  	case CPU_DEAD_FROZEN: -		mutex_lock(&therm_cpu_lock);  		thermal_throttle_remove_dev(dev); -		mutex_unlock(&therm_cpu_lock);  		break;  	}  	return notifier_from_errno(err); @@ -319,19 +312,16 @@ static __init int thermal_throttle_init_device(void)  	if (!atomic_read(&therm_throt_en))  		return 0; -	register_hotcpu_notifier(&thermal_throttle_cpu_notifier); +	cpu_notifier_register_begin(); -#ifdef CONFIG_HOTPLUG_CPU -	mutex_lock(&therm_cpu_lock); -#endif  	/* connect live CPUs to sysfs */  	for_each_online_cpu(cpu) {  		err = thermal_throttle_add_dev(get_cpu_device(cpu), cpu);  		WARN_ON(err);  	} -#ifdef CONFIG_HOTPLUG_CPU -	mutex_unlock(&therm_cpu_lock); -#endif + +	__register_hotcpu_notifier(&thermal_throttle_cpu_notifier); +	cpu_notifier_register_done();  	return 0;  } @@ -439,14 +429,14 @@ static inline void __smp_thermal_interrupt(void)  	smp_thermal_vector();  } -asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) +asmlinkage __visible void smp_thermal_interrupt(struct pt_regs *regs)  {  	entering_irq();  	__smp_thermal_interrupt();  	exiting_ack_irq();  } -asmlinkage void smp_trace_thermal_interrupt(struct pt_regs *regs) +asmlinkage __visible void smp_trace_thermal_interrupt(struct pt_regs *regs)  {  	entering_irq();  	trace_thermal_apic_entry(THERMAL_APIC_VECTOR); diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index fe6b1c86645..7245980186e 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -24,14 +24,14 @@ static inline void __smp_threshold_interrupt(void)  	mce_threshold_vector();  } -asmlinkage void smp_threshold_interrupt(void) +asmlinkage __visible void smp_threshold_interrupt(void)  {  	entering_irq();  	__smp_threshold_interrupt();  	exiting_ack_irq();  } -asmlinkage void smp_trace_threshold_interrupt(void) +asmlinkage __visible void smp_trace_threshold_interrupt(void)  {  	entering_irq();  	trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR); diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c index e9a701aecaa..7dc5564d0cd 100644 --- a/arch/x86/kernel/cpu/mcheck/winchip.c +++ b/arch/x86/kernel/cpu/mcheck/winchip.c @@ -5,7 +5,6 @@  #include <linux/interrupt.h>  #include <linux/kernel.h>  #include <linux/types.h> -#include <linux/init.h>  #include <asm/processor.h>  #include <asm/mce.h> diff --git a/arch/x86/kernel/cpu/microcode/Makefile b/arch/x86/kernel/cpu/microcode/Makefile new file mode 100644 index 00000000000..285c85427c3 --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/Makefile @@ -0,0 +1,7 @@ +microcode-y				:= core.o +obj-$(CONFIG_MICROCODE)			+= microcode.o +microcode-$(CONFIG_MICROCODE_INTEL)	+= intel.o intel_lib.o +microcode-$(CONFIG_MICROCODE_AMD)	+= amd.o +obj-$(CONFIG_MICROCODE_EARLY)		+= core_early.o +obj-$(CONFIG_MICROCODE_INTEL_EARLY)	+= intel_early.o +obj-$(CONFIG_MICROCODE_AMD_EARLY)	+= amd_early.o diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/cpu/microcode/amd.c index af99f71aeb7..8fffd845e22 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -182,10 +182,10 @@ int __apply_microcode_amd(struct microcode_amd *mc_amd)  {  	u32 rev, dummy; -	wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); +	native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);  	/* verify patch application was successful */ -	rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); +	native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);  	if (rev != mc_amd->hdr.patch_id)  		return -1; @@ -332,6 +332,9 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover)  	patch->patch_id  = mc_hdr->patch_id;  	patch->equiv_cpu = proc_id; +	pr_debug("%s: Added patch_id: 0x%08x, proc_id: 0x%04x\n", +		 __func__, patch->patch_id, proc_id); +  	/* ... and add to cache. */  	update_cache(patch); @@ -390,9 +393,9 @@ enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size)  	if (cpu_data(smp_processor_id()).cpu_index == boot_cpu_data.cpu_index) {  		struct ucode_patch *p = find_patch(smp_processor_id());  		if (p) { -			memset(amd_bsp_mpb, 0, MPB_MAX_SIZE); -			memcpy(amd_bsp_mpb, p->data, min_t(u32, ksize(p->data), -							   MPB_MAX_SIZE)); +			memset(amd_ucode_patch, 0, PATCH_MAX_SIZE); +			memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data), +							       PATCH_MAX_SIZE));  		}  	}  #endif @@ -430,8 +433,8 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device,  	if (c->x86 >= 0x15)  		snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86); -	if (request_firmware(&fw, (const char *)fw_name, device)) { -		pr_err("failed to load file %s\n", fw_name); +	if (request_firmware_direct(&fw, (const char *)fw_name, device)) { +		pr_debug("failed to load file %s\n", fw_name);  		goto out;  	} diff --git a/arch/x86/kernel/microcode_amd_early.c b/arch/x86/kernel/cpu/microcode/amd_early.c index 6073104ccaa..617a9e28424 100644 --- a/arch/x86/kernel/microcode_amd_early.c +++ b/arch/x86/kernel/cpu/microcode/amd_early.c @@ -2,6 +2,7 @@   * Copyright (C) 2013 Advanced Micro Devices, Inc.   *   * Author: Jacob Shin <jacob.shin@amd.com> + * Fixes: Borislav Petkov <bp@suse.de>   *   * This program is free software; you can redistribute it and/or modify   * it under the terms of the GNU General Public License version 2 as @@ -15,10 +16,18 @@  #include <asm/setup.h>  #include <asm/microcode_amd.h> -static bool ucode_loaded; +/* + * This points to the current valid container of microcode patches which we will + * save from the initrd before jettisoning its contents. + */ +static u8 *container; +static size_t container_size; +  static u32 ucode_new_rev; -static unsigned long ucode_offset; -static size_t ucode_size; +u8 amd_ucode_patch[PATCH_MAX_SIZE]; +static u16 this_equiv_id; + +struct cpio_data ucode_cpio;  /*   * Microcode patch container file is prepended to the initrd in cpio format. @@ -32,9 +41,6 @@ static struct cpio_data __init find_ucode_in_initrd(void)  	char *path;  	void *start;  	size_t size; -	unsigned long *uoffset; -	size_t *usize; -	struct cpio_data cd;  #ifdef CONFIG_X86_32  	struct boot_params *p; @@ -47,30 +53,50 @@ static struct cpio_data __init find_ucode_in_initrd(void)  	path    = (char *)__pa_nodebug(ucode_path);  	start   = (void *)p->hdr.ramdisk_image;  	size    = p->hdr.ramdisk_size; -	uoffset = (unsigned long *)__pa_nodebug(&ucode_offset); -	usize   = (size_t *)__pa_nodebug(&ucode_size);  #else  	path    = ucode_path;  	start   = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET);  	size    = boot_params.hdr.ramdisk_size; -	uoffset = &ucode_offset; -	usize   = &ucode_size;  #endif -	cd = find_cpio_data(path, start, size, &offset); -	if (!cd.data) -		return cd; +	return find_cpio_data(path, start, size, &offset); +} -	if (*(u32 *)cd.data != UCODE_MAGIC) { -		cd.data = NULL; -		cd.size = 0; -		return cd; -	} +static size_t compute_container_size(u8 *data, u32 total_size) +{ +	size_t size = 0; +	u32 *header = (u32 *)data; + +	if (header[0] != UCODE_MAGIC || +	    header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */ +	    header[2] == 0)                            /* size */ +		return size; + +	size = header[2] + CONTAINER_HDR_SZ; +	total_size -= size; +	data += size; -	*uoffset = (u8 *)cd.data - (u8 *)start; -	*usize   = cd.size; +	while (total_size) { +		u16 patch_size; + +		header = (u32 *)data; + +		if (header[0] != UCODE_UCODE_TYPE) +			break; + +		/* +		 * Sanity-check patch size. +		 */ +		patch_size = header[1]; +		if (patch_size > PATCH_MAX_SIZE) +			break; + +		size	   += patch_size + SECTION_HDR_SIZE; +		data	   += patch_size + SECTION_HDR_SIZE; +		total_size -= patch_size + SECTION_HDR_SIZE; +	} -	return cd; +	return size;  }  /* @@ -85,23 +111,22 @@ static struct cpio_data __init find_ucode_in_initrd(void)  static void apply_ucode_in_initrd(void *ucode, size_t size)  {  	struct equiv_cpu_entry *eq; +	size_t *cont_sz;  	u32 *header; -	u8  *data; +	u8  *data, **cont;  	u16 eq_id = 0;  	int offset, left; -	u32 rev, eax; +	u32 rev, eax, ebx, ecx, edx;  	u32 *new_rev; -	unsigned long *uoffset; -	size_t *usize;  #ifdef CONFIG_X86_32  	new_rev = (u32 *)__pa_nodebug(&ucode_new_rev); -	uoffset = (unsigned long *)__pa_nodebug(&ucode_offset); -	usize   = (size_t *)__pa_nodebug(&ucode_size); +	cont_sz = (size_t *)__pa_nodebug(&container_size); +	cont	= (u8 **)__pa_nodebug(&container);  #else  	new_rev = &ucode_new_rev; -	uoffset = &ucode_offset; -	usize   = &ucode_size; +	cont_sz = &container_size; +	cont	= &container;  #endif  	data   = ucode; @@ -109,23 +134,37 @@ static void apply_ucode_in_initrd(void *ucode, size_t size)  	header = (u32 *)data;  	/* find equiv cpu table */ - -	if (header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */ +	if (header[0] != UCODE_MAGIC || +	    header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */  	    header[2] == 0)                            /* size */  		return; -	eax = cpuid_eax(0x00000001); +	eax = 0x00000001; +	ecx = 0; +	native_cpuid(&eax, &ebx, &ecx, &edx);  	while (left > 0) {  		eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ); +		*cont = data; + +		/* Advance past the container header */  		offset = header[2] + CONTAINER_HDR_SZ;  		data  += offset;  		left  -= offset;  		eq_id = find_equiv_id(eq, eax); -		if (eq_id) +		if (eq_id) { +			this_equiv_id = eq_id; +			*cont_sz = compute_container_size(*cont, left + offset); + +			/* +			 * truncate how much we need to iterate over in the +			 * ucode update loop below +			 */ +			left = *cont_sz - offset;  			break; +		}  		/*  		 * support multiple container files appended together. if this @@ -145,19 +184,18 @@ static void apply_ucode_in_initrd(void *ucode, size_t size)  		/* mark where the next microcode container file starts */  		offset    = data - (u8 *)ucode; -		*uoffset += offset; -		*usize   -= offset;  		ucode     = data;  	}  	if (!eq_id) { -		*usize = 0; +		*cont = NULL; +		*cont_sz = 0;  		return;  	}  	/* find ucode and update if needed */ -	rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax); +	native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax);  	while (left > 0) {  		struct microcode_amd *mc; @@ -168,134 +206,190 @@ static void apply_ucode_in_initrd(void *ucode, size_t size)  			break;  		mc = (struct microcode_amd *)(data + SECTION_HDR_SIZE); -		if (eq_id == mc->hdr.processor_rev_id && rev < mc->hdr.patch_id) -			if (__apply_microcode_amd(mc) == 0) { + +		if (eq_id == mc->hdr.processor_rev_id && rev < mc->hdr.patch_id) { + +			if (!__apply_microcode_amd(mc)) {  				rev = mc->hdr.patch_id;  				*new_rev = rev; + +				/* save ucode patch */ +				memcpy(amd_ucode_patch, mc, +				       min_t(u32, header[1], PATCH_MAX_SIZE));  			} +		}  		offset  = header[1] + SECTION_HDR_SIZE;  		data   += offset;  		left   -= offset;  	} - -	/* mark where this microcode container file ends */ -	offset  = *usize - (data - (u8 *)ucode); -	*usize -= offset; - -	if (!(*new_rev)) -		*usize = 0;  }  void __init load_ucode_amd_bsp(void)  { -	struct cpio_data cd = find_ucode_in_initrd(); -	if (!cd.data) +	struct cpio_data cp; +	void **data; +	size_t *size; + +#ifdef CONFIG_X86_32 +	data =  (void **)__pa_nodebug(&ucode_cpio.data); +	size = (size_t *)__pa_nodebug(&ucode_cpio.size); +#else +	data = &ucode_cpio.data; +	size = &ucode_cpio.size; +#endif + +	cp = find_ucode_in_initrd(); +	if (!cp.data)  		return; -	apply_ucode_in_initrd(cd.data, cd.size); +	*data = cp.data; +	*size = cp.size; + +	apply_ucode_in_initrd(cp.data, cp.size);  }  #ifdef CONFIG_X86_32 -u8 amd_bsp_mpb[MPB_MAX_SIZE]; -  /*   * On 32-bit, since AP's early load occurs before paging is turned on, we   * cannot traverse cpu_equiv_table and pcache in kernel heap memory. So during   * cold boot, AP will apply_ucode_in_initrd() just like the BSP. During - * save_microcode_in_initrd_amd() BSP's patch is copied to amd_bsp_mpb, which - * is used upon resume from suspend. + * save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch, + * which is used upon resume from suspend.   */  void load_ucode_amd_ap(void)  {  	struct microcode_amd *mc; -	unsigned long *initrd; -	unsigned long *uoffset;  	size_t *usize; -	void *ucode; +	void **ucode; -	mc = (struct microcode_amd *)__pa(amd_bsp_mpb); +	mc = (struct microcode_amd *)__pa(amd_ucode_patch);  	if (mc->hdr.patch_id && mc->hdr.processor_rev_id) {  		__apply_microcode_amd(mc);  		return;  	} -	initrd  = (unsigned long *)__pa(&initrd_start); -	uoffset = (unsigned long *)__pa(&ucode_offset); -	usize   = (size_t *)__pa(&ucode_size); +	ucode = (void *)__pa_nodebug(&container); +	usize = (size_t *)__pa_nodebug(&container_size); -	if (!*usize || !*initrd) +	if (!*ucode || !*usize)  		return; -	ucode = (void *)((unsigned long)__pa(*initrd) + *uoffset); -	apply_ucode_in_initrd(ucode, *usize); +	apply_ucode_in_initrd(*ucode, *usize);  }  static void __init collect_cpu_sig_on_bsp(void *arg)  {  	unsigned int cpu = smp_processor_id();  	struct ucode_cpu_info *uci = ucode_cpu_info + cpu; +  	uci->cpu_sig.sig = cpuid_eax(0x00000001);  } + +static void __init get_bsp_sig(void) +{ +	unsigned int bsp = boot_cpu_data.cpu_index; +	struct ucode_cpu_info *uci = ucode_cpu_info + bsp; + +	if (!uci->cpu_sig.sig) +		smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1); +}  #else  void load_ucode_amd_ap(void)  {  	unsigned int cpu = smp_processor_id();  	struct ucode_cpu_info *uci = ucode_cpu_info + cpu; +	struct equiv_cpu_entry *eq; +	struct microcode_amd *mc;  	u32 rev, eax; +	u16 eq_id; + +	/* Exit if called on the BSP. */ +	if (!cpu) +		return; + +	if (!container) +		return;  	rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax); -	eax = cpuid_eax(0x00000001);  	uci->cpu_sig.rev = rev;  	uci->cpu_sig.sig = eax; -	if (cpu && !ucode_loaded) { -		void *ucode; +	eax = cpuid_eax(0x00000001); +	eq  = (struct equiv_cpu_entry *)(container + CONTAINER_HDR_SZ); + +	eq_id = find_equiv_id(eq, eax); +	if (!eq_id) +		return; -		if (!ucode_size || !initrd_start) -			return; +	if (eq_id == this_equiv_id) { +		mc = (struct microcode_amd *)amd_ucode_patch; + +		if (mc && rev < mc->hdr.patch_id) { +			if (!__apply_microcode_amd(mc)) +				ucode_new_rev = mc->hdr.patch_id; +		} -		ucode = (void *)(initrd_start + ucode_offset); -		eax   = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); -		if (load_microcode_amd(eax, ucode, ucode_size) != UCODE_OK) +	} else { +		if (!ucode_cpio.data)  			return; -		ucode_loaded = true; +		/* +		 * AP has a different equivalence ID than BSP, looks like +		 * mixed-steppings silicon so go through the ucode blob anew. +		 */ +		apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size);  	} - -	apply_microcode_amd(cpu);  }  #endif  int __init save_microcode_in_initrd_amd(void)  { +	unsigned long cont;  	enum ucode_state ret; -	void *ucode;  	u32 eax; -#ifdef CONFIG_X86_32 -	unsigned int bsp = boot_cpu_data.cpu_index; -	struct ucode_cpu_info *uci = ucode_cpu_info + bsp; +	if (!container) +		return -EINVAL; -	if (!uci->cpu_sig.sig) -		smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1); +#ifdef CONFIG_X86_32 +	get_bsp_sig(); +	cont = (unsigned long)container; +#else +	/* +	 * We need the physical address of the container for both bitness since +	 * boot_params.hdr.ramdisk_image is a physical address. +	 */ +	cont = __pa(container);  #endif + +	/* +	 * Take into account the fact that the ramdisk might get relocated and +	 * therefore we need to recompute the container's position in virtual +	 * memory space. +	 */ +	if (relocated_ramdisk) +		container = (u8 *)(__va(relocated_ramdisk) + +			     (cont - boot_params.hdr.ramdisk_image)); +  	if (ucode_new_rev)  		pr_info("microcode: updated early to new patch_level=0x%08x\n",  			ucode_new_rev); -	if (ucode_loaded || !ucode_size || !initrd_start) -		return 0; - -	ucode = (void *)(initrd_start + ucode_offset);  	eax   = cpuid_eax(0x00000001);  	eax   = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); -	ret = load_microcode_amd(eax, ucode, ucode_size); +	ret = load_microcode_amd(eax, container, container_size);  	if (ret != UCODE_OK)  		return -EINVAL; -	ucode_loaded = true; +	/* +	 * This will be freed any msec now, stash patches for the current +	 * family and switch to patch cache for cpu hotplug, etc later. +	 */ +	container = NULL; +	container_size = 0; +  	return 0;  } diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/cpu/microcode/core.c index 15c987698b0..dd9d6190b08 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -97,6 +97,9 @@ MODULE_LICENSE("GPL");  static struct microcode_ops	*microcode_ops; +bool dis_ucode_ldr; +module_param(dis_ucode_ldr, bool, 0); +  /*   * Synchronization.   * @@ -546,6 +549,9 @@ static int __init microcode_init(void)  	struct cpuinfo_x86 *c = &cpu_data(0);  	int error; +	if (dis_ucode_ldr) +		return 0; +  	if (c->x86_vendor == X86_VENDOR_INTEL)  		microcode_ops = init_intel_microcode();  	else if (c->x86_vendor == X86_VENDOR_AMD) diff --git a/arch/x86/kernel/microcode_core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c index be7f8514f57..5f28a64e71e 100644 --- a/arch/x86/kernel/microcode_core_early.c +++ b/arch/x86/kernel/cpu/microcode/core_early.c @@ -17,9 +17,11 @@   *	2 of the License, or (at your option) any later version.   */  #include <linux/module.h> +#include <asm/microcode.h>  #include <asm/microcode_intel.h>  #include <asm/microcode_amd.h>  #include <asm/processor.h> +#include <asm/cmdline.h>  #define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24))  #define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u') @@ -72,10 +74,33 @@ static int x86_family(void)  	return x86;  } +static bool __init check_loader_disabled_bsp(void) +{ +#ifdef CONFIG_X86_32 +	const char *cmdline = (const char *)__pa_nodebug(boot_command_line); +	const char *opt	    = "dis_ucode_ldr"; +	const char *option  = (const char *)__pa_nodebug(opt); +	bool *res = (bool *)__pa_nodebug(&dis_ucode_ldr); + +#else /* CONFIG_X86_64 */ +	const char *cmdline = boot_command_line; +	const char *option  = "dis_ucode_ldr"; +	bool *res = &dis_ucode_ldr; +#endif + +	if (cmdline_find_option_bool(cmdline, option)) +		*res = true; + +	return *res; +} +  void __init load_ucode_bsp(void)  {  	int vendor, x86; +	if (check_loader_disabled_bsp()) +		return; +  	if (!have_cpuid_p())  		return; @@ -96,10 +121,22 @@ void __init load_ucode_bsp(void)  	}  } +static bool check_loader_disabled_ap(void) +{ +#ifdef CONFIG_X86_32 +	return __pa_nodebug(dis_ucode_ldr); +#else +	return dis_ucode_ldr; +#endif +} +  void load_ucode_ap(void)  {  	int vendor, x86; +	if (check_loader_disabled_ap()) +		return; +  	if (!have_cpuid_p())  		return; diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 5fb2cebf556..a276fa75d9b 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -278,7 +278,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device,  	sprintf(name, "intel-ucode/%02x-%02x-%02x",  		c->x86, c->x86_model, c->x86_mask); -	if (request_firmware(&firmware, name, device)) { +	if (request_firmware_direct(&firmware, name, device)) {  		pr_debug("data file %s load failed\n", name);  		return UCODE_NFOUND;  	} diff --git a/arch/x86/kernel/microcode_intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c index 1575deb2e63..18f739129e7 100644 --- a/arch/x86/kernel/microcode_intel_early.c +++ b/arch/x86/kernel/cpu/microcode/intel_early.c @@ -365,16 +365,6 @@ out:  	return state;  } -#define native_rdmsr(msr, val1, val2)		\ -do {						\ -	u64 __val = native_read_msr((msr));	\ -	(void)((val1) = (u32)__val);		\ -	(void)((val2) = (u32)(__val >> 32));	\ -} while (0) - -#define native_wrmsr(msr, low, high)		\ -	native_write_msr(msr, low, high); -  static int collect_cpu_info_early(struct ucode_cpu_info *uci)  {  	unsigned int val[2]; diff --git a/arch/x86/kernel/microcode_intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c index ce69320d017..ce69320d017 100644 --- a/arch/x86/kernel/microcode_intel_lib.c +++ b/arch/x86/kernel/cpu/microcode/intel_lib.c diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 71a39f3621b..a450373e8e9 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -15,7 +15,9 @@  #include <linux/clocksource.h>  #include <linux/module.h>  #include <linux/hardirq.h> +#include <linux/efi.h>  #include <linux/interrupt.h> +#include <linux/irq.h>  #include <asm/processor.h>  #include <asm/hypervisor.h>  #include <asm/hyperv.h> @@ -23,10 +25,52 @@  #include <asm/desc.h>  #include <asm/idle.h>  #include <asm/irq_regs.h> +#include <asm/i8259.h> +#include <asm/apic.h> +#include <asm/timer.h>  struct ms_hyperv_info ms_hyperv;  EXPORT_SYMBOL_GPL(ms_hyperv); +#if IS_ENABLED(CONFIG_HYPERV) +static void (*vmbus_handler)(void); + +void hyperv_vector_handler(struct pt_regs *regs) +{ +	struct pt_regs *old_regs = set_irq_regs(regs); + +	irq_enter(); +	exit_idle(); + +	inc_irq_stat(irq_hv_callback_count); +	if (vmbus_handler) +		vmbus_handler(); + +	irq_exit(); +	set_irq_regs(old_regs); +} + +void hv_setup_vmbus_irq(void (*handler)(void)) +{ +	vmbus_handler = handler; +	/* +	 * Setup the IDT for hypervisor callback. Prevent reallocation +	 * at module reload. +	 */ +	if (!test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors)) +		alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, +				hyperv_callback_vector); +} + +void hv_remove_vmbus_irq(void) +{ +	/* We have no way to deallocate the interrupt gate */ +	vmbus_handler = NULL; +} +EXPORT_SYMBOL_GPL(hv_setup_vmbus_irq); +EXPORT_SYMBOL_GPL(hv_remove_vmbus_irq); +#endif +  static uint32_t  __init ms_hyperv_platform(void)  {  	u32 eax; @@ -76,8 +120,28 @@ static void __init ms_hyperv_init_platform(void)  	printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n",  	       ms_hyperv.features, ms_hyperv.hints); +#ifdef CONFIG_X86_LOCAL_APIC +	if (ms_hyperv.features & HV_X64_MSR_APIC_FREQUENCY_AVAILABLE) { +		/* +		 * Get the APIC frequency. +		 */ +		u64	hv_lapic_frequency; + +		rdmsrl(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency); +		hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ); +		lapic_timer_frequency = hv_lapic_frequency; +		printk(KERN_INFO "HyperV: LAPIC Timer Frequency: %#x\n", +				lapic_timer_frequency); +	} +#endif +  	if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE)  		clocksource_register_hz(&hyperv_cs, NSEC_PER_SEC/100); + +#ifdef CONFIG_X86_IO_APIC +	no_timer_check = 1; +#endif +  }  const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { @@ -86,41 +150,3 @@ const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {  	.init_platform		= ms_hyperv_init_platform,  };  EXPORT_SYMBOL(x86_hyper_ms_hyperv); - -#if IS_ENABLED(CONFIG_HYPERV) -static int vmbus_irq = -1; -static irq_handler_t vmbus_isr; - -void hv_register_vmbus_handler(int irq, irq_handler_t handler) -{ -	/* -	 * Setup the IDT for hypervisor callback. -	 */ -	alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector); - -	vmbus_irq = irq; -	vmbus_isr = handler; -} - -void hyperv_vector_handler(struct pt_regs *regs) -{ -	struct pt_regs *old_regs = set_irq_regs(regs); -	struct irq_desc *desc; - -	irq_enter(); -	exit_idle(); - -	desc = irq_to_desc(vmbus_irq); - -	if (desc) -		generic_handle_irq_desc(vmbus_irq, desc); - -	irq_exit(); -	set_irq_regs(old_regs); -} -#else -void hv_register_vmbus_handler(int irq, irq_handler_t handler) -{ -} -#endif -EXPORT_SYMBOL_GPL(hv_register_vmbus_handler); diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index ce2d0a2c3e4..0e25a1bc5ab 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -683,7 +683,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)  	}  	/* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ -	count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); +	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);  	__flush_tlb();  	/* Save MTRR state */ @@ -697,7 +697,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)  static void post_set(void) __releases(set_atomicity_lock)  {  	/* Flush TLBs (no need to flush caches - they are disabled) */ -	count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); +	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);  	__flush_tlb();  	/* Intel (P6) standard MTRRs */ diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 897783b3302..2879ecdaac4 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -118,6 +118,9 @@ static int x86_pmu_extra_regs(u64 config, struct perf_event *event)  			continue;  		if (event->attr.config1 & ~er->valid_mask)  			return -EINVAL; +		/* Check if the extra msrs can be safely accessed*/ +		if (!er->extra_msr_access) +			return -ENXIO;  		reg->idx = er->idx;  		reg->config = event->attr.config1; @@ -303,15 +306,6 @@ int x86_setup_perfctr(struct perf_event *event)  		hwc->sample_period = x86_pmu.max_period;  		hwc->last_period = hwc->sample_period;  		local64_set(&hwc->period_left, hwc->sample_period); -	} else { -		/* -		 * If we have a PMU initialized but no APIC -		 * interrupts, we cannot sample hardware -		 * events (user-space has to fall back and -		 * sample via a hrtimer based software event): -		 */ -		if (!x86_pmu.apic) -			return -EOPNOTSUPP;  	}  	if (attr->type == PERF_TYPE_RAW) @@ -721,6 +715,7 @@ int perf_assign_events(struct perf_event **events, int n,  	return sched.state.unassigned;  } +EXPORT_SYMBOL_GPL(perf_assign_events);  int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)  { @@ -892,7 +887,6 @@ static void x86_pmu_enable(struct pmu *pmu)  		 * hw_perf_group_sched_in() or x86_pmu_enable()  		 *  		 * step1: save events moving to new counters -		 * step2: reprogram moved events into new counters  		 */  		for (i = 0; i < n_running; i++) {  			event = cpuc->event_list[i]; @@ -918,6 +912,9 @@ static void x86_pmu_enable(struct pmu *pmu)  			x86_pmu_stop(event, PERF_EF_UPDATE);  		} +		/* +		 * step2: reprogram moved events into new counters +		 */  		for (i = 0; i < cpuc->n_events; i++) {  			event = cpuc->event_list[i];  			hwc = &event->hw; @@ -1043,7 +1040,7 @@ static int x86_pmu_add(struct perf_event *event, int flags)  	/*  	 * If group events scheduling transaction was started,  	 * skip the schedulability test here, it will be performed -	 * at commit time (->commit_txn) as a whole +	 * at commit time (->commit_txn) as a whole.  	 */  	if (cpuc->group_flag & PERF_EVENT_TXN)  		goto done_collect; @@ -1058,6 +1055,10 @@ static int x86_pmu_add(struct perf_event *event, int flags)  	memcpy(cpuc->assign, assign, n*sizeof(int));  done_collect: +	/* +	 * Commit the collect_events() state. See x86_pmu_del() and +	 * x86_pmu_*_txn(). +	 */  	cpuc->n_events = n;  	cpuc->n_added += n - n0;  	cpuc->n_txn += n - n0; @@ -1183,25 +1184,38 @@ static void x86_pmu_del(struct perf_event *event, int flags)  	 * If we're called during a txn, we don't need to do anything.  	 * The events never got scheduled and ->cancel_txn will truncate  	 * the event_list. +	 * +	 * XXX assumes any ->del() called during a TXN will only be on +	 * an event added during that same TXN.  	 */  	if (cpuc->group_flag & PERF_EVENT_TXN)  		return; +	/* +	 * Not a TXN, therefore cleanup properly. +	 */  	x86_pmu_stop(event, PERF_EF_UPDATE);  	for (i = 0; i < cpuc->n_events; i++) { -		if (event == cpuc->event_list[i]) { +		if (event == cpuc->event_list[i]) +			break; +	} -			if (x86_pmu.put_event_constraints) -				x86_pmu.put_event_constraints(cpuc, event); +	if (WARN_ON_ONCE(i == cpuc->n_events)) /* called ->del() without ->add() ? */ +		return; -			while (++i < cpuc->n_events) -				cpuc->event_list[i-1] = cpuc->event_list[i]; +	/* If we have a newly added event; make sure to decrease n_added. */ +	if (i >= cpuc->n_events - cpuc->n_added) +		--cpuc->n_added; + +	if (x86_pmu.put_event_constraints) +		x86_pmu.put_event_constraints(cpuc, event); + +	/* Delete the array entry. */ +	while (++i < cpuc->n_events) +		cpuc->event_list[i-1] = cpuc->event_list[i]; +	--cpuc->n_events; -			--cpuc->n_events; -			break; -		} -	}  	perf_event_update_userpage(event);  } @@ -1273,24 +1287,25 @@ void perf_events_lapic_init(void)  	apic_write(APIC_LVTPC, APIC_DM_NMI);  } -static int __kprobes +static int  perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)  { -	int ret;  	u64 start_clock;  	u64 finish_clock; +	int ret;  	if (!atomic_read(&active_events))  		return NMI_DONE; -	start_clock = local_clock(); +	start_clock = sched_clock();  	ret = x86_pmu.handle_irq(regs); -	finish_clock = local_clock(); +	finish_clock = sched_clock();  	perf_sample_event_took(finish_clock - start_clock);  	return ret;  } +NOKPROBE_SYMBOL(perf_event_nmi_handler);  struct event_constraint emptyconstraint;  struct event_constraint unconstrained; @@ -1346,6 +1361,15 @@ static void __init pmu_check_apic(void)  	x86_pmu.apic = 0;  	pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");  	pr_info("no hardware sampling interrupt available.\n"); + +	/* +	 * If we have a PMU initialized but no APIC +	 * interrupts, we cannot sample hardware +	 * events (user-space has to fall back and +	 * sample via a hrtimer based software event): +	 */ +	pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT; +  }  static struct attribute_group x86_pmu_format_group = { @@ -1521,6 +1545,8 @@ static int __init init_hw_perf_events(void)  	pr_cont("%s PMU driver.\n", x86_pmu.name); +	x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ +  	for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next)  		quirk->func(); @@ -1534,7 +1560,6 @@ static int __init init_hw_perf_events(void)  		__EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,  				   0, x86_pmu.num_counters, 0, 0); -	x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */  	x86_pmu_format_group.attrs = x86_pmu.format_attrs;  	if (x86_pmu.event_attrs) @@ -1594,7 +1619,8 @@ static void x86_pmu_cancel_txn(struct pmu *pmu)  {  	__this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN);  	/* -	 * Truncate the collected events. +	 * Truncate collected array by the number of events added in this +	 * transaction. See x86_pmu_add() and x86_pmu_*_txn().  	 */  	__this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));  	__this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn)); @@ -1605,6 +1631,8 @@ static void x86_pmu_cancel_txn(struct pmu *pmu)   * Commit group events scheduling transaction   * Perform the group schedulability test as a whole   * Return 0 if success + * + * Does not cancel the transaction on failure; expects the caller to do this.   */  static int x86_pmu_commit_txn(struct pmu *pmu)  { @@ -1820,9 +1848,12 @@ static ssize_t set_attr_rdpmc(struct device *cdev,  	if (ret)  		return ret; +	if (x86_pmu.attr_rdpmc_broken) +		return -ENOTSUPP; +  	if (!!val != !!x86_pmu.attr_rdpmc) {  		x86_pmu.attr_rdpmc = !!val; -		smp_call_function(change_rdpmc, (void *)val, 1); +		on_each_cpu(change_rdpmc, (void *)val, 1);  	}  	return count; @@ -1883,26 +1914,27 @@ static struct pmu pmu = {  void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)  { +	struct cyc2ns_data *data; +  	userpg->cap_user_time = 0;  	userpg->cap_user_time_zero = 0;  	userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc;  	userpg->pmc_width = x86_pmu.cntval_bits; -	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) +	if (!sched_clock_stable())  		return; -	if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) -		return; +	data = cyc2ns_read_begin();  	userpg->cap_user_time = 1; -	userpg->time_mult = this_cpu_read(cyc2ns); -	userpg->time_shift = CYC2NS_SCALE_FACTOR; -	userpg->time_offset = this_cpu_read(cyc2ns_offset) - now; +	userpg->time_mult = data->cyc2ns_mul; +	userpg->time_shift = data->cyc2ns_shift; +	userpg->time_offset = data->cyc2ns_offset - now; -	if (sched_clock_stable && !check_tsc_disabled()) { -		userpg->cap_user_time_zero = 1; -		userpg->time_zero = this_cpu_read(cyc2ns_offset); -	} +	userpg->cap_user_time_zero = 1; +	userpg->time_zero = data->cyc2ns_offset; + +	cyc2ns_read_end(data);  }  /* @@ -1994,7 +2026,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)  		frame.return_address = 0;  		bytes = copy_from_user_nmi(&frame, fp, sizeof(frame)); -		if (bytes != sizeof(frame)) +		if (bytes != 0)  			break;  		if (!valid_user_frame(fp, sizeof(frame))) @@ -2046,7 +2078,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)  		frame.return_address = 0;  		bytes = copy_from_user_nmi(&frame, fp, sizeof(frame)); -		if (bytes != sizeof(frame)) +		if (bytes != 0)  			break;  		if (!valid_user_frame(fp, sizeof(frame))) diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index cc16faae053..8ade93111e0 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -130,9 +130,11 @@ struct cpu_hw_events {  	unsigned long		running[BITS_TO_LONGS(X86_PMC_IDX_MAX)];  	int			enabled; -	int			n_events; -	int			n_added; -	int			n_txn; +	int			n_events; /* the # of events in the below arrays */ +	int			n_added;  /* the # last events in the below arrays; +					     they've never been enabled yet */ +	int			n_txn;    /* the # last events in the below arrays; +					     added in the current transaction */  	int			assign[X86_PMC_IDX_MAX]; /* event to counter assignment */  	u64			tags[X86_PMC_IDX_MAX];  	struct perf_event	*event_list[X86_PMC_IDX_MAX]; /* in enabled order */ @@ -164,6 +166,11 @@ struct cpu_hw_events {  	struct perf_guest_switch_msr	guest_switch_msrs[X86_PMC_IDX_MAX];  	/* +	 * Intel checkpoint mask +	 */ +	u64				intel_cp_status; + +	/*  	 * manage shared (per-core, per-cpu) registers  	 * used on Intel NHM/WSM/SNB  	 */ @@ -257,11 +264,20 @@ struct cpu_hw_events {  	__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \  			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW) -#define EVENT_CONSTRAINT_END		\ -	EVENT_CONSTRAINT(0, 0, 0) +/* + * We define the end marker as having a weight of -1 + * to enable blacklisting of events using a counter bitmask + * of zero and thus a weight of zero. + * The end marker has a weight that cannot possibly be + * obtained from counting the bits in the bitmask. + */ +#define EVENT_CONSTRAINT_END { .weight = -1 } +/* + * Check for end marker with weight == -1 + */  #define for_each_event_constraint(e, c)	\ -	for ((e) = (c); (e)->weight; (e)++) +	for ((e) = (c); (e)->weight != -1; (e)++)  /*   * Extra registers for specific events. @@ -279,14 +295,16 @@ struct extra_reg {  	u64			config_mask;  	u64			valid_mask;  	int			idx;  /* per_xxx->regs[] reg index */ +	bool			extra_msr_access;  };  #define EVENT_EXTRA_REG(e, ms, m, vm, i) {	\ -	.event = (e),		\ -	.msr = (ms),		\ -	.config_mask = (m),	\ -	.valid_mask = (vm),	\ -	.idx = EXTRA_REG_##i,	\ +	.event = (e),			\ +	.msr = (ms),			\ +	.config_mask = (m),		\ +	.valid_mask = (vm),		\ +	.idx = EXTRA_REG_##i,		\ +	.extra_msr_access = true,	\  	}  #define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx)	\ @@ -395,6 +413,7 @@ struct x86_pmu {  	/*  	 * sysfs attrs  	 */ +	int		attr_rdpmc_broken;  	int		attr_rdpmc;  	struct attribute **format_attrs;  	struct attribute **event_attrs; @@ -440,6 +459,7 @@ struct x86_pmu {  	int		lbr_nr;			   /* hardware stack size */  	u64		lbr_sel_mask;		   /* LBR_SELECT valid bits */  	const int	*lbr_sel_map;		   /* lbr_select mappings */ +	bool		lbr_double_abort;	   /* duplicated lbr aborts */  	/*  	 * Extra registers for events diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index e09f0bfb7b8..cbb1be3ed9e 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -10,6 +10,7 @@  #include <linux/module.h>  #include <linux/pci.h>  #include <linux/ptrace.h> +#include <linux/syscore_ops.h>  #include <asm/apic.h> @@ -592,7 +593,7 @@ out:  	return 1;  } -static int __kprobes +static int  perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)  {  	int handled = 0; @@ -605,6 +606,7 @@ perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)  	return handled;  } +NOKPROBE_SYMBOL(perf_ibs_nmi_handler);  static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name)  { @@ -816,6 +818,18 @@ out:  	return ret;  } +static void ibs_eilvt_setup(void) +{ +	/* +	 * Force LVT offset assignment for family 10h: The offsets are +	 * not assigned by the BIOS for this family, so the OS is +	 * responsible for doing it. If the OS assignment fails, fall +	 * back to BIOS settings and try to setup this. +	 */ +	if (boot_cpu_data.x86 == 0x10) +		force_ibs_eilvt_setup(); +} +  static inline int get_ibs_lvt_offset(void)  {  	u64 val; @@ -851,6 +865,36 @@ static void clear_APIC_ibs(void *dummy)  		setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1);  } +#ifdef CONFIG_PM + +static int perf_ibs_suspend(void) +{ +	clear_APIC_ibs(NULL); +	return 0; +} + +static void perf_ibs_resume(void) +{ +	ibs_eilvt_setup(); +	setup_APIC_ibs(NULL); +} + +static struct syscore_ops perf_ibs_syscore_ops = { +	.resume		= perf_ibs_resume, +	.suspend	= perf_ibs_suspend, +}; + +static void perf_ibs_pm_init(void) +{ +	register_syscore_ops(&perf_ibs_syscore_ops); +} + +#else + +static inline void perf_ibs_pm_init(void) { } + +#endif +  static int  perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)  { @@ -877,25 +921,19 @@ static __init int amd_ibs_init(void)  	if (!caps)  		return -ENODEV;	/* ibs not supported by the cpu */ -	/* -	 * Force LVT offset assignment for family 10h: The offsets are -	 * not assigned by the BIOS for this family, so the OS is -	 * responsible for doing it. If the OS assignment fails, fall -	 * back to BIOS settings and try to setup this. -	 */ -	if (boot_cpu_data.x86 == 0x10) -		force_ibs_eilvt_setup(); +	ibs_eilvt_setup();  	if (!ibs_eilvt_valid())  		goto out; -	get_online_cpus(); +	perf_ibs_pm_init(); +	cpu_notifier_register_begin();  	ibs_caps = caps;  	/* make ibs_caps visible to other cpus: */  	smp_mb(); -	perf_cpu_notifier(perf_ibs_cpu_notifier);  	smp_call_function(setup_APIC_ibs, NULL, 1); -	put_online_cpus(); +	__perf_cpu_notifier(perf_ibs_cpu_notifier); +	cpu_notifier_register_done();  	ret = perf_event_ibs_init();  out: diff --git a/arch/x86/kernel/cpu/perf_event_amd_uncore.c b/arch/x86/kernel/cpu/perf_event_amd_uncore.c index 754291adec3..3bbdf4cd38b 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_amd_uncore.c @@ -531,15 +531,16 @@ static int __init amd_uncore_init(void)  	if (ret)  		return -ENODEV; -	get_online_cpus(); +	cpu_notifier_register_begin(); +  	/* init cpus already online before registering for hotplug notifier */  	for_each_online_cpu(cpu) {  		amd_uncore_cpu_up_prepare(cpu);  		smp_call_function_single(cpu, init_cpu_already_online, NULL, 1);  	} -	register_cpu_notifier(&amd_uncore_cpu_notifier_block); -	put_online_cpus(); +	__register_cpu_notifier(&amd_uncore_cpu_notifier_block); +	cpu_notifier_register_done();  	return 0;  } diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index f31a1655d1f..2502d0d9d24 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -169,7 +169,6 @@ static struct event_constraint intel_slm_event_constraints[] __read_mostly =  {  	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */  	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ -	FIXED_EVENT_CONSTRAINT(0x013c, 2), /* CPU_CLK_UNHALTED.REF */  	FIXED_EVENT_CONSTRAINT(0x0300, 2), /* pseudo CPU_CLK_UNHALTED.REF */  	EVENT_CONSTRAINT_END  }; @@ -190,9 +189,9 @@ static struct extra_reg intel_snbep_extra_regs[] __read_mostly = {  	EVENT_EXTRA_END  }; -EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3"); -EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3"); -EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2"); +EVENT_ATTR_STR(mem-loads,	mem_ld_nhm,	"event=0x0b,umask=0x10,ldlat=3"); +EVENT_ATTR_STR(mem-loads,	mem_ld_snb,	"event=0xcd,umask=0x1,ldlat=3"); +EVENT_ATTR_STR(mem-stores,	mem_st_snb,	"event=0xcd,umask=0x2");  struct attribute *nhm_events_attrs[] = {  	EVENT_PTR(mem_ld_nhm), @@ -1184,6 +1183,11 @@ static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)  	wrmsrl(hwc->config_base, ctrl_val);  } +static inline bool event_is_checkpointed(struct perf_event *event) +{ +	return (event->hw.config & HSW_IN_TX_CHECKPOINTED) != 0; +} +  static void intel_pmu_disable_event(struct perf_event *event)  {  	struct hw_perf_event *hwc = &event->hw; @@ -1197,6 +1201,7 @@ static void intel_pmu_disable_event(struct perf_event *event)  	cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx);  	cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx); +	cpuc->intel_cp_status &= ~(1ull << hwc->idx);  	/*  	 * must disable before any actual event @@ -1271,6 +1276,9 @@ static void intel_pmu_enable_event(struct perf_event *event)  	if (event->attr.exclude_guest)  		cpuc->intel_ctrl_host_mask |= (1ull << hwc->idx); +	if (unlikely(event_is_checkpointed(event))) +		cpuc->intel_cp_status |= (1ull << hwc->idx); +  	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {  		intel_pmu_enable_fixed(hwc);  		return; @@ -1289,6 +1297,17 @@ static void intel_pmu_enable_event(struct perf_event *event)  int intel_pmu_save_and_restart(struct perf_event *event)  {  	x86_perf_event_update(event); +	/* +	 * For a checkpointed counter always reset back to 0.  This +	 * avoids a situation where the counter overflows, aborts the +	 * transaction and is then set back to shortly before the +	 * overflow, and overflows and aborts again. +	 */ +	if (unlikely(event_is_checkpointed(event))) { +		/* No race with NMIs because the counter should not be armed */ +		wrmsrl(event->hw.event_base, 0); +		local64_set(&event->hw.prev_count, 0); +	}  	return x86_perf_event_set_period(event);  } @@ -1341,10 +1360,8 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)  	intel_pmu_disable_all();  	handled = intel_pmu_drain_bts_buffer();  	status = intel_pmu_get_status(); -	if (!status) { -		intel_pmu_enable_all(0); -		return handled; -	} +	if (!status) +		goto done;  	loops = 0;  again: @@ -1365,6 +1382,15 @@ again:  	intel_pmu_lbr_read();  	/* +	 * CondChgd bit 63 doesn't mean any overflow status. Ignore +	 * and clear the bit. +	 */ +	if (__test_and_clear_bit(63, (unsigned long *)&status)) { +		if (!status) +			goto done; +	} + +	/*  	 * PEBS overflow sets bit 62 in the global status register  	 */  	if (__test_and_clear_bit(62, (unsigned long *)&status)) { @@ -1372,6 +1398,13 @@ again:  		x86_pmu.drain_pebs(regs);  	} +	/* +	 * Checkpointed counters can lead to 'spurious' PMIs because the +	 * rollback caused by the PMI will have cleared the overflow status +	 * bit. Therefore always force probe these counters. +	 */ +	status |= cpuc->intel_cp_status; +  	for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {  		struct perf_event *event = cpuc->events[bit]; @@ -1837,6 +1870,20 @@ static int hsw_hw_config(struct perf_event *event)  	      event->attr.precise_ip > 0))  		return -EOPNOTSUPP; +	if (event_is_checkpointed(event)) { +		/* +		 * Sampling of checkpointed events can cause situations where +		 * the CPU constantly aborts because of a overflow, which is +		 * then checkpointed back and ignored. Forbid checkpointing +		 * for sampling. +		 * +		 * But still allow a long sampling period, so that perf stat +		 * from KVM works. +		 */ +		if (event->attr.sample_period > 0 && +		    event->attr.sample_period < 0x7fffffff) +			return -EOPNOTSUPP; +	}  	return 0;  } @@ -2135,6 +2182,41 @@ static void intel_snb_check_microcode(void)  	}  } +/* + * Under certain circumstances, access certain MSR may cause #GP. + * The function tests if the input MSR can be safely accessed. + */ +static bool check_msr(unsigned long msr, u64 mask) +{ +	u64 val_old, val_new, val_tmp; + +	/* +	 * Read the current value, change it and read it back to see if it +	 * matches, this is needed to detect certain hardware emulators +	 * (qemu/kvm) that don't trap on the MSR access and always return 0s. +	 */ +	if (rdmsrl_safe(msr, &val_old)) +		return false; + +	/* +	 * Only change the bits which can be updated by wrmsrl. +	 */ +	val_tmp = val_old ^ mask; +	if (wrmsrl_safe(msr, val_tmp) || +	    rdmsrl_safe(msr, &val_new)) +		return false; + +	if (val_new != val_tmp) +		return false; + +	/* Here it's sure that the MSR can be safely accessed. +	 * Restore the old value and return. +	 */ +	wrmsrl(msr, val_old); + +	return true; +} +  static __init void intel_sandybridge_quirk(void)  {  	x86_pmu.check_microcode = intel_snb_check_microcode; @@ -2182,10 +2264,36 @@ static __init void intel_nehalem_quirk(void)  	}  } -EVENT_ATTR_STR(mem-loads,      mem_ld_hsw,     "event=0xcd,umask=0x1,ldlat=3"); -EVENT_ATTR_STR(mem-stores,     mem_st_hsw,     "event=0xd0,umask=0x82") +EVENT_ATTR_STR(mem-loads,	mem_ld_hsw,	"event=0xcd,umask=0x1,ldlat=3"); +EVENT_ATTR_STR(mem-stores,	mem_st_hsw,	"event=0xd0,umask=0x82") + +/* Haswell special events */ +EVENT_ATTR_STR(tx-start,	tx_start,	"event=0xc9,umask=0x1"); +EVENT_ATTR_STR(tx-commit,	tx_commit,	"event=0xc9,umask=0x2"); +EVENT_ATTR_STR(tx-abort,	tx_abort,	"event=0xc9,umask=0x4"); +EVENT_ATTR_STR(tx-capacity,	tx_capacity,	"event=0x54,umask=0x2"); +EVENT_ATTR_STR(tx-conflict,	tx_conflict,	"event=0x54,umask=0x1"); +EVENT_ATTR_STR(el-start,	el_start,	"event=0xc8,umask=0x1"); +EVENT_ATTR_STR(el-commit,	el_commit,	"event=0xc8,umask=0x2"); +EVENT_ATTR_STR(el-abort,	el_abort,	"event=0xc8,umask=0x4"); +EVENT_ATTR_STR(el-capacity,	el_capacity,	"event=0x54,umask=0x2"); +EVENT_ATTR_STR(el-conflict,	el_conflict,	"event=0x54,umask=0x1"); +EVENT_ATTR_STR(cycles-t,	cycles_t,	"event=0x3c,in_tx=1"); +EVENT_ATTR_STR(cycles-ct,	cycles_ct,	"event=0x3c,in_tx=1,in_tx_cp=1");  static struct attribute *hsw_events_attrs[] = { +	EVENT_PTR(tx_start), +	EVENT_PTR(tx_commit), +	EVENT_PTR(tx_abort), +	EVENT_PTR(tx_capacity), +	EVENT_PTR(tx_conflict), +	EVENT_PTR(el_start), +	EVENT_PTR(el_commit), +	EVENT_PTR(el_abort), +	EVENT_PTR(el_capacity), +	EVENT_PTR(el_conflict), +	EVENT_PTR(cycles_t), +	EVENT_PTR(cycles_ct),  	EVENT_PTR(mem_ld_hsw),  	EVENT_PTR(mem_st_hsw),  	NULL @@ -2198,7 +2306,8 @@ __init int intel_pmu_init(void)  	union cpuid10_ebx ebx;  	struct event_constraint *c;  	unsigned int unused; -	int version; +	struct extra_reg *er; +	int version, i;  	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {  		switch (boot_cpu_data.x86) { @@ -2243,10 +2352,7 @@ __init int intel_pmu_init(void)  	if (version > 1)  		x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); -	/* -	 * v2 and above have a perf capabilities MSR -	 */ -	if (version > 1) { +	if (boot_cpu_has(X86_FEATURE_PDCM)) {  		u64 capabilities;  		rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities); @@ -2404,6 +2510,9 @@ __init int intel_pmu_init(void)  	case 62: /* IvyBridge EP */  		memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,  		       sizeof(hw_cache_event_ids)); +		/* dTLB-load-misses on IVB is different than SNB */ +		hw_cache_event_ids[C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = 0x8108; /* DTLB_LOAD_MISSES.DEMAND_LD_MISS_CAUSES_A_WALK */ +  		memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,  		       sizeof(hw_cache_extra_regs)); @@ -2452,6 +2561,7 @@ __init int intel_pmu_init(void)  		x86_pmu.hw_config = hsw_hw_config;  		x86_pmu.get_event_constraints = hsw_get_event_constraints;  		x86_pmu.cpu_events = hsw_events_attrs; +		x86_pmu.lbr_double_abort = true;  		pr_cont("Haswell events, ");  		break; @@ -2503,6 +2613,34 @@ __init int intel_pmu_init(void)  		}  	} +	/* +	 * Access LBR MSR may cause #GP under certain circumstances. +	 * E.g. KVM doesn't support LBR MSR +	 * Check all LBT MSR here. +	 * Disable LBR access if any LBR MSRs can not be accessed. +	 */ +	if (x86_pmu.lbr_nr && !check_msr(x86_pmu.lbr_tos, 0x3UL)) +		x86_pmu.lbr_nr = 0; +	for (i = 0; i < x86_pmu.lbr_nr; i++) { +		if (!(check_msr(x86_pmu.lbr_from + i, 0xffffUL) && +		      check_msr(x86_pmu.lbr_to + i, 0xffffUL))) +			x86_pmu.lbr_nr = 0; +	} + +	/* +	 * Access extra MSR may cause #GP under certain circumstances. +	 * E.g. KVM doesn't support offcore event +	 * Check all extra_regs here. +	 */ +	if (x86_pmu.extra_regs) { +		for (er = x86_pmu.extra_regs; er->msr; er++) { +			er->extra_msr_access = check_msr(er->msr, 0x1ffUL); +			/* Disable LBR select mapping */ +			if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access) +				x86_pmu.lbr_sel_map = NULL; +		} +	} +  	/* Support full width counters using alternative MSR range */  	if (x86_pmu.intel_cap.full_width_write) {  		x86_pmu.max_period = x86_pmu.cntval_mask; diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index ab3ba1c1b7d..696ade311de 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -12,6 +12,7 @@  #define BTS_BUFFER_SIZE		(PAGE_SIZE << 4)  #define PEBS_BUFFER_SIZE	PAGE_SIZE +#define PEBS_FIXUP_SIZE		PAGE_SIZE  /*   * pebs_record_32 for p4 and core not supported @@ -107,15 +108,31 @@ static u64 precise_store_data(u64 status)  	return val;  } -static u64 precise_store_data_hsw(u64 status) +static u64 precise_store_data_hsw(struct perf_event *event, u64 status)  {  	union perf_mem_data_src dse; +	u64 cfg = event->hw.config & INTEL_ARCH_EVENT_MASK;  	dse.val = 0;  	dse.mem_op = PERF_MEM_OP_STORE;  	dse.mem_lvl = PERF_MEM_LVL_NA; + +	/* +	 * L1 info only valid for following events: +	 * +	 * MEM_UOPS_RETIRED.STLB_MISS_STORES +	 * MEM_UOPS_RETIRED.LOCK_STORES +	 * MEM_UOPS_RETIRED.SPLIT_STORES +	 * MEM_UOPS_RETIRED.ALL_STORES +	 */ +	if (cfg != 0x12d0 && cfg != 0x22d0 && cfg != 0x42d0 && cfg != 0x82d0) +		return dse.mem_lvl; +  	if (status & 1) -		dse.mem_lvl = PERF_MEM_LVL_L1; +		dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT; +	else +		dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS; +  	/* Nothing else supported. Sorry. */  	return dse.val;  } @@ -182,18 +199,32 @@ struct pebs_record_nhm {   * Same as pebs_record_nhm, with two additional fields.   */  struct pebs_record_hsw { -	struct pebs_record_nhm nhm; -	/* -	 * Real IP of the event. In the Intel documentation this -	 * is called eventingrip. -	 */ -	u64 real_ip; -	/* -	 * TSX tuning information field: abort cycles and abort flags. -	 */ -	u64 tsx_tuning; +	u64 flags, ip; +	u64 ax, bx, cx, dx; +	u64 si, di, bp, sp; +	u64 r8,  r9,  r10, r11; +	u64 r12, r13, r14, r15; +	u64 status, dla, dse, lat; +	u64 real_ip, tsx_tuning; +}; + +union hsw_tsx_tuning { +	struct { +		u32 cycles_last_block     : 32, +		    hle_abort		  : 1, +		    rtm_abort		  : 1, +		    instruction_abort     : 1, +		    non_instruction_abort : 1, +		    retry		  : 1, +		    data_conflict	  : 1, +		    capacity_writes	  : 1, +		    capacity_reads	  : 1; +	}; +	u64	    value;  }; +#define PEBS_HSW_TSX_FLAGS	0xff00000000ULL +  void init_debug_store_on_cpu(int cpu)  {  	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; @@ -214,12 +245,14 @@ void fini_debug_store_on_cpu(int cpu)  	wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);  } +static DEFINE_PER_CPU(void *, insn_buffer); +  static int alloc_pebs_buffer(int cpu)  {  	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;  	int node = cpu_to_node(cpu);  	int max, thresh = 1; /* always use a single PEBS record */ -	void *buffer; +	void *buffer, *ibuffer;  	if (!x86_pmu.pebs)  		return 0; @@ -228,6 +261,19 @@ static int alloc_pebs_buffer(int cpu)  	if (unlikely(!buffer))  		return -ENOMEM; +	/* +	 * HSW+ already provides us the eventing ip; no need to allocate this +	 * buffer then. +	 */ +	if (x86_pmu.intel_cap.pebs_format < 2) { +		ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node); +		if (!ibuffer) { +			kfree(buffer); +			return -ENOMEM; +		} +		per_cpu(insn_buffer, cpu) = ibuffer; +	} +  	max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;  	ds->pebs_buffer_base = (u64)(unsigned long)buffer; @@ -248,6 +294,9 @@ static void release_pebs_buffer(int cpu)  	if (!ds || !x86_pmu.pebs)  		return; +	kfree(per_cpu(insn_buffer, cpu)); +	per_cpu(insn_buffer, cpu) = NULL; +  	kfree((void *)(unsigned long)ds->pebs_buffer_base);  	ds->pebs_buffer_base = 0;  } @@ -262,9 +311,11 @@ static int alloc_bts_buffer(int cpu)  	if (!x86_pmu.bts)  		return 0; -	buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL, node); -	if (unlikely(!buffer)) +	buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node); +	if (unlikely(!buffer)) { +		WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);  		return -ENOMEM; +	}  	max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;  	thresh = max / 16; @@ -715,6 +766,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)  	unsigned long old_to, to = cpuc->lbr_entries[0].to;  	unsigned long ip = regs->ip;  	int is_64bit = 0; +	void *kaddr;  	/*  	 * We don't need to fixup if the PEBS assist is fault like @@ -738,7 +790,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)  	 * unsigned math, either ip is before the start (impossible) or  	 * the basic block is larger than 1 page (sanity)  	 */ -	if ((ip - to) > PAGE_SIZE) +	if ((ip - to) > PEBS_FIXUP_SIZE)  		return 0;  	/* @@ -749,29 +801,33 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)  		return 1;  	} +	if (!kernel_ip(ip)) { +		int size, bytes; +		u8 *buf = this_cpu_read(insn_buffer); + +		size = ip - to; /* Must fit our buffer, see above */ +		bytes = copy_from_user_nmi(buf, (void __user *)to, size); +		if (bytes != 0) +			return 0; + +		kaddr = buf; +	} else { +		kaddr = (void *)to; +	} +  	do {  		struct insn insn; -		u8 buf[MAX_INSN_SIZE]; -		void *kaddr;  		old_to = to; -		if (!kernel_ip(ip)) { -			int bytes, size = MAX_INSN_SIZE; - -			bytes = copy_from_user_nmi(buf, (void __user *)to, size); -			if (bytes != size) -				return 0; - -			kaddr = buf; -		} else -			kaddr = (void *)to;  #ifdef CONFIG_X86_64  		is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32);  #endif  		insn_init(&insn, kaddr, is_64bit);  		insn_get_length(&insn); +  		to += insn.length; +		kaddr += insn.length;  	} while (to < ip);  	if (to == ip) { @@ -786,16 +842,34 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)  	return 0;  } +static inline u64 intel_hsw_weight(struct pebs_record_hsw *pebs) +{ +	if (pebs->tsx_tuning) { +		union hsw_tsx_tuning tsx = { .value = pebs->tsx_tuning }; +		return tsx.cycles_last_block; +	} +	return 0; +} + +static inline u64 intel_hsw_transaction(struct pebs_record_hsw *pebs) +{ +	u64 txn = (pebs->tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32; + +	/* For RTM XABORTs also log the abort code from AX */ +	if ((txn & PERF_TXN_TRANSACTION) && (pebs->ax & 1)) +		txn |= ((pebs->ax >> 24) & 0xff) << PERF_TXN_ABORT_SHIFT; +	return txn; +} +  static void __intel_pmu_pebs_event(struct perf_event *event,  				   struct pt_regs *iregs, void *__pebs)  {  	/* -	 * We cast to pebs_record_nhm to get the load latency data -	 * if extra_reg MSR_PEBS_LD_LAT_THRESHOLD used +	 * We cast to the biggest pebs_record but are careful not to +	 * unconditionally access the 'extra' entries.  	 */  	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); -	struct pebs_record_nhm *pebs = __pebs; -	struct pebs_record_hsw *pebs_hsw = __pebs; +	struct pebs_record_hsw *pebs = __pebs;  	struct perf_sample_data data;  	struct pt_regs regs;  	u64 sample_type; @@ -831,7 +905,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,  				data.data_src.val = load_latency_data(pebs->dse);  			else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW)  				data.data_src.val = -					precise_store_data_hsw(pebs->dse); +					precise_store_data_hsw(event, pebs->dse);  			else  				data.data_src.val = precise_store_data(pebs->dse);  		} @@ -854,7 +928,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,  	regs.sp = pebs->sp;  	if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format >= 2) { -		regs.ip = pebs_hsw->real_ip; +		regs.ip = pebs->real_ip;  		regs.flags |= PERF_EFLAGS_EXACT;  	} else if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(®s))  		regs.flags |= PERF_EFLAGS_EXACT; @@ -862,9 +936,18 @@ static void __intel_pmu_pebs_event(struct perf_event *event,  		regs.flags &= ~PERF_EFLAGS_EXACT;  	if ((event->attr.sample_type & PERF_SAMPLE_ADDR) && -		x86_pmu.intel_cap.pebs_format >= 1) +	    x86_pmu.intel_cap.pebs_format >= 1)  		data.addr = pebs->dla; +	if (x86_pmu.intel_cap.pebs_format >= 2) { +		/* Only set the TSX weight when no memory weight. */ +		if ((event->attr.sample_type & PERF_SAMPLE_WEIGHT) && !fll) +			data.weight = intel_hsw_weight(pebs); + +		if (event->attr.sample_type & PERF_SAMPLE_TRANSACTION) +			data.txn = intel_hsw_transaction(pebs); +	} +  	if (has_branch_stack(event))  		data.br_stack = &cpuc->lbr_stack; @@ -913,17 +996,34 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)  	__intel_pmu_pebs_event(event, iregs, at);  } -static void __intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, void *at, -					void *top) +static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)  {  	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);  	struct debug_store *ds = cpuc->ds;  	struct perf_event *event = NULL; +	void *at, *top;  	u64 status = 0;  	int bit; +	if (!x86_pmu.pebs_active) +		return; + +	at  = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base; +	top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index; +  	ds->pebs_index = ds->pebs_buffer_base; +	if (unlikely(at > top)) +		return; + +	/* +	 * Should not happen, we program the threshold at 1 and do not +	 * set a reset value. +	 */ +	WARN_ONCE(top - at > x86_pmu.max_pebs_events * x86_pmu.pebs_record_size, +		  "Unexpected number of pebs records %ld\n", +		  (long)(top - at) / x86_pmu.pebs_record_size); +  	for (; at < top; at += x86_pmu.pebs_record_size) {  		struct pebs_record_nhm *p = at; @@ -951,61 +1051,6 @@ static void __intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, void *at,  	}  } -static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) -{ -	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); -	struct debug_store *ds = cpuc->ds; -	struct pebs_record_nhm *at, *top; -	int n; - -	if (!x86_pmu.pebs_active) -		return; - -	at  = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base; -	top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index; - -	ds->pebs_index = ds->pebs_buffer_base; - -	n = top - at; -	if (n <= 0) -		return; - -	/* -	 * Should not happen, we program the threshold at 1 and do not -	 * set a reset value. -	 */ -	WARN_ONCE(n > x86_pmu.max_pebs_events, -		  "Unexpected number of pebs records %d\n", n); - -	return __intel_pmu_drain_pebs_nhm(iregs, at, top); -} - -static void intel_pmu_drain_pebs_hsw(struct pt_regs *iregs) -{ -	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); -	struct debug_store *ds = cpuc->ds; -	struct pebs_record_hsw *at, *top; -	int n; - -	if (!x86_pmu.pebs_active) -		return; - -	at  = (struct pebs_record_hsw *)(unsigned long)ds->pebs_buffer_base; -	top = (struct pebs_record_hsw *)(unsigned long)ds->pebs_index; - -	n = top - at; -	if (n <= 0) -		return; -	/* -	 * Should not happen, we program the threshold at 1 and do not -	 * set a reset value. -	 */ -	WARN_ONCE(n > x86_pmu.max_pebs_events, -		  "Unexpected number of pebs records %d\n", n); - -	return __intel_pmu_drain_pebs_nhm(iregs, at, top); -} -  /*   * BTS, PEBS probe and setup   */ @@ -1040,7 +1085,7 @@ void intel_ds_init(void)  		case 2:  			pr_cont("PEBS fmt2%c, ", pebs_type);  			x86_pmu.pebs_record_size = sizeof(struct pebs_record_hsw); -			x86_pmu.drain_pebs = intel_pmu_drain_pebs_hsw; +			x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;  			break;  		default: diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index d5be06a5005..9dd2459a4c7 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -284,6 +284,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)  	int lbr_format = x86_pmu.intel_cap.lbr_format;  	u64 tos = intel_pmu_lbr_tos();  	int i; +	int out = 0;  	for (i = 0; i < x86_pmu.lbr_nr; i++) {  		unsigned long lbr_idx = (tos - i) & mask; @@ -306,15 +307,27 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)  		}  		from = (u64)((((s64)from) << skip) >> skip); -		cpuc->lbr_entries[i].from	= from; -		cpuc->lbr_entries[i].to		= to; -		cpuc->lbr_entries[i].mispred	= mis; -		cpuc->lbr_entries[i].predicted	= pred; -		cpuc->lbr_entries[i].in_tx	= in_tx; -		cpuc->lbr_entries[i].abort	= abort; -		cpuc->lbr_entries[i].reserved	= 0; +		/* +		 * Some CPUs report duplicated abort records, +		 * with the second entry not having an abort bit set. +		 * Skip them here. This loop runs backwards, +		 * so we need to undo the previous record. +		 * If the abort just happened outside the window +		 * the extra entry cannot be removed. +		 */ +		if (abort && x86_pmu.lbr_double_abort && out > 0) +			out--; + +		cpuc->lbr_entries[out].from	 = from; +		cpuc->lbr_entries[out].to	 = to; +		cpuc->lbr_entries[out].mispred	 = mis; +		cpuc->lbr_entries[out].predicted = pred; +		cpuc->lbr_entries[out].in_tx	 = in_tx; +		cpuc->lbr_entries[out].abort	 = abort; +		cpuc->lbr_entries[out].reserved	 = 0; +		out++;  	} -	cpuc->lbr_stack.nr = i; +	cpuc->lbr_stack.nr = out;  }  void intel_pmu_lbr_read(void) @@ -371,6 +384,9 @@ static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)  	if (br_type & PERF_SAMPLE_BRANCH_NO_TX)  		mask |= X86_BR_NO_TX; +	if (br_type & PERF_SAMPLE_BRANCH_COND) +		mask |= X86_BR_JCC; +  	/*  	 * stash actual user request into reg, it may  	 * be used by fixup code for some CPU @@ -478,7 +494,7 @@ static int branch_type(unsigned long from, unsigned long to, int abort)  		/* may fail if text not present */  		bytes = copy_from_user_nmi(buf, (void __user *)from, size); -		if (bytes != size) +		if (bytes != 0)  			return X86_BR_NONE;  		addr = buf; @@ -665,6 +681,7 @@ static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {  	 * NHM/WSM erratum: must include IND_JMP to capture IND_CALL  	 */  	[PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL | LBR_IND_JMP, +	[PERF_SAMPLE_BRANCH_COND]     = LBR_JCC,  };  static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = { @@ -676,6 +693,7 @@ static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {  	[PERF_SAMPLE_BRANCH_ANY_CALL]	= LBR_REL_CALL | LBR_IND_CALL  					| LBR_FAR,  	[PERF_SAMPLE_BRANCH_IND_CALL]	= LBR_IND_CALL, +	[PERF_SAMPLE_BRANCH_COND]       = LBR_JCC,  };  /* core */ diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c new file mode 100644 index 00000000000..619f7699487 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c @@ -0,0 +1,714 @@ +/* + * perf_event_intel_rapl.c: support Intel RAPL energy consumption counters + * Copyright (C) 2013 Google, Inc., Stephane Eranian + * + * Intel RAPL interface is specified in the IA-32 Manual Vol3b + * section 14.7.1 (September 2013) + * + * RAPL provides more controls than just reporting energy consumption + * however here we only expose the 3 energy consumption free running + * counters (pp0, pkg, dram). + * + * Each of those counters increments in a power unit defined by the + * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules + * but it can vary. + * + * Counter to rapl events mappings: + * + *  pp0 counter: consumption of all physical cores (power plane 0) + * 	  event: rapl_energy_cores + *    perf code: 0x1 + * + *  pkg counter: consumption of the whole processor package + *	  event: rapl_energy_pkg + *    perf code: 0x2 + * + * dram counter: consumption of the dram domain (servers only) + *	  event: rapl_energy_dram + *    perf code: 0x3 + * + * dram counter: consumption of the builtin-gpu domain (client only) + *	  event: rapl_energy_gpu + *    perf code: 0x4 + * + * We manage those counters as free running (read-only). They may be + * use simultaneously by other tools, such as turbostat. + * + * The events only support system-wide mode counting. There is no + * sampling support because it does not make sense and is not + * supported by the RAPL hardware. + * + * Because we want to avoid floating-point operations in the kernel, + * the events are all reported in fixed point arithmetic (32.32). + * Tools must adjust the counts to convert them to Watts using + * the duration of the measurement. Tools may use a function such as + * ldexp(raw_count, -32); + */ +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/perf_event.h> +#include <asm/cpu_device_id.h> +#include "perf_event.h" + +/* + * RAPL energy status counters + */ +#define RAPL_IDX_PP0_NRG_STAT	0	/* all cores */ +#define INTEL_RAPL_PP0		0x1	/* pseudo-encoding */ +#define RAPL_IDX_PKG_NRG_STAT	1	/* entire package */ +#define INTEL_RAPL_PKG		0x2	/* pseudo-encoding */ +#define RAPL_IDX_RAM_NRG_STAT	2	/* DRAM */ +#define INTEL_RAPL_RAM		0x3	/* pseudo-encoding */ +#define RAPL_IDX_PP1_NRG_STAT	3	/* gpu */ +#define INTEL_RAPL_PP1		0x4	/* pseudo-encoding */ + +/* Clients have PP0, PKG */ +#define RAPL_IDX_CLN	(1<<RAPL_IDX_PP0_NRG_STAT|\ +			 1<<RAPL_IDX_PKG_NRG_STAT|\ +			 1<<RAPL_IDX_PP1_NRG_STAT) + +/* Servers have PP0, PKG, RAM */ +#define RAPL_IDX_SRV	(1<<RAPL_IDX_PP0_NRG_STAT|\ +			 1<<RAPL_IDX_PKG_NRG_STAT|\ +			 1<<RAPL_IDX_RAM_NRG_STAT) + +/* Servers have PP0, PKG, RAM, PP1 */ +#define RAPL_IDX_HSW	(1<<RAPL_IDX_PP0_NRG_STAT|\ +			 1<<RAPL_IDX_PKG_NRG_STAT|\ +			 1<<RAPL_IDX_RAM_NRG_STAT|\ +			 1<<RAPL_IDX_PP1_NRG_STAT) + +/* + * event code: LSB 8 bits, passed in attr->config + * any other bit is reserved + */ +#define RAPL_EVENT_MASK	0xFFULL + +#define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format)		\ +static ssize_t __rapl_##_var##_show(struct kobject *kobj,	\ +				struct kobj_attribute *attr,	\ +				char *page)			\ +{								\ +	BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);		\ +	return sprintf(page, _format "\n");			\ +}								\ +static struct kobj_attribute format_attr_##_var =		\ +	__ATTR(_name, 0444, __rapl_##_var##_show, NULL) + +#define RAPL_EVENT_DESC(_name, _config)				\ +{								\ +	.attr	= __ATTR(_name, 0444, rapl_event_show, NULL),	\ +	.config	= _config,					\ +} + +#define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */ + +struct rapl_pmu { +	spinlock_t	 lock; +	int		 hw_unit;  /* 1/2^hw_unit Joule */ +	int		 n_active; /* number of active events */ +	struct list_head active_list; +	struct pmu	 *pmu; /* pointer to rapl_pmu_class */ +	ktime_t		 timer_interval; /* in ktime_t unit */ +	struct hrtimer   hrtimer; +}; + +static struct pmu rapl_pmu_class; +static cpumask_t rapl_cpu_mask; +static int rapl_cntr_mask; + +static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu); +static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free); + +static inline u64 rapl_read_counter(struct perf_event *event) +{ +	u64 raw; +	rdmsrl(event->hw.event_base, raw); +	return raw; +} + +static inline u64 rapl_scale(u64 v) +{ +	/* +	 * scale delta to smallest unit (1/2^32) +	 * users must then scale back: count * 1/(1e9*2^32) to get Joules +	 * or use ldexp(count, -32). +	 * Watts = Joules/Time delta +	 */ +	return v << (32 - __get_cpu_var(rapl_pmu)->hw_unit); +} + +static u64 rapl_event_update(struct perf_event *event) +{ +	struct hw_perf_event *hwc = &event->hw; +	u64 prev_raw_count, new_raw_count; +	s64 delta, sdelta; +	int shift = RAPL_CNTR_WIDTH; + +again: +	prev_raw_count = local64_read(&hwc->prev_count); +	rdmsrl(event->hw.event_base, new_raw_count); + +	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, +			    new_raw_count) != prev_raw_count) { +		cpu_relax(); +		goto again; +	} + +	/* +	 * Now we have the new raw value and have updated the prev +	 * timestamp already. We can now calculate the elapsed delta +	 * (event-)time and add that to the generic event. +	 * +	 * Careful, not all hw sign-extends above the physical width +	 * of the count. +	 */ +	delta = (new_raw_count << shift) - (prev_raw_count << shift); +	delta >>= shift; + +	sdelta = rapl_scale(delta); + +	local64_add(sdelta, &event->count); + +	return new_raw_count; +} + +static void rapl_start_hrtimer(struct rapl_pmu *pmu) +{ +	__hrtimer_start_range_ns(&pmu->hrtimer, +			pmu->timer_interval, 0, +			HRTIMER_MODE_REL_PINNED, 0); +} + +static void rapl_stop_hrtimer(struct rapl_pmu *pmu) +{ +	hrtimer_cancel(&pmu->hrtimer); +} + +static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer) +{ +	struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu); +	struct perf_event *event; +	unsigned long flags; + +	if (!pmu->n_active) +		return HRTIMER_NORESTART; + +	spin_lock_irqsave(&pmu->lock, flags); + +	list_for_each_entry(event, &pmu->active_list, active_entry) { +		rapl_event_update(event); +	} + +	spin_unlock_irqrestore(&pmu->lock, flags); + +	hrtimer_forward_now(hrtimer, pmu->timer_interval); + +	return HRTIMER_RESTART; +} + +static void rapl_hrtimer_init(struct rapl_pmu *pmu) +{ +	struct hrtimer *hr = &pmu->hrtimer; + +	hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL); +	hr->function = rapl_hrtimer_handle; +} + +static void __rapl_pmu_event_start(struct rapl_pmu *pmu, +				   struct perf_event *event) +{ +	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) +		return; + +	event->hw.state = 0; + +	list_add_tail(&event->active_entry, &pmu->active_list); + +	local64_set(&event->hw.prev_count, rapl_read_counter(event)); + +	pmu->n_active++; +	if (pmu->n_active == 1) +		rapl_start_hrtimer(pmu); +} + +static void rapl_pmu_event_start(struct perf_event *event, int mode) +{ +	struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu); +	unsigned long flags; + +	spin_lock_irqsave(&pmu->lock, flags); +	__rapl_pmu_event_start(pmu, event); +	spin_unlock_irqrestore(&pmu->lock, flags); +} + +static void rapl_pmu_event_stop(struct perf_event *event, int mode) +{ +	struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu); +	struct hw_perf_event *hwc = &event->hw; +	unsigned long flags; + +	spin_lock_irqsave(&pmu->lock, flags); + +	/* mark event as deactivated and stopped */ +	if (!(hwc->state & PERF_HES_STOPPED)) { +		WARN_ON_ONCE(pmu->n_active <= 0); +		pmu->n_active--; +		if (pmu->n_active == 0) +			rapl_stop_hrtimer(pmu); + +		list_del(&event->active_entry); + +		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); +		hwc->state |= PERF_HES_STOPPED; +	} + +	/* check if update of sw counter is necessary */ +	if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { +		/* +		 * Drain the remaining delta count out of a event +		 * that we are disabling: +		 */ +		rapl_event_update(event); +		hwc->state |= PERF_HES_UPTODATE; +	} + +	spin_unlock_irqrestore(&pmu->lock, flags); +} + +static int rapl_pmu_event_add(struct perf_event *event, int mode) +{ +	struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu); +	struct hw_perf_event *hwc = &event->hw; +	unsigned long flags; + +	spin_lock_irqsave(&pmu->lock, flags); + +	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + +	if (mode & PERF_EF_START) +		__rapl_pmu_event_start(pmu, event); + +	spin_unlock_irqrestore(&pmu->lock, flags); + +	return 0; +} + +static void rapl_pmu_event_del(struct perf_event *event, int flags) +{ +	rapl_pmu_event_stop(event, PERF_EF_UPDATE); +} + +static int rapl_pmu_event_init(struct perf_event *event) +{ +	u64 cfg = event->attr.config & RAPL_EVENT_MASK; +	int bit, msr, ret = 0; + +	/* only look at RAPL events */ +	if (event->attr.type != rapl_pmu_class.type) +		return -ENOENT; + +	/* check only supported bits are set */ +	if (event->attr.config & ~RAPL_EVENT_MASK) +		return -EINVAL; + +	/* +	 * check event is known (determines counter) +	 */ +	switch (cfg) { +	case INTEL_RAPL_PP0: +		bit = RAPL_IDX_PP0_NRG_STAT; +		msr = MSR_PP0_ENERGY_STATUS; +		break; +	case INTEL_RAPL_PKG: +		bit = RAPL_IDX_PKG_NRG_STAT; +		msr = MSR_PKG_ENERGY_STATUS; +		break; +	case INTEL_RAPL_RAM: +		bit = RAPL_IDX_RAM_NRG_STAT; +		msr = MSR_DRAM_ENERGY_STATUS; +		break; +	case INTEL_RAPL_PP1: +		bit = RAPL_IDX_PP1_NRG_STAT; +		msr = MSR_PP1_ENERGY_STATUS; +		break; +	default: +		return -EINVAL; +	} +	/* check event supported */ +	if (!(rapl_cntr_mask & (1 << bit))) +		return -EINVAL; + +	/* unsupported modes and filters */ +	if (event->attr.exclude_user   || +	    event->attr.exclude_kernel || +	    event->attr.exclude_hv     || +	    event->attr.exclude_idle   || +	    event->attr.exclude_host   || +	    event->attr.exclude_guest  || +	    event->attr.sample_period) /* no sampling */ +		return -EINVAL; + +	/* must be done before validate_group */ +	event->hw.event_base = msr; +	event->hw.config = cfg; +	event->hw.idx = bit; + +	return ret; +} + +static void rapl_pmu_event_read(struct perf_event *event) +{ +	rapl_event_update(event); +} + +static ssize_t rapl_get_attr_cpumask(struct device *dev, +				struct device_attribute *attr, char *buf) +{ +	int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &rapl_cpu_mask); + +	buf[n++] = '\n'; +	buf[n] = '\0'; +	return n; +} + +static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL); + +static struct attribute *rapl_pmu_attrs[] = { +	&dev_attr_cpumask.attr, +	NULL, +}; + +static struct attribute_group rapl_pmu_attr_group = { +	.attrs = rapl_pmu_attrs, +}; + +EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01"); +EVENT_ATTR_STR(energy-pkg  ,   rapl_pkg, "event=0x02"); +EVENT_ATTR_STR(energy-ram  ,   rapl_ram, "event=0x03"); +EVENT_ATTR_STR(energy-gpu  ,   rapl_gpu, "event=0x04"); + +EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules"); +EVENT_ATTR_STR(energy-pkg.unit  ,   rapl_pkg_unit, "Joules"); +EVENT_ATTR_STR(energy-ram.unit  ,   rapl_ram_unit, "Joules"); +EVENT_ATTR_STR(energy-gpu.unit  ,   rapl_gpu_unit, "Joules"); + +/* + * we compute in 0.23 nJ increments regardless of MSR + */ +EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10"); +EVENT_ATTR_STR(energy-pkg.scale,     rapl_pkg_scale, "2.3283064365386962890625e-10"); +EVENT_ATTR_STR(energy-ram.scale,     rapl_ram_scale, "2.3283064365386962890625e-10"); +EVENT_ATTR_STR(energy-gpu.scale,     rapl_gpu_scale, "2.3283064365386962890625e-10"); + +static struct attribute *rapl_events_srv_attr[] = { +	EVENT_PTR(rapl_cores), +	EVENT_PTR(rapl_pkg), +	EVENT_PTR(rapl_ram), + +	EVENT_PTR(rapl_cores_unit), +	EVENT_PTR(rapl_pkg_unit), +	EVENT_PTR(rapl_ram_unit), + +	EVENT_PTR(rapl_cores_scale), +	EVENT_PTR(rapl_pkg_scale), +	EVENT_PTR(rapl_ram_scale), +	NULL, +}; + +static struct attribute *rapl_events_cln_attr[] = { +	EVENT_PTR(rapl_cores), +	EVENT_PTR(rapl_pkg), +	EVENT_PTR(rapl_gpu), + +	EVENT_PTR(rapl_cores_unit), +	EVENT_PTR(rapl_pkg_unit), +	EVENT_PTR(rapl_gpu_unit), + +	EVENT_PTR(rapl_cores_scale), +	EVENT_PTR(rapl_pkg_scale), +	EVENT_PTR(rapl_gpu_scale), +	NULL, +}; + +static struct attribute *rapl_events_hsw_attr[] = { +	EVENT_PTR(rapl_cores), +	EVENT_PTR(rapl_pkg), +	EVENT_PTR(rapl_gpu), +	EVENT_PTR(rapl_ram), + +	EVENT_PTR(rapl_cores_unit), +	EVENT_PTR(rapl_pkg_unit), +	EVENT_PTR(rapl_gpu_unit), +	EVENT_PTR(rapl_ram_unit), + +	EVENT_PTR(rapl_cores_scale), +	EVENT_PTR(rapl_pkg_scale), +	EVENT_PTR(rapl_gpu_scale), +	EVENT_PTR(rapl_ram_scale), +	NULL, +}; + +static struct attribute_group rapl_pmu_events_group = { +	.name = "events", +	.attrs = NULL, /* patched at runtime */ +}; + +DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7"); +static struct attribute *rapl_formats_attr[] = { +	&format_attr_event.attr, +	NULL, +}; + +static struct attribute_group rapl_pmu_format_group = { +	.name = "format", +	.attrs = rapl_formats_attr, +}; + +const struct attribute_group *rapl_attr_groups[] = { +	&rapl_pmu_attr_group, +	&rapl_pmu_format_group, +	&rapl_pmu_events_group, +	NULL, +}; + +static struct pmu rapl_pmu_class = { +	.attr_groups	= rapl_attr_groups, +	.task_ctx_nr	= perf_invalid_context, /* system-wide only */ +	.event_init	= rapl_pmu_event_init, +	.add		= rapl_pmu_event_add, /* must have */ +	.del		= rapl_pmu_event_del, /* must have */ +	.start		= rapl_pmu_event_start, +	.stop		= rapl_pmu_event_stop, +	.read		= rapl_pmu_event_read, +}; + +static void rapl_cpu_exit(int cpu) +{ +	struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); +	int i, phys_id = topology_physical_package_id(cpu); +	int target = -1; + +	/* find a new cpu on same package */ +	for_each_online_cpu(i) { +		if (i == cpu) +			continue; +		if (phys_id == topology_physical_package_id(i)) { +			target = i; +			break; +		} +	} +	/* +	 * clear cpu from cpumask +	 * if was set in cpumask and still some cpu on package, +	 * then move to new cpu +	 */ +	if (cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask) && target >= 0) +		cpumask_set_cpu(target, &rapl_cpu_mask); + +	WARN_ON(cpumask_empty(&rapl_cpu_mask)); +	/* +	 * migrate events and context to new cpu +	 */ +	if (target >= 0) +		perf_pmu_migrate_context(pmu->pmu, cpu, target); + +	/* cancel overflow polling timer for CPU */ +	rapl_stop_hrtimer(pmu); +} + +static void rapl_cpu_init(int cpu) +{ +	int i, phys_id = topology_physical_package_id(cpu); + +	/* check if phys_is is already covered */ +	for_each_cpu(i, &rapl_cpu_mask) { +		if (phys_id == topology_physical_package_id(i)) +			return; +	} +	/* was not found, so add it */ +	cpumask_set_cpu(cpu, &rapl_cpu_mask); +} + +static int rapl_cpu_prepare(int cpu) +{ +	struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); +	int phys_id = topology_physical_package_id(cpu); +	u64 ms; +	u64 msr_rapl_power_unit_bits; + +	if (pmu) +		return 0; + +	if (phys_id < 0) +		return -1; + +	/* protect rdmsrl() to handle virtualization */ +	if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits)) +		return -1; + +	pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); +	if (!pmu) +		return -1; + +	spin_lock_init(&pmu->lock); + +	INIT_LIST_HEAD(&pmu->active_list); + +	/* +	 * grab power unit as: 1/2^unit Joules +	 * +	 * we cache in local PMU instance +	 */ +	pmu->hw_unit = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; +	pmu->pmu = &rapl_pmu_class; + +	/* +	 * use reference of 200W for scaling the timeout +	 * to avoid missing counter overflows. +	 * 200W = 200 Joules/sec +	 * divide interval by 2 to avoid lockstep (2 * 100) +	 * if hw unit is 32, then we use 2 ms 1/200/2 +	 */ +	if (pmu->hw_unit < 32) +		ms = (1000 / (2 * 100)) * (1ULL << (32 - pmu->hw_unit - 1)); +	else +		ms = 2; + +	pmu->timer_interval = ms_to_ktime(ms); + +	rapl_hrtimer_init(pmu); + +	/* set RAPL pmu for this cpu for now */ +	per_cpu(rapl_pmu, cpu) = pmu; +	per_cpu(rapl_pmu_to_free, cpu) = NULL; + +	return 0; +} + +static void rapl_cpu_kfree(int cpu) +{ +	struct rapl_pmu *pmu = per_cpu(rapl_pmu_to_free, cpu); + +	kfree(pmu); + +	per_cpu(rapl_pmu_to_free, cpu) = NULL; +} + +static int rapl_cpu_dying(int cpu) +{ +	struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); + +	if (!pmu) +		return 0; + +	per_cpu(rapl_pmu, cpu) = NULL; + +	per_cpu(rapl_pmu_to_free, cpu) = pmu; + +	return 0; +} + +static int rapl_cpu_notifier(struct notifier_block *self, +			     unsigned long action, void *hcpu) +{ +	unsigned int cpu = (long)hcpu; + +	switch (action & ~CPU_TASKS_FROZEN) { +	case CPU_UP_PREPARE: +		rapl_cpu_prepare(cpu); +		break; +	case CPU_STARTING: +		rapl_cpu_init(cpu); +		break; +	case CPU_UP_CANCELED: +	case CPU_DYING: +		rapl_cpu_dying(cpu); +		break; +	case CPU_ONLINE: +	case CPU_DEAD: +		rapl_cpu_kfree(cpu); +		break; +	case CPU_DOWN_PREPARE: +		rapl_cpu_exit(cpu); +		break; +	default: +		break; +	} + +	return NOTIFY_OK; +} + +static const struct x86_cpu_id rapl_cpu_match[] = { +	[0] = { .vendor = X86_VENDOR_INTEL, .family = 6 }, +	[1] = {}, +}; + +static int __init rapl_pmu_init(void) +{ +	struct rapl_pmu *pmu; +	int cpu, ret; + +	/* +	 * check for Intel processor family 6 +	 */ +	if (!x86_match_cpu(rapl_cpu_match)) +		return 0; + +	/* check supported CPU */ +	switch (boot_cpu_data.x86_model) { +	case 42: /* Sandy Bridge */ +	case 58: /* Ivy Bridge */ +		rapl_cntr_mask = RAPL_IDX_CLN; +		rapl_pmu_events_group.attrs = rapl_events_cln_attr; +		break; +	case 60: /* Haswell */ +	case 69: /* Haswell-Celeron */ +		rapl_cntr_mask = RAPL_IDX_HSW; +		rapl_pmu_events_group.attrs = rapl_events_hsw_attr; +		break; +	case 45: /* Sandy Bridge-EP */ +	case 62: /* IvyTown */ +		rapl_cntr_mask = RAPL_IDX_SRV; +		rapl_pmu_events_group.attrs = rapl_events_srv_attr; +		break; + +	default: +		/* unsupported */ +		return 0; +	} + +	cpu_notifier_register_begin(); + +	for_each_online_cpu(cpu) { +		ret = rapl_cpu_prepare(cpu); +		if (ret) +			goto out; +		rapl_cpu_init(cpu); +	} + +	__perf_cpu_notifier(rapl_cpu_notifier); + +	ret = perf_pmu_register(&rapl_pmu_class, "power", -1); +	if (WARN_ON(ret)) { +		pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret); +		cpu_notifier_register_done(); +		return -1; +	} + +	pmu = __get_cpu_var(rapl_pmu); + +	pr_info("RAPL PMU detected, hw unit 2^-%d Joules," +		" API unit is 2^-32 Joules," +		" %d fixed counters" +		" %llu ms ovfl timer\n", +		pmu->hw_unit, +		hweight32(rapl_cntr_mask), +		ktime_to_ms(pmu->timer_interval)); + +out: +	cpu_notifier_register_done(); + +	return 0; +} +device_initcall(rapl_pmu_init); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 4118f9f6831..ae6552a0701 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -66,6 +66,47 @@ DEFINE_UNCORE_FORMAT_ATTR(mask_vnw, mask_vnw, "config2:3-4");  DEFINE_UNCORE_FORMAT_ATTR(mask0, mask0, "config2:0-31");  DEFINE_UNCORE_FORMAT_ATTR(mask1, mask1, "config2:32-63"); +static void uncore_pmu_start_hrtimer(struct intel_uncore_box *box); +static void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box); +static void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event); +static void uncore_pmu_event_read(struct perf_event *event); + +static struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event) +{ +	return container_of(event->pmu, struct intel_uncore_pmu, pmu); +} + +static struct intel_uncore_box * +uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu) +{ +	struct intel_uncore_box *box; + +	box = *per_cpu_ptr(pmu->box, cpu); +	if (box) +		return box; + +	raw_spin_lock(&uncore_box_lock); +	list_for_each_entry(box, &pmu->box_list, list) { +		if (box->phys_id == topology_physical_package_id(cpu)) { +			atomic_inc(&box->refcnt); +			*per_cpu_ptr(pmu->box, cpu) = box; +			break; +		} +	} +	raw_spin_unlock(&uncore_box_lock); + +	return *per_cpu_ptr(pmu->box, cpu); +} + +static struct intel_uncore_box *uncore_event_to_box(struct perf_event *event) +{ +	/* +	 * perf core schedules event on the basis of cpu, uncore events are +	 * collected by one of the cpus inside a physical package. +	 */ +	return uncore_pmu_to_box(uncore_event_to_pmu(event), smp_processor_id()); +} +  static u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event)  {  	u64 count; @@ -501,21 +542,24 @@ static struct extra_reg snbep_uncore_cbox_extra_regs[] = {  	SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,  				  SNBEP_CBO_PMON_CTL_TID_EN, 0x1),  	SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4), +	SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0x6),  	SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4), +	SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0x6),  	SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4), +	SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0x6),  	SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x6),  	SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x8),  	SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x8), -	SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0xc), -	SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0xc), +	SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0xa), +	SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0xa),  	SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x2),  	SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x2),  	SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x2),  	SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x2),  	SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x8),  	SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x8), -	SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0xc), -	SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0xc), +	SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0xa), +	SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0xa),  	SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x2),  	SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x2),  	SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x2), @@ -997,6 +1041,20 @@ static int snbep_pci2phy_map_init(int devid)  		}  	} +	if (!err) { +		/* +		 * For PCI bus with no UBOX device, find the next bus +		 * that has UBOX device and use its mapping. +		 */ +		i = -1; +		for (bus = 255; bus >= 0; bus--) { +			if (pcibus_to_physid[bus] >= 0) +				i = pcibus_to_physid[bus]; +			else +				pcibus_to_physid[bus] = i; +		} +	} +  	if (ubox_dev)  		pci_dev_put(ubox_dev); @@ -1099,6 +1157,24 @@ static struct attribute *ivt_uncore_qpi_formats_attr[] = {  	&format_attr_umask.attr,  	&format_attr_edge.attr,  	&format_attr_thresh8.attr, +	&format_attr_match_rds.attr, +	&format_attr_match_rnid30.attr, +	&format_attr_match_rnid4.attr, +	&format_attr_match_dnid.attr, +	&format_attr_match_mc.attr, +	&format_attr_match_opc.attr, +	&format_attr_match_vnw.attr, +	&format_attr_match0.attr, +	&format_attr_match1.attr, +	&format_attr_mask_rds.attr, +	&format_attr_mask_rnid30.attr, +	&format_attr_mask_rnid4.attr, +	&format_attr_mask_dnid.attr, +	&format_attr_mask_mc.attr, +	&format_attr_mask_opc.attr, +	&format_attr_mask_vnw.attr, +	&format_attr_mask0.attr, +	&format_attr_mask1.attr,  	NULL,  }; @@ -1146,10 +1222,16 @@ static struct extra_reg ivt_uncore_cbox_extra_regs[] = {  	SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,  				  SNBEP_CBO_PMON_CTL_TID_EN, 0x1),  	SNBEP_CBO_EVENT_EXTRA_REG(0x1031, 0x10ff, 0x2), + +	SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4), +	SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0xc), +	SNBEP_CBO_EVENT_EXTRA_REG(0x5134, 0xffff, 0xc),  	SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4), +	SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0xc),  	SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4), +	SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0xc),  	SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4), -	SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0xc), +	SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0xc),  	SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x10),  	SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10),  	SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10), @@ -1164,7 +1246,7 @@ static struct extra_reg ivt_uncore_cbox_extra_regs[] = {  	SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10),  	SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10),  	SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10), -	SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10), +	SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10),  	SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),  	SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18),  	SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18), @@ -1312,17 +1394,83 @@ static struct intel_uncore_type ivt_uncore_imc = {  	IVT_UNCORE_PCI_COMMON_INIT(),  }; +/* registers in IRP boxes are not properly aligned */ +static unsigned ivt_uncore_irp_ctls[] = {0xd8, 0xdc, 0xe0, 0xe4}; +static unsigned ivt_uncore_irp_ctrs[] = {0xa0, 0xb0, 0xb8, 0xc0}; + +static void ivt_uncore_irp_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ +	struct pci_dev *pdev = box->pci_dev; +	struct hw_perf_event *hwc = &event->hw; + +	pci_write_config_dword(pdev, ivt_uncore_irp_ctls[hwc->idx], +			       hwc->config | SNBEP_PMON_CTL_EN); +} + +static void ivt_uncore_irp_disable_event(struct intel_uncore_box *box, struct perf_event *event) +{ +	struct pci_dev *pdev = box->pci_dev; +	struct hw_perf_event *hwc = &event->hw; + +	pci_write_config_dword(pdev, ivt_uncore_irp_ctls[hwc->idx], hwc->config); +} + +static u64 ivt_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event) +{ +	struct pci_dev *pdev = box->pci_dev; +	struct hw_perf_event *hwc = &event->hw; +	u64 count = 0; + +	pci_read_config_dword(pdev, ivt_uncore_irp_ctrs[hwc->idx], (u32 *)&count); +	pci_read_config_dword(pdev, ivt_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1); + +	return count; +} + +static struct intel_uncore_ops ivt_uncore_irp_ops = { +	.init_box	= ivt_uncore_pci_init_box, +	.disable_box	= snbep_uncore_pci_disable_box, +	.enable_box	= snbep_uncore_pci_enable_box, +	.disable_event	= ivt_uncore_irp_disable_event, +	.enable_event	= ivt_uncore_irp_enable_event, +	.read_counter	= ivt_uncore_irp_read_counter, +}; + +static struct intel_uncore_type ivt_uncore_irp = { +	.name			= "irp", +	.num_counters		= 4, +	.num_boxes		= 1, +	.perf_ctr_bits		= 48, +	.event_mask		= IVT_PMON_RAW_EVENT_MASK, +	.box_ctl		= SNBEP_PCI_PMON_BOX_CTL, +	.ops			= &ivt_uncore_irp_ops, +	.format_group		= &ivt_uncore_format_group, +}; + +static struct intel_uncore_ops ivt_uncore_qpi_ops = { +	.init_box	= ivt_uncore_pci_init_box, +	.disable_box	= snbep_uncore_pci_disable_box, +	.enable_box	= snbep_uncore_pci_enable_box, +	.disable_event	= snbep_uncore_pci_disable_event, +	.enable_event	= snbep_qpi_enable_event, +	.read_counter	= snbep_uncore_pci_read_counter, +	.hw_config	= snbep_qpi_hw_config, +	.get_constraint	= uncore_get_constraint, +	.put_constraint	= uncore_put_constraint, +}; +  static struct intel_uncore_type ivt_uncore_qpi = { -	.name		= "qpi", -	.num_counters   = 4, -	.num_boxes	= 3, -	.perf_ctr_bits	= 48, -	.perf_ctr	= SNBEP_PCI_PMON_CTR0, -	.event_ctl	= SNBEP_PCI_PMON_CTL0, -	.event_mask	= IVT_QPI_PCI_PMON_RAW_EVENT_MASK, -	.box_ctl	= SNBEP_PCI_PMON_BOX_CTL, -	.ops		= &ivt_uncore_pci_ops, -	.format_group	= &ivt_uncore_qpi_format_group, +	.name			= "qpi", +	.num_counters		= 4, +	.num_boxes		= 3, +	.perf_ctr_bits		= 48, +	.perf_ctr		= SNBEP_PCI_PMON_CTR0, +	.event_ctl		= SNBEP_PCI_PMON_CTL0, +	.event_mask		= IVT_QPI_PCI_PMON_RAW_EVENT_MASK, +	.box_ctl		= SNBEP_PCI_PMON_BOX_CTL, +	.num_shared_regs	= 1, +	.ops			= &ivt_uncore_qpi_ops, +	.format_group		= &ivt_uncore_qpi_format_group,  };  static struct intel_uncore_type ivt_uncore_r2pcie = { @@ -1346,6 +1494,7 @@ static struct intel_uncore_type ivt_uncore_r3qpi = {  enum {  	IVT_PCI_UNCORE_HA,  	IVT_PCI_UNCORE_IMC, +	IVT_PCI_UNCORE_IRP,  	IVT_PCI_UNCORE_QPI,  	IVT_PCI_UNCORE_R2PCIE,  	IVT_PCI_UNCORE_R3QPI, @@ -1354,6 +1503,7 @@ enum {  static struct intel_uncore_type *ivt_pci_uncores[] = {  	[IVT_PCI_UNCORE_HA]	= &ivt_uncore_ha,  	[IVT_PCI_UNCORE_IMC]	= &ivt_uncore_imc, +	[IVT_PCI_UNCORE_IRP]	= &ivt_uncore_irp,  	[IVT_PCI_UNCORE_QPI]	= &ivt_uncore_qpi,  	[IVT_PCI_UNCORE_R2PCIE]	= &ivt_uncore_r2pcie,  	[IVT_PCI_UNCORE_R3QPI]	= &ivt_uncore_r3qpi, @@ -1401,6 +1551,10 @@ static DEFINE_PCI_DEVICE_TABLE(ivt_uncore_pci_ids) = {  		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef1),  		.driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 7),  	}, +	{ /* IRP */ +		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe39), +		.driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IRP, 0), +	},  	{ /* QPI0 Port 0 */  		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe32),  		.driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_QPI, 0), @@ -1429,6 +1583,16 @@ static DEFINE_PCI_DEVICE_TABLE(ivt_uncore_pci_ids) = {  		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3e),  		.driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R3QPI, 2),  	}, +	{ /* QPI Port 0 filter  */ +		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe86), +		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, +						   SNBEP_PCI_QPI_PORT0_FILTER), +	}, +	{ /* QPI Port 0 filter  */ +		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe96), +		.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, +						   SNBEP_PCI_QPI_PORT1_FILTER), +	},  	{ /* end: all zeroes */ }  }; @@ -1517,6 +1681,349 @@ static struct intel_uncore_type *snb_msr_uncores[] = {  	&snb_uncore_cbox,  	NULL,  }; + +enum { +	SNB_PCI_UNCORE_IMC, +}; + +static struct uncore_event_desc snb_uncore_imc_events[] = { +	INTEL_UNCORE_EVENT_DESC(data_reads,  "event=0x01"), +	INTEL_UNCORE_EVENT_DESC(data_reads.scale, "6.103515625e-5"), +	INTEL_UNCORE_EVENT_DESC(data_reads.unit, "MiB"), + +	INTEL_UNCORE_EVENT_DESC(data_writes, "event=0x02"), +	INTEL_UNCORE_EVENT_DESC(data_writes.scale, "6.103515625e-5"), +	INTEL_UNCORE_EVENT_DESC(data_writes.unit, "MiB"), + +	{ /* end: all zeroes */ }, +}; + +#define SNB_UNCORE_PCI_IMC_EVENT_MASK		0xff +#define SNB_UNCORE_PCI_IMC_BAR_OFFSET		0x48 + +/* page size multiple covering all config regs */ +#define SNB_UNCORE_PCI_IMC_MAP_SIZE		0x6000 + +#define SNB_UNCORE_PCI_IMC_DATA_READS		0x1 +#define SNB_UNCORE_PCI_IMC_DATA_READS_BASE	0x5050 +#define SNB_UNCORE_PCI_IMC_DATA_WRITES		0x2 +#define SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE	0x5054 +#define SNB_UNCORE_PCI_IMC_CTR_BASE		SNB_UNCORE_PCI_IMC_DATA_READS_BASE + +static struct attribute *snb_uncore_imc_formats_attr[] = { +	&format_attr_event.attr, +	NULL, +}; + +static struct attribute_group snb_uncore_imc_format_group = { +	.name = "format", +	.attrs = snb_uncore_imc_formats_attr, +}; + +static void snb_uncore_imc_init_box(struct intel_uncore_box *box) +{ +	struct pci_dev *pdev = box->pci_dev; +	int where = SNB_UNCORE_PCI_IMC_BAR_OFFSET; +	resource_size_t addr; +	u32 pci_dword; + +	pci_read_config_dword(pdev, where, &pci_dword); +	addr = pci_dword; + +#ifdef CONFIG_PHYS_ADDR_T_64BIT +	pci_read_config_dword(pdev, where + 4, &pci_dword); +	addr |= ((resource_size_t)pci_dword << 32); +#endif + +	addr &= ~(PAGE_SIZE - 1); + +	box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE); +	box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL; +} + +static void snb_uncore_imc_enable_box(struct intel_uncore_box *box) +{} + +static void snb_uncore_imc_disable_box(struct intel_uncore_box *box) +{} + +static void snb_uncore_imc_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{} + +static void snb_uncore_imc_disable_event(struct intel_uncore_box *box, struct perf_event *event) +{} + +static u64 snb_uncore_imc_read_counter(struct intel_uncore_box *box, struct perf_event *event) +{ +	struct hw_perf_event *hwc = &event->hw; + +	return (u64)*(unsigned int *)(box->io_addr + hwc->event_base); +} + +/* + * custom event_init() function because we define our own fixed, free + * running counters, so we do not want to conflict with generic uncore + * logic. Also simplifies processing + */ +static int snb_uncore_imc_event_init(struct perf_event *event) +{ +	struct intel_uncore_pmu *pmu; +	struct intel_uncore_box *box; +	struct hw_perf_event *hwc = &event->hw; +	u64 cfg = event->attr.config & SNB_UNCORE_PCI_IMC_EVENT_MASK; +	int idx, base; + +	if (event->attr.type != event->pmu->type) +		return -ENOENT; + +	pmu = uncore_event_to_pmu(event); +	/* no device found for this pmu */ +	if (pmu->func_id < 0) +		return -ENOENT; + +	/* Sampling not supported yet */ +	if (hwc->sample_period) +		return -EINVAL; + +	/* unsupported modes and filters */ +	if (event->attr.exclude_user   || +	    event->attr.exclude_kernel || +	    event->attr.exclude_hv     || +	    event->attr.exclude_idle   || +	    event->attr.exclude_host   || +	    event->attr.exclude_guest  || +	    event->attr.sample_period) /* no sampling */ +		return -EINVAL; + +	/* +	 * Place all uncore events for a particular physical package +	 * onto a single cpu +	 */ +	if (event->cpu < 0) +		return -EINVAL; + +	/* check only supported bits are set */ +	if (event->attr.config & ~SNB_UNCORE_PCI_IMC_EVENT_MASK) +		return -EINVAL; + +	box = uncore_pmu_to_box(pmu, event->cpu); +	if (!box || box->cpu < 0) +		return -EINVAL; + +	event->cpu = box->cpu; + +	event->hw.idx = -1; +	event->hw.last_tag = ~0ULL; +	event->hw.extra_reg.idx = EXTRA_REG_NONE; +	event->hw.branch_reg.idx = EXTRA_REG_NONE; +	/* +	 * check event is known (whitelist, determines counter) +	 */ +	switch (cfg) { +	case SNB_UNCORE_PCI_IMC_DATA_READS: +		base = SNB_UNCORE_PCI_IMC_DATA_READS_BASE; +		idx = UNCORE_PMC_IDX_FIXED; +		break; +	case SNB_UNCORE_PCI_IMC_DATA_WRITES: +		base = SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE; +		idx = UNCORE_PMC_IDX_FIXED + 1; +		break; +	default: +		return -EINVAL; +	} + +	/* must be done before validate_group */ +	event->hw.event_base = base; +	event->hw.config = cfg; +	event->hw.idx = idx; + +	/* no group validation needed, we have free running counters */ + +	return 0; +} + +static int snb_uncore_imc_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ +	return 0; +} + +static void snb_uncore_imc_event_start(struct perf_event *event, int flags) +{ +	struct intel_uncore_box *box = uncore_event_to_box(event); +	u64 count; + +	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) +		return; + +	event->hw.state = 0; +	box->n_active++; + +	list_add_tail(&event->active_entry, &box->active_list); + +	count = snb_uncore_imc_read_counter(box, event); +	local64_set(&event->hw.prev_count, count); + +	if (box->n_active == 1) +		uncore_pmu_start_hrtimer(box); +} + +static void snb_uncore_imc_event_stop(struct perf_event *event, int flags) +{ +	struct intel_uncore_box *box = uncore_event_to_box(event); +	struct hw_perf_event *hwc = &event->hw; + +	if (!(hwc->state & PERF_HES_STOPPED)) { +		box->n_active--; + +		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); +		hwc->state |= PERF_HES_STOPPED; + +		list_del(&event->active_entry); + +		if (box->n_active == 0) +			uncore_pmu_cancel_hrtimer(box); +	} + +	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { +		/* +		 * Drain the remaining delta count out of a event +		 * that we are disabling: +		 */ +		uncore_perf_event_update(box, event); +		hwc->state |= PERF_HES_UPTODATE; +	} +} + +static int snb_uncore_imc_event_add(struct perf_event *event, int flags) +{ +	struct intel_uncore_box *box = uncore_event_to_box(event); +	struct hw_perf_event *hwc = &event->hw; + +	if (!box) +		return -ENODEV; + +	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; +	if (!(flags & PERF_EF_START)) +		hwc->state |= PERF_HES_ARCH; + +	snb_uncore_imc_event_start(event, 0); + +	box->n_events++; + +	return 0; +} + +static void snb_uncore_imc_event_del(struct perf_event *event, int flags) +{ +	struct intel_uncore_box *box = uncore_event_to_box(event); +	int i; + +	snb_uncore_imc_event_stop(event, PERF_EF_UPDATE); + +	for (i = 0; i < box->n_events; i++) { +		if (event == box->event_list[i]) { +			--box->n_events; +			break; +		} +	} +} + +static int snb_pci2phy_map_init(int devid) +{ +	struct pci_dev *dev = NULL; +	int bus; + +	dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, dev); +	if (!dev) +		return -ENOTTY; + +	bus = dev->bus->number; + +	pcibus_to_physid[bus] = 0; + +	pci_dev_put(dev); + +	return 0; +} + +static struct pmu snb_uncore_imc_pmu = { +	.task_ctx_nr	= perf_invalid_context, +	.event_init	= snb_uncore_imc_event_init, +	.add		= snb_uncore_imc_event_add, +	.del		= snb_uncore_imc_event_del, +	.start		= snb_uncore_imc_event_start, +	.stop		= snb_uncore_imc_event_stop, +	.read		= uncore_pmu_event_read, +}; + +static struct intel_uncore_ops snb_uncore_imc_ops = { +	.init_box	= snb_uncore_imc_init_box, +	.enable_box	= snb_uncore_imc_enable_box, +	.disable_box	= snb_uncore_imc_disable_box, +	.disable_event	= snb_uncore_imc_disable_event, +	.enable_event	= snb_uncore_imc_enable_event, +	.hw_config	= snb_uncore_imc_hw_config, +	.read_counter	= snb_uncore_imc_read_counter, +}; + +static struct intel_uncore_type snb_uncore_imc = { +	.name		= "imc", +	.num_counters   = 2, +	.num_boxes	= 1, +	.fixed_ctr_bits	= 32, +	.fixed_ctr	= SNB_UNCORE_PCI_IMC_CTR_BASE, +	.event_descs	= snb_uncore_imc_events, +	.format_group	= &snb_uncore_imc_format_group, +	.perf_ctr	= SNB_UNCORE_PCI_IMC_DATA_READS_BASE, +	.event_mask	= SNB_UNCORE_PCI_IMC_EVENT_MASK, +	.ops		= &snb_uncore_imc_ops, +	.pmu		= &snb_uncore_imc_pmu, +}; + +static struct intel_uncore_type *snb_pci_uncores[] = { +	[SNB_PCI_UNCORE_IMC]	= &snb_uncore_imc, +	NULL, +}; + +static DEFINE_PCI_DEVICE_TABLE(snb_uncore_pci_ids) = { +	{ /* IMC */ +		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SNB_IMC), +		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), +	}, +	{ /* end: all zeroes */ }, +}; + +static DEFINE_PCI_DEVICE_TABLE(ivb_uncore_pci_ids) = { +	{ /* IMC */ +		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_IMC), +		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), +	}, +	{ /* end: all zeroes */ }, +}; + +static DEFINE_PCI_DEVICE_TABLE(hsw_uncore_pci_ids) = { +	{ /* IMC */ +		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC), +		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), +	}, +	{ /* end: all zeroes */ }, +}; + +static struct pci_driver snb_uncore_pci_driver = { +	.name		= "snb_uncore", +	.id_table	= snb_uncore_pci_ids, +}; + +static struct pci_driver ivb_uncore_pci_driver = { +	.name		= "ivb_uncore", +	.id_table	= ivb_uncore_pci_ids, +}; + +static struct pci_driver hsw_uncore_pci_driver = { +	.name		= "hsw_uncore", +	.id_table	= hsw_uncore_pci_ids, +}; +  /* end of Sandy Bridge uncore support */  /* Nehalem uncore support */ @@ -2667,6 +3174,7 @@ again:  static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer)  {  	struct intel_uncore_box *box; +	struct perf_event *event;  	unsigned long flags;  	int bit; @@ -2679,19 +3187,27 @@ static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer)  	 */  	local_irq_save(flags); +	/* +	 * handle boxes with an active event list as opposed to active +	 * counters +	 */ +	list_for_each_entry(event, &box->active_list, active_entry) { +		uncore_perf_event_update(box, event); +	} +  	for_each_set_bit(bit, box->active_mask, UNCORE_PMC_IDX_MAX)  		uncore_perf_event_update(box, box->events[bit]);  	local_irq_restore(flags); -	hrtimer_forward_now(hrtimer, ns_to_ktime(UNCORE_PMU_HRTIMER_INTERVAL)); +	hrtimer_forward_now(hrtimer, ns_to_ktime(box->hrtimer_duration));  	return HRTIMER_RESTART;  }  static void uncore_pmu_start_hrtimer(struct intel_uncore_box *box)  {  	__hrtimer_start_range_ns(&box->hrtimer, -			ns_to_ktime(UNCORE_PMU_HRTIMER_INTERVAL), 0, +			ns_to_ktime(box->hrtimer_duration), 0,  			HRTIMER_MODE_REL_PINNED, 0);  } @@ -2725,43 +3241,12 @@ static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type,  	box->cpu = -1;  	box->phys_id = -1; -	return box; -} - -static struct intel_uncore_box * -uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu) -{ -	struct intel_uncore_box *box; - -	box = *per_cpu_ptr(pmu->box, cpu); -	if (box) -		return box; - -	raw_spin_lock(&uncore_box_lock); -	list_for_each_entry(box, &pmu->box_list, list) { -		if (box->phys_id == topology_physical_package_id(cpu)) { -			atomic_inc(&box->refcnt); -			*per_cpu_ptr(pmu->box, cpu) = box; -			break; -		} -	} -	raw_spin_unlock(&uncore_box_lock); - -	return *per_cpu_ptr(pmu->box, cpu); -} +	/* set default hrtimer timeout */ +	box->hrtimer_duration = UNCORE_PMU_HRTIMER_INTERVAL; -static struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event) -{ -	return container_of(event->pmu, struct intel_uncore_pmu, pmu); -} +	INIT_LIST_HEAD(&box->active_list); -static struct intel_uncore_box *uncore_event_to_box(struct perf_event *event) -{ -	/* -	 * perf core schedules event on the basis of cpu, uncore events are -	 * collected by one of the cpus inside a physical package. -	 */ -	return uncore_pmu_to_box(uncore_event_to_pmu(event), smp_processor_id()); +	return box;  }  static int @@ -3157,16 +3642,21 @@ static int __init uncore_pmu_register(struct intel_uncore_pmu *pmu)  {  	int ret; -	pmu->pmu = (struct pmu) { -		.attr_groups	= pmu->type->attr_groups, -		.task_ctx_nr	= perf_invalid_context, -		.event_init	= uncore_pmu_event_init, -		.add		= uncore_pmu_event_add, -		.del		= uncore_pmu_event_del, -		.start		= uncore_pmu_event_start, -		.stop		= uncore_pmu_event_stop, -		.read		= uncore_pmu_event_read, -	}; +	if (!pmu->type->pmu) { +		pmu->pmu = (struct pmu) { +			.attr_groups	= pmu->type->attr_groups, +			.task_ctx_nr	= perf_invalid_context, +			.event_init	= uncore_pmu_event_init, +			.add		= uncore_pmu_event_add, +			.del		= uncore_pmu_event_del, +			.start		= uncore_pmu_event_start, +			.stop		= uncore_pmu_event_stop, +			.read		= uncore_pmu_event_read, +		}; +	} else { +		pmu->pmu = *pmu->type->pmu; +		pmu->pmu.attr_groups = pmu->type->attr_groups; +	}  	if (pmu->type->num_boxes == 1) {  		if (strlen(pmu->type->name) > 0) @@ -3212,6 +3702,8 @@ static int __init uncore_type_init(struct intel_uncore_type *type)  	if (!pmus)  		return -ENOMEM; +	type->pmus = pmus; +  	type->unconstrainted = (struct event_constraint)  		__EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1,  				0, type->num_counters, 0, 0); @@ -3247,7 +3739,6 @@ static int __init uncore_type_init(struct intel_uncore_type *type)  	}  	type->pmu_group = &uncore_pmu_attr_group; -	type->pmus = pmus;  	return 0;  fail:  	uncore_type_exit(type); @@ -3379,6 +3870,28 @@ static int __init uncore_pci_init(void)  		pci_uncores = ivt_pci_uncores;  		uncore_pci_driver = &ivt_uncore_pci_driver;  		break; +	case 42: /* Sandy Bridge */ +		ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_SNB_IMC); +		if (ret) +			return ret; +		pci_uncores = snb_pci_uncores; +		uncore_pci_driver = &snb_uncore_pci_driver; +		break; +	case 58: /* Ivy Bridge */ +		ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_IVB_IMC); +		if (ret) +			return ret; +		pci_uncores = snb_pci_uncores; +		uncore_pci_driver = &ivb_uncore_pci_driver; +		break; +	case 60: /* Haswell */ +	case 69: /* Haswell Celeron */ +		ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_HSW_IMC); +		if (ret) +			return ret; +		pci_uncores = snb_pci_uncores; +		uncore_pci_driver = &hsw_uncore_pci_driver; +		break;  	default:  		return 0;  	} @@ -3650,7 +4163,7 @@ static void __init uncore_cpu_setup(void *dummy)  static int __init uncore_cpu_init(void)  { -	int ret, cpu, max_cores; +	int ret, max_cores;  	max_cores = boot_cpu_data.x86_max_cores;  	switch (boot_cpu_data.x86_model) { @@ -3694,29 +4207,6 @@ static int __init uncore_cpu_init(void)  	if (ret)  		return ret; -	get_online_cpus(); - -	for_each_online_cpu(cpu) { -		int i, phys_id = topology_physical_package_id(cpu); - -		for_each_cpu(i, &uncore_cpu_mask) { -			if (phys_id == topology_physical_package_id(i)) { -				phys_id = -1; -				break; -			} -		} -		if (phys_id < 0) -			continue; - -		uncore_cpu_prepare(cpu, phys_id); -		uncore_event_init_cpu(cpu); -	} -	on_each_cpu(uncore_cpu_setup, NULL, 1); - -	register_cpu_notifier(&uncore_cpu_nb); - -	put_online_cpus(); -  	return 0;  } @@ -3745,6 +4235,41 @@ static int __init uncore_pmus_register(void)  	return 0;  } +static void __init uncore_cpumask_init(void) +{ +	int cpu; + +	/* +	 * ony invoke once from msr or pci init code +	 */ +	if (!cpumask_empty(&uncore_cpu_mask)) +		return; + +	cpu_notifier_register_begin(); + +	for_each_online_cpu(cpu) { +		int i, phys_id = topology_physical_package_id(cpu); + +		for_each_cpu(i, &uncore_cpu_mask) { +			if (phys_id == topology_physical_package_id(i)) { +				phys_id = -1; +				break; +			} +		} +		if (phys_id < 0) +			continue; + +		uncore_cpu_prepare(cpu, phys_id); +		uncore_event_init_cpu(cpu); +	} +	on_each_cpu(uncore_cpu_setup, NULL, 1); + +	__register_cpu_notifier(&uncore_cpu_nb); + +	cpu_notifier_register_done(); +} + +  static int __init intel_uncore_init(void)  {  	int ret; @@ -3763,6 +4288,7 @@ static int __init intel_uncore_init(void)  		uncore_pci_exit();  		goto fail;  	} +	uncore_cpumask_init();  	uncore_pmus_register();  	return 0; diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index a80ab71a883..90236f0c94a 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -6,6 +6,7 @@  #define UNCORE_PMU_NAME_LEN		32  #define UNCORE_PMU_HRTIMER_INTERVAL	(60LL * NSEC_PER_SEC) +#define UNCORE_SNB_IMC_HRTIMER_INTERVAL (5ULL * NSEC_PER_SEC)  #define UNCORE_FIXED_EVENT		0xff  #define UNCORE_PMC_IDX_MAX_GENERIC	8 @@ -440,6 +441,7 @@ struct intel_uncore_type {  	struct intel_uncore_ops *ops;  	struct uncore_event_desc *event_descs;  	const struct attribute_group *attr_groups[4]; +	struct pmu *pmu; /* for custom pmu ops */  };  #define pmu_group attr_groups[0] @@ -488,8 +490,11 @@ struct intel_uncore_box {  	u64 tags[UNCORE_PMC_IDX_MAX];  	struct pci_dev *pci_dev;  	struct intel_uncore_pmu *pmu; +	u64 hrtimer_duration; /* hrtimer timeout for this box */  	struct hrtimer hrtimer;  	struct list_head list; +	struct list_head active_list; +	void *io_addr;  	struct intel_uncore_extra_reg shared_regs[0];  }; diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index 3486e666035..5d466b7d860 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -1257,7 +1257,24 @@ again:  			pass++;  			goto again;  		} - +		/* +		 * Perf does test runs to see if a whole group can be assigned +		 * together succesfully.  There can be multiple rounds of this. +		 * Unfortunately, p4_pmu_swap_config_ts touches the hwc->config +		 * bits, such that the next round of group assignments will +		 * cause the above p4_should_swap_ts to pass instead of fail. +		 * This leads to counters exclusive to thread0 being used by +		 * thread1. +		 * +		 * Solve this with a cheap hack, reset the idx back to -1 to +		 * force a new lookup (p4_next_cntr) to get the right counter +		 * for the right thread. +		 * +		 * This probably doesn't comply with the general spirit of how +		 * perf wants to work, but P4 is special. :-( +		 */ +		if (p4_should_swap_ts(hwc->config, cpu)) +			hwc->idx = -1;  		p4_pmu_swap_config_ts(hwc, cpu);  		if (assign)  			assign[i] = cntr_idx; @@ -1322,6 +1339,7 @@ static __initconst const struct x86_pmu p4_pmu = {  __init int p4_pmu_init(void)  {  	unsigned int low, high; +	int i, reg;  	/* If we get stripped -- indexing fails */  	BUILD_BUG_ON(ARCH_P4_MAX_CCCR > INTEL_PMC_MAX_GENERIC); @@ -1340,5 +1358,19 @@ __init int p4_pmu_init(void)  	x86_pmu = p4_pmu; +	/* +	 * Even though the counters are configured to interrupt a particular +	 * logical processor when an overflow happens, testing has shown that +	 * on kdump kernels (which uses a single cpu), thread1's counter +	 * continues to run and will report an NMI on thread0.  Due to the +	 * overflow bug, this leads to a stream of unknown NMIs. +	 * +	 * Solve this by zero'ing out the registers to mimic a reset. +	 */ +	for (i = 0; i < x86_pmu.num_counters; i++) { +		reg = x86_pmu_config_addr(i); +		wrmsrl_safe(reg, 0ULL); +	} +  	return 0;  } diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c index b1e2fe11532..7c1a0c07b60 100644 --- a/arch/x86/kernel/cpu/perf_event_p6.c +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -231,31 +231,49 @@ static __initconst const struct x86_pmu p6_pmu = {  }; +static __init void p6_pmu_rdpmc_quirk(void) +{ +	if (boot_cpu_data.x86_mask < 9) { +		/* +		 * PPro erratum 26; fixed in stepping 9 and above. +		 */ +		pr_warn("Userspace RDPMC support disabled due to a CPU erratum\n"); +		x86_pmu.attr_rdpmc_broken = 1; +		x86_pmu.attr_rdpmc = 0; +	} +} +  __init int p6_pmu_init(void)  { +	x86_pmu = p6_pmu; +  	switch (boot_cpu_data.x86_model) { -	case 1: -	case 3:  /* Pentium Pro */ -	case 5: -	case 6:  /* Pentium II */ -	case 7: -	case 8: -	case 11: /* Pentium III */ -	case 9: -	case 13: -		/* Pentium M */ +	case  1: /* Pentium Pro */ +		x86_add_quirk(p6_pmu_rdpmc_quirk); +		break; + +	case  3: /* Pentium II - Klamath */ +	case  5: /* Pentium II - Deschutes */ +	case  6: /* Pentium II - Mendocino */  		break; + +	case  7: /* Pentium III - Katmai */ +	case  8: /* Pentium III - Coppermine */ +	case 10: /* Pentium III Xeon */ +	case 11: /* Pentium III - Tualatin */ +		break; + +	case  9: /* Pentium M - Banias */ +	case 13: /* Pentium M - Dothan */ +		break; +  	default: -		pr_cont("unsupported p6 CPU model %d ", -			boot_cpu_data.x86_model); +		pr_cont("unsupported p6 CPU model %d ", boot_cpu_data.x86_model);  		return -ENODEV;  	} -	x86_pmu = p6_pmu; -  	memcpy(hw_cache_event_ids, p6_hw_cache_event_ids,  		sizeof(hw_cache_event_ids)); -  	return 0;  } diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index aee6317b902..06fe3ed8b85 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -11,15 +11,12 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,  			      unsigned int cpu)  {  #ifdef CONFIG_SMP -	if (c->x86_max_cores * smp_num_siblings > 1) { -		seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); -		seq_printf(m, "siblings\t: %d\n", -			   cpumask_weight(cpu_core_mask(cpu))); -		seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); -		seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); -		seq_printf(m, "apicid\t\t: %d\n", c->apicid); -		seq_printf(m, "initial apicid\t: %d\n", c->initial_apicid); -	} +	seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); +	seq_printf(m, "siblings\t: %d\n", cpumask_weight(cpu_core_mask(cpu))); +	seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); +	seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); +	seq_printf(m, "apicid\t\t: %d\n", c->apicid); +	seq_printf(m, "initial apicid\t: %d\n", c->initial_apicid);  #endif  } diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c index 88db010845c..136ac74dee8 100644 --- a/arch/x86/kernel/cpu/rdrand.c +++ b/arch/x86/kernel/cpu/rdrand.c @@ -27,24 +27,11 @@  static int __init x86_rdrand_setup(char *s)  {  	setup_clear_cpu_cap(X86_FEATURE_RDRAND); +	setup_clear_cpu_cap(X86_FEATURE_RDSEED);  	return 1;  }  __setup("nordrand", x86_rdrand_setup); -/* We can't use arch_get_random_long() here since alternatives haven't run */ -static inline int rdrand_long(unsigned long *v) -{ -	int ok; -	asm volatile("1: " RDRAND_LONG "\n\t" -		     "jc 2f\n\t" -		     "decl %0\n\t" -		     "jnz 1b\n\t" -		     "2:" -		     : "=r" (ok), "=a" (*v) -		     : "0" (RDRAND_RETRY_LOOPS)); -	return ok; -} -  /*   * Force a reseed cycle; we are architecturally guaranteed a reseed   * after no more than 512 128-bit chunks of random data.  This also diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index f2cc63e9cf0..b6f794aa169 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -1,5 +1,5 @@  /* - *	Routines to indentify additional cpu features that are scattered in + *	Routines to identify additional cpu features that are scattered in   *	cpuid space.   */  #include <linux/cpu.h> diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c index aa0430d69b9..3fa0e5ad86b 100644 --- a/arch/x86/kernel/cpu/transmeta.c +++ b/arch/x86/kernel/cpu/transmeta.c @@ -1,6 +1,5 @@  #include <linux/kernel.h>  #include <linux/mm.h> -#include <linux/init.h>  #include <asm/processor.h>  #include <asm/msr.h>  #include "cpu.h" diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c index 202759a1412..ef9c2a0078b 100644 --- a/arch/x86/kernel/cpu/umc.c +++ b/arch/x86/kernel/cpu/umc.c @@ -1,5 +1,4 @@  #include <linux/kernel.h> -#include <linux/init.h>  #include <asm/processor.h>  #include "cpu.h" @@ -11,8 +10,8 @@  static const struct cpu_dev umc_cpu_dev = {  	.c_vendor	= "UMC",  	.c_ident	= { "UMC UMC UMC" }, -	.c_models = { -		{ .vendor = X86_VENDOR_UMC, .family = 4, .model_names = +	.legacy_models	= { +		{ .family = 4, .model_names =  		  {  			  [1] = "U5D",  			  [2] = "U5S", diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 7d9481c743f..3225ae6c518 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -198,14 +198,15 @@ static int __init cpuid_init(void)  		goto out_chrdev;  	}  	cpuid_class->devnode = cpuid_devnode; -	get_online_cpus(); + +	cpu_notifier_register_begin();  	for_each_online_cpu(i) {  		err = cpuid_device_create(i);  		if (err != 0)  			goto out_class;  	} -	register_hotcpu_notifier(&cpuid_class_cpu_notifier); -	put_online_cpus(); +	__register_hotcpu_notifier(&cpuid_class_cpu_notifier); +	cpu_notifier_register_done();  	err = 0;  	goto out; @@ -215,7 +216,7 @@ out_class:  	for_each_online_cpu(i) {  		cpuid_device_destroy(i);  	} -	put_online_cpus(); +	cpu_notifier_register_done();  	class_destroy(cpuid_class);  out_chrdev:  	__unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); @@ -227,13 +228,13 @@ static void __exit cpuid_exit(void)  {  	int cpu = 0; -	get_online_cpus(); +	cpu_notifier_register_begin();  	for_each_online_cpu(cpu)  		cpuid_device_destroy(cpu);  	class_destroy(cpuid_class);  	__unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); -	unregister_hotcpu_notifier(&cpuid_class_cpu_notifier); -	put_online_cpus(); +	__unregister_hotcpu_notifier(&cpuid_class_cpu_notifier); +	cpu_notifier_register_done();  }  module_init(cpuid_init); diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index e0e0841eef4..507de806659 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -7,7 +7,6 @@   *   */ -#include <linux/init.h>  #include <linux/types.h>  #include <linux/kernel.h>  #include <linux/smp.h> @@ -58,9 +57,7 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs)  {  #ifdef CONFIG_X86_32  	struct pt_regs fixed_regs; -#endif -#ifdef CONFIG_X86_32  	if (!user_mode_vm(regs)) {  		crash_fixup_ss_esp(&fixed_regs, regs);  		regs = &fixed_regs; @@ -127,12 +124,12 @@ void native_machine_crash_shutdown(struct pt_regs *regs)  	cpu_emergency_vmxoff();  	cpu_emergency_svm_disable(); -	lapic_shutdown();  #ifdef CONFIG_X86_IO_APIC  	/* Prevent crash_kexec() from deadlocking on ioapic_lock. */  	ioapic_zap_locks();  	disable_IO_APIC();  #endif +	lapic_shutdown();  #ifdef CONFIG_HPET_TIMER  	hpet_disable();  #endif diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 376dc787344..7db54b5d5f8 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -20,22 +20,13 @@  #include <asm/hpet.h>  #include <asm/apic.h>  #include <asm/pci_x86.h> +#include <asm/setup.h>  __initdata u64 initial_dtb;  char __initdata cmd_line[COMMAND_LINE_SIZE];  int __initdata of_ioapic; -unsigned long pci_address_to_pio(phys_addr_t address) -{ -	/* -	 * The ioport address can be directly used by inX / outX -	 */ -	BUG_ON(address >= (1 << 16)); -	return (unsigned long)address; -} -EXPORT_SYMBOL_GPL(pci_address_to_pio); -  void __init early_init_dt_scan_chosen_arch(unsigned long node)  {  	BUG(); @@ -51,15 +42,6 @@ void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align)  	return __alloc_bootmem(size, align, __pa(MAX_DMA_ADDRESS));  } -#ifdef CONFIG_BLK_DEV_INITRD -void __init early_init_dt_setup_initrd_arch(u64 start, u64 end) -{ -	initrd_start = (unsigned long)__va(start); -	initrd_end = (unsigned long)__va(end); -	initrd_below_start_ok = 1; -} -#endif -  void __init add_dtb(u64 data)  {  	initial_dtb = data + offsetof(struct setup_data, data); @@ -105,7 +87,6 @@ struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus)  static int x86_of_pci_irq_enable(struct pci_dev *dev)  { -	struct of_irq oirq;  	u32 virq;  	int ret;  	u8 pin; @@ -116,12 +97,7 @@ static int x86_of_pci_irq_enable(struct pci_dev *dev)  	if (!pin)  		return 0; -	ret = of_irq_map_pci(dev, &oirq); -	if (ret) -		return ret; - -	virq = irq_create_of_mapping(oirq.controller, oirq.specifier, -			oirq.size); +	virq = of_irq_parse_and_map_pci(dev, 0, 0);  	if (virq == 0)  		return -EINVAL;  	dev->irq = virq; @@ -230,32 +206,23 @@ static void __init dtb_apic_setup(void)  static void __init x86_flattree_get_config(void)  {  	u32 size, map_len; -	void *new_dtb; +	void *dt;  	if (!initial_dtb)  		return; -	map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), -			(u64)sizeof(struct boot_param_header)); +	map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), (u64)128); -	initial_boot_params = early_memremap(initial_dtb, map_len); -	size = be32_to_cpu(initial_boot_params->totalsize); +	initial_boot_params = dt = early_memremap(initial_dtb, map_len); +	size = of_get_flat_dt_size();  	if (map_len < size) { -		early_iounmap(initial_boot_params, map_len); -		initial_boot_params = early_memremap(initial_dtb, size); +		early_iounmap(dt, map_len); +		initial_boot_params = dt = early_memremap(initial_dtb, size);  		map_len = size;  	} -	new_dtb = alloc_bootmem(size); -	memcpy(new_dtb, initial_boot_params, size); -	early_iounmap(initial_boot_params, map_len); - -	initial_boot_params = new_dtb; - -	/* root level address cells */ -	of_scan_flat_dt(early_init_dt_scan_root, NULL); - -	unflatten_device_tree(); +	unflatten_and_copy_device_tree(); +	early_iounmap(dt, map_len);  }  #else  static inline void x86_flattree_get_config(void) { } diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c index 5d3fe8d36e4..f6dfd9334b6 100644 --- a/arch/x86/kernel/doublefault.c +++ b/arch/x86/kernel/doublefault.c @@ -1,6 +1,5 @@  #include <linux/mm.h>  #include <linux/sched.h> -#include <linux/init.h>  #include <linux/init_task.h>  #include <linux/fs.h> diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index deb6421c9e6..b74ebc7c440 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -25,12 +25,17 @@ unsigned int code_bytes = 64;  int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;  static int die_counter; -void printk_address(unsigned long address, int reliable) +static void printk_stack_address(unsigned long address, int reliable)  {  	pr_cont(" [<%p>] %s%pB\n",  		(void *)address, reliable ? "" : "? ", (void *)address);  } +void printk_address(unsigned long address) +{ +	pr_cont(" [<%p>] %pS\n", (void *)address, (void *)address); +} +  #ifdef CONFIG_FUNCTION_GRAPH_TRACER  static void  print_ftrace_graph_addr(unsigned long addr, void *data, @@ -151,7 +156,7 @@ static void print_trace_address(void *data, unsigned long addr, int reliable)  {  	touch_nmi_watchdog();  	printk(data); -	printk_address(addr, reliable); +	printk_stack_address(addr, reliable);  }  static const struct stacktrace_ops print_trace_ops = { @@ -195,7 +200,7 @@ static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED;  static int die_owner = -1;  static unsigned int die_nest_count; -unsigned __kprobes long oops_begin(void) +unsigned long oops_begin(void)  {  	int cpu;  	unsigned long flags; @@ -218,8 +223,9 @@ unsigned __kprobes long oops_begin(void)  	return flags;  }  EXPORT_SYMBOL_GPL(oops_begin); +NOKPROBE_SYMBOL(oops_begin); -void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) +void oops_end(unsigned long flags, struct pt_regs *regs, int signr)  {  	if (regs && kexec_should_crash(current))  		crash_kexec(regs); @@ -242,8 +248,9 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)  		panic("Fatal exception");  	do_exit(signr);  } +NOKPROBE_SYMBOL(oops_end); -int __kprobes __die(const char *str, struct pt_regs *regs, long err) +int __die(const char *str, struct pt_regs *regs, long err)  {  #ifdef CONFIG_X86_32  	unsigned short ss; @@ -281,11 +288,12 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)  #else  	/* Executive summary in case the oops scrolled away */  	printk(KERN_ALERT "RIP "); -	printk_address(regs->ip, 1); +	printk_address(regs->ip);  	printk(" RSP <%016lx>\n", regs->sp);  #endif  	return 0;  } +NOKPROBE_SYMBOL(__die);  /*   * This is gone through when something in the kernel has done something bad diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index f2a1770ca17..5abd4cd4230 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -16,12 +16,35 @@  #include <asm/stacktrace.h> +static void *is_irq_stack(void *p, void *irq) +{ +	if (p < irq || p >= (irq + THREAD_SIZE)) +		return NULL; +	return irq + THREAD_SIZE; +} + + +static void *is_hardirq_stack(unsigned long *stack, int cpu) +{ +	void *irq = per_cpu(hardirq_stack, cpu); + +	return is_irq_stack(stack, irq); +} + +static void *is_softirq_stack(unsigned long *stack, int cpu) +{ +	void *irq = per_cpu(softirq_stack, cpu); + +	return is_irq_stack(stack, irq); +}  void dump_trace(struct task_struct *task, struct pt_regs *regs,  		unsigned long *stack, unsigned long bp,  		const struct stacktrace_ops *ops, void *data)  { +	const unsigned cpu = get_cpu();  	int graph = 0; +	u32 *prev_esp;  	if (!task)  		task = current; @@ -30,7 +53,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,  		unsigned long dummy;  		stack = &dummy; -		if (task && task != current) +		if (task != current)  			stack = (unsigned long *)task->thread.sp;  	} @@ -39,18 +62,31 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,  	for (;;) {  		struct thread_info *context; +		void *end_stack; + +		end_stack = is_hardirq_stack(stack, cpu); +		if (!end_stack) +			end_stack = is_softirq_stack(stack, cpu); -		context = (struct thread_info *) -			((unsigned long)stack & (~(THREAD_SIZE - 1))); -		bp = ops->walk_stack(context, stack, bp, ops, data, NULL, &graph); +		context = task_thread_info(task); +		bp = ops->walk_stack(context, stack, bp, ops, data, +				     end_stack, &graph); -		stack = (unsigned long *)context->previous_esp; +		/* Stop if not on irq stack */ +		if (!end_stack) +			break; + +		/* The previous esp is saved on the bottom of the stack */ +		prev_esp = (u32 *)(end_stack - THREAD_SIZE); +		stack = (unsigned long *)*prev_esp;  		if (!stack)  			break; +  		if (ops->stack(data, "IRQ") < 0)  			break;  		touch_nmi_watchdog();  	} +	put_cpu();  }  EXPORT_SYMBOL(dump_trace); diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index addb207dab9..1abcb50b48a 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -104,6 +104,44 @@ in_irq_stack(unsigned long *stack, unsigned long *irq_stack,  	return (stack >= irq_stack && stack < irq_stack_end);  } +static const unsigned long irq_stack_size = +	(IRQ_STACK_SIZE - 64) / sizeof(unsigned long); + +enum stack_type { +	STACK_IS_UNKNOWN, +	STACK_IS_NORMAL, +	STACK_IS_EXCEPTION, +	STACK_IS_IRQ, +}; + +static enum stack_type +analyze_stack(int cpu, struct task_struct *task, unsigned long *stack, +	      unsigned long **stack_end, unsigned long *irq_stack, +	      unsigned *used, char **id) +{ +	unsigned long addr; + +	addr = ((unsigned long)stack & (~(THREAD_SIZE - 1))); +	if ((unsigned long)task_stack_page(task) == addr) +		return STACK_IS_NORMAL; + +	*stack_end = in_exception_stack(cpu, (unsigned long)stack, +					used, id); +	if (*stack_end) +		return STACK_IS_EXCEPTION; + +	if (!irq_stack) +		return STACK_IS_NORMAL; + +	*stack_end = irq_stack; +	irq_stack = irq_stack - irq_stack_size; + +	if (in_irq_stack(stack, irq_stack, *stack_end)) +		return STACK_IS_IRQ; + +	return STACK_IS_UNKNOWN; +} +  /*   * x86-64 can have up to three kernel stacks:   * process stack @@ -116,12 +154,12 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,  		const struct stacktrace_ops *ops, void *data)  {  	const unsigned cpu = get_cpu(); -	unsigned long *irq_stack_end = -		(unsigned long *)per_cpu(irq_stack_ptr, cpu); -	unsigned used = 0;  	struct thread_info *tinfo; -	int graph = 0; +	unsigned long *irq_stack = (unsigned long *)per_cpu(irq_stack_ptr, cpu);  	unsigned long dummy; +	unsigned used = 0; +	int graph = 0; +	int done = 0;  	if (!task)  		task = current; @@ -143,49 +181,61 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,  	 * exceptions  	 */  	tinfo = task_thread_info(task); -	for (;;) { +	while (!done) { +		unsigned long *stack_end; +		enum stack_type stype;  		char *id; -		unsigned long *estack_end; -		estack_end = in_exception_stack(cpu, (unsigned long)stack, -						&used, &id); -		if (estack_end) { +		stype = analyze_stack(cpu, task, stack, &stack_end, +				      irq_stack, &used, &id); + +		/* Default finish unless specified to continue */ +		done = 1; + +		switch (stype) { + +		/* Break out early if we are on the thread stack */ +		case STACK_IS_NORMAL: +			break; + +		case STACK_IS_EXCEPTION: +  			if (ops->stack(data, id) < 0)  				break;  			bp = ops->walk_stack(tinfo, stack, bp, ops, -					     data, estack_end, &graph); +					     data, stack_end, &graph);  			ops->stack(data, "<EOE>");  			/*  			 * We link to the next stack via the  			 * second-to-last pointer (index -2 to end) in the  			 * exception stack:  			 */ -			stack = (unsigned long *) estack_end[-2]; -			continue; -		} -		if (irq_stack_end) { -			unsigned long *irq_stack; -			irq_stack = irq_stack_end - -				(IRQ_STACK_SIZE - 64) / sizeof(*irq_stack); - -			if (in_irq_stack(stack, irq_stack, irq_stack_end)) { -				if (ops->stack(data, "IRQ") < 0) -					break; -				bp = ops->walk_stack(tinfo, stack, bp, -					ops, data, irq_stack_end, &graph); -				/* -				 * We link to the next stack (which would be -				 * the process stack normally) the last -				 * pointer (index -1 to end) in the IRQ stack: -				 */ -				stack = (unsigned long *) (irq_stack_end[-1]); -				irq_stack_end = NULL; -				ops->stack(data, "EOI"); -				continue; -			} +			stack = (unsigned long *) stack_end[-2]; +			done = 0; +			break; + +		case STACK_IS_IRQ: + +			if (ops->stack(data, "IRQ") < 0) +				break; +			bp = ops->walk_stack(tinfo, stack, bp, +				     ops, data, stack_end, &graph); +			/* +			 * We link to the next stack (which would be +			 * the process stack normally) the last +			 * pointer (index -1 to end) in the IRQ stack: +			 */ +			stack = (unsigned long *) (stack_end[-1]); +			irq_stack = NULL; +			ops->stack(data, "EOI"); +			done = 0; +			break; + +		case STACK_IS_UNKNOWN: +			ops->stack(data, "UNK"); +			break;  		} -		break;  	}  	/* diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 174da5fc5a7..988c00a1f60 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1120,7 +1120,7 @@ void __init memblock_find_dma_reserve(void)  		nr_pages += end_pfn - start_pfn;  	} -	for_each_free_mem_range(u, MAX_NUMNODES, &start, &end, NULL) { +	for_each_free_mem_range(u, NUMA_NO_NODE, &start, &end, NULL) {  		start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN);  		end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN);  		if (start_pfn < end_pfn) diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index b3cd3ebae07..2e1a6853e00 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -17,6 +17,7 @@  #include <asm/dma.h>  #include <asm/io_apic.h>  #include <asm/apic.h> +#include <asm/hpet.h>  #include <asm/iommu.h>  #include <asm/gart.h>  #include <asm/irq_remapping.h> @@ -203,18 +204,15 @@ static void __init intel_remapping_check(int num, int slot, int func)  	revision = read_pci_config_byte(num, slot, func, PCI_REVISION_ID);  	/* - 	 * Revision 13 of all triggering devices id in this quirk have -	 * a problem draining interrupts when irq remapping is enabled, -	 * and should be flagged as broken.  Additionally revisions 0x12 -	 * and 0x22 of device id 0x3405 has this problem. +	 * Revision <= 13 of all triggering devices id in this quirk +	 * have a problem draining interrupts when irq remapping is +	 * enabled, and should be flagged as broken. Additionally +	 * revision 0x22 of device id 0x3405 has this problem.  	 */ -	if (revision == 0x13) +	if (revision <= 0x13)  		set_irq_remapping_broken(); -	else if ((device == 0x3405) && -	    ((revision == 0x12) || -	     (revision == 0x22))) +	else if (device == 0x3405 && revision == 0x22)  		set_irq_remapping_broken(); -  }  /* @@ -228,7 +226,7 @@ static void __init intel_remapping_check(int num, int slot, int func)   *   * And yes, so far on current devices the base addr is always under 4G.   */ -static u32 __init intel_stolen_base(int num, int slot, int func) +static u32 __init intel_stolen_base(int num, int slot, int func, size_t stolen_size)  {  	u32 base; @@ -243,10 +241,118 @@ static u32 __init intel_stolen_base(int num, int slot, int func)  	return base;  } -#define KB(x)	((x) * 1024) +#define KB(x)	((x) * 1024UL)  #define MB(x)	(KB (KB (x)))  #define GB(x)	(MB (KB (x))) +static size_t __init i830_tseg_size(void) +{ +	u8 tmp = read_pci_config_byte(0, 0, 0, I830_ESMRAMC); + +	if (!(tmp & TSEG_ENABLE)) +		return 0; + +	if (tmp & I830_TSEG_SIZE_1M) +		return MB(1); +	else +		return KB(512); +} + +static size_t __init i845_tseg_size(void) +{ +	u8 tmp = read_pci_config_byte(0, 0, 0, I845_ESMRAMC); + +	if (!(tmp & TSEG_ENABLE)) +		return 0; + +	switch (tmp & I845_TSEG_SIZE_MASK) { +	case I845_TSEG_SIZE_512K: +		return KB(512); +	case I845_TSEG_SIZE_1M: +		return MB(1); +	default: +		WARN_ON(1); +		return 0; +	} +} + +static size_t __init i85x_tseg_size(void) +{ +	u8 tmp = read_pci_config_byte(0, 0, 0, I85X_ESMRAMC); + +	if (!(tmp & TSEG_ENABLE)) +		return 0; + +	return MB(1); +} + +static size_t __init i830_mem_size(void) +{ +	return read_pci_config_byte(0, 0, 0, I830_DRB3) * MB(32); +} + +static size_t __init i85x_mem_size(void) +{ +	return read_pci_config_byte(0, 0, 1, I85X_DRB3) * MB(32); +} + +/* + * On 830/845/85x the stolen memory base isn't available in any + * register. We need to calculate it as TOM-TSEG_SIZE-stolen_size. + */ +static u32 __init i830_stolen_base(int num, int slot, int func, size_t stolen_size) +{ +	return i830_mem_size() - i830_tseg_size() - stolen_size; +} + +static u32 __init i845_stolen_base(int num, int slot, int func, size_t stolen_size) +{ +	return i830_mem_size() - i845_tseg_size() - stolen_size; +} + +static u32 __init i85x_stolen_base(int num, int slot, int func, size_t stolen_size) +{ +	return i85x_mem_size() - i85x_tseg_size() - stolen_size; +} + +static u32 __init i865_stolen_base(int num, int slot, int func, size_t stolen_size) +{ +	/* +	 * FIXME is the graphics stolen memory region +	 * always at TOUD? Ie. is it always the last +	 * one to be allocated by the BIOS? +	 */ +	return read_pci_config_16(0, 0, 0, I865_TOUD) << 16; +} + +static size_t __init i830_stolen_size(int num, int slot, int func) +{ +	size_t stolen_size; +	u16 gmch_ctrl; + +	gmch_ctrl = read_pci_config_16(0, 0, 0, I830_GMCH_CTRL); + +	switch (gmch_ctrl & I830_GMCH_GMS_MASK) { +	case I830_GMCH_GMS_STOLEN_512: +		stolen_size = KB(512); +		break; +	case I830_GMCH_GMS_STOLEN_1024: +		stolen_size = MB(1); +		break; +	case I830_GMCH_GMS_STOLEN_8192: +		stolen_size = MB(8); +		break; +	case I830_GMCH_GMS_LOCAL: +		/* local memory isn't part of the normal address space */ +		stolen_size = 0; +		break; +	default: +		return 0; +	} + +	return stolen_size; +} +  static size_t __init gen3_stolen_size(int num, int slot, int func)  {  	size_t stolen_size; @@ -313,29 +419,110 @@ static size_t __init gen6_stolen_size(int num, int slot, int func)  	return gmch_ctrl << 25; /* 32 MB units */  } -typedef size_t (*stolen_size_fn)(int num, int slot, int func); - -static struct pci_device_id intel_stolen_ids[] __initdata = { -	INTEL_I915G_IDS(gen3_stolen_size), -	INTEL_I915GM_IDS(gen3_stolen_size), -	INTEL_I945G_IDS(gen3_stolen_size), -	INTEL_I945GM_IDS(gen3_stolen_size), -	INTEL_VLV_M_IDS(gen3_stolen_size), -	INTEL_VLV_D_IDS(gen3_stolen_size), -	INTEL_PINEVIEW_IDS(gen3_stolen_size), -	INTEL_I965G_IDS(gen3_stolen_size), -	INTEL_G33_IDS(gen3_stolen_size), -	INTEL_I965GM_IDS(gen3_stolen_size), -	INTEL_GM45_IDS(gen3_stolen_size), -	INTEL_G45_IDS(gen3_stolen_size), -	INTEL_IRONLAKE_D_IDS(gen3_stolen_size), -	INTEL_IRONLAKE_M_IDS(gen3_stolen_size), -	INTEL_SNB_D_IDS(gen6_stolen_size), -	INTEL_SNB_M_IDS(gen6_stolen_size), -	INTEL_IVB_M_IDS(gen6_stolen_size), -	INTEL_IVB_D_IDS(gen6_stolen_size), -	INTEL_HSW_D_IDS(gen6_stolen_size), -	INTEL_HSW_M_IDS(gen6_stolen_size), +static size_t __init gen8_stolen_size(int num, int slot, int func) +{ +	u16 gmch_ctrl; + +	gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL); +	gmch_ctrl >>= BDW_GMCH_GMS_SHIFT; +	gmch_ctrl &= BDW_GMCH_GMS_MASK; +	return gmch_ctrl << 25; /* 32 MB units */ +} + +static size_t __init chv_stolen_size(int num, int slot, int func) +{ +	u16 gmch_ctrl; + +	gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL); +	gmch_ctrl >>= SNB_GMCH_GMS_SHIFT; +	gmch_ctrl &= SNB_GMCH_GMS_MASK; + +	/* +	 * 0x0  to 0x10: 32MB increments starting at 0MB +	 * 0x11 to 0x16: 4MB increments starting at 8MB +	 * 0x17 to 0x1d: 4MB increments start at 36MB +	 */ +	if (gmch_ctrl < 0x11) +		return gmch_ctrl << 25; +	else if (gmch_ctrl < 0x17) +		return (gmch_ctrl - 0x11 + 2) << 22; +	else +		return (gmch_ctrl - 0x17 + 9) << 22; +} + +struct intel_stolen_funcs { +	size_t (*size)(int num, int slot, int func); +	u32 (*base)(int num, int slot, int func, size_t size); +}; + +static const struct intel_stolen_funcs i830_stolen_funcs __initconst = { +	.base = i830_stolen_base, +	.size = i830_stolen_size, +}; + +static const struct intel_stolen_funcs i845_stolen_funcs __initconst = { +	.base = i845_stolen_base, +	.size = i830_stolen_size, +}; + +static const struct intel_stolen_funcs i85x_stolen_funcs __initconst = { +	.base = i85x_stolen_base, +	.size = gen3_stolen_size, +}; + +static const struct intel_stolen_funcs i865_stolen_funcs __initconst = { +	.base = i865_stolen_base, +	.size = gen3_stolen_size, +}; + +static const struct intel_stolen_funcs gen3_stolen_funcs __initconst = { +	.base = intel_stolen_base, +	.size = gen3_stolen_size, +}; + +static const struct intel_stolen_funcs gen6_stolen_funcs __initconst = { +	.base = intel_stolen_base, +	.size = gen6_stolen_size, +}; + +static const struct intel_stolen_funcs gen8_stolen_funcs __initconst = { +	.base = intel_stolen_base, +	.size = gen8_stolen_size, +}; + +static const struct intel_stolen_funcs chv_stolen_funcs __initconst = { +	.base = intel_stolen_base, +	.size = chv_stolen_size, +}; + +static const struct pci_device_id intel_stolen_ids[] __initconst = { +	INTEL_I830_IDS(&i830_stolen_funcs), +	INTEL_I845G_IDS(&i845_stolen_funcs), +	INTEL_I85X_IDS(&i85x_stolen_funcs), +	INTEL_I865G_IDS(&i865_stolen_funcs), +	INTEL_I915G_IDS(&gen3_stolen_funcs), +	INTEL_I915GM_IDS(&gen3_stolen_funcs), +	INTEL_I945G_IDS(&gen3_stolen_funcs), +	INTEL_I945GM_IDS(&gen3_stolen_funcs), +	INTEL_VLV_M_IDS(&gen6_stolen_funcs), +	INTEL_VLV_D_IDS(&gen6_stolen_funcs), +	INTEL_PINEVIEW_IDS(&gen3_stolen_funcs), +	INTEL_I965G_IDS(&gen3_stolen_funcs), +	INTEL_G33_IDS(&gen3_stolen_funcs), +	INTEL_I965GM_IDS(&gen3_stolen_funcs), +	INTEL_GM45_IDS(&gen3_stolen_funcs), +	INTEL_G45_IDS(&gen3_stolen_funcs), +	INTEL_IRONLAKE_D_IDS(&gen3_stolen_funcs), +	INTEL_IRONLAKE_M_IDS(&gen3_stolen_funcs), +	INTEL_SNB_D_IDS(&gen6_stolen_funcs), +	INTEL_SNB_M_IDS(&gen6_stolen_funcs), +	INTEL_IVB_M_IDS(&gen6_stolen_funcs), +	INTEL_IVB_D_IDS(&gen6_stolen_funcs), +	INTEL_HSW_D_IDS(&gen6_stolen_funcs), +	INTEL_HSW_M_IDS(&gen6_stolen_funcs), +	INTEL_BDW_M_IDS(&gen8_stolen_funcs), +	INTEL_BDW_D_IDS(&gen8_stolen_funcs), +	INTEL_CHV_IDS(&chv_stolen_funcs),  };  static void __init intel_graphics_stolen(int num, int slot, int func) @@ -352,11 +539,13 @@ static void __init intel_graphics_stolen(int num, int slot, int func)  	for (i = 0; i < ARRAY_SIZE(intel_stolen_ids); i++) {  		if (intel_stolen_ids[i].device == device) { -			stolen_size_fn stolen_size = -				(stolen_size_fn)intel_stolen_ids[i].driver_data; -			size = stolen_size(num, slot, func); -			start = intel_stolen_base(num, slot, func); +			const struct intel_stolen_funcs *stolen_funcs = +				(const struct intel_stolen_funcs *)intel_stolen_ids[i].driver_data; +			size = stolen_funcs->size(num, slot, func); +			start = stolen_funcs->base(num, slot, func, size);  			if (size && start) { +				printk(KERN_INFO "Reserving Intel graphics stolen memory at 0x%x-0x%x\n", +				       start, start + (u32)size - 1);  				/* Mark this space as reserved */  				e820_add_region(start, size, E820_RESERVED);  				sanitize_e820_map(e820.map, @@ -368,6 +557,15 @@ static void __init intel_graphics_stolen(int num, int slot, int func)  	}  } +static void __init force_disable_hpet(int num, int slot, int func) +{ +#ifdef CONFIG_HPET_TIMER +	boot_hpet_disable = 1; +	pr_info("x86/hpet: Will disable the HPET for this platform because it's not reliable\n"); +#endif +} + +  #define QFLAG_APPLY_ONCE 	0x1  #define QFLAG_APPLIED		0x2  #define QFLAG_DONE		(QFLAG_APPLY_ONCE|QFLAG_APPLIED) @@ -405,6 +603,12 @@ static struct chipset early_qrk[] __initdata = {  	  PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check },  	{ PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID,  	  QFLAG_APPLY_ONCE, intel_graphics_stolen }, +	/* +	 * HPET on current version of Baytrail platform has accuracy +	 * problems, disable it for now: +	 */ +	{ PCI_VENDOR_ID_INTEL, 0x0f00, +		PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},  	{}  }; diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index d15f575a861..01d1c187c9f 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -14,9 +14,11 @@  #include <xen/hvc-console.h>  #include <asm/pci-direct.h>  #include <asm/fixmap.h> -#include <asm/mrst.h> +#include <asm/intel-mid.h>  #include <asm/pgtable.h>  #include <linux/usb/ehci_def.h> +#include <linux/efi.h> +#include <asm/efi.h>  /* Simple VGA output */  #define VGABASE		(__ISA_IO_base + 0xb8000) @@ -234,6 +236,11 @@ static int __init setup_early_printk(char *buf)  			early_console_register(&early_hsu_console, keep);  		}  #endif +#ifdef CONFIG_EARLY_PRINTK_EFI +		if (!strncmp(buf, "efi", 3)) +			early_console_register(&early_efi_console, keep); +#endif +  		buf++;  	}  	return 0; diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index f0dcb0ceb6a..0d0c9d4ab6d 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -315,10 +315,6 @@ ENTRY(ret_from_kernel_thread)  ENDPROC(ret_from_kernel_thread)  /* - * Interrupt exit functions should be protected against kprobes - */ -	.pushsection .kprobes.text, "ax" -/*   * Return to user mode is not as complex as all this looks,   * but we want the default path for a system call return to   * go as quickly as possible which is why some of this is @@ -362,12 +358,9 @@ END(ret_from_exception)  #ifdef CONFIG_PREEMPT  ENTRY(resume_kernel)  	DISABLE_INTERRUPTS(CLBR_ANY) -	cmpl $0,TI_preempt_count(%ebp)	# non-zero preempt_count ? -	jnz restore_all  need_resched: -	movl TI_flags(%ebp), %ecx	# need_resched set ? -	testb $_TIF_NEED_RESCHED, %cl -	jz restore_all +	cmpl $0,PER_CPU_VAR(__preempt_count) +	jnz restore_all  	testl $X86_EFLAGS_IF,PT_EFLAGS(%esp)	# interrupts off (exception path) ?  	jz restore_all  	call preempt_schedule_irq @@ -375,10 +368,6 @@ need_resched:  END(resume_kernel)  #endif  	CFI_ENDPROC -/* - * End of kprobes section - */ -	.popsection  /* SYSENTER_RETURN points to after the "sysenter" instruction in     the vsyscall page.  See vsyscall-sysentry.S, which defines the symbol.  */ @@ -434,8 +423,9 @@ sysenter_past_esp:  	jnz sysenter_audit  sysenter_do_call:  	cmpl $(NR_syscalls), %eax -	jae syscall_badsys +	jae sysenter_badsys  	call *sys_call_table(,%eax,4) +sysenter_after_call:  	movl %eax,PT_EAX(%esp)  	LOCKDEP_SYS_EXIT  	DISABLE_INTERRUPTS(CLBR_ANY) @@ -498,10 +488,6 @@ sysexit_audit:  	PTGS_TO_GS_EX  ENDPROC(ia32_sysenter_target) -/* - * syscall stub including irq exit should be protected against kprobes - */ -	.pushsection .kprobes.text, "ax"  	# system call handler stub  ENTRY(system_call)  	RING0_INT_FRAME			# can't unwind into user space anyway @@ -516,6 +502,7 @@ ENTRY(system_call)  	jae syscall_badsys  syscall_call:  	call *sys_call_table(,%eax,4) +syscall_after_call:  	movl %eax,PT_EAX(%esp)		# store the return value  syscall_exit:  	LOCKDEP_SYS_EXIT @@ -530,6 +517,7 @@ syscall_exit:  restore_all:  	TRACE_IRQS_IRET  restore_all_notrace: +#ifdef CONFIG_X86_ESPFIX32  	movl PT_EFLAGS(%esp), %eax	# mix EFLAGS, SS and CS  	# Warning: PT_OLDSS(%esp) contains the wrong/random values if we  	# are returning to the kernel. @@ -540,6 +528,7 @@ restore_all_notrace:  	cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax  	CFI_REMEMBER_STATE  	je ldt_ss			# returning to user-space with LDT SS +#endif  restore_nocheck:  	RESTORE_REGS 4			# skip orig_eax/error_code  irq_return: @@ -552,13 +541,9 @@ ENTRY(iret_exc)  .previous  	_ASM_EXTABLE(irq_return,iret_exc) +#ifdef CONFIG_X86_ESPFIX32  	CFI_RESTORE_STATE  ldt_ss: -	larl PT_OLDSS(%esp), %eax -	jnz restore_nocheck -	testl $0x00400000, %eax		# returning to 32bit stack? -	jnz restore_nocheck		# allright, normal return -  #ifdef CONFIG_PARAVIRT  	/*  	 * The kernel can't run on a non-flat stack if paravirt mode @@ -600,6 +585,7 @@ ldt_ss:  	lss (%esp), %esp		/* switch to espfix segment */  	CFI_ADJUST_CFA_OFFSET -8  	jmp restore_nocheck +#endif  	CFI_ENDPROC  ENDPROC(system_call) @@ -690,14 +676,15 @@ syscall_fault:  END(syscall_fault)  syscall_badsys: -	movl $-ENOSYS,PT_EAX(%esp) -	jmp resume_userspace +	movl $-ENOSYS,%eax +	jmp syscall_after_call +END(syscall_badsys) + +sysenter_badsys: +	movl $-ENOSYS,%eax +	jmp sysenter_after_call  END(syscall_badsys)  	CFI_ENDPROC -/* - * End of kprobes section - */ -	.popsection  .macro FIXUP_ESPFIX_STACK  /* @@ -707,6 +694,7 @@ END(syscall_badsys)   * the high word of the segment base from the GDT and swiches to the   * normal stack and adjusts ESP with the matching offset.   */ +#ifdef CONFIG_X86_ESPFIX32  	/* fixup the stack */  	mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */  	mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ @@ -716,8 +704,10 @@ END(syscall_badsys)  	pushl_cfi %eax  	lss (%esp), %esp		/* switch to the normal stack segment */  	CFI_ADJUST_CFA_OFFSET -8 +#endif  .endm  .macro UNWIND_ESPFIX_STACK +#ifdef CONFIG_X86_ESPFIX32  	movl %ss, %eax  	/* see if on espfix stack */  	cmpw $__ESPFIX_SS, %ax @@ -728,6 +718,7 @@ END(syscall_badsys)  	/* switch to normal stack */  	FIXUP_ESPFIX_STACK  27: +#endif  .endm  /* @@ -784,10 +775,6 @@ common_interrupt:  ENDPROC(common_interrupt)  	CFI_ENDPROC -/* - *  Irq entries should be protected against kprobes - */ -	.pushsection .kprobes.text, "ax"  #define BUILD_INTERRUPT3(name, nr, fn)	\  ENTRY(name)				\  	RING0_INT_FRAME;		\ @@ -964,10 +951,6 @@ ENTRY(spurious_interrupt_bug)  	jmp error_code  	CFI_ENDPROC  END(spurious_interrupt_bug) -/* - * End of kprobes section - */ -	.popsection  #ifdef CONFIG_XEN  /* Xen doesn't set %esp to be precisely what the normal sysenter @@ -1085,7 +1068,7 @@ ENTRY(ftrace_caller)  	pushl $0	/* Pass NULL as regs pointer */  	movl 4*4(%esp), %eax  	movl 0x4(%ebp), %edx -	leal function_trace_op, %ecx +	movl function_trace_op, %ecx  	subl $MCOUNT_INSN_SIZE, %eax  .globl ftrace_call @@ -1143,7 +1126,7 @@ ENTRY(ftrace_regs_caller)  	movl 12*4(%esp), %eax	/* Load ip (1st parameter) */  	subl $MCOUNT_INSN_SIZE, %eax	/* Adjust ip */  	movl 0x4(%ebp), %edx	/* Load parent ip (2nd parameter) */ -	leal function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */ +	movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */  	pushl %esp		/* Save pt_regs as 4th parameter */  GLOBAL(ftrace_regs_call) @@ -1242,10 +1225,15 @@ return_to_handler:  	jmp *%ecx  #endif -/* - * Some functions should be protected against kprobes - */ -	.pushsection .kprobes.text, "ax" +#ifdef CONFIG_TRACING +ENTRY(trace_page_fault) +	RING0_EC_FRAME +	ASM_CLAC +	pushl_cfi $trace_do_page_fault +	jmp error_code +	CFI_ENDPROC +END(trace_page_fault) +#endif  ENTRY(page_fault)  	RING0_EC_FRAME @@ -1348,11 +1336,13 @@ END(debug)  ENTRY(nmi)  	RING0_INT_FRAME  	ASM_CLAC +#ifdef CONFIG_X86_ESPFIX32  	pushl_cfi %eax  	movl %ss, %eax  	cmpw $__ESPFIX_SS, %ax  	popl_cfi %eax  	je nmi_espfix_stack +#endif  	cmpl $ia32_sysenter_target,(%esp)  	je nmi_stack_fixup  	pushl_cfi %eax @@ -1392,6 +1382,7 @@ nmi_debug_stack_check:  	FIX_STACK 24, nmi_stack_correct, 1  	jmp nmi_stack_correct +#ifdef CONFIG_X86_ESPFIX32  nmi_espfix_stack:  	/* We have a RING0_INT_FRAME here.  	 * @@ -1413,6 +1404,7 @@ nmi_espfix_stack:  	lss 12+4(%esp), %esp		# back to espfix stack  	CFI_ADJUST_CFA_OFFSET -24  	jmp irq_return +#endif  	CFI_ENDPROC  END(nmi) @@ -1446,7 +1438,3 @@ ENTRY(async_page_fault)  END(async_page_fault)  #endif -/* - * End of kprobes section - */ -	.popsection diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b077f4cc225..c844f0816ab 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -36,7 +36,7 @@   * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack   * frame that is otherwise undefined after a SYSCALL   * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. - * - errorentry/paranoidentry/zeroentry - Define exception entry points. + * - idtentry - Define exception entry points.   */  #include <linux/linkage.h> @@ -53,11 +53,11 @@  #include <asm/page_types.h>  #include <asm/irqflags.h>  #include <asm/paravirt.h> -#include <asm/ftrace.h>  #include <asm/percpu.h>  #include <asm/asm.h>  #include <asm/context_tracking.h>  #include <asm/smap.h> +#include <asm/pgtable_types.h>  #include <linux/err.h>  /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */ @@ -69,209 +69,6 @@  	.code64  	.section .entry.text, "ax" -#ifdef CONFIG_FUNCTION_TRACER - -#ifdef CC_USING_FENTRY -# define function_hook	__fentry__ -#else -# define function_hook	mcount -#endif - -#ifdef CONFIG_DYNAMIC_FTRACE - -ENTRY(function_hook) -	retq -END(function_hook) - -/* skip is set if stack has been adjusted */ -.macro ftrace_caller_setup skip=0 -	MCOUNT_SAVE_FRAME \skip - -	/* Load the ftrace_ops into the 3rd parameter */ -	leaq function_trace_op, %rdx - -	/* Load ip into the first parameter */ -	movq RIP(%rsp), %rdi -	subq $MCOUNT_INSN_SIZE, %rdi -	/* Load the parent_ip into the second parameter */ -#ifdef CC_USING_FENTRY -	movq SS+16(%rsp), %rsi -#else -	movq 8(%rbp), %rsi -#endif -.endm - -ENTRY(ftrace_caller) -	/* Check if tracing was disabled (quick check) */ -	cmpl $0, function_trace_stop -	jne  ftrace_stub - -	ftrace_caller_setup -	/* regs go into 4th parameter (but make it NULL) */ -	movq $0, %rcx - -GLOBAL(ftrace_call) -	call ftrace_stub - -	MCOUNT_RESTORE_FRAME -ftrace_return: - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -GLOBAL(ftrace_graph_call) -	jmp ftrace_stub -#endif - -GLOBAL(ftrace_stub) -	retq -END(ftrace_caller) - -ENTRY(ftrace_regs_caller) -	/* Save the current flags before compare (in SS location)*/ -	pushfq - -	/* Check if tracing was disabled (quick check) */ -	cmpl $0, function_trace_stop -	jne  ftrace_restore_flags - -	/* skip=8 to skip flags saved in SS */ -	ftrace_caller_setup 8 - -	/* Save the rest of pt_regs */ -	movq %r15, R15(%rsp) -	movq %r14, R14(%rsp) -	movq %r13, R13(%rsp) -	movq %r12, R12(%rsp) -	movq %r11, R11(%rsp) -	movq %r10, R10(%rsp) -	movq %rbp, RBP(%rsp) -	movq %rbx, RBX(%rsp) -	/* Copy saved flags */ -	movq SS(%rsp), %rcx -	movq %rcx, EFLAGS(%rsp) -	/* Kernel segments */ -	movq $__KERNEL_DS, %rcx -	movq %rcx, SS(%rsp) -	movq $__KERNEL_CS, %rcx -	movq %rcx, CS(%rsp) -	/* Stack - skipping return address */ -	leaq SS+16(%rsp), %rcx -	movq %rcx, RSP(%rsp) - -	/* regs go into 4th parameter */ -	leaq (%rsp), %rcx - -GLOBAL(ftrace_regs_call) -	call ftrace_stub - -	/* Copy flags back to SS, to restore them */ -	movq EFLAGS(%rsp), %rax -	movq %rax, SS(%rsp) - -	/* Handlers can change the RIP */ -	movq RIP(%rsp), %rax -	movq %rax, SS+8(%rsp) - -	/* restore the rest of pt_regs */ -	movq R15(%rsp), %r15 -	movq R14(%rsp), %r14 -	movq R13(%rsp), %r13 -	movq R12(%rsp), %r12 -	movq R10(%rsp), %r10 -	movq RBP(%rsp), %rbp -	movq RBX(%rsp), %rbx - -	/* skip=8 to skip flags saved in SS */ -	MCOUNT_RESTORE_FRAME 8 - -	/* Restore flags */ -	popfq - -	jmp ftrace_return -ftrace_restore_flags: -	popfq -	jmp  ftrace_stub - -END(ftrace_regs_caller) - - -#else /* ! CONFIG_DYNAMIC_FTRACE */ - -ENTRY(function_hook) -	cmpl $0, function_trace_stop -	jne  ftrace_stub - -	cmpq $ftrace_stub, ftrace_trace_function -	jnz trace - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -	cmpq $ftrace_stub, ftrace_graph_return -	jnz ftrace_graph_caller - -	cmpq $ftrace_graph_entry_stub, ftrace_graph_entry -	jnz ftrace_graph_caller -#endif - -GLOBAL(ftrace_stub) -	retq - -trace: -	MCOUNT_SAVE_FRAME - -	movq RIP(%rsp), %rdi -#ifdef CC_USING_FENTRY -	movq SS+16(%rsp), %rsi -#else -	movq 8(%rbp), %rsi -#endif -	subq $MCOUNT_INSN_SIZE, %rdi - -	call   *ftrace_trace_function - -	MCOUNT_RESTORE_FRAME - -	jmp ftrace_stub -END(function_hook) -#endif /* CONFIG_DYNAMIC_FTRACE */ -#endif /* CONFIG_FUNCTION_TRACER */ - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -ENTRY(ftrace_graph_caller) -	MCOUNT_SAVE_FRAME - -#ifdef CC_USING_FENTRY -	leaq SS+16(%rsp), %rdi -	movq $0, %rdx	/* No framepointers needed */ -#else -	leaq 8(%rbp), %rdi -	movq (%rbp), %rdx -#endif -	movq RIP(%rsp), %rsi -	subq $MCOUNT_INSN_SIZE, %rsi - -	call	prepare_ftrace_return - -	MCOUNT_RESTORE_FRAME - -	retq -END(ftrace_graph_caller) - -GLOBAL(return_to_handler) -	subq  $24, %rsp - -	/* Save the return values */ -	movq %rax, (%rsp) -	movq %rdx, 8(%rsp) -	movq %rbp, %rdi - -	call ftrace_return_to_handler - -	movq %rax, %rdi -	movq 8(%rsp), %rdx -	movq (%rsp), %rax -	addq $24, %rsp -	jmp *%rdi -#endif -  #ifndef CONFIG_PREEMPT  #define retint_kernel retint_restore_args @@ -487,8 +284,6 @@ ENDPROC(native_usergs_sysret64)  	TRACE_IRQS_OFF  	.endm -/* save complete stack frame */ -	.pushsection .kprobes.text, "ax"  ENTRY(save_paranoid)  	XCPT_FRAME 1 RDI+8  	cld @@ -517,7 +312,6 @@ ENTRY(save_paranoid)  1:	ret  	CFI_ENDPROC  END(save_paranoid) -	.popsection  /*   * A newly forked process directly context switches into this address. @@ -975,10 +769,6 @@ END(interrupt)  	call \func  	.endm -/* - * Interrupt entry/exit should be protected against kprobes - */ -	.pushsection .kprobes.text, "ax"  	/*  	 * The interrupt stubs push (~vector+0x80) onto the stack and  	 * then jump to common_interrupt. @@ -1041,12 +831,45 @@ restore_args:  irq_return:  	INTERRUPT_RETURN -	_ASM_EXTABLE(irq_return, bad_iret) -#ifdef CONFIG_PARAVIRT  ENTRY(native_iret) +	/* +	 * Are we returning to a stack segment from the LDT?  Note: in +	 * 64-bit mode SS:RSP on the exception stack is always valid. +	 */ +#ifdef CONFIG_X86_ESPFIX64 +	testb $4,(SS-RIP)(%rsp) +	jnz native_irq_return_ldt +#endif + +native_irq_return_iret:  	iretq -	_ASM_EXTABLE(native_iret, bad_iret) +	_ASM_EXTABLE(native_irq_return_iret, bad_iret) + +#ifdef CONFIG_X86_ESPFIX64 +native_irq_return_ldt: +	pushq_cfi %rax +	pushq_cfi %rdi +	SWAPGS +	movq PER_CPU_VAR(espfix_waddr),%rdi +	movq %rax,(0*8)(%rdi)	/* RAX */ +	movq (2*8)(%rsp),%rax	/* RIP */ +	movq %rax,(1*8)(%rdi) +	movq (3*8)(%rsp),%rax	/* CS */ +	movq %rax,(2*8)(%rdi) +	movq (4*8)(%rsp),%rax	/* RFLAGS */ +	movq %rax,(3*8)(%rdi) +	movq (6*8)(%rsp),%rax	/* SS */ +	movq %rax,(5*8)(%rdi) +	movq (5*8)(%rsp),%rax	/* RSP */ +	movq %rax,(4*8)(%rdi) +	andl $0xffff0000,%eax +	popq_cfi %rdi +	orq PER_CPU_VAR(espfix_stack),%rax +	SWAPGS +	movq %rax,%rsp +	popq_cfi %rax +	jmp native_irq_return_iret  #endif  	.section .fixup,"ax" @@ -1103,22 +926,46 @@ retint_signal:  	/* Returning to kernel space. Check if we need preemption */  	/* rcx:	 threadinfo. interrupts off. */  ENTRY(retint_kernel) -	cmpl $0,TI_preempt_count(%rcx) +	cmpl $0,PER_CPU_VAR(__preempt_count)  	jnz  retint_restore_args -	bt  $TIF_NEED_RESCHED,TI_flags(%rcx) -	jnc  retint_restore_args  	bt   $9,EFLAGS-ARGOFFSET(%rsp)	/* interrupts off? */  	jnc  retint_restore_args  	call preempt_schedule_irq  	jmp exit_intr  #endif -  	CFI_ENDPROC  END(common_interrupt) -/* - * End of kprobes section - */ -       .popsection + +	/* +	 * If IRET takes a fault on the espfix stack, then we +	 * end up promoting it to a doublefault.  In that case, +	 * modify the stack to make it look like we just entered +	 * the #GP handler from user space, similar to bad_iret. +	 */ +#ifdef CONFIG_X86_ESPFIX64 +	ALIGN +__do_double_fault: +	XCPT_FRAME 1 RDI+8 +	movq RSP(%rdi),%rax		/* Trap on the espfix stack? */ +	sarq $PGDIR_SHIFT,%rax +	cmpl $ESPFIX_PGD_ENTRY,%eax +	jne do_double_fault		/* No, just deliver the fault */ +	cmpl $__KERNEL_CS,CS(%rdi) +	jne do_double_fault +	movq RIP(%rdi),%rax +	cmpq $native_irq_return_iret,%rax +	jne do_double_fault		/* This shouldn't happen... */ +	movq PER_CPU_VAR(kernel_stack),%rax +	subq $(6*8-KERNEL_STACK_OFFSET),%rax	/* Reset to original stack */ +	movq %rax,RSP(%rdi) +	movq $0,(%rax)			/* Missing (lost) #GP error code */ +	movq $general_protection,RIP(%rdi) +	retq +	CFI_ENDPROC +END(__do_double_fault) +#else +# define __do_double_fault do_double_fault +#endif  /*   * APIC interrupts. @@ -1205,114 +1052,100 @@ apicinterrupt IRQ_WORK_VECTOR \  /*   * Exception entry points.   */ -.macro zeroentry sym do_sym -ENTRY(\sym) -	INTR_FRAME -	ASM_CLAC -	PARAVIRT_ADJUST_EXCEPTION_FRAME -	pushq_cfi $-1		/* ORIG_RAX: no syscall to restart */ -	subq $ORIG_RAX-R15, %rsp -	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 -	call error_entry -	DEFAULT_FRAME 0 -	movq %rsp,%rdi		/* pt_regs pointer */ -	xorl %esi,%esi		/* no error code */ -	call \do_sym -	jmp error_exit		/* %ebx: no swapgs flag */ -	CFI_ENDPROC -END(\sym) -.endm +#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) -.macro paranoidzeroentry sym do_sym +.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1  ENTRY(\sym) -	INTR_FRAME -	ASM_CLAC -	PARAVIRT_ADJUST_EXCEPTION_FRAME -	pushq_cfi $-1		/* ORIG_RAX: no syscall to restart */ -	subq $ORIG_RAX-R15, %rsp -	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 -	call save_paranoid -	TRACE_IRQS_OFF -	movq %rsp,%rdi		/* pt_regs pointer */ -	xorl %esi,%esi		/* no error code */ -	call \do_sym -	jmp paranoid_exit	/* %ebx: no swapgs flag */ -	CFI_ENDPROC -END(\sym) -.endm +	/* Sanity check */ +	.if \shift_ist != -1 && \paranoid == 0 +	.error "using shift_ist requires paranoid=1" +	.endif -#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) -.macro paranoidzeroentry_ist sym do_sym ist -ENTRY(\sym) +	.if \has_error_code +	XCPT_FRAME +	.else  	INTR_FRAME -	ASM_CLAC -	PARAVIRT_ADJUST_EXCEPTION_FRAME -	pushq_cfi $-1		/* ORIG_RAX: no syscall to restart */ -	subq $ORIG_RAX-R15, %rsp -	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 -	call save_paranoid -	TRACE_IRQS_OFF_DEBUG -	movq %rsp,%rdi		/* pt_regs pointer */ -	xorl %esi,%esi		/* no error code */ -	subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist) -	call \do_sym -	addq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist) -	jmp paranoid_exit	/* %ebx: no swapgs flag */ -	CFI_ENDPROC -END(\sym) -.endm +	.endif -.macro errorentry sym do_sym -ENTRY(\sym) -	XCPT_FRAME  	ASM_CLAC  	PARAVIRT_ADJUST_EXCEPTION_FRAME -	subq $ORIG_RAX-R15, %rsp -	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 -	call error_entry -	DEFAULT_FRAME 0 -	movq %rsp,%rdi			/* pt_regs pointer */ -	movq ORIG_RAX(%rsp),%rsi	/* get error code */ -	movq $-1,ORIG_RAX(%rsp)		/* no syscall to restart */ -	call \do_sym -	jmp error_exit			/* %ebx: no swapgs flag */ -	CFI_ENDPROC -END(\sym) -.endm -	/* error code is on the stack already */ -.macro paranoiderrorentry sym do_sym -ENTRY(\sym) -	XCPT_FRAME -	ASM_CLAC -	PARAVIRT_ADJUST_EXCEPTION_FRAME +	.ifeq \has_error_code +	pushq_cfi $-1			/* ORIG_RAX: no syscall to restart */ +	.endif +  	subq $ORIG_RAX-R15, %rsp  	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 + +	.if \paranoid  	call save_paranoid +	.else +	call error_entry +	.endif +  	DEFAULT_FRAME 0 + +	.if \paranoid +	.if \shift_ist != -1 +	TRACE_IRQS_OFF_DEBUG		/* reload IDT in case of recursion */ +	.else  	TRACE_IRQS_OFF +	.endif +	.endif +  	movq %rsp,%rdi			/* pt_regs pointer */ + +	.if \has_error_code  	movq ORIG_RAX(%rsp),%rsi	/* get error code */  	movq $-1,ORIG_RAX(%rsp)		/* no syscall to restart */ +	.else +	xorl %esi,%esi			/* no error code */ +	.endif + +	.if \shift_ist != -1 +	subq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist) +	.endif +  	call \do_sym + +	.if \shift_ist != -1 +	addq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist) +	.endif + +	.if \paranoid  	jmp paranoid_exit		/* %ebx: no swapgs flag */ +	.else +	jmp error_exit			/* %ebx: no swapgs flag */ +	.endif +  	CFI_ENDPROC  END(\sym)  .endm -zeroentry divide_error do_divide_error -zeroentry overflow do_overflow -zeroentry bounds do_bounds -zeroentry invalid_op do_invalid_op -zeroentry device_not_available do_device_not_available -paranoiderrorentry double_fault do_double_fault -zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun -errorentry invalid_TSS do_invalid_TSS -errorentry segment_not_present do_segment_not_present -zeroentry spurious_interrupt_bug do_spurious_interrupt_bug -zeroentry coprocessor_error do_coprocessor_error -errorentry alignment_check do_alignment_check -zeroentry simd_coprocessor_error do_simd_coprocessor_error +#ifdef CONFIG_TRACING +.macro trace_idtentry sym do_sym has_error_code:req +idtentry trace(\sym) trace(\do_sym) has_error_code=\has_error_code +idtentry \sym \do_sym has_error_code=\has_error_code +.endm +#else +.macro trace_idtentry sym do_sym has_error_code:req +idtentry \sym \do_sym has_error_code=\has_error_code +.endm +#endif + +idtentry divide_error do_divide_error has_error_code=0 +idtentry overflow do_overflow has_error_code=0 +idtentry bounds do_bounds has_error_code=0 +idtentry invalid_op do_invalid_op has_error_code=0 +idtentry device_not_available do_device_not_available has_error_code=0 +idtentry double_fault __do_double_fault has_error_code=1 paranoid=1 +idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0 +idtentry invalid_TSS do_invalid_TSS has_error_code=1 +idtentry segment_not_present do_segment_not_present has_error_code=1 +idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0 +idtentry coprocessor_error do_coprocessor_error has_error_code=0 +idtentry alignment_check do_alignment_check has_error_code=1 +idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0  	/* Reload gs selector with exception handling */ @@ -1342,7 +1175,7 @@ bad_gs:  	.previous  /* Call softirq on interrupt stack. Interrupts are off. */ -ENTRY(call_softirq) +ENTRY(do_softirq_own_stack)  	CFI_STARTPROC  	pushq_cfi %rbp  	CFI_REL_OFFSET rbp,0 @@ -1359,10 +1192,10 @@ ENTRY(call_softirq)  	decl PER_CPU_VAR(irq_count)  	ret  	CFI_ENDPROC -END(call_softirq) +END(do_softirq_own_stack)  #ifdef CONFIG_XEN -zeroentry xen_hypervisor_callback xen_do_hypervisor_callback +idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0  /*   * A note on the "critical region" in our callback handler. @@ -1468,26 +1301,21 @@ apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \  	hyperv_callback_vector hyperv_vector_handler  #endif /* CONFIG_HYPERV */ -/* - * Some functions should be protected against kprobes - */ -	.pushsection .kprobes.text, "ax" - -paranoidzeroentry_ist debug do_debug DEBUG_STACK -paranoidzeroentry_ist int3 do_int3 DEBUG_STACK -paranoiderrorentry stack_segment do_stack_segment +idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK +idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK +idtentry stack_segment do_stack_segment has_error_code=1 paranoid=1  #ifdef CONFIG_XEN -zeroentry xen_debug do_debug -zeroentry xen_int3 do_int3 -errorentry xen_stack_segment do_stack_segment +idtentry xen_debug do_debug has_error_code=0 +idtentry xen_int3 do_int3 has_error_code=0 +idtentry xen_stack_segment do_stack_segment has_error_code=1  #endif -errorentry general_protection do_general_protection -errorentry page_fault do_page_fault +idtentry general_protection do_general_protection has_error_code=1 +trace_idtentry page_fault do_page_fault has_error_code=1  #ifdef CONFIG_KVM_GUEST -errorentry async_page_fault do_async_page_fault +idtentry async_page_fault do_async_page_fault has_error_code=1  #endif  #ifdef CONFIG_X86_MCE -paranoidzeroentry machine_check *machine_check_vector(%rip) +idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip)  #endif  	/* @@ -1592,7 +1420,7 @@ error_sti:   */  error_kernelspace:  	incl %ebx -	leaq irq_return(%rip),%rcx +	leaq native_irq_return_iret(%rip),%rcx  	cmpq %rcx,RIP+8(%rsp)  	je error_swapgs  	movl %ecx,%eax	/* zero extend */ @@ -1889,7 +1717,3 @@ ENTRY(ignore_sysret)  	CFI_ENDPROC  END(ignore_sysret) -/* - * End of kprobes section - */ -	.popsection diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c new file mode 100644 index 00000000000..94d857fb103 --- /dev/null +++ b/arch/x86/kernel/espfix_64.c @@ -0,0 +1,208 @@ +/* ----------------------------------------------------------------------- * + * + *   Copyright 2014 Intel Corporation; author: H. Peter Anvin + * + *   This program is free software; you can redistribute it and/or modify it + *   under the terms and conditions of the GNU General Public License, + *   version 2, as published by the Free Software Foundation. + * + *   This program is distributed in the hope it will be useful, but WITHOUT + *   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + *   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for + *   more details. + * + * ----------------------------------------------------------------------- */ + +/* + * The IRET instruction, when returning to a 16-bit segment, only + * restores the bottom 16 bits of the user space stack pointer.  This + * causes some 16-bit software to break, but it also leaks kernel state + * to user space. + * + * This works around this by creating percpu "ministacks", each of which + * is mapped 2^16 times 64K apart.  When we detect that the return SS is + * on the LDT, we copy the IRET frame to the ministack and use the + * relevant alias to return to userspace.  The ministacks are mapped + * readonly, so if the IRET fault we promote #GP to #DF which is an IST + * vector and thus has its own stack; we then do the fixup in the #DF + * handler. + * + * This file sets up the ministacks and the related page tables.  The + * actual ministack invocation is in entry_64.S. + */ + +#include <linux/init.h> +#include <linux/init_task.h> +#include <linux/kernel.h> +#include <linux/percpu.h> +#include <linux/gfp.h> +#include <linux/random.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/setup.h> +#include <asm/espfix.h> + +/* + * Note: we only need 6*8 = 48 bytes for the espfix stack, but round + * it up to a cache line to avoid unnecessary sharing. + */ +#define ESPFIX_STACK_SIZE	(8*8UL) +#define ESPFIX_STACKS_PER_PAGE	(PAGE_SIZE/ESPFIX_STACK_SIZE) + +/* There is address space for how many espfix pages? */ +#define ESPFIX_PAGE_SPACE	(1UL << (PGDIR_SHIFT-PAGE_SHIFT-16)) + +#define ESPFIX_MAX_CPUS		(ESPFIX_STACKS_PER_PAGE * ESPFIX_PAGE_SPACE) +#if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS +# error "Need more than one PGD for the ESPFIX hack" +#endif + +#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO) + +/* This contains the *bottom* address of the espfix stack */ +DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack); +DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr); + +/* Initialization mutex - should this be a spinlock? */ +static DEFINE_MUTEX(espfix_init_mutex); + +/* Page allocation bitmap - each page serves ESPFIX_STACKS_PER_PAGE CPUs */ +#define ESPFIX_MAX_PAGES  DIV_ROUND_UP(CONFIG_NR_CPUS, ESPFIX_STACKS_PER_PAGE) +static void *espfix_pages[ESPFIX_MAX_PAGES]; + +static __page_aligned_bss pud_t espfix_pud_page[PTRS_PER_PUD] +	__aligned(PAGE_SIZE); + +static unsigned int page_random, slot_random; + +/* + * This returns the bottom address of the espfix stack for a specific CPU. + * The math allows for a non-power-of-two ESPFIX_STACK_SIZE, in which case + * we have to account for some amount of padding at the end of each page. + */ +static inline unsigned long espfix_base_addr(unsigned int cpu) +{ +	unsigned long page, slot; +	unsigned long addr; + +	page = (cpu / ESPFIX_STACKS_PER_PAGE) ^ page_random; +	slot = (cpu + slot_random) % ESPFIX_STACKS_PER_PAGE; +	addr = (page << PAGE_SHIFT) + (slot * ESPFIX_STACK_SIZE); +	addr = (addr & 0xffffUL) | ((addr & ~0xffffUL) << 16); +	addr += ESPFIX_BASE_ADDR; +	return addr; +} + +#define PTE_STRIDE        (65536/PAGE_SIZE) +#define ESPFIX_PTE_CLONES (PTRS_PER_PTE/PTE_STRIDE) +#define ESPFIX_PMD_CLONES PTRS_PER_PMD +#define ESPFIX_PUD_CLONES (65536/(ESPFIX_PTE_CLONES*ESPFIX_PMD_CLONES)) + +#define PGTABLE_PROT	  ((_KERNPG_TABLE & ~_PAGE_RW) | _PAGE_NX) + +static void init_espfix_random(void) +{ +	unsigned long rand; + +	/* +	 * This is run before the entropy pools are initialized, +	 * but this is hopefully better than nothing. +	 */ +	if (!arch_get_random_long(&rand)) { +		/* The constant is an arbitrary large prime */ +		rdtscll(rand); +		rand *= 0xc345c6b72fd16123UL; +	} + +	slot_random = rand % ESPFIX_STACKS_PER_PAGE; +	page_random = (rand / ESPFIX_STACKS_PER_PAGE) +		& (ESPFIX_PAGE_SPACE - 1); +} + +void __init init_espfix_bsp(void) +{ +	pgd_t *pgd_p; +	pteval_t ptemask; + +	ptemask = __supported_pte_mask; + +	/* Install the espfix pud into the kernel page directory */ +	pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; +	pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page); + +	/* Randomize the locations */ +	init_espfix_random(); + +	/* The rest is the same as for any other processor */ +	init_espfix_ap(); +} + +void init_espfix_ap(void) +{ +	unsigned int cpu, page; +	unsigned long addr; +	pud_t pud, *pud_p; +	pmd_t pmd, *pmd_p; +	pte_t pte, *pte_p; +	int n; +	void *stack_page; +	pteval_t ptemask; + +	/* We only have to do this once... */ +	if (likely(this_cpu_read(espfix_stack))) +		return;		/* Already initialized */ + +	cpu = smp_processor_id(); +	addr = espfix_base_addr(cpu); +	page = cpu/ESPFIX_STACKS_PER_PAGE; + +	/* Did another CPU already set this up? */ +	stack_page = ACCESS_ONCE(espfix_pages[page]); +	if (likely(stack_page)) +		goto done; + +	mutex_lock(&espfix_init_mutex); + +	/* Did we race on the lock? */ +	stack_page = ACCESS_ONCE(espfix_pages[page]); +	if (stack_page) +		goto unlock_done; + +	ptemask = __supported_pte_mask; + +	pud_p = &espfix_pud_page[pud_index(addr)]; +	pud = *pud_p; +	if (!pud_present(pud)) { +		pmd_p = (pmd_t *)__get_free_page(PGALLOC_GFP); +		pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask)); +		paravirt_alloc_pmd(&init_mm, __pa(pmd_p) >> PAGE_SHIFT); +		for (n = 0; n < ESPFIX_PUD_CLONES; n++) +			set_pud(&pud_p[n], pud); +	} + +	pmd_p = pmd_offset(&pud, addr); +	pmd = *pmd_p; +	if (!pmd_present(pmd)) { +		pte_p = (pte_t *)__get_free_page(PGALLOC_GFP); +		pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask)); +		paravirt_alloc_pte(&init_mm, __pa(pte_p) >> PAGE_SHIFT); +		for (n = 0; n < ESPFIX_PMD_CLONES; n++) +			set_pmd(&pmd_p[n], pmd); +	} + +	pte_p = pte_offset_kernel(&pmd, addr); +	stack_page = (void *)__get_free_page(GFP_KERNEL); +	pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask)); +	for (n = 0; n < ESPFIX_PTE_CLONES; n++) +		set_pte(&pte_p[n*PTE_STRIDE], pte); + +	/* Job is done for this CPU and any CPU which shares this page */ +	ACCESS_ONCE(espfix_pages[page]) = stack_page; + +unlock_done: +	mutex_unlock(&espfix_init_mutex); +done: +	this_cpu_write(espfix_stack, addr); +	this_cpu_write(espfix_waddr, (unsigned long)stack_page +		       + (addr & ~PAGE_MASK)); +} diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 42a392a9fd0..cbc4a91b131 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -77,8 +77,7 @@ within(unsigned long addr, unsigned long start, unsigned long end)  	return addr >= start && addr < end;  } -static int -do_ftrace_mod_code(unsigned long ip, const void *new_code) +static unsigned long text_ip_addr(unsigned long ip)  {  	/*  	 * On x86_64, kernel text mappings are mapped read-only with @@ -91,7 +90,7 @@ do_ftrace_mod_code(unsigned long ip, const void *new_code)  	if (within(ip, (unsigned long)_text, (unsigned long)_etext))  		ip = (unsigned long)__va(__pa_symbol(ip)); -	return probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE); +	return ip;  }  static const unsigned char *ftrace_nop_replace(void) @@ -123,8 +122,10 @@ ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code,  	if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0)  		return -EINVAL; +	ip = text_ip_addr(ip); +  	/* replace the text with the new text */ -	if (do_ftrace_mod_code(ip, new_code)) +	if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE))  		return -EPERM;  	sync_core(); @@ -221,33 +222,56 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,  	return -EINVAL;  } -int ftrace_update_ftrace_func(ftrace_func_t func) +static unsigned long ftrace_update_func; + +static int update_ftrace_func(unsigned long ip, void *new)  { -	unsigned long ip = (unsigned long)(&ftrace_call); -	unsigned char old[MCOUNT_INSN_SIZE], *new; +	unsigned char old[MCOUNT_INSN_SIZE];  	int ret; -	memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE); -	new = ftrace_call_replace(ip, (unsigned long)func); +	memcpy(old, (void *)ip, MCOUNT_INSN_SIZE); + +	ftrace_update_func = ip; +	/* Make sure the breakpoints see the ftrace_update_func update */ +	smp_wmb();  	/* See comment above by declaration of modifying_ftrace_code */  	atomic_inc(&modifying_ftrace_code);  	ret = ftrace_modify_code(ip, old, new); +	atomic_dec(&modifying_ftrace_code); + +	return ret; +} + +int ftrace_update_ftrace_func(ftrace_func_t func) +{ +	unsigned long ip = (unsigned long)(&ftrace_call); +	unsigned char *new; +	int ret; + +	new = ftrace_call_replace(ip, (unsigned long)func); +	ret = update_ftrace_func(ip, new); +  	/* Also update the regs callback function */  	if (!ret) {  		ip = (unsigned long)(&ftrace_regs_call); -		memcpy(old, &ftrace_regs_call, MCOUNT_INSN_SIZE);  		new = ftrace_call_replace(ip, (unsigned long)func); -		ret = ftrace_modify_code(ip, old, new); +		ret = update_ftrace_func(ip, new);  	} -	atomic_dec(&modifying_ftrace_code); -  	return ret;  } +static int is_ftrace_caller(unsigned long ip) +{ +	if (ip == ftrace_update_func) +		return 1; + +	return 0; +} +  /*   * A breakpoint was added to the code address we are about to   * modify, and this is the handle that will just skip over it. @@ -257,10 +281,13 @@ int ftrace_update_ftrace_func(ftrace_func_t func)   */  int ftrace_int3_handler(struct pt_regs *regs)  { +	unsigned long ip; +  	if (WARN_ON_ONCE(!regs))  		return 0; -	if (!ftrace_location(regs->ip - 1)) +	ip = regs->ip - 1; +	if (!ftrace_location(ip) && !is_ftrace_caller(ip))  		return 0;  	regs->ip += MCOUNT_INSN_SIZE - 1; @@ -270,18 +297,12 @@ int ftrace_int3_handler(struct pt_regs *regs)  static int ftrace_write(unsigned long ip, const char *val, int size)  { -	/* -	 * On x86_64, kernel text mappings are mapped read-only with -	 * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead -	 * of the kernel text mapping to modify the kernel text. -	 * -	 * For 32bit kernels, these mappings are same and we can use -	 * kernel identity mapping to modify code. -	 */ -	if (within(ip, (unsigned long)_text, (unsigned long)_etext)) -		ip = (unsigned long)__va(__pa_symbol(ip)); +	ip = text_ip_addr(ip); + +	if (probe_kernel_write((void *)ip, val, size)) +		return -EPERM; -	return probe_kernel_write((void *)ip, val, size); +	return 0;  }  static int add_break(unsigned long ip, const char *old) @@ -296,10 +317,7 @@ static int add_break(unsigned long ip, const char *old)  	if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0)  		return -EINVAL; -	if (ftrace_write(ip, &brk, 1)) -		return -EPERM; - -	return 0; +	return ftrace_write(ip, &brk, 1);  }  static int add_brk_on_call(struct dyn_ftrace *rec, unsigned long addr) @@ -322,40 +340,14 @@ static int add_brk_on_nop(struct dyn_ftrace *rec)  	return add_break(rec->ip, old);  } -/* - * If the record has the FTRACE_FL_REGS set, that means that it - * wants to convert to a callback that saves all regs. If FTRACE_FL_REGS - * is not not set, then it wants to convert to the normal callback. - */ -static unsigned long get_ftrace_addr(struct dyn_ftrace *rec) -{ -	if (rec->flags & FTRACE_FL_REGS) -		return (unsigned long)FTRACE_REGS_ADDR; -	else -		return (unsigned long)FTRACE_ADDR; -} - -/* - * The FTRACE_FL_REGS_EN is set when the record already points to - * a function that saves all the regs. Basically the '_EN' version - * represents the current state of the function. - */ -static unsigned long get_ftrace_old_addr(struct dyn_ftrace *rec) -{ -	if (rec->flags & FTRACE_FL_REGS_EN) -		return (unsigned long)FTRACE_REGS_ADDR; -	else -		return (unsigned long)FTRACE_ADDR; -} -  static int add_breakpoints(struct dyn_ftrace *rec, int enable)  {  	unsigned long ftrace_addr;  	int ret; -	ret = ftrace_test_record(rec, enable); +	ftrace_addr = ftrace_get_addr_curr(rec); -	ftrace_addr = get_ftrace_addr(rec); +	ret = ftrace_test_record(rec, enable);  	switch (ret) {  	case FTRACE_UPDATE_IGNORE: @@ -365,10 +357,7 @@ static int add_breakpoints(struct dyn_ftrace *rec, int enable)  		/* converting nop to call */  		return add_brk_on_nop(rec); -	case FTRACE_UPDATE_MODIFY_CALL_REGS:  	case FTRACE_UPDATE_MODIFY_CALL: -		ftrace_addr = get_ftrace_old_addr(rec); -		/* fall through */  	case FTRACE_UPDATE_MAKE_NOP:  		/* converting a call to a nop */  		return add_brk_on_call(rec, ftrace_addr); @@ -398,7 +387,7 @@ static int remove_breakpoint(struct dyn_ftrace *rec)  	/* If this does not have a breakpoint, we are done */  	if (ins[0] != brk) -		return -1; +		return 0;  	nop = ftrace_nop_replace(); @@ -413,14 +402,14 @@ static int remove_breakpoint(struct dyn_ftrace *rec)  		 * If not, don't touch the breakpoint, we make just create  		 * a disaster.  		 */ -		ftrace_addr = get_ftrace_addr(rec); +		ftrace_addr = ftrace_get_addr_new(rec);  		nop = ftrace_call_replace(ip, ftrace_addr);  		if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) == 0)  			goto update;  		/* Check both ftrace_addr and ftrace_old_addr */ -		ftrace_addr = get_ftrace_old_addr(rec); +		ftrace_addr = ftrace_get_addr_curr(rec);  		nop = ftrace_call_replace(ip, ftrace_addr);  		if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) @@ -428,7 +417,7 @@ static int remove_breakpoint(struct dyn_ftrace *rec)  	}   update: -	return probe_kernel_write((void *)ip, &nop[0], 1); +	return ftrace_write(ip, nop, 1);  }  static int add_update_code(unsigned long ip, unsigned const char *new) @@ -436,9 +425,7 @@ static int add_update_code(unsigned long ip, unsigned const char *new)  	/* skip breakpoint */  	ip++;  	new++; -	if (ftrace_write(ip, new, MCOUNT_INSN_SIZE - 1)) -		return -EPERM; -	return 0; +	return ftrace_write(ip, new, MCOUNT_INSN_SIZE - 1);  }  static int add_update_call(struct dyn_ftrace *rec, unsigned long addr) @@ -466,13 +453,12 @@ static int add_update(struct dyn_ftrace *rec, int enable)  	ret = ftrace_test_record(rec, enable); -	ftrace_addr  = get_ftrace_addr(rec); +	ftrace_addr  = ftrace_get_addr_new(rec);  	switch (ret) {  	case FTRACE_UPDATE_IGNORE:  		return 0; -	case FTRACE_UPDATE_MODIFY_CALL_REGS:  	case FTRACE_UPDATE_MODIFY_CALL:  	case FTRACE_UPDATE_MAKE_CALL:  		/* converting nop to call */ @@ -493,10 +479,7 @@ static int finish_update_call(struct dyn_ftrace *rec, unsigned long addr)  	new = ftrace_call_replace(ip, addr); -	if (ftrace_write(ip, new, 1)) -		return -EPERM; - -	return 0; +	return ftrace_write(ip, new, 1);  }  static int finish_update_nop(struct dyn_ftrace *rec) @@ -506,9 +489,7 @@ static int finish_update_nop(struct dyn_ftrace *rec)  	new = ftrace_nop_replace(); -	if (ftrace_write(ip, new, 1)) -		return -EPERM; -	return 0; +	return ftrace_write(ip, new, 1);  }  static int finish_update(struct dyn_ftrace *rec, int enable) @@ -518,13 +499,12 @@ static int finish_update(struct dyn_ftrace *rec, int enable)  	ret = ftrace_update_record(rec, enable); -	ftrace_addr = get_ftrace_addr(rec); +	ftrace_addr = ftrace_get_addr_new(rec);  	switch (ret) {  	case FTRACE_UPDATE_IGNORE:  		return 0; -	case FTRACE_UPDATE_MODIFY_CALL_REGS:  	case FTRACE_UPDATE_MODIFY_CALL:  	case FTRACE_UPDATE_MAKE_CALL:  		/* converting nop to call */ @@ -601,12 +581,18 @@ void ftrace_replace_code(int enable)  	return;   remove_breakpoints: +	pr_warn("Failed on %s (%d):\n", report, count);  	ftrace_bug(ret, rec ? rec->ip : 0); -	printk(KERN_WARNING "Failed on %s (%d):\n", report, count);  	for_ftrace_rec_iter(iter) {  		rec = ftrace_rec_iter_record(iter); -		remove_breakpoint(rec); +		/* +		 * Breakpoints are handled only when this function is in +		 * progress. The system could not work with them. +		 */ +		if (remove_breakpoint(rec)) +			BUG();  	} +	run_sync();  }  static int @@ -628,16 +614,19 @@ ftrace_modify_code(unsigned long ip, unsigned const char *old_code,  	run_sync();  	ret = ftrace_write(ip, new_code, 1); -	if (ret) { -		ret = -EPERM; -		goto out; -	} -	run_sync(); +	/* +	 * The breakpoint is handled only when this function is in progress. +	 * The system could not work if we could not remove it. +	 */ +	BUG_ON(ret);   out: +	run_sync();  	return ret;   fail_update: -	probe_kernel_write((void *)ip, &old_code[0], 1); +	/* Also here the system could not work with the breakpoint */ +	if (ftrace_write(ip, old_code, 1)) +		BUG();  	goto out;  } @@ -651,11 +640,8 @@ void arch_ftrace_update_code(int command)  	atomic_dec(&modifying_ftrace_code);  } -int __init ftrace_dyn_arch_init(void *data) +int __init ftrace_dyn_arch_init(void)  { -	/* The return code is retured via data */ -	*(unsigned long *)data = 0; -  	return 0;  }  #endif @@ -665,45 +651,41 @@ int __init ftrace_dyn_arch_init(void *data)  #ifdef CONFIG_DYNAMIC_FTRACE  extern void ftrace_graph_call(void); -static int ftrace_mod_jmp(unsigned long ip, -			  int old_offset, int new_offset) +static unsigned char *ftrace_jmp_replace(unsigned long ip, unsigned long addr)  { -	unsigned char code[MCOUNT_INSN_SIZE]; +	static union ftrace_code_union calc; -	if (probe_kernel_read(code, (void *)ip, MCOUNT_INSN_SIZE)) -		return -EFAULT; +	/* Jmp not a call (ignore the .e8) */ +	calc.e8		= 0xe9; +	calc.offset	= ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr); -	if (code[0] != 0xe9 || old_offset != *(int *)(&code[1])) -		return -EINVAL; +	/* +	 * ftrace external locks synchronize the access to the static variable. +	 */ +	return calc.code; +} -	*(int *)(&code[1]) = new_offset; +static int ftrace_mod_jmp(unsigned long ip, void *func) +{ +	unsigned char *new; -	if (do_ftrace_mod_code(ip, &code)) -		return -EPERM; +	new = ftrace_jmp_replace(ip, (unsigned long)func); -	return 0; +	return update_ftrace_func(ip, new);  }  int ftrace_enable_ftrace_graph_caller(void)  {  	unsigned long ip = (unsigned long)(&ftrace_graph_call); -	int old_offset, new_offset; -	old_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE); -	new_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE); - -	return ftrace_mod_jmp(ip, old_offset, new_offset); +	return ftrace_mod_jmp(ip, &ftrace_graph_caller);  }  int ftrace_disable_ftrace_graph_caller(void)  {  	unsigned long ip = (unsigned long)(&ftrace_graph_call); -	int old_offset, new_offset; - -	old_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE); -	new_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE); -	return ftrace_mod_jmp(ip, old_offset, new_offset); +	return ftrace_mod_jmp(ip, &ftrace_stub);  }  #endif /* !CONFIG_DYNAMIC_FTRACE */ diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 06f87bece92..d6c1b983699 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -29,14 +29,14 @@ static void __init i386_default_early_setup(void)  	reserve_ebda_region();  } -asmlinkage void __init i386_start_kernel(void) +asmlinkage __visible void __init i386_start_kernel(void)  {  	sanitize_boot_params(&boot_params);  	/* Call the subarch specific early setup function */  	switch (boot_params.hdr.hardware_subarch) { -	case X86_SUBARCH_MRST: -		x86_mrst_early_setup(); +	case X86_SUBARCH_INTEL_MID: +		x86_intel_mid_early_setup();  		break;  	case X86_SUBARCH_CE4100:  		x86_ce4100_early_setup(); diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 1be8e43b669..eda1a865641 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -137,7 +137,7 @@ static void __init copy_bootdata(char *real_mode_data)  	}  } -asmlinkage void __init x86_64_start_kernel(char * real_mode_data) +asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)  {  	int i; @@ -162,7 +162,7 @@ asmlinkage void __init x86_64_start_kernel(char * real_mode_data)  	clear_bss();  	for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) -		set_intr_gate(i, &early_idt_handlers[i]); +		set_intr_gate(i, early_idt_handlers[i]);  	load_idt((const struct desc_ptr *)&idt_descr);  	copy_bootdata(__va(real_mode_data)); @@ -172,7 +172,7 @@ asmlinkage void __init x86_64_start_kernel(char * real_mode_data)  	 */  	load_ucode_bsp(); -	if (console_loglevel == 10) +	if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG)  		early_printk("Kernel alive\n");  	clear_page(init_level4_pgt); diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 81ba27679f1..f36bd42d6f0 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -544,6 +544,10 @@ ENDPROC(early_idt_handlers)  	/* This is global to keep gas from relaxing the jumps */  ENTRY(early_idt_handler)  	cld + +	cmpl $2,(%esp)		# X86_TRAP_NMI +	je is_nmi		# Ignore NMI +  	cmpl $2,%ss:early_recursion_flag  	je hlt_loop  	incl %ss:early_recursion_flag @@ -594,8 +598,9 @@ ex_entry:  	pop %edx  	pop %ecx  	pop %eax -	addl $8,%esp		/* drop vector number and error code */  	decl %ss:early_recursion_flag +is_nmi: +	addl $8,%esp		/* drop vector number and error code */  	iret  ENDPROC(early_idt_handler) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index e1aabdb314c..a468c0a65c4 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -343,6 +343,9 @@ early_idt_handlers:  ENTRY(early_idt_handler)  	cld +	cmpl $2,(%rsp)		# X86_TRAP_NMI +	je is_nmi		# Ignore NMI +  	cmpl $2,early_recursion_flag(%rip)  	jz  1f  	incl early_recursion_flag(%rip) @@ -405,8 +408,9 @@ ENTRY(early_idt_handler)  	popq %rdx  	popq %rcx  	popq %rax -	addq $16,%rsp		# drop vector number and error code  	decl early_recursion_flag(%rip) +is_nmi: +	addq $16,%rsp		# drop vector number and error code  	INTERRUPT_RETURN  ENDPROC(early_idt_handler) diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index da85a8e830a..319bcb9372f 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -74,9 +74,6 @@ static inline void hpet_writel(unsigned int d, unsigned int a)  static inline void hpet_set_mapping(void)  {  	hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE); -#ifdef CONFIG_X86_64 -	__set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VVAR_NOCACHE); -#endif  }  static inline void hpet_clear_mapping(void) @@ -88,7 +85,7 @@ static inline void hpet_clear_mapping(void)  /*   * HPET command line enable / disable   */ -static int boot_hpet_disable; +int boot_hpet_disable;  int hpet_force_user;  static int hpet_verbose; @@ -479,7 +476,7 @@ static int hpet_msi_next_event(unsigned long delta,  static int hpet_setup_msi_irq(unsigned int irq)  {  	if (x86_msi.setup_hpet_msi(irq, hpet_blockid)) { -		destroy_irq(irq); +		irq_free_hwirq(irq);  		return -EINVAL;  	}  	return 0; @@ -487,9 +484,8 @@ static int hpet_setup_msi_irq(unsigned int irq)  static int hpet_assign_irq(struct hpet_dev *dev)  { -	unsigned int irq; +	unsigned int irq = irq_alloc_hwirq(-1); -	irq = create_irq_nr(0, -1);  	if (!irq)  		return -EINVAL; @@ -521,7 +517,7 @@ static int hpet_setup_irq(struct hpet_dev *dev)  {  	if (request_irq(dev->irq, hpet_interrupt_handler, -			IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING, +			IRQF_TIMER | IRQF_NOBALANCING,  			dev->name, dev))  		return -1; @@ -699,7 +695,7 @@ static int hpet_cpuhp_notify(struct notifier_block *n,  		/* FIXME: add schedule_work_on() */  		schedule_delayed_work_on(cpu, &work.work, 0);  		wait_for_completion(&work.complete); -		destroy_timer_on_stack(&work.work.timer); +		destroy_delayed_work_on_stack(&work.work);  		break;  	case CPU_DEAD:  		if (hdev) { @@ -752,9 +748,7 @@ static struct clocksource clocksource_hpet = {  	.mask		= HPET_MASK,  	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,  	.resume		= hpet_resume_counter, -#ifdef CONFIG_X86_64  	.archdata	= { .vclock_mode = VCLOCK_HPET }, -#endif  };  static int hpet_clocksource_register(void) @@ -943,12 +937,14 @@ static __init int hpet_late_init(void)  	if (boot_cpu_has(X86_FEATURE_ARAT))  		return 0; +	cpu_notifier_register_begin();  	for_each_online_cpu(cpu) {  		hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu);  	}  	/* This notifier should be called after workqueue is ready */ -	hotcpu_notifier(hpet_cpuhp_notify, -20); +	__hotcpu_notifier(hpet_cpuhp_notify, -20); +	cpu_notifier_register_done();  	return 0;  } diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index f66ff162dce..5f9cf20cdb6 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -32,13 +32,11 @@  #include <linux/irqflags.h>  #include <linux/notifier.h>  #include <linux/kallsyms.h> -#include <linux/kprobes.h>  #include <linux/percpu.h>  #include <linux/kdebug.h>  #include <linux/kernel.h>  #include <linux/module.h>  #include <linux/sched.h> -#include <linux/init.h>  #include <linux/smp.h>  #include <asm/hw_breakpoint.h> @@ -425,7 +423,7 @@ EXPORT_SYMBOL_GPL(hw_breakpoint_restore);   * NOTIFY_STOP returned for all other cases   *   */ -static int __kprobes hw_breakpoint_handler(struct die_args *args) +static int hw_breakpoint_handler(struct die_args *args)  {  	int i, cpu, rc = NOTIFY_STOP;  	struct perf_event *bp; @@ -512,7 +510,7 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)  /*   * Handle debug exception notifications.   */ -int __kprobes hw_breakpoint_exceptions_notify( +int hw_breakpoint_exceptions_notify(  		struct notifier_block *unused, unsigned long val, void *data)  {  	if (val != DIE_DEBUG) diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c index 0fa69127209..05fd74f537d 100644 --- a/arch/x86/kernel/i386_ksyms_32.c +++ b/arch/x86/kernel/i386_ksyms_32.c @@ -37,3 +37,10 @@ EXPORT_SYMBOL(strstr);  EXPORT_SYMBOL(csum_partial);  EXPORT_SYMBOL(empty_zero_page); + +#ifdef CONFIG_PREEMPT +EXPORT_SYMBOL(___preempt_schedule); +#ifdef CONFIG_CONTEXT_TRACKING +EXPORT_SYMBOL(___preempt_schedule_context); +#endif +#endif diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 5d576ab3440..d5dd8081441 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -86,10 +86,19 @@ EXPORT_SYMBOL(__kernel_fpu_begin);  void __kernel_fpu_end(void)  { -	if (use_eager_fpu()) -		math_state_restore(); -	else +	if (use_eager_fpu()) { +		/* +		 * For eager fpu, most the time, tsk_used_math() is true. +		 * Restore the user math as we are done with the kernel usage. +		 * At few instances during thread exit, signal handling etc, +		 * tsk_used_math() is false. Those few places will take proper +		 * actions, so we don't need to restore the math here. +		 */ +		if (likely(tsk_used_math(current))) +			math_state_restore(); +	} else {  		stts(); +	}  }  EXPORT_SYMBOL(__kernel_fpu_end); @@ -100,7 +109,7 @@ void unlazy_fpu(struct task_struct *tsk)  		__save_init_fpu(tsk);  		__thread_fpu_end(tsk);  	} else -		tsk->fpu_counter = 0; +		tsk->thread.fpu_counter = 0;  	preempt_enable();  }  EXPORT_SYMBOL(unlazy_fpu); diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 9a5c460404d..8af817105e2 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -299,21 +299,38 @@ static void unmask_8259A(void)  static void init_8259A(int auto_eoi)  {  	unsigned long flags; +	unsigned char probe_val = ~(1 << PIC_CASCADE_IR); +	unsigned char new_val;  	i8259A_auto_eoi = auto_eoi;  	raw_spin_lock_irqsave(&i8259A_lock, flags); -	outb(0xff, PIC_MASTER_IMR);	/* mask all of 8259A-1 */ +	/* +	 * Check to see if we have a PIC. +	 * Mask all except the cascade and read +	 * back the value we just wrote. If we don't +	 * have a PIC, we will read 0xff as opposed to the +	 * value we wrote. +	 */  	outb(0xff, PIC_SLAVE_IMR);	/* mask all of 8259A-2 */ +	outb(probe_val, PIC_MASTER_IMR); +	new_val = inb(PIC_MASTER_IMR); +	if (new_val != probe_val) { +		printk(KERN_INFO "Using NULL legacy PIC\n"); +		legacy_pic = &null_legacy_pic; +		raw_spin_unlock_irqrestore(&i8259A_lock, flags); +		return; +	} + +	outb(0xff, PIC_MASTER_IMR);	/* mask all of 8259A-1 */  	/*  	 * outb_pic - this has to work on a wide range of PC hardware.  	 */  	outb_pic(0x11, PIC_MASTER_CMD);	/* ICW1: select 8259A-1 init */ -	/* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 on x86-64, -	   to 0x20-0x27 on i386 */ +	/* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */  	outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR);  	/* 8259A-1 (the master) has a slave on IR2 */ diff --git a/arch/x86/kernel/iosf_mbi.c b/arch/x86/kernel/iosf_mbi.c new file mode 100644 index 00000000000..d30acdc1229 --- /dev/null +++ b/arch/x86/kernel/iosf_mbi.c @@ -0,0 +1,237 @@ +/* + * IOSF-SB MailBox Interface Driver + * Copyright (c) 2013, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for + * more details. + * + * + * The IOSF-SB is a fabric bus available on Atom based SOC's that uses a + * mailbox interface (MBI) to communicate with mutiple devices. This + * driver implements access to this interface for those platforms that can + * enumerate the device using PCI. + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/spinlock.h> +#include <linux/pci.h> + +#include <asm/iosf_mbi.h> + +#define PCI_DEVICE_ID_BAYTRAIL		0x0F00 +#define PCI_DEVICE_ID_QUARK_X1000	0x0958 + +static DEFINE_SPINLOCK(iosf_mbi_lock); + +static inline u32 iosf_mbi_form_mcr(u8 op, u8 port, u8 offset) +{ +	return (op << 24) | (port << 16) | (offset << 8) | MBI_ENABLE; +} + +static struct pci_dev *mbi_pdev;	/* one mbi device */ + +static int iosf_mbi_pci_read_mdr(u32 mcrx, u32 mcr, u32 *mdr) +{ +	int result; + +	if (!mbi_pdev) +		return -ENODEV; + +	if (mcrx) { +		result = pci_write_config_dword(mbi_pdev, MBI_MCRX_OFFSET, +						mcrx); +		if (result < 0) +			goto fail_read; +	} + +	result = pci_write_config_dword(mbi_pdev, MBI_MCR_OFFSET, mcr); +	if (result < 0) +		goto fail_read; + +	result = pci_read_config_dword(mbi_pdev, MBI_MDR_OFFSET, mdr); +	if (result < 0) +		goto fail_read; + +	return 0; + +fail_read: +	dev_err(&mbi_pdev->dev, "PCI config access failed with %d\n", result); +	return result; +} + +static int iosf_mbi_pci_write_mdr(u32 mcrx, u32 mcr, u32 mdr) +{ +	int result; + +	if (!mbi_pdev) +		return -ENODEV; + +	result = pci_write_config_dword(mbi_pdev, MBI_MDR_OFFSET, mdr); +	if (result < 0) +		goto fail_write; + +	if (mcrx) { +		result = pci_write_config_dword(mbi_pdev, MBI_MCRX_OFFSET, +						mcrx); +		if (result < 0) +			goto fail_write; +	} + +	result = pci_write_config_dword(mbi_pdev, MBI_MCR_OFFSET, mcr); +	if (result < 0) +		goto fail_write; + +	return 0; + +fail_write: +	dev_err(&mbi_pdev->dev, "PCI config access failed with %d\n", result); +	return result; +} + +int iosf_mbi_read(u8 port, u8 opcode, u32 offset, u32 *mdr) +{ +	u32 mcr, mcrx; +	unsigned long flags; +	int ret; + +	/*Access to the GFX unit is handled by GPU code */ +	if (port == BT_MBI_UNIT_GFX) { +		WARN_ON(1); +		return -EPERM; +	} + +	mcr = iosf_mbi_form_mcr(opcode, port, offset & MBI_MASK_LO); +	mcrx = offset & MBI_MASK_HI; + +	spin_lock_irqsave(&iosf_mbi_lock, flags); +	ret = iosf_mbi_pci_read_mdr(mcrx, mcr, mdr); +	spin_unlock_irqrestore(&iosf_mbi_lock, flags); + +	return ret; +} +EXPORT_SYMBOL(iosf_mbi_read); + +int iosf_mbi_write(u8 port, u8 opcode, u32 offset, u32 mdr) +{ +	u32 mcr, mcrx; +	unsigned long flags; +	int ret; + +	/*Access to the GFX unit is handled by GPU code */ +	if (port == BT_MBI_UNIT_GFX) { +		WARN_ON(1); +		return -EPERM; +	} + +	mcr = iosf_mbi_form_mcr(opcode, port, offset & MBI_MASK_LO); +	mcrx = offset & MBI_MASK_HI; + +	spin_lock_irqsave(&iosf_mbi_lock, flags); +	ret = iosf_mbi_pci_write_mdr(mcrx, mcr, mdr); +	spin_unlock_irqrestore(&iosf_mbi_lock, flags); + +	return ret; +} +EXPORT_SYMBOL(iosf_mbi_write); + +int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask) +{ +	u32 mcr, mcrx; +	u32 value; +	unsigned long flags; +	int ret; + +	/*Access to the GFX unit is handled by GPU code */ +	if (port == BT_MBI_UNIT_GFX) { +		WARN_ON(1); +		return -EPERM; +	} + +	mcr = iosf_mbi_form_mcr(opcode, port, offset & MBI_MASK_LO); +	mcrx = offset & MBI_MASK_HI; + +	spin_lock_irqsave(&iosf_mbi_lock, flags); + +	/* Read current mdr value */ +	ret = iosf_mbi_pci_read_mdr(mcrx, mcr & MBI_RD_MASK, &value); +	if (ret < 0) { +		spin_unlock_irqrestore(&iosf_mbi_lock, flags); +		return ret; +	} + +	/* Apply mask */ +	value &= ~mask; +	mdr &= mask; +	value |= mdr; + +	/* Write back */ +	ret = iosf_mbi_pci_write_mdr(mcrx, mcr | MBI_WR_MASK, value); + +	spin_unlock_irqrestore(&iosf_mbi_lock, flags); + +	return ret; +} +EXPORT_SYMBOL(iosf_mbi_modify); + +bool iosf_mbi_available(void) +{ +	/* Mbi isn't hot-pluggable. No remove routine is provided */ +	return mbi_pdev; +} +EXPORT_SYMBOL(iosf_mbi_available); + +static int iosf_mbi_probe(struct pci_dev *pdev, +			  const struct pci_device_id *unused) +{ +	int ret; + +	ret = pci_enable_device(pdev); +	if (ret < 0) { +		dev_err(&pdev->dev, "error: could not enable device\n"); +		return ret; +	} + +	mbi_pdev = pci_dev_get(pdev); +	return 0; +} + +static DEFINE_PCI_DEVICE_TABLE(iosf_mbi_pci_ids) = { +	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_BAYTRAIL) }, +	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_QUARK_X1000) }, +	{ 0, }, +}; +MODULE_DEVICE_TABLE(pci, iosf_mbi_pci_ids); + +static struct pci_driver iosf_mbi_pci_driver = { +	.name		= "iosf_mbi_pci", +	.probe		= iosf_mbi_probe, +	.id_table	= iosf_mbi_pci_ids, +}; + +static int __init iosf_mbi_init(void) +{ +	return pci_register_driver(&iosf_mbi_pci_driver); +} + +static void __exit iosf_mbi_exit(void) +{ +	pci_unregister_driver(&iosf_mbi_pci_driver); +	if (mbi_pdev) { +		pci_dev_put(mbi_pdev); +		mbi_pdev = NULL; +	} +} + +module_init(iosf_mbi_init); +module_exit(iosf_mbi_exit); + +MODULE_AUTHOR("David E. Box <david.e.box@linux.intel.com>"); +MODULE_DESCRIPTION("IOSF Mailbox Interface accessor"); +MODULE_LICENSE("GPL v2"); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 22d0687e7fd..922d2858102 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -17,6 +17,7 @@  #include <asm/idle.h>  #include <asm/mce.h>  #include <asm/hw_irq.h> +#include <asm/desc.h>  #define CREATE_TRACE_POINTS  #include <asm/trace/irq_vectors.h> @@ -125,6 +126,12 @@ int arch_show_interrupts(struct seq_file *p, int prec)  		seq_printf(p, "%10u ", per_cpu(mce_poll_count, j));  	seq_printf(p, "  Machine check polls\n");  #endif +#if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN) +	seq_printf(p, "%*s: ", prec, "THR"); +	for_each_online_cpu(j) +		seq_printf(p, "%10u ", irq_stats(j)->irq_hv_callback_count); +	seq_printf(p, "  Hypervisor callback interrupts\n"); +#endif  	seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));  #if defined(CONFIG_X86_IO_APIC)  	seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count)); @@ -193,9 +200,13 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)  	if (!handle_irq(irq, regs)) {  		ack_APIC_irq(); -		if (printk_ratelimit()) -			pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n", -				__func__, smp_processor_id(), vector, irq); +		if (irq != VECTOR_RETRIGGERED) { +			pr_emerg_ratelimited("%s: %d.%d No irq handler for vector (irq %d)\n", +					     __func__, smp_processor_id(), +					     vector, irq); +		} else { +			__this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED); +		}  	}  	irq_exit(); @@ -262,6 +273,90 @@ __visible void smp_trace_x86_platform_ipi(struct pt_regs *regs)  EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);  #ifdef CONFIG_HOTPLUG_CPU + +/* These two declarations are only used in check_irq_vectors_for_cpu_disable() + * below, which is protected by stop_machine().  Putting them on the stack + * results in a stack frame overflow.  Dynamically allocating could result in a + * failure so declare these two cpumasks as global. + */ +static struct cpumask affinity_new, online_new; + +/* + * This cpu is going to be removed and its vectors migrated to the remaining + * online cpus.  Check to see if there are enough vectors in the remaining cpus. + * This function is protected by stop_machine(). + */ +int check_irq_vectors_for_cpu_disable(void) +{ +	int irq, cpu; +	unsigned int this_cpu, vector, this_count, count; +	struct irq_desc *desc; +	struct irq_data *data; + +	this_cpu = smp_processor_id(); +	cpumask_copy(&online_new, cpu_online_mask); +	cpu_clear(this_cpu, online_new); + +	this_count = 0; +	for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { +		irq = __this_cpu_read(vector_irq[vector]); +		if (irq >= 0) { +			desc = irq_to_desc(irq); +			data = irq_desc_get_irq_data(desc); +			cpumask_copy(&affinity_new, data->affinity); +			cpu_clear(this_cpu, affinity_new); + +			/* Do not count inactive or per-cpu irqs. */ +			if (!irq_has_action(irq) || irqd_is_per_cpu(data)) +				continue; + +			/* +			 * A single irq may be mapped to multiple +			 * cpu's vector_irq[] (for example IOAPIC cluster +			 * mode).  In this case we have two +			 * possibilities: +			 * +			 * 1) the resulting affinity mask is empty; that is +			 * this the down'd cpu is the last cpu in the irq's +			 * affinity mask, or +			 * +			 * 2) the resulting affinity mask is no longer +			 * a subset of the online cpus but the affinity +			 * mask is not zero; that is the down'd cpu is the +			 * last online cpu in a user set affinity mask. +			 */ +			if (cpumask_empty(&affinity_new) || +			    !cpumask_subset(&affinity_new, &online_new)) +				this_count++; +		} +	} + +	count = 0; +	for_each_online_cpu(cpu) { +		if (cpu == this_cpu) +			continue; +		/* +		 * We scan from FIRST_EXTERNAL_VECTOR to first system +		 * vector. If the vector is marked in the used vectors +		 * bitmap or an irq is assigned to it, we don't count +		 * it as available. +		 */ +		for (vector = FIRST_EXTERNAL_VECTOR; +		     vector < first_system_vector; vector++) { +			if (!test_bit(vector, used_vectors) && +			    per_cpu(vector_irq, cpu)[vector] < 0) +					count++; +		} +	} + +	if (count < this_count) { +		pr_warn("CPU %d disable failed: CPU has %u vectors assigned and there are only %u available.\n", +			this_cpu, this_count, count); +		return -ERANGE; +	} +	return 0; +} +  /* A cpu has been removed from cpu_online_mask.  Reset irq affinities. */  void fixup_irqs(void)  { @@ -270,6 +365,7 @@ void fixup_irqs(void)  	struct irq_desc *desc;  	struct irq_data *data;  	struct irq_chip *chip; +	int ret;  	for_each_irq_desc(irq, desc) {  		int break_affinity = 0; @@ -308,10 +404,14 @@ void fixup_irqs(void)  		if (!irqd_can_move_in_process_context(data) && chip->irq_mask)  			chip->irq_mask(data); -		if (chip->irq_set_affinity) -			chip->irq_set_affinity(data, affinity, true); -		else if (!(warned++)) -			set_affinity = 0; +		if (chip->irq_set_affinity) { +			ret = chip->irq_set_affinity(data, affinity, true); +			if (ret == -ENOSPC) +				pr_crit("IRQ %d set affinity failed because there are no available vectors.  The device assigned to this IRQ is unstable.\n", irq); +		} else { +			if (!(warned++)) +				set_affinity = 0; +		}  		/*  		 * We unmask if the irq was not marked masked by the @@ -344,7 +444,7 @@ void fixup_irqs(void)  	for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {  		unsigned int irr; -		if (__this_cpu_read(vector_irq[vector]) < 0) +		if (__this_cpu_read(vector_irq[vector]) <= VECTOR_UNDEFINED)  			continue;  		irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); @@ -355,11 +455,14 @@ void fixup_irqs(void)  			data = irq_desc_get_irq_data(desc);  			chip = irq_data_get_irq_chip(data);  			raw_spin_lock(&desc->lock); -			if (chip->irq_retrigger) +			if (chip->irq_retrigger) {  				chip->irq_retrigger(data); +				__this_cpu_write(vector_irq[vector], VECTOR_RETRIGGERED); +			}  			raw_spin_unlock(&desc->lock);  		} -		__this_cpu_write(vector_irq[vector], -1); +		if (__this_cpu_read(vector_irq[vector]) != VECTOR_RETRIGGERED) +			__this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED);  	}  }  #endif diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 4186755f1d7..63ce838e5a5 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -55,16 +55,8 @@ static inline int check_stack_overflow(void) { return 0; }  static inline void print_stack_overflow(void) { }  #endif -/* - * per-CPU IRQ handling contexts (thread information and stack) - */ -union irq_ctx { -	struct thread_info      tinfo; -	u32                     stack[THREAD_SIZE/sizeof(u32)]; -} __attribute__((aligned(THREAD_SIZE))); - -static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx); -static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx); +DEFINE_PER_CPU(struct irq_stack *, hardirq_stack); +DEFINE_PER_CPU(struct irq_stack *, softirq_stack);  static void call_on_stack(void *func, void *stack)  { @@ -77,14 +69,26 @@ static void call_on_stack(void *func, void *stack)  		     : "memory", "cc", "edx", "ecx", "eax");  } +/* how to get the current stack pointer from C */ +#define current_stack_pointer ({		\ +	unsigned long sp;			\ +	asm("mov %%esp,%0" : "=g" (sp));	\ +	sp;					\ +}) + +static inline void *current_stack(void) +{ +	return (void *)(current_stack_pointer & ~(THREAD_SIZE - 1)); +} +  static inline int  execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)  { -	union irq_ctx *curctx, *irqctx; -	u32 *isp, arg1, arg2; +	struct irq_stack *curstk, *irqstk; +	u32 *isp, *prev_esp, arg1, arg2; -	curctx = (union irq_ctx *) current_thread_info(); -	irqctx = __this_cpu_read(hardirq_ctx); +	curstk = (struct irq_stack *) current_stack(); +	irqstk = __this_cpu_read(hardirq_stack);  	/*  	 * this is where we switch to the IRQ stack. However, if we are @@ -92,16 +96,14 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)  	 * handler) we can't do that and just have to keep using the  	 * current stack (which is the irq stack already after all)  	 */ -	if (unlikely(curctx == irqctx)) +	if (unlikely(curstk == irqstk))  		return 0; -	/* build the stack frame on the IRQ stack */ -	isp = (u32 *) ((char *)irqctx + sizeof(*irqctx)); -	irqctx->tinfo.task = curctx->tinfo.task; -	irqctx->tinfo.previous_esp = current_stack_pointer; +	isp = (u32 *) ((char *)irqstk + sizeof(*irqstk)); -	/* Copy the preempt_count so that the [soft]irq checks work. */ -	irqctx->tinfo.preempt_count = curctx->tinfo.preempt_count; +	/* Save the next esp at the bottom of the stack */ +	prev_esp = (u32 *)irqstk; +	*prev_esp = current_stack_pointer;  	if (unlikely(overflow))  		call_on_stack(print_stack_overflow, isp); @@ -121,63 +123,42 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)   */  void irq_ctx_init(int cpu)  { -	union irq_ctx *irqctx; +	struct irq_stack *irqstk; -	if (per_cpu(hardirq_ctx, cpu)) +	if (per_cpu(hardirq_stack, cpu))  		return; -	irqctx = page_address(alloc_pages_node(cpu_to_node(cpu), +	irqstk = page_address(alloc_pages_node(cpu_to_node(cpu),  					       THREADINFO_GFP,  					       THREAD_SIZE_ORDER)); -	memset(&irqctx->tinfo, 0, sizeof(struct thread_info)); -	irqctx->tinfo.cpu		= cpu; -	irqctx->tinfo.preempt_count	= HARDIRQ_OFFSET; -	irqctx->tinfo.addr_limit	= MAKE_MM_SEG(0); +	per_cpu(hardirq_stack, cpu) = irqstk; -	per_cpu(hardirq_ctx, cpu) = irqctx; - -	irqctx = page_address(alloc_pages_node(cpu_to_node(cpu), +	irqstk = page_address(alloc_pages_node(cpu_to_node(cpu),  					       THREADINFO_GFP,  					       THREAD_SIZE_ORDER)); -	memset(&irqctx->tinfo, 0, sizeof(struct thread_info)); -	irqctx->tinfo.cpu		= cpu; -	irqctx->tinfo.addr_limit	= MAKE_MM_SEG(0); - -	per_cpu(softirq_ctx, cpu) = irqctx; +	per_cpu(softirq_stack, cpu) = irqstk;  	printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n", -	       cpu, per_cpu(hardirq_ctx, cpu),  per_cpu(softirq_ctx, cpu)); +	       cpu, per_cpu(hardirq_stack, cpu),  per_cpu(softirq_stack, cpu));  } -asmlinkage void do_softirq(void) +void do_softirq_own_stack(void)  { -	unsigned long flags; -	struct thread_info *curctx; -	union irq_ctx *irqctx; -	u32 *isp; - -	if (in_interrupt()) -		return; - -	local_irq_save(flags); +	struct thread_info *curstk; +	struct irq_stack *irqstk; +	u32 *isp, *prev_esp; -	if (local_softirq_pending()) { -		curctx = current_thread_info(); -		irqctx = __this_cpu_read(softirq_ctx); -		irqctx->tinfo.task = curctx->task; -		irqctx->tinfo.previous_esp = current_stack_pointer; +	curstk = current_stack(); +	irqstk = __this_cpu_read(softirq_stack); -		/* build the stack frame on the softirq stack */ -		isp = (u32 *) ((char *)irqctx + sizeof(*irqctx)); +	/* build the stack frame on the softirq stack */ +	isp = (u32 *) ((char *)irqstk + sizeof(*irqstk)); -		call_on_stack(__do_softirq, isp); -		/* -		 * Shouldn't happen, we returned above if in_interrupt(): -		 */ -		WARN_ON_ONCE(softirq_count()); -	} +	/* Push the previous esp onto the stack */ +	prev_esp = (u32 *)irqstk; +	*prev_esp = current_stack_pointer; -	local_irq_restore(flags); +	call_on_stack(__do_softirq, isp);  }  bool handle_irq(unsigned irq, struct pt_regs *regs) diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index d04d3ecded6..4d1c746892e 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -87,24 +87,3 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)  	generic_handle_irq_desc(irq, desc);  	return true;  } - - -extern void call_softirq(void); - -asmlinkage void do_softirq(void) -{ -	__u32 pending; -	unsigned long flags; - -	if (in_interrupt()) -		return; - -	local_irq_save(flags); -	pending = local_softirq_pending(); -	/* Switch to interrupt stack */ -	if (pending) { -		call_softirq(); -		WARN_ON_ONCE(softirq_count()); -	} -	local_irq_restore(flags); -} diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index a2a1fbc594f..7f50156542f 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -52,7 +52,7 @@ static struct irqaction irq2 = {  };  DEFINE_PER_CPU(vector_irq_t, vector_irq) = { -	[0 ... NR_VECTORS - 1] = -1, +	[0 ... NR_VECTORS - 1] = VECTOR_UNDEFINED,  };  int vector_used_by_percpu_irq(unsigned int vector) @@ -60,7 +60,7 @@ int vector_used_by_percpu_irq(unsigned int vector)  	int cpu;  	for_each_online_cpu(cpu) { -		if (per_cpu(vector_irq, cpu)[vector] != -1) +		if (per_cpu(vector_irq, cpu)[vector] > VECTOR_UNDEFINED)  			return 1;  	} diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index ee11b7dfbfb..26d5a55a273 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -42,15 +42,27 @@ static void __jump_label_transform(struct jump_entry *entry,  				   int init)  {  	union jump_code_union code; +	const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP };  	const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5];  	if (type == JUMP_LABEL_ENABLE) { -		/* -		 * We are enabling this jump label. If it is not a nop -		 * then something must have gone wrong. -		 */ -		if (unlikely(memcmp((void *)entry->code, ideal_nop, 5) != 0)) -			bug_at((void *)entry->code, __LINE__); +		if (init) { +			/* +			 * Jump label is enabled for the first time. +			 * So we expect a default_nop... +			 */ +			if (unlikely(memcmp((void *)entry->code, default_nop, 5) +				     != 0)) +				bug_at((void *)entry->code, __LINE__); +		} else { +			/* +			 * ...otherwise expect an ideal_nop. Otherwise +			 * something went horribly wrong. +			 */ +			if (unlikely(memcmp((void *)entry->code, ideal_nop, 5) +				     != 0)) +				bug_at((void *)entry->code, __LINE__); +		}  		code.jump = 0xe9;  		code.offset = entry->target - @@ -63,7 +75,6 @@ static void __jump_label_transform(struct jump_entry *entry,  		 * are converting the default nop to the ideal nop.  		 */  		if (init) { -			const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP };  			if (unlikely(memcmp((void *)entry->code, default_nop, 5) != 0))  				bug_at((void *)entry->code, __LINE__);  		} else { diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 836f8322960..7ec1d5f8d28 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -39,7 +39,6 @@  #include <linux/sched.h>  #include <linux/delay.h>  #include <linux/kgdb.h> -#include <linux/init.h>  #include <linux/smp.h>  #include <linux/nmi.h>  #include <linux/hw_breakpoint.h> diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 79a3f968287..67e6d19ef1b 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -112,7 +112,8 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = {  const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); -static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op) +static nokprobe_inline void +__synthesize_relative_insn(void *from, void *to, u8 op)  {  	struct __arch_relative_insn {  		u8 op; @@ -125,21 +126,23 @@ static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op)  }  /* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ -void __kprobes synthesize_reljump(void *from, void *to) +void synthesize_reljump(void *from, void *to)  {  	__synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE);  } +NOKPROBE_SYMBOL(synthesize_reljump);  /* Insert a call instruction at address 'from', which calls address 'to'.*/ -void __kprobes synthesize_relcall(void *from, void *to) +void synthesize_relcall(void *from, void *to)  {  	__synthesize_relative_insn(from, to, RELATIVECALL_OPCODE);  } +NOKPROBE_SYMBOL(synthesize_relcall);  /*   * Skip the prefixes of the instruction.   */ -static kprobe_opcode_t *__kprobes skip_prefixes(kprobe_opcode_t *insn) +static kprobe_opcode_t *skip_prefixes(kprobe_opcode_t *insn)  {  	insn_attr_t attr; @@ -154,12 +157,13 @@ static kprobe_opcode_t *__kprobes skip_prefixes(kprobe_opcode_t *insn)  #endif  	return insn;  } +NOKPROBE_SYMBOL(skip_prefixes);  /*   * Returns non-zero if opcode is boostable.   * RIP relative instructions are adjusted at copying time in 64 bits mode   */ -int __kprobes can_boost(kprobe_opcode_t *opcodes) +int can_boost(kprobe_opcode_t *opcodes)  {  	kprobe_opcode_t opcode;  	kprobe_opcode_t *orig_opcodes = opcodes; @@ -260,7 +264,7 @@ unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long add  }  /* Check if paddr is at an instruction boundary */ -static int __kprobes can_probe(unsigned long paddr) +static int can_probe(unsigned long paddr)  {  	unsigned long addr, __addr, offset = 0;  	struct insn insn; @@ -299,7 +303,7 @@ static int __kprobes can_probe(unsigned long paddr)  /*   * Returns non-zero if opcode modifies the interrupt flag.   */ -static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) +static int is_IF_modifier(kprobe_opcode_t *insn)  {  	/* Skip prefixes */  	insn = skip_prefixes(insn); @@ -322,7 +326,7 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)   * If not, return null.   * Only applicable to 64-bit x86.   */ -int __kprobes __copy_instruction(u8 *dest, u8 *src) +int __copy_instruction(u8 *dest, u8 *src)  {  	struct insn insn;  	kprobe_opcode_t buf[MAX_INSN_SIZE]; @@ -365,7 +369,7 @@ int __kprobes __copy_instruction(u8 *dest, u8 *src)  	return insn.length;  } -static int __kprobes arch_copy_kprobe(struct kprobe *p) +static int arch_copy_kprobe(struct kprobe *p)  {  	int ret; @@ -392,7 +396,7 @@ static int __kprobes arch_copy_kprobe(struct kprobe *p)  	return 0;  } -int __kprobes arch_prepare_kprobe(struct kprobe *p) +int arch_prepare_kprobe(struct kprobe *p)  {  	if (alternatives_text_reserved(p->addr, p->addr))  		return -EINVAL; @@ -407,17 +411,17 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p)  	return arch_copy_kprobe(p);  } -void __kprobes arch_arm_kprobe(struct kprobe *p) +void arch_arm_kprobe(struct kprobe *p)  {  	text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);  } -void __kprobes arch_disarm_kprobe(struct kprobe *p) +void arch_disarm_kprobe(struct kprobe *p)  {  	text_poke(p->addr, &p->opcode, 1);  } -void __kprobes arch_remove_kprobe(struct kprobe *p) +void arch_remove_kprobe(struct kprobe *p)  {  	if (p->ainsn.insn) {  		free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1)); @@ -425,7 +429,8 @@ void __kprobes arch_remove_kprobe(struct kprobe *p)  	}  } -static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb) +static nokprobe_inline void +save_previous_kprobe(struct kprobe_ctlblk *kcb)  {  	kcb->prev_kprobe.kp = kprobe_running();  	kcb->prev_kprobe.status = kcb->kprobe_status; @@ -433,7 +438,8 @@ static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)  	kcb->prev_kprobe.saved_flags = kcb->kprobe_saved_flags;  } -static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb) +static nokprobe_inline void +restore_previous_kprobe(struct kprobe_ctlblk *kcb)  {  	__this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);  	kcb->kprobe_status = kcb->prev_kprobe.status; @@ -441,8 +447,9 @@ static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)  	kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags;  } -static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs, -				struct kprobe_ctlblk *kcb) +static nokprobe_inline void +set_current_kprobe(struct kprobe *p, struct pt_regs *regs, +		   struct kprobe_ctlblk *kcb)  {  	__this_cpu_write(current_kprobe, p);  	kcb->kprobe_saved_flags = kcb->kprobe_old_flags @@ -451,7 +458,7 @@ static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,  		kcb->kprobe_saved_flags &= ~X86_EFLAGS_IF;  } -static void __kprobes clear_btf(void) +static nokprobe_inline void clear_btf(void)  {  	if (test_thread_flag(TIF_BLOCKSTEP)) {  		unsigned long debugctl = get_debugctlmsr(); @@ -461,7 +468,7 @@ static void __kprobes clear_btf(void)  	}  } -static void __kprobes restore_btf(void) +static nokprobe_inline void restore_btf(void)  {  	if (test_thread_flag(TIF_BLOCKSTEP)) {  		unsigned long debugctl = get_debugctlmsr(); @@ -471,8 +478,7 @@ static void __kprobes restore_btf(void)  	}  } -void __kprobes -arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) +void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)  {  	unsigned long *sara = stack_addr(regs); @@ -481,9 +487,10 @@ arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)  	/* Replace the return addr with trampoline addr */  	*sara = (unsigned long) &kretprobe_trampoline;  } +NOKPROBE_SYMBOL(arch_prepare_kretprobe); -static void __kprobes -setup_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb, int reenter) +static void setup_singlestep(struct kprobe *p, struct pt_regs *regs, +			     struct kprobe_ctlblk *kcb, int reenter)  {  	if (setup_detour_execution(p, regs, reenter))  		return; @@ -519,22 +526,24 @@ setup_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *k  	else  		regs->ip = (unsigned long)p->ainsn.insn;  } +NOKPROBE_SYMBOL(setup_singlestep);  /*   * We have reentered the kprobe_handler(), since another probe was hit while   * within the handler. We save the original kprobes variables and just single   * step on the instruction of the new probe without calling any user handlers.   */ -static int __kprobes -reenter_kprobe(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb) +static int reenter_kprobe(struct kprobe *p, struct pt_regs *regs, +			  struct kprobe_ctlblk *kcb)  {  	switch (kcb->kprobe_status) {  	case KPROBE_HIT_SSDONE:  	case KPROBE_HIT_ACTIVE: +	case KPROBE_HIT_SS:  		kprobes_inc_nmissed_count(p);  		setup_singlestep(p, regs, kcb, 1);  		break; -	case KPROBE_HIT_SS: +	case KPROBE_REENTER:  		/* A probe has been hit in the codepath leading up to, or just  		 * after, single-stepping of a probed instruction. This entire  		 * codepath should strictly reside in .kprobes.text section. @@ -553,17 +562,21 @@ reenter_kprobe(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb  	return 1;  } +NOKPROBE_SYMBOL(reenter_kprobe);  /*   * Interrupts are disabled on entry as trap3 is an interrupt gate and they   * remain disabled throughout this function.   */ -static int __kprobes kprobe_handler(struct pt_regs *regs) +int kprobe_int3_handler(struct pt_regs *regs)  {  	kprobe_opcode_t *addr;  	struct kprobe *p;  	struct kprobe_ctlblk *kcb; +	if (user_mode_vm(regs)) +		return 0; +  	addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));  	/*  	 * We don't want to be preempted for the entire @@ -621,12 +634,13 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)  	preempt_enable_no_resched();  	return 0;  } +NOKPROBE_SYMBOL(kprobe_int3_handler);  /*   * When a retprobed function returns, this code saves registers and   * calls trampoline_handler() runs, which calls the kretprobe's handler.   */ -static void __used __kprobes kretprobe_trampoline_holder(void) +static void __used kretprobe_trampoline_holder(void)  {  	asm volatile (  			".global kretprobe_trampoline\n" @@ -657,11 +671,13 @@ static void __used __kprobes kretprobe_trampoline_holder(void)  #endif  			"	ret\n");  } +NOKPROBE_SYMBOL(kretprobe_trampoline_holder); +NOKPROBE_SYMBOL(kretprobe_trampoline);  /*   * Called from kretprobe_trampoline   */ -__visible __used __kprobes void *trampoline_handler(struct pt_regs *regs) +__visible __used void *trampoline_handler(struct pt_regs *regs)  {  	struct kretprobe_instance *ri = NULL;  	struct hlist_head *head, empty_rp; @@ -747,6 +763,7 @@ __visible __used __kprobes void *trampoline_handler(struct pt_regs *regs)  	}  	return (void *)orig_ret_address;  } +NOKPROBE_SYMBOL(trampoline_handler);  /*   * Called after single-stepping.  p->addr is the address of the @@ -775,8 +792,8 @@ __visible __used __kprobes void *trampoline_handler(struct pt_regs *regs)   * jump instruction after the copied instruction, that jumps to the next   * instruction after the probepoint.   */ -static void __kprobes -resume_execution(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb) +static void resume_execution(struct kprobe *p, struct pt_regs *regs, +			     struct kprobe_ctlblk *kcb)  {  	unsigned long *tos = stack_addr(regs);  	unsigned long copy_ip = (unsigned long)p->ainsn.insn; @@ -851,12 +868,13 @@ resume_execution(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *k  no_change:  	restore_btf();  } +NOKPROBE_SYMBOL(resume_execution);  /*   * Interrupts are disabled on entry as trap1 is an interrupt gate and they   * remain disabled throughout this function.   */ -static int __kprobes post_kprobe_handler(struct pt_regs *regs) +int kprobe_debug_handler(struct pt_regs *regs)  {  	struct kprobe *cur = kprobe_running();  	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); @@ -891,15 +909,17 @@ out:  	return 1;  } +NOKPROBE_SYMBOL(kprobe_debug_handler); -int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) +int kprobe_fault_handler(struct pt_regs *regs, int trapnr)  {  	struct kprobe *cur = kprobe_running();  	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); -	switch (kcb->kprobe_status) { -	case KPROBE_HIT_SS: -	case KPROBE_REENTER: +	if (unlikely(regs->ip == (unsigned long)cur->ainsn.insn)) { +		/* This must happen on single-stepping */ +		WARN_ON(kcb->kprobe_status != KPROBE_HIT_SS && +			kcb->kprobe_status != KPROBE_REENTER);  		/*  		 * We are here because the instruction being single  		 * stepped caused a page fault. We reset the current @@ -914,9 +934,8 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)  		else  			reset_current_kprobe();  		preempt_enable_no_resched(); -		break; -	case KPROBE_HIT_ACTIVE: -	case KPROBE_HIT_SSDONE: +	} else if (kcb->kprobe_status == KPROBE_HIT_ACTIVE || +		   kcb->kprobe_status == KPROBE_HIT_SSDONE) {  		/*  		 * We increment the nmissed count for accounting,  		 * we can also use npre/npostfault count for accounting @@ -945,18 +964,17 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)  		 * fixup routine could not handle it,  		 * Let do_page_fault() fix it.  		 */ -		break; -	default: -		break;  	} +  	return 0;  } +NOKPROBE_SYMBOL(kprobe_fault_handler);  /*   * Wrapper routine for handling exceptions.   */ -int __kprobes -kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *data) +int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, +			     void *data)  {  	struct die_args *args = data;  	int ret = NOTIFY_DONE; @@ -964,22 +982,7 @@ kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *d  	if (args->regs && user_mode_vm(args->regs))  		return ret; -	switch (val) { -	case DIE_INT3: -		if (kprobe_handler(args->regs)) -			ret = NOTIFY_STOP; -		break; -	case DIE_DEBUG: -		if (post_kprobe_handler(args->regs)) { -			/* -			 * Reset the BS bit in dr6 (pointed by args->err) to -			 * denote completion of processing -			 */ -			(*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP; -			ret = NOTIFY_STOP; -		} -		break; -	case DIE_GPF: +	if (val == DIE_GPF) {  		/*  		 * To be potentially processing a kprobe fault and to  		 * trust the result from kprobe_running(), we have @@ -988,14 +991,12 @@ kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *d  		if (!preemptible() && kprobe_running() &&  		    kprobe_fault_handler(args->regs, args->trapnr))  			ret = NOTIFY_STOP; -		break; -	default: -		break;  	}  	return ret;  } +NOKPROBE_SYMBOL(kprobe_exceptions_notify); -int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) +int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)  {  	struct jprobe *jp = container_of(p, struct jprobe, kp);  	unsigned long addr; @@ -1019,8 +1020,9 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)  	regs->ip = (unsigned long)(jp->entry);  	return 1;  } +NOKPROBE_SYMBOL(setjmp_pre_handler); -void __kprobes jprobe_return(void) +void jprobe_return(void)  {  	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); @@ -1036,8 +1038,10 @@ void __kprobes jprobe_return(void)  			"       nop			\n"::"b"  			(kcb->jprobe_saved_sp):"memory");  } +NOKPROBE_SYMBOL(jprobe_return); +NOKPROBE_SYMBOL(jprobe_return_end); -int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) +int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)  {  	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();  	u8 *addr = (u8 *) (regs->ip - 1); @@ -1065,13 +1069,22 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)  	}  	return 0;  } +NOKPROBE_SYMBOL(longjmp_break_handler); + +bool arch_within_kprobe_blacklist(unsigned long addr) +{ +	return  (addr >= (unsigned long)__kprobes_text_start && +		 addr < (unsigned long)__kprobes_text_end) || +		(addr >= (unsigned long)__entry_text_start && +		 addr < (unsigned long)__entry_text_end); +}  int __init arch_init_kprobes(void)  {  	return 0;  } -int __kprobes arch_trampoline_kprobe(struct kprobe *p) +int arch_trampoline_kprobe(struct kprobe *p)  {  	return 0;  } diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index 23ef5c556f0..717b02a22e6 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -25,8 +25,9 @@  #include "common.h" -static int __skip_singlestep(struct kprobe *p, struct pt_regs *regs, -			     struct kprobe_ctlblk *kcb) +static nokprobe_inline +int __skip_singlestep(struct kprobe *p, struct pt_regs *regs, +		      struct kprobe_ctlblk *kcb)  {  	/*  	 * Emulate singlestep (and also recover regs->ip) @@ -41,18 +42,19 @@ static int __skip_singlestep(struct kprobe *p, struct pt_regs *regs,  	return 1;  } -int __kprobes skip_singlestep(struct kprobe *p, struct pt_regs *regs, -			      struct kprobe_ctlblk *kcb) +int skip_singlestep(struct kprobe *p, struct pt_regs *regs, +		    struct kprobe_ctlblk *kcb)  {  	if (kprobe_ftrace(p))  		return __skip_singlestep(p, regs, kcb);  	else  		return 0;  } +NOKPROBE_SYMBOL(skip_singlestep);  /* Ftrace callback handler for kprobes */ -void __kprobes kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, -				     struct ftrace_ops *ops, struct pt_regs *regs) +void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, +			   struct ftrace_ops *ops, struct pt_regs *regs)  {  	struct kprobe *p;  	struct kprobe_ctlblk *kcb; @@ -84,8 +86,9 @@ void __kprobes kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,  end:  	local_irq_restore(flags);  } +NOKPROBE_SYMBOL(kprobe_ftrace_handler); -int __kprobes arch_prepare_kprobe_ftrace(struct kprobe *p) +int arch_prepare_kprobe_ftrace(struct kprobe *p)  {  	p->ainsn.insn = NULL;  	p->ainsn.boostable = -1; diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 898160b42e4..f304773285a 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -77,7 +77,7 @@ found:  }  /* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */ -static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val) +static void synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val)  {  #ifdef CONFIG_X86_64  	*addr++ = 0x48; @@ -138,7 +138,8 @@ asm (  #define INT3_SIZE sizeof(kprobe_opcode_t)  /* Optimized kprobe call back function: called from optinsn */ -static void __kprobes optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) +static void +optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs)  {  	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();  	unsigned long flags; @@ -168,8 +169,9 @@ static void __kprobes optimized_callback(struct optimized_kprobe *op, struct pt_  	}  	local_irq_restore(flags);  } +NOKPROBE_SYMBOL(optimized_callback); -static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src) +static int copy_optimized_instructions(u8 *dest, u8 *src)  {  	int len = 0, ret; @@ -189,7 +191,7 @@ static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)  }  /* Check whether insn is indirect jump */ -static int __kprobes insn_is_indirect_jump(struct insn *insn) +static int insn_is_indirect_jump(struct insn *insn)  {  	return ((insn->opcode.bytes[0] == 0xff &&  		(X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */ @@ -224,7 +226,7 @@ static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)  }  /* Decode whole function to ensure any instructions don't jump into target */ -static int __kprobes can_optimize(unsigned long paddr) +static int can_optimize(unsigned long paddr)  {  	unsigned long addr, size = 0, offset = 0;  	struct insn insn; @@ -275,7 +277,7 @@ static int __kprobes can_optimize(unsigned long paddr)  }  /* Check optimized_kprobe can actually be optimized. */ -int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op) +int arch_check_optimized_kprobe(struct optimized_kprobe *op)  {  	int i;  	struct kprobe *p; @@ -290,15 +292,15 @@ int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op)  }  /* Check the addr is within the optimized instructions. */ -int __kprobes -arch_within_optimized_kprobe(struct optimized_kprobe *op, unsigned long addr) +int arch_within_optimized_kprobe(struct optimized_kprobe *op, +				 unsigned long addr)  {  	return ((unsigned long)op->kp.addr <= addr &&  		(unsigned long)op->kp.addr + op->optinsn.size > addr);  }  /* Free optimized instruction slot */ -static __kprobes +static  void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)  {  	if (op->optinsn.insn) { @@ -308,7 +310,7 @@ void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)  	}  } -void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op) +void arch_remove_optimized_kprobe(struct optimized_kprobe *op)  {  	__arch_remove_optimized_kprobe(op, 1);  } @@ -318,7 +320,7 @@ void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op)   * Target instructions MUST be relocatable (checked inside)   * This is called when new aggr(opt)probe is allocated or reused.   */ -int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op) +int arch_prepare_optimized_kprobe(struct optimized_kprobe *op)  {  	u8 *buf;  	int ret; @@ -372,7 +374,7 @@ int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)   * Replace breakpoints (int3) with relative jumps.   * Caller must call with locking kprobe_mutex and text_mutex.   */ -void __kprobes arch_optimize_kprobes(struct list_head *oplist) +void arch_optimize_kprobes(struct list_head *oplist)  {  	struct optimized_kprobe *op, *tmp;  	u8 insn_buf[RELATIVEJUMP_SIZE]; @@ -398,7 +400,7 @@ void __kprobes arch_optimize_kprobes(struct list_head *oplist)  }  /* Replace a relative jump with a breakpoint (int3).  */ -void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op) +void arch_unoptimize_kprobe(struct optimized_kprobe *op)  {  	u8 insn_buf[RELATIVEJUMP_SIZE]; @@ -424,8 +426,7 @@ extern void arch_unoptimize_kprobes(struct list_head *oplist,  	}  } -int  __kprobes -setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter) +int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter)  {  	struct optimized_kprobe *op; @@ -441,3 +442,4 @@ setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter)  	}  	return 0;  } +NOKPROBE_SYMBOL(setup_detour_execution); diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c new file mode 100644 index 00000000000..c2bedaea11f --- /dev/null +++ b/arch/x86/kernel/ksysfs.c @@ -0,0 +1,340 @@ +/* + * Architecture specific sysfs attributes in /sys/kernel + * + * Copyright (C) 2007, Intel Corp. + *      Huang Ying <ying.huang@intel.com> + * Copyright (C) 2013, 2013 Red Hat, Inc. + *      Dave Young <dyoung@redhat.com> + * + * This file is released under the GPLv2 + */ + +#include <linux/kobject.h> +#include <linux/string.h> +#include <linux/sysfs.h> +#include <linux/init.h> +#include <linux/stat.h> +#include <linux/slab.h> +#include <linux/mm.h> + +#include <asm/io.h> +#include <asm/setup.h> + +static ssize_t version_show(struct kobject *kobj, +			    struct kobj_attribute *attr, char *buf) +{ +	return sprintf(buf, "0x%04x\n", boot_params.hdr.version); +} + +static struct kobj_attribute boot_params_version_attr = __ATTR_RO(version); + +static ssize_t boot_params_data_read(struct file *fp, struct kobject *kobj, +				     struct bin_attribute *bin_attr, +				     char *buf, loff_t off, size_t count) +{ +	memcpy(buf, (void *)&boot_params + off, count); +	return count; +} + +static struct bin_attribute boot_params_data_attr = { +	.attr = { +		.name = "data", +		.mode = S_IRUGO, +	}, +	.read = boot_params_data_read, +	.size = sizeof(boot_params), +}; + +static struct attribute *boot_params_version_attrs[] = { +	&boot_params_version_attr.attr, +	NULL, +}; + +static struct bin_attribute *boot_params_data_attrs[] = { +	&boot_params_data_attr, +	NULL, +}; + +static struct attribute_group boot_params_attr_group = { +	.attrs = boot_params_version_attrs, +	.bin_attrs = boot_params_data_attrs, +}; + +static int kobj_to_setup_data_nr(struct kobject *kobj, int *nr) +{ +	const char *name; + +	name = kobject_name(kobj); +	return kstrtoint(name, 10, nr); +} + +static int get_setup_data_paddr(int nr, u64 *paddr) +{ +	int i = 0; +	struct setup_data *data; +	u64 pa_data = boot_params.hdr.setup_data; + +	while (pa_data) { +		if (nr == i) { +			*paddr = pa_data; +			return 0; +		} +		data = ioremap_cache(pa_data, sizeof(*data)); +		if (!data) +			return -ENOMEM; + +		pa_data = data->next; +		iounmap(data); +		i++; +	} +	return -EINVAL; +} + +static int __init get_setup_data_size(int nr, size_t *size) +{ +	int i = 0; +	struct setup_data *data; +	u64 pa_data = boot_params.hdr.setup_data; + +	while (pa_data) { +		data = ioremap_cache(pa_data, sizeof(*data)); +		if (!data) +			return -ENOMEM; +		if (nr == i) { +			*size = data->len; +			iounmap(data); +			return 0; +		} + +		pa_data = data->next; +		iounmap(data); +		i++; +	} +	return -EINVAL; +} + +static ssize_t type_show(struct kobject *kobj, +			 struct kobj_attribute *attr, char *buf) +{ +	int nr, ret; +	u64 paddr; +	struct setup_data *data; + +	ret = kobj_to_setup_data_nr(kobj, &nr); +	if (ret) +		return ret; + +	ret = get_setup_data_paddr(nr, &paddr); +	if (ret) +		return ret; +	data = ioremap_cache(paddr, sizeof(*data)); +	if (!data) +		return -ENOMEM; + +	ret = sprintf(buf, "0x%x\n", data->type); +	iounmap(data); +	return ret; +} + +static ssize_t setup_data_data_read(struct file *fp, +				    struct kobject *kobj, +				    struct bin_attribute *bin_attr, +				    char *buf, +				    loff_t off, size_t count) +{ +	int nr, ret = 0; +	u64 paddr; +	struct setup_data *data; +	void *p; + +	ret = kobj_to_setup_data_nr(kobj, &nr); +	if (ret) +		return ret; + +	ret = get_setup_data_paddr(nr, &paddr); +	if (ret) +		return ret; +	data = ioremap_cache(paddr, sizeof(*data)); +	if (!data) +		return -ENOMEM; + +	if (off > data->len) { +		ret = -EINVAL; +		goto out; +	} + +	if (count > data->len - off) +		count = data->len - off; + +	if (!count) +		goto out; + +	ret = count; +	p = ioremap_cache(paddr + sizeof(*data), data->len); +	if (!p) { +		ret = -ENOMEM; +		goto out; +	} +	memcpy(buf, p + off, count); +	iounmap(p); +out: +	iounmap(data); +	return ret; +} + +static struct kobj_attribute type_attr = __ATTR_RO(type); + +static struct bin_attribute data_attr = { +	.attr = { +		.name = "data", +		.mode = S_IRUGO, +	}, +	.read = setup_data_data_read, +}; + +static struct attribute *setup_data_type_attrs[] = { +	&type_attr.attr, +	NULL, +}; + +static struct bin_attribute *setup_data_data_attrs[] = { +	&data_attr, +	NULL, +}; + +static struct attribute_group setup_data_attr_group = { +	.attrs = setup_data_type_attrs, +	.bin_attrs = setup_data_data_attrs, +}; + +static int __init create_setup_data_node(struct kobject *parent, +					 struct kobject **kobjp, int nr) +{ +	int ret = 0; +	size_t size; +	struct kobject *kobj; +	char name[16]; /* should be enough for setup_data nodes numbers */ +	snprintf(name, 16, "%d", nr); + +	kobj = kobject_create_and_add(name, parent); +	if (!kobj) +		return -ENOMEM; + +	ret = get_setup_data_size(nr, &size); +	if (ret) +		goto out_kobj; + +	data_attr.size = size; +	ret = sysfs_create_group(kobj, &setup_data_attr_group); +	if (ret) +		goto out_kobj; +	*kobjp = kobj; + +	return 0; +out_kobj: +	kobject_put(kobj); +	return ret; +} + +static void __init cleanup_setup_data_node(struct kobject *kobj) +{ +	sysfs_remove_group(kobj, &setup_data_attr_group); +	kobject_put(kobj); +} + +static int __init get_setup_data_total_num(u64 pa_data, int *nr) +{ +	int ret = 0; +	struct setup_data *data; + +	*nr = 0; +	while (pa_data) { +		*nr += 1; +		data = ioremap_cache(pa_data, sizeof(*data)); +		if (!data) { +			ret = -ENOMEM; +			goto out; +		} +		pa_data = data->next; +		iounmap(data); +	} + +out: +	return ret; +} + +static int __init create_setup_data_nodes(struct kobject *parent) +{ +	struct kobject *setup_data_kobj, **kobjp; +	u64 pa_data; +	int i, j, nr, ret = 0; + +	pa_data = boot_params.hdr.setup_data; +	if (!pa_data) +		return 0; + +	setup_data_kobj = kobject_create_and_add("setup_data", parent); +	if (!setup_data_kobj) { +		ret = -ENOMEM; +		goto out; +	} + +	ret = get_setup_data_total_num(pa_data, &nr); +	if (ret) +		goto out_setup_data_kobj; + +	kobjp = kmalloc(sizeof(*kobjp) * nr, GFP_KERNEL); +	if (!kobjp) { +		ret = -ENOMEM; +		goto out_setup_data_kobj; +	} + +	for (i = 0; i < nr; i++) { +		ret = create_setup_data_node(setup_data_kobj, kobjp + i, i); +		if (ret) +			goto out_clean_nodes; +	} + +	kfree(kobjp); +	return 0; + +out_clean_nodes: +	for (j = i - 1; j > 0; j--) +		cleanup_setup_data_node(*(kobjp + j)); +	kfree(kobjp); +out_setup_data_kobj: +	kobject_put(setup_data_kobj); +out: +	return ret; +} + +static int __init boot_params_ksysfs_init(void) +{ +	int ret; +	struct kobject *boot_params_kobj; + +	boot_params_kobj = kobject_create_and_add("boot_params", +						  kernel_kobj); +	if (!boot_params_kobj) { +		ret = -ENOMEM; +		goto out; +	} + +	ret = sysfs_create_group(boot_params_kobj, &boot_params_attr_group); +	if (ret) +		goto out_boot_params_kobj; + +	ret = create_setup_data_nodes(boot_params_kobj); +	if (ret) +		goto out_create_group; + +	return 0; +out_create_group: +	sysfs_remove_group(boot_params_kobj, &boot_params_attr_group); +out_boot_params_kobj: +	kobject_put(boot_params_kobj); +out: +	return ret; +} + +arch_initcall(boot_params_ksysfs_init); diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 697b93af02d..3dd8e2c4d74 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -251,15 +251,16 @@ u32 kvm_read_and_reset_pf_reason(void)  	return reason;  }  EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason); +NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason); -dotraplinkage void __kprobes +dotraplinkage void  do_async_page_fault(struct pt_regs *regs, unsigned long error_code)  {  	enum ctx_state prev_state;  	switch (kvm_read_and_reset_pf_reason()) {  	default: -		do_page_fault(regs, error_code); +		trace_do_page_fault(regs, error_code);  		break;  	case KVM_PV_REASON_PAGE_NOT_PRESENT:  		/* page is swapped out by the host. */ @@ -276,6 +277,7 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code)  		break;  	}  } +NOKPROBE_SYMBOL(do_async_page_fault);  static void __init paravirt_ops_setup(void)  { @@ -417,7 +419,6 @@ void kvm_disable_steal_time(void)  #ifdef CONFIG_SMP  static void __init kvm_smp_prepare_boot_cpu(void)  { -	WARN_ON(kvm_register_clock("primary cpu clock"));  	kvm_guest_cpu_init();  	native_smp_prepare_boot_cpu();  	kvm_spinlock_init(); @@ -464,7 +465,7 @@ static struct notifier_block kvm_cpu_notifier = {  static void __init kvm_apf_trap_init(void)  { -	set_intr_gate(14, &async_page_fault); +	set_intr_gate(14, async_page_fault);  }  void __init kvm_guest_init(void) @@ -500,6 +501,38 @@ void __init kvm_guest_init(void)  #endif  } +static noinline uint32_t __kvm_cpuid_base(void) +{ +	if (boot_cpu_data.cpuid_level < 0) +		return 0;	/* So we don't blow up on old processors */ + +	if (cpu_has_hypervisor) +		return hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0); + +	return 0; +} + +static inline uint32_t kvm_cpuid_base(void) +{ +	static int kvm_cpuid_base = -1; + +	if (kvm_cpuid_base == -1) +		kvm_cpuid_base = __kvm_cpuid_base(); + +	return kvm_cpuid_base; +} + +bool kvm_para_available(void) +{ +	return kvm_cpuid_base() != 0; +} +EXPORT_SYMBOL_GPL(kvm_para_available); + +unsigned int kvm_arch_para_features(void) +{ +	return cpuid_eax(kvm_cpuid_base() | KVM_CPUID_FEATURES); +} +  static uint32_t __init kvm_detect(void)  {  	return kvm_cpuid_base(); @@ -609,7 +642,7 @@ static struct dentry *d_kvm_debug;  struct dentry *kvm_init_debugfs(void)  { -	d_kvm_debug = debugfs_create_dir("kvm", NULL); +	d_kvm_debug = debugfs_create_dir("kvm-guest", NULL);  	if (!d_kvm_debug)  		printk(KERN_WARNING "Could not create 'kvm' debugfs directory\n"); @@ -673,7 +706,7 @@ static cpumask_t waiting_cpus;  /* Track spinlock on which a cpu is waiting */  static DEFINE_PER_CPU(struct kvm_lock_waiting, klock_waiting); -static void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want) +__visible void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want)  {  	struct kvm_lock_waiting *w;  	int cpu; @@ -775,11 +808,22 @@ void __init kvm_spinlock_init(void)  	if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))  		return; -	printk(KERN_INFO "KVM setup paravirtual spinlock\n"); +	pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(kvm_lock_spinning); +	pv_lock_ops.unlock_kick = kvm_unlock_kick; +} + +static __init int kvm_spinlock_init_jump(void) +{ +	if (!kvm_para_available()) +		return 0; +	if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) +		return 0;  	static_key_slow_inc(¶virt_ticketlocks_enabled); +	printk(KERN_INFO "KVM setup paravirtual spinlock\n"); -	pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(kvm_lock_spinning); -	pv_lock_ops.unlock_kick = kvm_unlock_kick; +	return 0;  } +early_initcall(kvm_spinlock_init_jump); +  #endif	/* CONFIG_PARAVIRT_SPINLOCKS */ diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 1570e074134..d9156ceecdf 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -139,6 +139,7 @@ bool kvm_check_and_clear_guest_paused(void)  	src = &hv_clock[cpu].pvti;  	if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) {  		src->flags &= ~PVCLOCK_GUEST_STOPPED; +		pvclock_touch_watchdogs();  		ret = true;  	} @@ -241,7 +242,7 @@ void __init kvmclock_init(void)  	hv_clock = __va(mem);  	memset(hv_clock, 0, size); -	if (kvm_register_clock("boot clock")) { +	if (kvm_register_clock("primary cpu clock")) {  		hv_clock = NULL;  		memblock_free(mem, size);  		return; diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index ebc98739892..c37886d759c 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -229,6 +229,11 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)  		}  	} +	if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) { +		error = -EINVAL; +		goto out_unlock; +	} +  	fill_ldt(&ldt, &ldt_info);  	if (oldmode)  		ldt.avl = 0; diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 5b19e4d78b0..1667b1de8d5 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -9,7 +9,6 @@  #include <linux/mm.h>  #include <linux/kexec.h>  #include <linux/delay.h> -#include <linux/init.h>  #include <linux/numa.h>  #include <linux/ftrace.h>  #include <linux/suspend.h> diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 4eabc160696..679cef0791c 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -279,5 +279,7 @@ void arch_crash_save_vmcoreinfo(void)  	VMCOREINFO_SYMBOL(node_data);  	VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);  #endif +	vmcoreinfo_append_str("KERNELOFFSET=%lx\n", +			      (unsigned long)&_text - __START_KERNEL);  } diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/mcount_64.S new file mode 100644 index 00000000000..c050a015316 --- /dev/null +++ b/arch/x86/kernel/mcount_64.S @@ -0,0 +1,217 @@ +/* + *  linux/arch/x86_64/mcount_64.S + * + *  Copyright (C) 2014  Steven Rostedt, Red Hat Inc + */ + +#include <linux/linkage.h> +#include <asm/ptrace.h> +#include <asm/ftrace.h> + + +	.code64 +	.section .entry.text, "ax" + + +#ifdef CONFIG_FUNCTION_TRACER + +#ifdef CC_USING_FENTRY +# define function_hook	__fentry__ +#else +# define function_hook	mcount +#endif + +#ifdef CONFIG_DYNAMIC_FTRACE + +ENTRY(function_hook) +	retq +END(function_hook) + +/* skip is set if stack has been adjusted */ +.macro ftrace_caller_setup skip=0 +	MCOUNT_SAVE_FRAME \skip + +	/* Load the ftrace_ops into the 3rd parameter */ +	movq function_trace_op(%rip), %rdx + +	/* Load ip into the first parameter */ +	movq RIP(%rsp), %rdi +	subq $MCOUNT_INSN_SIZE, %rdi +	/* Load the parent_ip into the second parameter */ +#ifdef CC_USING_FENTRY +	movq SS+16(%rsp), %rsi +#else +	movq 8(%rbp), %rsi +#endif +.endm + +ENTRY(ftrace_caller) +	/* Check if tracing was disabled (quick check) */ +	cmpl $0, function_trace_stop +	jne  ftrace_stub + +	ftrace_caller_setup +	/* regs go into 4th parameter (but make it NULL) */ +	movq $0, %rcx + +GLOBAL(ftrace_call) +	call ftrace_stub + +	MCOUNT_RESTORE_FRAME +ftrace_return: + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +GLOBAL(ftrace_graph_call) +	jmp ftrace_stub +#endif + +GLOBAL(ftrace_stub) +	retq +END(ftrace_caller) + +ENTRY(ftrace_regs_caller) +	/* Save the current flags before compare (in SS location)*/ +	pushfq + +	/* Check if tracing was disabled (quick check) */ +	cmpl $0, function_trace_stop +	jne  ftrace_restore_flags + +	/* skip=8 to skip flags saved in SS */ +	ftrace_caller_setup 8 + +	/* Save the rest of pt_regs */ +	movq %r15, R15(%rsp) +	movq %r14, R14(%rsp) +	movq %r13, R13(%rsp) +	movq %r12, R12(%rsp) +	movq %r11, R11(%rsp) +	movq %r10, R10(%rsp) +	movq %rbp, RBP(%rsp) +	movq %rbx, RBX(%rsp) +	/* Copy saved flags */ +	movq SS(%rsp), %rcx +	movq %rcx, EFLAGS(%rsp) +	/* Kernel segments */ +	movq $__KERNEL_DS, %rcx +	movq %rcx, SS(%rsp) +	movq $__KERNEL_CS, %rcx +	movq %rcx, CS(%rsp) +	/* Stack - skipping return address */ +	leaq SS+16(%rsp), %rcx +	movq %rcx, RSP(%rsp) + +	/* regs go into 4th parameter */ +	leaq (%rsp), %rcx + +GLOBAL(ftrace_regs_call) +	call ftrace_stub + +	/* Copy flags back to SS, to restore them */ +	movq EFLAGS(%rsp), %rax +	movq %rax, SS(%rsp) + +	/* Handlers can change the RIP */ +	movq RIP(%rsp), %rax +	movq %rax, SS+8(%rsp) + +	/* restore the rest of pt_regs */ +	movq R15(%rsp), %r15 +	movq R14(%rsp), %r14 +	movq R13(%rsp), %r13 +	movq R12(%rsp), %r12 +	movq R10(%rsp), %r10 +	movq RBP(%rsp), %rbp +	movq RBX(%rsp), %rbx + +	/* skip=8 to skip flags saved in SS */ +	MCOUNT_RESTORE_FRAME 8 + +	/* Restore flags */ +	popfq + +	jmp ftrace_return +ftrace_restore_flags: +	popfq +	jmp  ftrace_stub + +END(ftrace_regs_caller) + + +#else /* ! CONFIG_DYNAMIC_FTRACE */ + +ENTRY(function_hook) +	cmpl $0, function_trace_stop +	jne  ftrace_stub + +	cmpq $ftrace_stub, ftrace_trace_function +	jnz trace + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +	cmpq $ftrace_stub, ftrace_graph_return +	jnz ftrace_graph_caller + +	cmpq $ftrace_graph_entry_stub, ftrace_graph_entry +	jnz ftrace_graph_caller +#endif + +GLOBAL(ftrace_stub) +	retq + +trace: +	MCOUNT_SAVE_FRAME + +	movq RIP(%rsp), %rdi +#ifdef CC_USING_FENTRY +	movq SS+16(%rsp), %rsi +#else +	movq 8(%rbp), %rsi +#endif +	subq $MCOUNT_INSN_SIZE, %rdi + +	call   *ftrace_trace_function + +	MCOUNT_RESTORE_FRAME + +	jmp ftrace_stub +END(function_hook) +#endif /* CONFIG_DYNAMIC_FTRACE */ +#endif /* CONFIG_FUNCTION_TRACER */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +ENTRY(ftrace_graph_caller) +	MCOUNT_SAVE_FRAME + +#ifdef CC_USING_FENTRY +	leaq SS+16(%rsp), %rdi +	movq $0, %rdx	/* No framepointers needed */ +#else +	leaq 8(%rbp), %rdi +	movq (%rbp), %rdx +#endif +	movq RIP(%rsp), %rsi +	subq $MCOUNT_INSN_SIZE, %rsi + +	call	prepare_ftrace_return + +	MCOUNT_RESTORE_FRAME + +	retq +END(ftrace_graph_caller) + +GLOBAL(return_to_handler) +	subq  $24, %rsp + +	/* Save the return values */ +	movq %rax, (%rsp) +	movq %rdx, 8(%rsp) +	movq %rbp, %rdi + +	call ftrace_return_to_handler + +	movq %rax, %rdi +	movq 8(%rsp), %rdx +	movq (%rsp), %rax +	addq $24, %rsp +	jmp *%rdi +#endif diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 216a4d754b0..e69f9882bf9 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -28,6 +28,7 @@  #include <linux/mm.h>  #include <linux/gfp.h>  #include <linux/jump_label.h> +#include <linux/random.h>  #include <asm/page.h>  #include <asm/pgtable.h> @@ -43,13 +44,52 @@ do {							\  } while (0)  #endif +#ifdef CONFIG_RANDOMIZE_BASE +static unsigned long module_load_offset; +static int randomize_modules = 1; + +/* Mutex protects the module_load_offset. */ +static DEFINE_MUTEX(module_kaslr_mutex); + +static int __init parse_nokaslr(char *p) +{ +	randomize_modules = 0; +	return 0; +} +early_param("nokaslr", parse_nokaslr); + +static unsigned long int get_module_load_offset(void) +{ +	if (randomize_modules) { +		mutex_lock(&module_kaslr_mutex); +		/* +		 * Calculate the module_load_offset the first time this +		 * code is called. Once calculated it stays the same until +		 * reboot. +		 */ +		if (module_load_offset == 0) +			module_load_offset = +				(get_random_int() % 1024 + 1) * PAGE_SIZE; +		mutex_unlock(&module_kaslr_mutex); +	} +	return module_load_offset; +} +#else +static unsigned long int get_module_load_offset(void) +{ +	return 0; +} +#endif +  void *module_alloc(unsigned long size)  {  	if (PAGE_ALIGN(size) > MODULES_LEN)  		return NULL; -	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, -				GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, -				-1, __builtin_return_address(0)); +	return __vmalloc_node_range(size, 1, +				    MODULES_VADDR + get_module_load_offset(), +				    MODULES_END, GFP_KERNEL | __GFP_HIGHMEM, +				    PAGE_KERNEL_EXEC, NUMA_NO_NODE, +				    __builtin_return_address(0));  }  #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 88458faea2f..c9603ac80de 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -46,7 +46,7 @@ static struct class *msr_class;  static loff_t msr_seek(struct file *file, loff_t offset, int orig)  {  	loff_t ret; -	struct inode *inode = file->f_mapping->host; +	struct inode *inode = file_inode(file);  	mutex_lock(&inode->i_mutex);  	switch (orig) { @@ -259,14 +259,15 @@ static int __init msr_init(void)  		goto out_chrdev;  	}  	msr_class->devnode = msr_devnode; -	get_online_cpus(); + +	cpu_notifier_register_begin();  	for_each_online_cpu(i) {  		err = msr_device_create(i);  		if (err != 0)  			goto out_class;  	} -	register_hotcpu_notifier(&msr_class_cpu_notifier); -	put_online_cpus(); +	__register_hotcpu_notifier(&msr_class_cpu_notifier); +	cpu_notifier_register_done();  	err = 0;  	goto out; @@ -275,7 +276,7 @@ out_class:  	i = 0;  	for_each_online_cpu(i)  		msr_device_destroy(i); -	put_online_cpus(); +	cpu_notifier_register_done();  	class_destroy(msr_class);  out_chrdev:  	__unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); @@ -286,13 +287,14 @@ out:  static void __exit msr_exit(void)  {  	int cpu = 0; -	get_online_cpus(); + +	cpu_notifier_register_begin();  	for_each_online_cpu(cpu)  		msr_device_destroy(cpu);  	class_destroy(msr_class);  	__unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); -	unregister_hotcpu_notifier(&msr_class_cpu_notifier); -	put_online_cpus(); +	__unregister_hotcpu_notifier(&msr_class_cpu_notifier); +	cpu_notifier_register_done();  }  module_init(msr_init); diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index ba77ebc2c35..c3e985d1751 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -87,6 +87,7 @@ __setup("unknown_nmi_panic", setup_unknown_nmi_panic);  #define nmi_to_desc(type) (&nmi_desc[type])  static u64 nmi_longest_ns = 1 * NSEC_PER_MSEC; +  static int __init nmi_warning_debugfs(void)  {  	debugfs_create_u64("nmi_longest_ns", 0644, @@ -95,7 +96,21 @@ static int __init nmi_warning_debugfs(void)  }  fs_initcall(nmi_warning_debugfs); -static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b) +static void nmi_max_handler(struct irq_work *w) +{ +	struct nmiaction *a = container_of(w, struct nmiaction, irq_work); +	int remainder_ns, decimal_msecs; +	u64 whole_msecs = ACCESS_ONCE(a->max_duration); + +	remainder_ns = do_div(whole_msecs, (1000 * 1000)); +	decimal_msecs = remainder_ns / 1000; + +	printk_ratelimited(KERN_INFO +		"INFO: NMI handler (%ps) took too long to run: %lld.%03d msecs\n", +		a->handler, whole_msecs, decimal_msecs); +} + +static int nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b)  {  	struct nmi_desc *desc = nmi_to_desc(type);  	struct nmiaction *a; @@ -110,26 +125,20 @@ static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2  	 * to handle those situations.  	 */  	list_for_each_entry_rcu(a, &desc->head, list) { -		u64 before, delta, whole_msecs; -		int remainder_ns, decimal_msecs, thishandled; +		int thishandled; +		u64 delta; -		before = local_clock(); +		delta = sched_clock();  		thishandled = a->handler(type, regs);  		handled += thishandled; -		delta = local_clock() - before; +		delta = sched_clock() - delta;  		trace_nmi_handler(a->handler, (int)delta, thishandled); -		if (delta < nmi_longest_ns) +		if (delta < nmi_longest_ns || delta < a->max_duration)  			continue; -		nmi_longest_ns = delta; -		whole_msecs = delta; -		remainder_ns = do_div(whole_msecs, (1000 * 1000)); -		decimal_msecs = remainder_ns / 1000; -		printk_ratelimited(KERN_INFO -			"INFO: NMI handler (%ps) took too long to run: " -			"%lld.%03d msecs\n", a->handler, whole_msecs, -			decimal_msecs); +		a->max_duration = delta; +		irq_work_queue(&a->irq_work);  	}  	rcu_read_unlock(); @@ -137,6 +146,7 @@ static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2  	/* return total number of NMI events handled */  	return handled;  } +NOKPROBE_SYMBOL(nmi_handle);  int __register_nmi_handler(unsigned int type, struct nmiaction *action)  { @@ -146,6 +156,8 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action)  	if (!action->handler)  		return -EINVAL; +	init_irq_work(&action->irq_work, nmi_max_handler); +  	spin_lock_irqsave(&desc->lock, flags);  	/* @@ -197,7 +209,7 @@ void unregister_nmi_handler(unsigned int type, const char *name)  }  EXPORT_SYMBOL_GPL(unregister_nmi_handler); -static __kprobes void +static void  pci_serr_error(unsigned char reason, struct pt_regs *regs)  {  	/* check to see if anyone registered against these types of errors */ @@ -227,8 +239,9 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs)  	reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR;  	outb(reason, NMI_REASON_PORT);  } +NOKPROBE_SYMBOL(pci_serr_error); -static __kprobes void +static void  io_check_error(unsigned char reason, struct pt_regs *regs)  {  	unsigned long i; @@ -258,8 +271,9 @@ io_check_error(unsigned char reason, struct pt_regs *regs)  	reason &= ~NMI_REASON_CLEAR_IOCHK;  	outb(reason, NMI_REASON_PORT);  } +NOKPROBE_SYMBOL(io_check_error); -static __kprobes void +static void  unknown_nmi_error(unsigned char reason, struct pt_regs *regs)  {  	int handled; @@ -287,11 +301,12 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)  	pr_emerg("Dazed and confused, but trying to continue\n");  } +NOKPROBE_SYMBOL(unknown_nmi_error);  static DEFINE_PER_CPU(bool, swallow_nmi);  static DEFINE_PER_CPU(unsigned long, last_nmi_rip); -static __kprobes void default_do_nmi(struct pt_regs *regs) +static void default_do_nmi(struct pt_regs *regs)  {  	unsigned char reason = 0;  	int handled; @@ -390,6 +405,7 @@ static __kprobes void default_do_nmi(struct pt_regs *regs)  	else  		unknown_nmi_error(reason, regs);  } +NOKPROBE_SYMBOL(default_do_nmi);  /*   * NMIs can hit breakpoints which will cause it to lose its @@ -509,7 +525,7 @@ static inline void nmi_nesting_postprocess(void)  }  #endif -dotraplinkage notrace __kprobes void +dotraplinkage notrace void  do_nmi(struct pt_regs *regs, long error_code)  {  	nmi_nesting_preprocess(regs); @@ -526,6 +542,7 @@ do_nmi(struct pt_regs *regs, long error_code)  	/* On i386, may loop back to preprocess */  	nmi_nesting_postprocess();  } +NOKPROBE_SYMBOL(do_nmi);  void stop_nmi(void)  { diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 1b10af835c3..548d25f00c9 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -23,6 +23,7 @@  #include <linux/efi.h>  #include <linux/bcd.h>  #include <linux/highmem.h> +#include <linux/kprobes.h>  #include <asm/bug.h>  #include <asm/paravirt.h> @@ -389,6 +390,11 @@ __visible struct pv_cpu_ops pv_cpu_ops = {  	.end_context_switch = paravirt_nop,  }; +/* At this point, native_get/set_debugreg has real function entries */ +NOKPROBE_SYMBOL(native_get_debugreg); +NOKPROBE_SYMBOL(native_set_debugreg); +NOKPROBE_SYMBOL(native_load_idt); +  struct pv_apic_ops pv_apic_ops = {  #ifdef CONFIG_X86_LOCAL_APIC  	.startup_ipi_hook = paravirt_nop, diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index 3f08f34f93e..a1da6737ba5 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -6,7 +6,6 @@ DEF_NATIVE(pv_irq_ops, irq_disable, "cli");  DEF_NATIVE(pv_irq_ops, irq_enable, "sti");  DEF_NATIVE(pv_irq_ops, restore_fl, "pushq %rdi; popfq");  DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax"); -DEF_NATIVE(pv_cpu_ops, iret, "iretq");  DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");  DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");  DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3"); @@ -50,7 +49,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,  		PATCH_SITE(pv_irq_ops, save_fl);  		PATCH_SITE(pv_irq_ops, irq_enable);  		PATCH_SITE(pv_irq_ops, irq_disable); -		PATCH_SITE(pv_cpu_ops, iret);  		PATCH_SITE(pv_cpu_ops, irq_enable_sysexit);  		PATCH_SITE(pv_cpu_ops, usergs_sysret32);  		PATCH_SITE(pv_cpu_ops, usergs_sysret64); diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 299d49302e7..0497f719977 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -1207,23 +1207,31 @@ error:  	return ret;  } -static inline int __init determine_tce_table_size(u64 ram) +static inline int __init determine_tce_table_size(void)  {  	int ret;  	if (specified_table_size != TCE_TABLE_SIZE_UNSPECIFIED)  		return specified_table_size; -	/* -	 * Table sizes are from 0 to 7 (TCE_TABLE_SIZE_64K to -	 * TCE_TABLE_SIZE_8M). Table size 0 has 8K entries and each -	 * larger table size has twice as many entries, so shift the -	 * max ram address by 13 to divide by 8K and then look at the -	 * order of the result to choose between 0-7. -	 */ -	ret = get_order(ram >> 13); -	if (ret > TCE_TABLE_SIZE_8M) +	if (is_kdump_kernel() && saved_max_pfn) { +		/* +		 * Table sizes are from 0 to 7 (TCE_TABLE_SIZE_64K to +		 * TCE_TABLE_SIZE_8M). Table size 0 has 8K entries and each +		 * larger table size has twice as many entries, so shift the +		 * max ram address by 13 to divide by 8K and then look at the +		 * order of the result to choose between 0-7. +		 */ +		ret = get_order((saved_max_pfn * PAGE_SIZE) >> 13); +		if (ret > TCE_TABLE_SIZE_8M) +			ret = TCE_TABLE_SIZE_8M; +	} else { +		/* +		 * Use 8M by default (suggested by Muli) if it's not +		 * kdump kernel and saved_max_pfn isn't set. +		 */  		ret = TCE_TABLE_SIZE_8M; +	}  	return ret;  } @@ -1418,8 +1426,7 @@ int __init detect_calgary(void)  		return -ENOMEM;  	} -	specified_table_size = determine_tce_table_size((is_kdump_kernel() ? -					saved_max_pfn : max_pfn) * PAGE_SIZE); +	specified_table_size = determine_tce_table_size();  	for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {  		struct calgary_bus_info *info = &bus_info[bus]; diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 872079a67e4..a25e202bb31 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -97,11 +97,18 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size,  	dma_mask = dma_alloc_coherent_mask(dev, flag); -	flag |= __GFP_ZERO; +	flag &= ~__GFP_ZERO;  again:  	page = NULL; -	if (!(flag & GFP_ATOMIC)) +	/* CMA can be used only in the context which permits sleeping */ +	if (flag & __GFP_WAIT) {  		page = dma_alloc_from_contiguous(dev, count, get_order(size)); +		if (page && page_to_phys(page) + size > dma_mask) { +			dma_release_from_contiguous(dev, page, count); +			page = NULL; +		} +	} +	/* fallback */  	if (!page)  		page = alloc_pages_node(dev_to_node(dev), flag, get_order(size));  	if (!page) @@ -118,7 +125,7 @@ again:  		return NULL;  	} - +	memset(page_address(page), 0, size);  	*dma_addr = addr;  	return page_address(page);  } diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index 871be4a84c7..da15918d1c8 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -3,7 +3,6 @@  #include <linux/dma-mapping.h>  #include <linux/scatterlist.h>  #include <linux/string.h> -#include <linux/init.h>  #include <linux/gfp.h>  #include <linux/pci.h>  #include <linux/mm.h> diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 6c483ba98b9..77dd0ad58be 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -14,7 +14,7 @@  #include <asm/iommu_table.h>  int swiotlb __read_mostly; -static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, +void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,  					dma_addr_t *dma_handle, gfp_t flags,  					struct dma_attrs *attrs)  { @@ -28,11 +28,14 @@ static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,  	return swiotlb_alloc_coherent(hwdev, size, dma_handle, flags);  } -static void x86_swiotlb_free_coherent(struct device *dev, size_t size, +void x86_swiotlb_free_coherent(struct device *dev, size_t size,  				      void *vaddr, dma_addr_t dma_addr,  				      struct dma_attrs *attrs)  { -	swiotlb_free_coherent(dev, size, vaddr, dma_addr); +	if (is_swiotlb_buffer(dma_to_phys(dev, dma_addr))) +		swiotlb_free_coherent(dev, size, vaddr, dma_addr); +	else +		dma_generic_free_coherent(dev, size, vaddr, dma_addr, attrs);  }  static struct dma_map_ops swiotlb_dma_ops = { diff --git a/arch/x86/kernel/preempt.S b/arch/x86/kernel/preempt.S new file mode 100644 index 00000000000..ca7f0d58a87 --- /dev/null +++ b/arch/x86/kernel/preempt.S @@ -0,0 +1,25 @@ + +#include <linux/linkage.h> +#include <asm/dwarf2.h> +#include <asm/asm.h> +#include <asm/calling.h> + +ENTRY(___preempt_schedule) +	CFI_STARTPROC +	SAVE_ALL +	call preempt_schedule +	RESTORE_ALL +	ret +	CFI_ENDPROC + +#ifdef CONFIG_CONTEXT_TRACKING + +ENTRY(___preempt_schedule_context) +	CFI_STARTPROC +	SAVE_ALL +	call preempt_schedule_context +	RESTORE_ALL +	ret +	CFI_ENDPROC + +#endif diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index c83516be105..4505e2a950d 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -298,10 +298,7 @@ void arch_cpu_idle_dead(void)   */  void arch_cpu_idle(void)  { -	if (cpuidle_idle_call()) -		x86_idle(); -	else -		local_irq_enable(); +	x86_idle();  }  /* @@ -391,9 +388,9 @@ static void amd_e400_idle(void)  		 * The switch back from broadcast mode needs to be  		 * called with interrupts disabled.  		 */ -		 local_irq_disable(); -		 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); -		 local_irq_enable(); +		local_irq_disable(); +		clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); +		local_irq_enable();  	} else  		default_idle();  } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 884f98f6935..7bc86bbe748 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -24,7 +24,6 @@  #include <linux/interrupt.h>  #include <linux/delay.h>  #include <linux/reboot.h> -#include <linux/init.h>  #include <linux/mc146818rtc.h>  #include <linux/module.h>  #include <linux/kallsyms.h> @@ -153,7 +152,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,  		childregs->orig_ax = -1;  		childregs->cs = __KERNEL_CS | get_kernel_rpl();  		childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED; -		p->fpu_counter = 0; +		p->thread.fpu_counter = 0;  		p->thread.io_bitmap_ptr = NULL;  		memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));  		return 0; @@ -166,7 +165,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,  	p->thread.ip = (unsigned long) ret_from_fork;  	task_user_gs(p) = get_user_gs(current_pt_regs()); -	p->fpu_counter = 0; +	p->thread.fpu_counter = 0;  	p->thread.io_bitmap_ptr = NULL;  	tsk = current;  	err = -ENOMEM; @@ -292,6 +291,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)  		set_iopl_mask(next->iopl);  	/* +	 * If it were not for PREEMPT_ACTIVE we could guarantee that the +	 * preempt_count of all tasks was equal here and this would not be +	 * needed. +	 */ +	task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count); +	this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count); + +	/*  	 * Now maybe handle debug registers and/or IO bitmaps  	 */  	if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV || @@ -307,6 +314,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)  	 */  	arch_end_context_switch(next_p); +	this_cpu_write(kernel_stack, +		  (unsigned long)task_stack_page(next_p) + +		  THREAD_SIZE - KERNEL_STACK_OFFSET); +  	/*  	 * Restore %gs if needed (which is common)  	 */ diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index bb1dc51bab0..ca5b02d405c 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -52,7 +52,7 @@  asmlinkage extern void ret_from_fork(void); -asmlinkage DEFINE_PER_CPU(unsigned long, old_rsp); +__visible DEFINE_PER_CPU(unsigned long, old_rsp);  /* Prints also some state that isn't saved in the pt_regs */  void __show_regs(struct pt_regs *regs, int all) @@ -63,7 +63,7 @@ void __show_regs(struct pt_regs *regs, int all)  	unsigned int ds, cs, es;  	printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); -	printk_address(regs->ip, 1); +	printk_address(regs->ip);  	printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,  			regs->sp, regs->flags);  	printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n", @@ -163,7 +163,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,  	p->thread.sp = (unsigned long) childregs;  	p->thread.usersp = me->thread.usersp;  	set_tsk_thread_flag(p, TIF_FORK); -	p->fpu_counter = 0; +	p->thread.fpu_counter = 0;  	p->thread.io_bitmap_ptr = NULL;  	savesegment(gs, p->thread.gsindex); @@ -363,6 +363,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)  	this_cpu_write(old_rsp, next->usersp);  	this_cpu_write(current_task, next_p); +	/* +	 * If it were not for PREEMPT_ACTIVE we could guarantee that the +	 * preempt_count of all tasks was equal here and this would not be +	 * needed. +	 */ +	task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count); +	this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count); +  	this_cpu_write(kernel_stack,  		  (unsigned long)task_stack_page(next_p) +  		  THREAD_SIZE - KERNEL_STACK_OFFSET); @@ -405,12 +413,11 @@ void set_personality_ia32(bool x32)  	set_thread_flag(TIF_ADDR32);  	/* Mark the associated mm as containing 32-bit tasks. */ -	if (current->mm) -		current->mm->context.ia32_compat = 1; -  	if (x32) {  		clear_thread_flag(TIF_IA32);  		set_thread_flag(TIF_X32); +		if (current->mm) +			current->mm->context.ia32_compat = TIF_X32;  		current->personality &= ~READ_IMPLIES_EXEC;  		/* is_compat_task() uses the presence of the x32  		   syscall bit flag to determine compat status */ @@ -418,6 +425,8 @@ void set_personality_ia32(bool x32)  	} else {  		set_thread_flag(TIF_IA32);  		clear_thread_flag(TIF_X32); +		if (current->mm) +			current->mm->context.ia32_compat = TIF_IA32;  		current->personality |= force_personality32;  		/* Prepare the first "return" to user space */  		current_thread_info()->status |= TS_COMPAT; diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 7461f50d5bb..678c0ada3b3 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -184,14 +184,14 @@ unsigned long kernel_stack_pointer(struct pt_regs *regs)  {  	unsigned long context = (unsigned long)regs & ~(THREAD_SIZE - 1);  	unsigned long sp = (unsigned long)®s->sp; -	struct thread_info *tinfo; +	u32 *prev_esp;  	if (context == (sp & ~(THREAD_SIZE - 1)))  		return sp; -	tinfo = (struct thread_info *)context; -	if (tinfo->previous_esp) -		return tinfo->previous_esp; +	prev_esp = (u32 *)(context); +	if (prev_esp) +		return (unsigned long)prev_esp;  	return (unsigned long)regs;  } diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index a16bae3f83b..2f355d229a5 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -43,6 +43,14 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)  	return pv_tsc_khz;  } +void pvclock_touch_watchdogs(void) +{ +	touch_softlockup_watchdog_sync(); +	clocksource_touch_watchdog(); +	rcu_cpu_stall_reset(); +	reset_hung_task_detector(); +} +  static atomic64_t last_value = ATOMIC64_INIT(0);  void pvclock_resume(void) @@ -74,6 +82,11 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)  		version = __pvclock_read_cycles(src, &ret, &flags);  	} while ((src->version & 1) || version != src->version); +	if (unlikely((flags & PVCLOCK_GUEST_STOPPED) != 0)) { +		src->flags &= ~PVCLOCK_GUEST_STOPPED; +		pvclock_touch_watchdogs(); +	} +  	if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) &&  		(flags & PVCLOCK_TSC_STABLE_BIT))  		return ret; diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 04ee1e2e4c0..ff898bbf579 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -529,7 +529,7 @@ static void quirk_amd_nb_node(struct pci_dev *dev)  		return;  	pci_read_config_dword(nb_ht, 0x60, &val); -	node = val & 7; +	node = pcibus_to_node(dev->bus) | (val & 7);  	/*  	 * Some hardware may return an invalid node ID,  	 * so check it first: @@ -571,3 +571,40 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F5,  			quirk_amd_nb_node);  #endif + +#ifdef CONFIG_PCI +/* + * Processor does not ensure DRAM scrub read/write sequence + * is atomic wrt accesses to CC6 save state area. Therefore + * if a concurrent scrub read/write access is to same address + * the entry may appear as if it is not written. This quirk + * applies to Fam16h models 00h-0Fh + * + * See "Revision Guide" for AMD F16h models 00h-0fh, + * document 51810 rev. 3.04, Nov 2013 + */ +static void amd_disable_seq_and_redirect_scrub(struct pci_dev *dev) +{ +	u32 val; + +	/* +	 * Suggested workaround: +	 * set D18F3x58[4:0] = 00h and set D18F3x5C[0] = 0b +	 */ +	pci_read_config_dword(dev, 0x58, &val); +	if (val & 0x1F) { +		val &= ~(0x1F); +		pci_write_config_dword(dev, 0x58, val); +	} + +	pci_read_config_dword(dev, 0x5C, &val); +	if (val & BIT(0)) { +		val &= ~BIT(0); +		pci_write_config_dword(dev, 0x5c, val); +	} +} + +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3, +			amd_disable_seq_and_redirect_scrub); + +#endif diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index e643e744e4d..52b1157c53e 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -61,7 +61,7 @@ static int __init set_bios_reboot(const struct dmi_system_id *d)  	if (reboot_type != BOOT_BIOS) {  		reboot_type = BOOT_BIOS;  		pr_info("%s series board detected. Selecting %s-method for reboots.\n", -			"BIOS", d->ident); +			d->ident, "BIOS");  	}  	return 0;  } @@ -114,10 +114,10 @@ EXPORT_SYMBOL(machine_real_restart);   */  static int __init set_pci_reboot(const struct dmi_system_id *d)  { -	if (reboot_type != BOOT_CF9) { -		reboot_type = BOOT_CF9; +	if (reboot_type != BOOT_CF9_FORCE) { +		reboot_type = BOOT_CF9_FORCE;  		pr_info("%s series board detected. Selecting %s-method for reboots.\n", -			"PCI", d->ident); +			d->ident, "PCI");  	}  	return 0;  } @@ -127,7 +127,7 @@ static int __init set_kbd_reboot(const struct dmi_system_id *d)  	if (reboot_type != BOOT_KBD) {  		reboot_type = BOOT_KBD;  		pr_info("%s series board detected. Selecting %s-method for reboot.\n", -			"KBD", d->ident); +			d->ident, "KBD");  	}  	return 0;  } @@ -136,244 +136,266 @@ static int __init set_kbd_reboot(const struct dmi_system_id *d)   * This is a single dmi_table handling all reboot quirks.   */  static struct dmi_system_id __initdata reboot_dmi_table[] = { -	{	/* Handle problems with rebooting on Dell E520's */ -		.callback = set_bios_reboot, -		.ident = "Dell E520", + +	/* Acer */ +	{	/* Handle reboot issue on Acer Aspire one */ +		.callback = set_kbd_reboot, +		.ident = "Acer Aspire One A110",  		.matches = { -			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), -			DMI_MATCH(DMI_PRODUCT_NAME, "Dell DM061"), +			DMI_MATCH(DMI_SYS_VENDOR, "Acer"), +			DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"),  		},  	}, -	{	/* Handle problems with rebooting on Dell 1300's */ -		.callback = set_bios_reboot, -		.ident = "Dell PowerEdge 1300", + +	/* Apple */ +	{	/* Handle problems with rebooting on Apple MacBook5 */ +		.callback = set_pci_reboot, +		.ident = "Apple MacBook5",  		.matches = { -			DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), -			DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1300/"), +			DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), +			DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"),  		},  	}, -	{	/* Handle problems with rebooting on Dell 300's */ -		.callback = set_bios_reboot, -		.ident = "Dell PowerEdge 300", +	{	/* Handle problems with rebooting on Apple MacBookPro5 */ +		.callback = set_pci_reboot, +		.ident = "Apple MacBookPro5",  		.matches = { -			DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), -			DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"), +			DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), +			DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"),  		},  	}, -	{	/* Handle problems with rebooting on Dell Optiplex 745's SFF */ -		.callback = set_bios_reboot, -		.ident = "Dell OptiPlex 745", +	{	/* Handle problems with rebooting on Apple Macmini3,1 */ +		.callback = set_pci_reboot, +		.ident = "Apple Macmini3,1",  		.matches = { -			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), -			DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"), +			DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), +			DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"),  		},  	}, -	{	/* Handle problems with rebooting on Dell Optiplex 745's DFF */ -		.callback = set_bios_reboot, -		.ident = "Dell OptiPlex 745", +	{	/* Handle problems with rebooting on the iMac9,1. */ +		.callback = set_pci_reboot, +		.ident = "Apple iMac9,1",  		.matches = { -			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), -			DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"), -			DMI_MATCH(DMI_BOARD_NAME, "0MM599"), +			DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), +			DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"),  		},  	}, -	{	/* Handle problems with rebooting on Dell Optiplex 745 with 0KW626 */ + +	/* ASUS */ +	{	/* Handle problems with rebooting on ASUS P4S800 */  		.callback = set_bios_reboot, -		.ident = "Dell OptiPlex 745", +		.ident = "ASUS P4S800",  		.matches = { -			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), -			DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"), -			DMI_MATCH(DMI_BOARD_NAME, "0KW626"), +			DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), +			DMI_MATCH(DMI_BOARD_NAME, "P4S800"),  		},  	}, -	{	/* Handle problems with rebooting on Dell Optiplex 330 with 0KP561 */ + +	/* Certec */ +	{       /* Handle problems with rebooting on Certec BPC600 */ +		.callback = set_pci_reboot, +		.ident = "Certec BPC600", +		.matches = { +			DMI_MATCH(DMI_SYS_VENDOR, "Certec"), +			DMI_MATCH(DMI_PRODUCT_NAME, "BPC600"), +		}, +	}, + +	/* Dell */ +	{	/* Handle problems with rebooting on Dell DXP061 */  		.callback = set_bios_reboot, -		.ident = "Dell OptiPlex 330", +		.ident = "Dell DXP061",  		.matches = {  			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), -			DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 330"), -			DMI_MATCH(DMI_BOARD_NAME, "0KP561"), +			DMI_MATCH(DMI_PRODUCT_NAME, "Dell DXP061"),  		},  	}, -	{	/* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */ +	{	/* Handle problems with rebooting on Dell E520's */  		.callback = set_bios_reboot, -		.ident = "Dell OptiPlex 360", +		.ident = "Dell E520",  		.matches = {  			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), -			DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 360"), -			DMI_MATCH(DMI_BOARD_NAME, "0T656F"), +			DMI_MATCH(DMI_PRODUCT_NAME, "Dell DM061"),  		},  	}, -	{	/* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G */ -		.callback = set_bios_reboot, -		.ident = "Dell OptiPlex 760", +	{	/* Handle problems with rebooting on the Latitude E5410. */ +		.callback = set_pci_reboot, +		.ident = "Dell Latitude E5410",  		.matches = {  			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), -			DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 760"), -			DMI_MATCH(DMI_BOARD_NAME, "0G919G"), +			DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5410"),  		},  	}, -	{	/* Handle problems with rebooting on Dell 2400's */ -		.callback = set_bios_reboot, -		.ident = "Dell PowerEdge 2400", +	{	/* Handle problems with rebooting on the Latitude E5420. */ +		.callback = set_pci_reboot, +		.ident = "Dell Latitude E5420",  		.matches = { -			DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), -			DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"), +			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), +			DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5420"),  		},  	}, -	{	/* Handle problems with rebooting on Dell T5400's */ -		.callback = set_bios_reboot, -		.ident = "Dell Precision T5400", +	{	/* Handle problems with rebooting on the Latitude E6320. */ +		.callback = set_pci_reboot, +		.ident = "Dell Latitude E6320",  		.matches = {  			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), -			DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T5400"), +			DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6320"),  		},  	}, -	{	/* Handle problems with rebooting on Dell T7400's */ -		.callback = set_bios_reboot, -		.ident = "Dell Precision T7400", +	{	/* Handle problems with rebooting on the Latitude E6420. */ +		.callback = set_pci_reboot, +		.ident = "Dell Latitude E6420",  		.matches = {  			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), -			DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T7400"), +			DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6420"),  		},  	}, -	{	/* Handle problems with rebooting on HP laptops */ +	{	/* Handle problems with rebooting on Dell Optiplex 330 with 0KP561 */  		.callback = set_bios_reboot, -		.ident = "HP Compaq Laptop", +		.ident = "Dell OptiPlex 330",  		.matches = { -			DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), -			DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"), +			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), +			DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 330"), +			DMI_MATCH(DMI_BOARD_NAME, "0KP561"),  		},  	}, -	{	/* Handle problems with rebooting on Dell XPS710 */ +	{	/* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */  		.callback = set_bios_reboot, -		.ident = "Dell XPS710", +		.ident = "Dell OptiPlex 360",  		.matches = {  			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), -			DMI_MATCH(DMI_PRODUCT_NAME, "Dell XPS710"), +			DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 360"), +			DMI_MATCH(DMI_BOARD_NAME, "0T656F"),  		},  	}, -	{	/* Handle problems with rebooting on Dell DXP061 */ +	{	/* Handle problems with rebooting on Dell Optiplex 745's SFF */  		.callback = set_bios_reboot, -		.ident = "Dell DXP061", +		.ident = "Dell OptiPlex 745",  		.matches = {  			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), -			DMI_MATCH(DMI_PRODUCT_NAME, "Dell DXP061"), +			DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),  		},  	}, -	{	/* Handle problems with rebooting on Sony VGN-Z540N */ +	{	/* Handle problems with rebooting on Dell Optiplex 745's DFF */  		.callback = set_bios_reboot, -		.ident = "Sony VGN-Z540N", +		.ident = "Dell OptiPlex 745",  		.matches = { -			DMI_MATCH(DMI_SYS_VENDOR, "Sony Corporation"), -			DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"), +			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), +			DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"), +			DMI_MATCH(DMI_BOARD_NAME, "0MM599"),  		},  	}, -	{	/* Handle problems with rebooting on ASUS P4S800 */ +	{	/* Handle problems with rebooting on Dell Optiplex 745 with 0KW626 */  		.callback = set_bios_reboot, -		.ident = "ASUS P4S800", +		.ident = "Dell OptiPlex 745",  		.matches = { -			DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), -			DMI_MATCH(DMI_BOARD_NAME, "P4S800"), +			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), +			DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"), +			DMI_MATCH(DMI_BOARD_NAME, "0KW626"),  		},  	}, - -	{	/* Handle reboot issue on Acer Aspire one */ -		.callback = set_kbd_reboot, -		.ident = "Acer Aspire One A110", +	{	/* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G */ +		.callback = set_bios_reboot, +		.ident = "Dell OptiPlex 760",  		.matches = { -			DMI_MATCH(DMI_SYS_VENDOR, "Acer"), -			DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"), +			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), +			DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 760"), +			DMI_MATCH(DMI_BOARD_NAME, "0G919G"),  		},  	}, -	{	/* Handle problems with rebooting on Apple MacBook5 */ +	{	/* Handle problems with rebooting on the OptiPlex 990. */  		.callback = set_pci_reboot, -		.ident = "Apple MacBook5", +		.ident = "Dell OptiPlex 990",  		.matches = { -			DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), -			DMI_MATCH(DMI_PRODUCT_NAME, "MacBook5"), +			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), +			DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"),  		},  	}, -	{	/* Handle problems with rebooting on Apple MacBookPro5 */ -		.callback = set_pci_reboot, -		.ident = "Apple MacBookPro5", +	{	/* Handle problems with rebooting on Dell 300's */ +		.callback = set_bios_reboot, +		.ident = "Dell PowerEdge 300",  		.matches = { -			DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), -			DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"), +			DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), +			DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"),  		},  	}, -	{	/* Handle problems with rebooting on Apple Macmini3,1 */ -		.callback = set_pci_reboot, -		.ident = "Apple Macmini3,1", +	{	/* Handle problems with rebooting on Dell 1300's */ +		.callback = set_bios_reboot, +		.ident = "Dell PowerEdge 1300",  		.matches = { -			DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), -			DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"), +			DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), +			DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1300/"),  		},  	}, -	{	/* Handle problems with rebooting on the iMac9,1. */ -		.callback = set_pci_reboot, -		.ident = "Apple iMac9,1", +	{	/* Handle problems with rebooting on Dell 2400's */ +		.callback = set_bios_reboot, +		.ident = "Dell PowerEdge 2400",  		.matches = { -			DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), -			DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"), +			DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), +			DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"),  		},  	}, -	{	/* Handle problems with rebooting on the Latitude E6320. */ +	{	/* Handle problems with rebooting on the Dell PowerEdge C6100. */  		.callback = set_pci_reboot, -		.ident = "Dell Latitude E6320", +		.ident = "Dell PowerEdge C6100",  		.matches = { -			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), -			DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6320"), +			DMI_MATCH(DMI_SYS_VENDOR, "Dell"), +			DMI_MATCH(DMI_PRODUCT_NAME, "C6100"),  		},  	}, -	{	/* Handle problems with rebooting on the Latitude E5420. */ +	{	/* Handle problems with rebooting on the Precision M6600. */  		.callback = set_pci_reboot, -		.ident = "Dell Latitude E5420", +		.ident = "Dell Precision M6600",  		.matches = {  			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), -			DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5420"), +			DMI_MATCH(DMI_PRODUCT_NAME, "Precision M6600"),  		},  	}, -	{	/* Handle problems with rebooting on the Latitude E6420. */ -		.callback = set_pci_reboot, -		.ident = "Dell Latitude E6420", +	{	/* Handle problems with rebooting on Dell T5400's */ +		.callback = set_bios_reboot, +		.ident = "Dell Precision T5400",  		.matches = {  			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), -			DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6420"), +			DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T5400"),  		},  	}, -	{	/* Handle problems with rebooting on the OptiPlex 990. */ -		.callback = set_pci_reboot, -		.ident = "Dell OptiPlex 990", +	{	/* Handle problems with rebooting on Dell T7400's */ +		.callback = set_bios_reboot, +		.ident = "Dell Precision T7400",  		.matches = {  			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), -			DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"), +			DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T7400"),  		},  	}, -	{	/* Handle problems with rebooting on the Precision M6600. */ -		.callback = set_pci_reboot, -		.ident = "Dell Precision M6600", +	{	/* Handle problems with rebooting on Dell XPS710 */ +		.callback = set_bios_reboot, +		.ident = "Dell XPS710",  		.matches = {  			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), -			DMI_MATCH(DMI_PRODUCT_NAME, "Precision M6600"), +			DMI_MATCH(DMI_PRODUCT_NAME, "Dell XPS710"),  		},  	}, -	{	/* Handle problems with rebooting on the Dell PowerEdge C6100. */ -		.callback = set_pci_reboot, -		.ident = "Dell PowerEdge C6100", + +	/* Hewlett-Packard */ +	{	/* Handle problems with rebooting on HP laptops */ +		.callback = set_bios_reboot, +		.ident = "HP Compaq Laptop",  		.matches = { -			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), -			DMI_MATCH(DMI_PRODUCT_NAME, "C6100"), +			DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), +			DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"),  		},  	}, -	{	/* Some C6100 machines were shipped with vendor being 'Dell'. */ -		.callback = set_pci_reboot, -		.ident = "Dell PowerEdge C6100", + +	/* Sony */ +	{	/* Handle problems with rebooting on Sony VGN-Z540N */ +		.callback = set_bios_reboot, +		.ident = "Sony VGN-Z540N",  		.matches = { -			DMI_MATCH(DMI_SYS_VENDOR, "Dell"), -			DMI_MATCH(DMI_PRODUCT_NAME, "C6100"), +			DMI_MATCH(DMI_SYS_VENDOR, "Sony Corporation"), +			DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"),  		},  	}, +  	{ }  }; @@ -446,17 +468,23 @@ void __attribute__((weak)) mach_reboot_fixups(void)  }  /* - * Windows compatible x86 hardware expects the following on reboot: + * To the best of our knowledge Windows compatible x86 hardware expects + * the following on reboot:   *   * 1) If the FADT has the ACPI reboot register flag set, try it   * 2) If still alive, write to the keyboard controller   * 3) If still alive, write to the ACPI reboot register again   * 4) If still alive, write to the keyboard controller again + * 5) If still alive, call the EFI runtime service to reboot + * 6) If no EFI runtime service, call the BIOS to do a reboot + * + * We default to following the same pattern. We also have + * two other reboot methods: 'triple fault' and 'PCI', which + * can be triggered via the reboot= kernel boot option or + * via quirks.   * - * If the machine is still alive at this stage, it gives up. We default to - * following the same pattern, except that if we're still alive after (4) we'll - * try to force a triple fault and then cycle between hitting the keyboard - * controller and doing that + * This means that this function can never return, it can misbehave + * by not rebooting properly and hanging.   */  static void native_machine_emergency_restart(void)  { @@ -477,6 +505,11 @@ static void native_machine_emergency_restart(void)  	for (;;) {  		/* Could also try the reset bit in the Hammer NB */  		switch (reboot_type) { +		case BOOT_ACPI: +			acpi_reboot(); +			reboot_type = BOOT_KBD; +			break; +  		case BOOT_KBD:  			mach_reboot_fixups(); /* For board specific fixups */ @@ -490,49 +523,48 @@ static void native_machine_emergency_restart(void)  				attempt = 1;  				reboot_type = BOOT_ACPI;  			} else { -				reboot_type = BOOT_TRIPLE; +				reboot_type = BOOT_EFI;  			}  			break; -		case BOOT_TRIPLE: -			load_idt(&no_idt); -			__asm__ __volatile__("int3"); - -			reboot_type = BOOT_KBD; -			break; - -		case BOOT_BIOS: -			machine_real_restart(MRR_BIOS); - -			reboot_type = BOOT_KBD; -			break; - -		case BOOT_ACPI: -			acpi_reboot(); -			reboot_type = BOOT_KBD; -			break; -  		case BOOT_EFI:  			if (efi_enabled(EFI_RUNTIME_SERVICES))  				efi.reset_system(reboot_mode == REBOOT_WARM ?  						 EFI_RESET_WARM :  						 EFI_RESET_COLD,  						 EFI_SUCCESS, 0, NULL); -			reboot_type = BOOT_KBD; +			reboot_type = BOOT_BIOS;  			break; -		case BOOT_CF9: +		case BOOT_BIOS: +			machine_real_restart(MRR_BIOS); + +			/* We're probably dead after this, but... */ +			reboot_type = BOOT_CF9_SAFE; +			break; + +		case BOOT_CF9_FORCE:  			port_cf9_safe = true;  			/* Fall through */ -		case BOOT_CF9_COND: +		case BOOT_CF9_SAFE:  			if (port_cf9_safe) { -				u8 cf9 = inb(0xcf9) & ~6; +				u8 reboot_code = reboot_mode == REBOOT_WARM ?  0x06 : 0x0E; +				u8 cf9 = inb(0xcf9) & ~reboot_code;  				outb(cf9|2, 0xcf9); /* Request hard reset */  				udelay(50); -				outb(cf9|6, 0xcf9); /* Actually do the reset */ +				/* Actually do the reset */ +				outb(cf9|reboot_code, 0xcf9);  				udelay(50);  			} +			reboot_type = BOOT_TRIPLE; +			break; + +		case BOOT_TRIPLE: +			load_idt(&no_idt); +			__asm__ __volatile__("int3"); + +			/* We're probably dead after this, but... */  			reboot_type = BOOT_KBD;  			break;  		} @@ -542,6 +574,21 @@ static void native_machine_emergency_restart(void)  void native_machine_shutdown(void)  {  	/* Stop the cpus and apics */ +#ifdef CONFIG_X86_IO_APIC +	/* +	 * Disabling IO APIC before local APIC is a workaround for +	 * erratum AVR31 in "Intel Atom Processor C2000 Product Family +	 * Specification Update". In this situation, interrupts that target +	 * a Logical Processor whose Local APIC is either in the process of +	 * being hardware disabled or software disabled are neither delivered +	 * nor discarded. When this erratum occurs, the processor may hang. +	 * +	 * Even without the erratum, it still makes sense to quiet IO APIC +	 * before disabling Local APIC. +	 */ +	disable_IO_APIC(); +#endif +  #ifdef CONFIG_SMP  	/*  	 * Stop all of the others. Also disable the local irq to @@ -554,10 +601,6 @@ void native_machine_shutdown(void)  	lapic_shutdown(); -#ifdef CONFIG_X86_IO_APIC -	disable_IO_APIC(); -#endif -  #ifdef CONFIG_HPET_TIMER  	hpet_disable();  #endif diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 0aa29394ed6..ca9622a25e9 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -12,7 +12,7 @@  #include <asm/vsyscall.h>  #include <asm/x86_init.h>  #include <asm/time.h> -#include <asm/mrst.h> +#include <asm/intel-mid.h>  #include <asm/rtc.h>  #ifdef CONFIG_X86_32 @@ -189,9 +189,17 @@ static __init int add_rtc_cmos(void)  		return 0;  	/* Intel MID platforms don't have ioport rtc */ -	if (mrst_identify_cpu()) +	if (intel_mid_identify_cpu())  		return -ENODEV; +#ifdef CONFIG_ACPI +	if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC) { +		/* This warning can likely go away again in a year or two. */ +		pr_info("ACPI: not registering RTC platform device\n"); +		return -ENODEV; +	} +#endif +  	platform_device_register(&rtc_device);  	dev_info(&rtc_device.dev,  		 "registered platform RTC device (no PNP device found)\n"); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index f0de6294b95..78a0e629892 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -295,6 +295,8 @@ static void __init reserve_brk(void)  	_brk_start = 0;  } +u64 relocated_ramdisk; +  #ifdef CONFIG_BLK_DEV_INITRD  static u64 __init get_ramdisk_image(void) @@ -321,25 +323,24 @@ static void __init relocate_initrd(void)  	u64 ramdisk_image = get_ramdisk_image();  	u64 ramdisk_size  = get_ramdisk_size();  	u64 area_size     = PAGE_ALIGN(ramdisk_size); -	u64 ramdisk_here;  	unsigned long slop, clen, mapaddr;  	char *p, *q;  	/* We need to move the initrd down into directly mapped mem */ -	ramdisk_here = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), -						 area_size, PAGE_SIZE); +	relocated_ramdisk = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), +						   area_size, PAGE_SIZE); -	if (!ramdisk_here) +	if (!relocated_ramdisk)  		panic("Cannot find place for new RAMDISK of size %lld\n", -			 ramdisk_size); +		      ramdisk_size);  	/* Note: this includes all the mem currently occupied by  	   the initrd, we rely on that fact to keep the data intact. */ -	memblock_reserve(ramdisk_here, area_size); -	initrd_start = ramdisk_here + PAGE_OFFSET; +	memblock_reserve(relocated_ramdisk, area_size); +	initrd_start = relocated_ramdisk + PAGE_OFFSET;  	initrd_end   = initrd_start + ramdisk_size;  	printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n", -			 ramdisk_here, ramdisk_here + ramdisk_size - 1); +	       relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1);  	q = (char *)initrd_start; @@ -363,7 +364,7 @@ static void __init relocate_initrd(void)  	printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to"  		" [mem %#010llx-%#010llx]\n",  		ramdisk_image, ramdisk_image + ramdisk_size - 1, -		ramdisk_here, ramdisk_here + ramdisk_size - 1); +		relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1);  }  static void __init early_reserve_initrd(void) @@ -447,6 +448,9 @@ static void __init parse_setup_data(void)  		case SETUP_DTB:  			add_dtb(pa_data);  			break; +		case SETUP_EFI: +			parse_efi_setup(pa_data, data_len); +			break;  		default:  			break;  		} @@ -824,6 +828,20 @@ static void __init trim_low_memory_range(void)  }  /* + * Dump out kernel offset information on panic. + */ +static int +dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) +{ +	pr_emerg("Kernel Offset: 0x%lx from 0x%lx " +		 "(relocation range: 0x%lx-0x%lx)\n", +		 (unsigned long)&_text - __START_KERNEL, __START_KERNEL, +		 __START_KERNEL_map, MODULES_VADDR-1); + +	return 0; +} + +/*   * Determine if we were loaded by an EFI loader.  If so, then we have also been   * passed the efi memmap, systab, etc., so we should use these data structures   * for initialization.  Note, the efi init code path is determined by the @@ -851,7 +869,6 @@ void __init setup_arch(char **cmdline_p)  #ifdef CONFIG_X86_32  	memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); -	visws_early_detect();  	/*  	 * copy kernel address range established so far and switch @@ -908,11 +925,11 @@ void __init setup_arch(char **cmdline_p)  #ifdef CONFIG_EFI  	if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,  		     "EL32", 4)) { -		set_bit(EFI_BOOT, &x86_efi_facility); +		set_bit(EFI_BOOT, &efi.flags);  	} else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,  		     "EL64", 4)) { -		set_bit(EFI_BOOT, &x86_efi_facility); -		set_bit(EFI_64BIT, &x86_efi_facility); +		set_bit(EFI_BOOT, &efi.flags); +		set_bit(EFI_64BIT, &efi.flags);  	}  	if (efi_enabled(EFI_BOOT)) @@ -924,8 +941,6 @@ void __init setup_arch(char **cmdline_p)  	iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1;  	setup_memory_map();  	parse_setup_data(); -	/* update the e820_saved too */ -	e820_reserve_setup_data();  	copy_edd(); @@ -987,12 +1002,15 @@ void __init setup_arch(char **cmdline_p)  		early_dump_pci_devices();  #endif +	/* update the e820_saved too */ +	e820_reserve_setup_data();  	finish_e820_parsing();  	if (efi_enabled(EFI_BOOT))  		efi_init();  	dmi_scan_machine(); +	dmi_memdev_walk();  	dmi_set_dump_stack_arch_desc();  	/* @@ -1101,7 +1119,7 @@ void __init setup_arch(char **cmdline_p)  	setup_real_mode();  	memblock_set_current_limit(get_max_mapped()); -	dma_contiguous_reserve(0); +	dma_contiguous_reserve(max_pfn_mapped << PAGE_SHIFT);  	/*  	 * NOTE: On x86-32, only from this point on, fixmaps are ready for use. @@ -1120,8 +1138,6 @@ void __init setup_arch(char **cmdline_p)  	acpi_initrd_override((void *)initrd_start, initrd_end - initrd_start);  #endif -	reserve_crashkernel(); -  	vsmp_init();  	io_delay_init(); @@ -1134,6 +1150,13 @@ void __init setup_arch(char **cmdline_p)  	early_acpi_boot_init();  	initmem_init(); + +	/* +	 * Reserve memory for crash kernel after SRAT is parsed so that it +	 * won't consume hotpluggable memory. +	 */ +	reserve_crashkernel(); +  	memblock_find_dma_reserve();  #ifdef CONFIG_KVM_GUEST @@ -1215,14 +1238,8 @@ void __init setup_arch(char **cmdline_p)  	register_refined_jiffies(CLOCK_TICK_RATE);  #ifdef CONFIG_EFI -	/* Once setup is done above, unmap the EFI memory map on -	 * mismatched firmware/kernel archtectures since there is no -	 * support for runtime services. -	 */ -	if (efi_enabled(EFI_BOOT) && !efi_is_native()) { -		pr_info("efi: Setup done, disabling due to 32/64-bit mismatch\n"); -		efi_unmap_memmap(); -	} +	if (efi_enabled(EFI_BOOT)) +		efi_apply_memmap_quirks();  #endif  } @@ -1242,3 +1259,15 @@ void __init i386_reserve_resources(void)  }  #endif /* CONFIG_X86_32 */ + +static struct notifier_block kernel_offset_notifier = { +	.notifier_call = dump_kernel_offset +}; + +static int __init register_kernel_offset_dumper(void) +{ +	atomic_notifier_chain_register(&panic_notifier_list, +					&kernel_offset_notifier); +	return 0; +} +__initcall(register_kernel_offset_dumper); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 9e5de6813e1..2851d63c120 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -298,7 +298,8 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t *set,  	}  	if (current->mm->context.vdso) -		restorer = VDSO32_SYMBOL(current->mm->context.vdso, sigreturn); +		restorer = current->mm->context.vdso + +			selected_vdso32->sym___kernel_sigreturn;  	else  		restorer = &frame->retcode;  	if (ksig->ka.sa.sa_flags & SA_RESTORER) @@ -361,7 +362,8 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,  		save_altstack_ex(&frame->uc.uc_stack, regs->sp);  		/* Set up to return from userspace.  */ -		restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); +		restorer = current->mm->context.vdso + +			selected_vdso32->sym___kernel_rt_sigreturn;  		if (ksig->ka.sa.sa_flags & SA_RESTORER)  			restorer = ksig->ka.sa.sa_restorer;  		put_user_ex(restorer, &frame->pretcode); diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 7c3a5a61f2e..be8e1bde07a 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -168,7 +168,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)   * this function calls the 'stop' function on all other CPUs in the system.   */ -asmlinkage void smp_reboot_interrupt(void) +asmlinkage __visible void smp_reboot_interrupt(void)  {  	ack_APIC_irq();  	irq_enter(); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 6cacab671f9..5492798930e 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -73,36 +73,14 @@  #include <asm/setup.h>  #include <asm/uv/uv.h>  #include <linux/mc146818rtc.h> -  #include <asm/smpboot_hooks.h>  #include <asm/i8259.h> -  #include <asm/realmode.h> +#include <asm/misc.h>  /* State of each CPU */  DEFINE_PER_CPU(int, cpu_state) = { 0 }; -#ifdef CONFIG_HOTPLUG_CPU -/* - * We need this for trampoline_base protection from concurrent accesses when - * off- and onlining cores wildly. - */ -static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex); - -void cpu_hotplug_driver_lock(void) -{ -	mutex_lock(&x86_cpu_hotplug_driver_mutex); -} - -void cpu_hotplug_driver_unlock(void) -{ -	mutex_unlock(&x86_cpu_hotplug_driver_mutex); -} - -ssize_t arch_cpu_probe(const char *buf, size_t count) { return -1; } -ssize_t arch_cpu_release(const char *buf, size_t count) { return -1; } -#endif -  /* Number of siblings per CPU package */  int smp_num_siblings = 1;  EXPORT_SYMBOL(smp_num_siblings); @@ -144,8 +122,9 @@ static void smp_callin(void)  	 * Since CPU0 is not wakened up by INIT, it doesn't wait for the IPI.  	 */  	cpuid = smp_processor_id(); -	if (apic->wait_for_init_deassert && cpuid != 0) -		apic->wait_for_init_deassert(&init_deasserted); +	if (apic->wait_for_init_deassert && cpuid) +		while (!atomic_read(&init_deasserted)) +			cpu_relax();  	/*  	 * (This works even if the APIC is not enabled.) @@ -265,6 +244,13 @@ static void notrace start_secondary(void *unused)  	check_tsc_sync_target();  	/* +	 * Enable the espfix hack for this CPU +	 */ +#ifdef CONFIG_X86_ESPFIX64 +	init_espfix_ap(); +#endif + +	/*  	 * We need to hold vector_lock so there the set of online cpus  	 * does not change while we are assigning vectors to cpus.  Holding  	 * this lock ensures we don't half assign or remove an irq from a cpu. @@ -648,22 +634,46 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)  	return (send_status | accept_status);  } +void smp_announce(void) +{ +	int num_nodes = num_online_nodes(); + +	printk(KERN_INFO "x86: Booted up %d node%s, %d CPUs\n", +	       num_nodes, (num_nodes > 1 ? "s" : ""), num_online_cpus()); +} +  /* reduce the number of lines printed when booting a large cpu count system */  static void announce_cpu(int cpu, int apicid)  {  	static int current_node = -1;  	int node = early_cpu_to_node(cpu); -	int max_cpu_present = find_last_bit(cpumask_bits(cpu_present_mask), NR_CPUS); +	static int width, node_width; + +	if (!width) +		width = num_digits(num_possible_cpus()) + 1; /* + '#' sign */ + +	if (!node_width) +		node_width = num_digits(num_possible_nodes()) + 1; /* + '#' */ + +	if (cpu == 1) +		printk(KERN_INFO "x86: Booting SMP configuration:\n");  	if (system_state == SYSTEM_BOOTING) {  		if (node != current_node) {  			if (current_node > (-1)) -				pr_cont(" OK\n"); +				pr_cont("\n");  			current_node = node; -			pr_info("Booting Node %3d, Processors ", node); + +			printk(KERN_INFO ".... node %*s#%d, CPUs:  ", +			       node_width - num_digits(node), " ", node);  		} -		pr_cont(" #%4d%s", cpu, cpu == max_cpu_present ? " OK\n" : ""); -		return; + +		/* Add padding for the BSP */ +		if (cpu == 1) +			pr_cont("%*s", width + 1, " "); + +		pr_cont("%*s#%d", width - num_digits(cpu), " ", cpu); +  	} else  		pr_info("Booting Node %d Processor %d APIC 0x%x\n",  			node, cpu, apicid); @@ -699,11 +709,15 @@ wakeup_cpu_via_init_nmi(int cpu, unsigned long start_ip, int apicid,  	int id;  	int boot_error; +	preempt_disable(); +  	/*  	 * Wake up AP by INIT, INIT, STARTUP sequence.  	 */ -	if (cpu) -		return wakeup_secondary_cpu_via_init(apicid, start_ip); +	if (cpu) { +		boot_error = wakeup_secondary_cpu_via_init(apicid, start_ip); +		goto out; +	}  	/*  	 * Wake up BSP by nmi. @@ -723,6 +737,9 @@ wakeup_cpu_via_init_nmi(int cpu, unsigned long start_ip, int apicid,  		boot_error = wakeup_secondary_cpu_via_nmi(id, start_ip);  	} +out: +	preempt_enable(); +  	return boot_error;  } @@ -756,10 +773,10 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)  #else  	clear_tsk_thread_flag(idle, TIF_FORK);  	initial_gs = per_cpu_offset(cpu); +#endif  	per_cpu(kernel_stack, cpu) =  		(unsigned long)task_stack_page(idle) -  		KERNEL_STACK_OFFSET + THREAD_SIZE; -#endif  	early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);  	initial_code = (unsigned long)start_secondary;  	stack_start  = idle->thread.sp; @@ -849,9 +866,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)  		/* was set by cpu_init() */  		cpumask_clear_cpu(cpu, cpu_initialized_mask); - -		set_cpu_present(cpu, false); -		per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID;  	}  	/* mark "stuck" area as not stuck */ @@ -911,7 +925,7 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)  	err = do_boot_cpu(apicid, cpu, tidle);  	if (err) { -		pr_debug("do_boot_cpu failed %d\n", err); +		pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu);  		return -EIO;  	} @@ -1310,6 +1324,12 @@ void cpu_disable_common(void)  int native_cpu_disable(void)  { +	int ret; + +	ret = check_irq_vectors_for_cpu_disable(); +	if (ret) +		return ret; +  	clear_local_APIC();  	cpu_disable_common(); @@ -1371,7 +1391,7 @@ static inline void mwait_play_dead(void)  	if (!this_cpu_has(X86_FEATURE_MWAIT))  		return; -	if (!this_cpu_has(X86_FEATURE_CLFLSH)) +	if (!this_cpu_has(X86_FEATURE_CLFLUSH))  		return;  	if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)  		return; @@ -1415,7 +1435,9 @@ static inline void mwait_play_dead(void)  		 * The WBINVD is insufficient due to the spurious-wakeup  		 * case where we return around the loop.  		 */ +		mb();  		clflush(mwait_ptr); +		mb();  		__monitor(mwait_ptr, 0, 0);  		mb();  		__mwait(eax, 0); diff --git a/arch/x86/kernel/sysfb_simplefb.c b/arch/x86/kernel/sysfb_simplefb.c index 22513e96b01..86179d40989 100644 --- a/arch/x86/kernel/sysfb_simplefb.c +++ b/arch/x86/kernel/sysfb_simplefb.c @@ -72,14 +72,14 @@ __init int create_simplefb(const struct screen_info *si,  	 * the part that is occupied by the framebuffer */  	len = mode->height * mode->stride;  	len = PAGE_ALIGN(len); -	if (len > si->lfb_size << 16) { +	if (len > (u64)si->lfb_size << 16) {  		printk(KERN_WARNING "sysfb: VRAM smaller than advertised\n");  		return -EINVAL;  	}  	/* setup IORESOURCE_MEM as framebuffer memory */  	memset(&res, 0, sizeof(res)); -	res.flags = IORESOURCE_MEM; +	res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;  	res.name = simplefb_resname;  	res.start = si->lfb_base;  	res.end = si->lfb_base + len - 1; diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index 24d3c91e981..bf7ef5ce29d 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -23,7 +23,7 @@  #include <asm/time.h>  #ifdef CONFIG_X86_64 -DEFINE_VVAR(volatile unsigned long, jiffies) = INITIAL_JIFFIES; +__visible DEFINE_VVAR(volatile unsigned long, jiffies) = INITIAL_JIFFIES;  #endif  unsigned long profile_pc(struct pt_regs *regs) @@ -62,7 +62,7 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id)  static struct irqaction irq0  = {  	.handler = timer_interrupt, -	.flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, +	.flags = IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER,  	.name = "timer"  }; diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index 6e60b5fe224..649b010da00 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -65,29 +65,32 @@ int __ref _debug_hotplug_cpu(int cpu, int action)  	if (!cpu_is_hotpluggable(cpu))  		return -EINVAL; -	cpu_hotplug_driver_lock(); +	lock_device_hotplug();  	switch (action) {  	case 0:  		ret = cpu_down(cpu);  		if (!ret) {  			pr_info("CPU %u is now offline\n", cpu); +			dev->offline = true;  			kobject_uevent(&dev->kobj, KOBJ_OFFLINE);  		} else  			pr_debug("Can't offline CPU%d.\n", cpu);  		break;  	case 1:  		ret = cpu_up(cpu); -		if (!ret) +		if (!ret) { +			dev->offline = false;  			kobject_uevent(&dev->kobj, KOBJ_ONLINE); -		else +		} else {  			pr_debug("Can't online CPU%d.\n", cpu); +		}  		break;  	default:  		ret = -EINVAL;  	} -	cpu_hotplug_driver_unlock(); +	unlock_device_hotplug();  	return ret;  } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 8c8093b146c..0d0e922fafc 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -23,6 +23,7 @@  #include <linux/kernel.h>  #include <linux/module.h>  #include <linux/ptrace.h> +#include <linux/uprobes.h>  #include <linux/string.h>  #include <linux/delay.h>  #include <linux/errno.h> @@ -88,7 +89,7 @@ static inline void conditional_sti(struct pt_regs *regs)  static inline void preempt_conditional_sti(struct pt_regs *regs)  { -	inc_preempt_count(); +	preempt_count_inc();  	if (regs->flags & X86_EFLAGS_IF)  		local_irq_enable();  } @@ -103,10 +104,10 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)  {  	if (regs->flags & X86_EFLAGS_IF)  		local_irq_disable(); -	dec_preempt_count(); +	preempt_count_dec();  } -static int __kprobes +static nokprobe_inline int  do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,  		  struct pt_regs *regs,	long error_code)  { @@ -136,7 +137,38 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,  	return -1;  } -static void __kprobes +static siginfo_t *fill_trap_info(struct pt_regs *regs, int signr, int trapnr, +				siginfo_t *info) +{ +	unsigned long siaddr; +	int sicode; + +	switch (trapnr) { +	default: +		return SEND_SIG_PRIV; + +	case X86_TRAP_DE: +		sicode = FPE_INTDIV; +		siaddr = uprobe_get_trap_addr(regs); +		break; +	case X86_TRAP_UD: +		sicode = ILL_ILLOPN; +		siaddr = uprobe_get_trap_addr(regs); +		break; +	case X86_TRAP_AC: +		sicode = BUS_ADRALN; +		siaddr = 0; +		break; +	} + +	info->si_signo = signr; +	info->si_errno = 0; +	info->si_code = sicode; +	info->si_addr = (void __user *)siaddr; +	return info; +} + +static void  do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,  	long error_code, siginfo_t *info)  { @@ -168,64 +200,43 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,  	}  #endif -	if (info) -		force_sig_info(signr, info, tsk); -	else -		force_sig(signr, tsk); +	force_sig_info(signr, info ?: SEND_SIG_PRIV, tsk);  } +NOKPROBE_SYMBOL(do_trap); -#define DO_ERROR(trapnr, signr, str, name)				\ -dotraplinkage void do_##name(struct pt_regs *regs, long error_code)	\ -{									\ -	enum ctx_state prev_state;					\ -									\ -	prev_state = exception_enter();					\ -	if (notify_die(DIE_TRAP, str, regs, error_code,			\ -			trapnr, signr) == NOTIFY_STOP) {		\ -		exception_exit(prev_state);				\ -		return;							\ -	}								\ -	conditional_sti(regs);						\ -	do_trap(trapnr, signr, str, regs, error_code, NULL);		\ -	exception_exit(prev_state);					\ +static void do_error_trap(struct pt_regs *regs, long error_code, char *str, +			  unsigned long trapnr, int signr) +{ +	enum ctx_state prev_state = exception_enter(); +	siginfo_t info; + +	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) != +			NOTIFY_STOP) { +		conditional_sti(regs); +		do_trap(trapnr, signr, str, regs, error_code, +			fill_trap_info(regs, signr, trapnr, &info)); +	} + +	exception_exit(prev_state);  } -#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr)		\ +#define DO_ERROR(trapnr, signr, str, name)				\  dotraplinkage void do_##name(struct pt_regs *regs, long error_code)	\  {									\ -	siginfo_t info;							\ -	enum ctx_state prev_state;					\ -									\ -	info.si_signo = signr;						\ -	info.si_errno = 0;						\ -	info.si_code = sicode;						\ -	info.si_addr = (void __user *)siaddr;				\ -	prev_state = exception_enter();					\ -	if (notify_die(DIE_TRAP, str, regs, error_code,			\ -			trapnr, signr) == NOTIFY_STOP) {		\ -		exception_exit(prev_state);				\ -		return;							\ -	}								\ -	conditional_sti(regs);						\ -	do_trap(trapnr, signr, str, regs, error_code, &info);		\ -	exception_exit(prev_state);					\ +	do_error_trap(regs, error_code, str, trapnr, signr);		\  } -DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV, -		regs->ip) -DO_ERROR(X86_TRAP_OF, SIGSEGV, "overflow", overflow) -DO_ERROR(X86_TRAP_BR, SIGSEGV, "bounds", bounds) -DO_ERROR_INFO(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, -		regs->ip) -DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun", -		coprocessor_segment_overrun) -DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS) -DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present) +DO_ERROR(X86_TRAP_DE,     SIGFPE,  "divide error",		divide_error) +DO_ERROR(X86_TRAP_OF,     SIGSEGV, "overflow",			overflow) +DO_ERROR(X86_TRAP_BR,     SIGSEGV, "bounds",			bounds) +DO_ERROR(X86_TRAP_UD,     SIGILL,  "invalid opcode",		invalid_op) +DO_ERROR(X86_TRAP_OLD_MF, SIGFPE,  "coprocessor segment overrun",coprocessor_segment_overrun) +DO_ERROR(X86_TRAP_TS,     SIGSEGV, "invalid TSS",		invalid_TSS) +DO_ERROR(X86_TRAP_NP,     SIGBUS,  "segment not present",	segment_not_present)  #ifdef CONFIG_X86_32 -DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment) +DO_ERROR(X86_TRAP_SS,     SIGBUS,  "stack segment",		stack_segment)  #endif -DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check, -		BUS_ADRALN, 0) +DO_ERROR(X86_TRAP_AC,     SIGBUS,  "alignment check",		alignment_check)  #ifdef CONFIG_X86_64  /* Runs on IST stack */ @@ -267,7 +278,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)  }  #endif -dotraplinkage void __kprobes +dotraplinkage void  do_general_protection(struct pt_regs *regs, long error_code)  {  	struct task_struct *tsk; @@ -309,13 +320,14 @@ do_general_protection(struct pt_regs *regs, long error_code)  		pr_cont("\n");  	} -	force_sig(SIGSEGV, tsk); +	force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);  exit:  	exception_exit(prev_state);  } +NOKPROBE_SYMBOL(do_general_protection);  /* May run on IST stack. */ -dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_code) +dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)  {  	enum ctx_state prev_state; @@ -338,6 +350,11 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_co  		goto exit;  #endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */ +#ifdef CONFIG_KPROBES +	if (kprobe_int3_handler(regs)) +		goto exit; +#endif +  	if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,  			SIGTRAP) == NOTIFY_STOP)  		goto exit; @@ -354,6 +371,7 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_co  exit:  	exception_exit(prev_state);  } +NOKPROBE_SYMBOL(do_int3);  #ifdef CONFIG_X86_64  /* @@ -361,7 +379,7 @@ exit:   * for scheduling or signal handling. The actual stack switch is done in   * entry.S   */ -asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) +asmlinkage __visible struct pt_regs *sync_regs(struct pt_regs *eregs)  {  	struct pt_regs *regs = eregs;  	/* Did already sync */ @@ -380,6 +398,7 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)  		*regs = *eregs;  	return regs;  } +NOKPROBE_SYMBOL(sync_regs);  #endif  /* @@ -406,7 +425,7 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)   *   * May run on IST stack.   */ -dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) +dotraplinkage void do_debug(struct pt_regs *regs, long error_code)  {  	struct task_struct *tsk = current;  	enum ctx_state prev_state; @@ -444,6 +463,11 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)  	/* Store the virtualized DR6 value */  	tsk->thread.debugreg6 = dr6; +#ifdef CONFIG_KPROBES +	if (kprobe_debug_handler(regs)) +		goto exit; +#endif +  	if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, error_code,  							SIGTRAP) == NOTIFY_STOP)  		goto exit; @@ -486,13 +510,14 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)  exit:  	exception_exit(prev_state);  } +NOKPROBE_SYMBOL(do_debug);  /*   * Note that we play around with the 'TS' bit in an attempt to get   * the correct behaviour even in the presence of the asynchronous   * IRQ13 behaviour   */ -void math_error(struct pt_regs *regs, int error_code, int trapnr) +static void math_error(struct pt_regs *regs, int error_code, int trapnr)  {  	struct task_struct *task = current;  	siginfo_t info; @@ -522,7 +547,7 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr)  	task->thread.error_code = error_code;  	info.si_signo = SIGFPE;  	info.si_errno = 0; -	info.si_addr = (void __user *)regs->ip; +	info.si_addr = (void __user *)uprobe_get_trap_addr(regs);  	if (trapnr == X86_TRAP_MF) {  		unsigned short cwd, swd;  		/* @@ -605,11 +630,11 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)  #endif  } -asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) +asmlinkage __visible void __attribute__((weak)) smp_thermal_interrupt(void)  {  } -asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void) +asmlinkage __visible void __attribute__((weak)) smp_threshold_interrupt(void)  {  } @@ -649,15 +674,15 @@ void math_state_restore(void)  	 */  	if (unlikely(restore_fpu_checking(tsk))) {  		drop_init_fpu(tsk); -		force_sig(SIGSEGV, tsk); +		force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);  		return;  	} -	tsk->fpu_counter++; +	tsk->thread.fpu_counter++;  }  EXPORT_SYMBOL_GPL(math_state_restore); -dotraplinkage void __kprobes +dotraplinkage void  do_device_not_available(struct pt_regs *regs, long error_code)  {  	enum ctx_state prev_state; @@ -683,6 +708,7 @@ do_device_not_available(struct pt_regs *regs, long error_code)  #endif  	exception_exit(prev_state);  } +NOKPROBE_SYMBOL(do_device_not_available);  #ifdef CONFIG_X86_32  dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) @@ -713,7 +739,7 @@ void __init early_trap_init(void)  	/* int3 can be called from all */  	set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);  #ifdef CONFIG_X86_32 -	set_intr_gate(X86_TRAP_PF, &page_fault); +	set_intr_gate(X86_TRAP_PF, page_fault);  #endif  	load_idt(&idt_descr);  } @@ -721,7 +747,7 @@ void __init early_trap_init(void)  void __init early_trap_pf_init(void)  {  #ifdef CONFIG_X86_64 -	set_intr_gate(X86_TRAP_PF, &page_fault); +	set_intr_gate(X86_TRAP_PF, page_fault);  #endif  } @@ -737,30 +763,30 @@ void __init trap_init(void)  	early_iounmap(p, 4);  #endif -	set_intr_gate(X86_TRAP_DE, ÷_error); +	set_intr_gate(X86_TRAP_DE, divide_error);  	set_intr_gate_ist(X86_TRAP_NMI, &nmi, NMI_STACK);  	/* int4 can be called from all */  	set_system_intr_gate(X86_TRAP_OF, &overflow); -	set_intr_gate(X86_TRAP_BR, &bounds); -	set_intr_gate(X86_TRAP_UD, &invalid_op); -	set_intr_gate(X86_TRAP_NM, &device_not_available); +	set_intr_gate(X86_TRAP_BR, bounds); +	set_intr_gate(X86_TRAP_UD, invalid_op); +	set_intr_gate(X86_TRAP_NM, device_not_available);  #ifdef CONFIG_X86_32  	set_task_gate(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS);  #else  	set_intr_gate_ist(X86_TRAP_DF, &double_fault, DOUBLEFAULT_STACK);  #endif -	set_intr_gate(X86_TRAP_OLD_MF, &coprocessor_segment_overrun); -	set_intr_gate(X86_TRAP_TS, &invalid_TSS); -	set_intr_gate(X86_TRAP_NP, &segment_not_present); +	set_intr_gate(X86_TRAP_OLD_MF, coprocessor_segment_overrun); +	set_intr_gate(X86_TRAP_TS, invalid_TSS); +	set_intr_gate(X86_TRAP_NP, segment_not_present);  	set_intr_gate_ist(X86_TRAP_SS, &stack_segment, STACKFAULT_STACK); -	set_intr_gate(X86_TRAP_GP, &general_protection); -	set_intr_gate(X86_TRAP_SPURIOUS, &spurious_interrupt_bug); -	set_intr_gate(X86_TRAP_MF, &coprocessor_error); -	set_intr_gate(X86_TRAP_AC, &alignment_check); +	set_intr_gate(X86_TRAP_GP, general_protection); +	set_intr_gate(X86_TRAP_SPURIOUS, spurious_interrupt_bug); +	set_intr_gate(X86_TRAP_MF, coprocessor_error); +	set_intr_gate(X86_TRAP_AC, alignment_check);  #ifdef CONFIG_X86_MCE  	set_intr_gate_ist(X86_TRAP_MC, &machine_check, MCE_STACK);  #endif -	set_intr_gate(X86_TRAP_XF, &simd_coprocessor_error); +	set_intr_gate(X86_TRAP_XF, simd_coprocessor_error);  	/* Reserve all the builtin and the syscall vector: */  	for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 930e5d48f56..ea030319b32 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -11,6 +11,7 @@  #include <linux/clocksource.h>  #include <linux/percpu.h>  #include <linux/timex.h> +#include <linux/static_key.h>  #include <asm/hpet.h>  #include <asm/timer.h> @@ -37,13 +38,244 @@ static int __read_mostly tsc_unstable;     erroneous rdtsc usage on !cpu_has_tsc processors */  static int __read_mostly tsc_disabled = -1; +static struct static_key __use_tsc = STATIC_KEY_INIT; +  int tsc_clocksource_reliable; + +/* + * Use a ring-buffer like data structure, where a writer advances the head by + * writing a new data entry and a reader advances the tail when it observes a + * new entry. + * + * Writers are made to wait on readers until there's space to write a new + * entry. + * + * This means that we can always use an {offset, mul} pair to compute a ns + * value that is 'roughly' in the right direction, even if we're writing a new + * {offset, mul} pair during the clock read. + * + * The down-side is that we can no longer guarantee strict monotonicity anymore + * (assuming the TSC was that to begin with), because while we compute the + * intersection point of the two clock slopes and make sure the time is + * continuous at the point of switching; we can no longer guarantee a reader is + * strictly before or after the switch point. + * + * It does mean a reader no longer needs to disable IRQs in order to avoid + * CPU-Freq updates messing with his times, and similarly an NMI reader will + * no longer run the risk of hitting half-written state. + */ + +struct cyc2ns { +	struct cyc2ns_data data[2];	/*  0 + 2*24 = 48 */ +	struct cyc2ns_data *head;	/* 48 + 8    = 56 */ +	struct cyc2ns_data *tail;	/* 56 + 8    = 64 */ +}; /* exactly fits one cacheline */ + +static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns); + +struct cyc2ns_data *cyc2ns_read_begin(void) +{ +	struct cyc2ns_data *head; + +	preempt_disable(); + +	head = this_cpu_read(cyc2ns.head); +	/* +	 * Ensure we observe the entry when we observe the pointer to it. +	 * matches the wmb from cyc2ns_write_end(). +	 */ +	smp_read_barrier_depends(); +	head->__count++; +	barrier(); + +	return head; +} + +void cyc2ns_read_end(struct cyc2ns_data *head) +{ +	barrier(); +	/* +	 * If we're the outer most nested read; update the tail pointer +	 * when we're done. This notifies possible pending writers +	 * that we've observed the head pointer and that the other +	 * entry is now free. +	 */ +	if (!--head->__count) { +		/* +		 * x86-TSO does not reorder writes with older reads; +		 * therefore once this write becomes visible to another +		 * cpu, we must be finished reading the cyc2ns_data. +		 * +		 * matches with cyc2ns_write_begin(). +		 */ +		this_cpu_write(cyc2ns.tail, head); +	} +	preempt_enable(); +} + +/* + * Begin writing a new @data entry for @cpu. + * + * Assumes some sort of write side lock; currently 'provided' by the assumption + * that cpufreq will call its notifiers sequentially. + */ +static struct cyc2ns_data *cyc2ns_write_begin(int cpu) +{ +	struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); +	struct cyc2ns_data *data = c2n->data; + +	if (data == c2n->head) +		data++; + +	/* XXX send an IPI to @cpu in order to guarantee a read? */ + +	/* +	 * When we observe the tail write from cyc2ns_read_end(), +	 * the cpu must be done with that entry and its safe +	 * to start writing to it. +	 */ +	while (c2n->tail == data) +		cpu_relax(); + +	return data; +} + +static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data) +{ +	struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); + +	/* +	 * Ensure the @data writes are visible before we publish the +	 * entry. Matches the data-depencency in cyc2ns_read_begin(). +	 */ +	smp_wmb(); + +	ACCESS_ONCE(c2n->head) = data; +} + +/* + * Accelerators for sched_clock() + * convert from cycles(64bits) => nanoseconds (64bits) + *  basic equation: + *              ns = cycles / (freq / ns_per_sec) + *              ns = cycles * (ns_per_sec / freq) + *              ns = cycles * (10^9 / (cpu_khz * 10^3)) + *              ns = cycles * (10^6 / cpu_khz) + * + *      Then we use scaling math (suggested by george@mvista.com) to get: + *              ns = cycles * (10^6 * SC / cpu_khz) / SC + *              ns = cycles * cyc2ns_scale / SC + * + *      And since SC is a constant power of two, we can convert the div + *  into a shift. + * + *  We can use khz divisor instead of mhz to keep a better precision, since + *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. + *  (mathieu.desnoyers@polymtl.ca) + * + *                      -johnstul@us.ibm.com "math is hard, lets go shopping!" + */ + +#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ + +static void cyc2ns_data_init(struct cyc2ns_data *data) +{ +	data->cyc2ns_mul = 0; +	data->cyc2ns_shift = CYC2NS_SCALE_FACTOR; +	data->cyc2ns_offset = 0; +	data->__count = 0; +} + +static void cyc2ns_init(int cpu) +{ +	struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); + +	cyc2ns_data_init(&c2n->data[0]); +	cyc2ns_data_init(&c2n->data[1]); + +	c2n->head = c2n->data; +	c2n->tail = c2n->data; +} + +static inline unsigned long long cycles_2_ns(unsigned long long cyc) +{ +	struct cyc2ns_data *data, *tail; +	unsigned long long ns; + +	/* +	 * See cyc2ns_read_*() for details; replicated in order to avoid +	 * an extra few instructions that came with the abstraction. +	 * Notable, it allows us to only do the __count and tail update +	 * dance when its actually needed. +	 */ + +	preempt_disable_notrace(); +	data = this_cpu_read(cyc2ns.head); +	tail = this_cpu_read(cyc2ns.tail); + +	if (likely(data == tail)) { +		ns = data->cyc2ns_offset; +		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); +	} else { +		data->__count++; + +		barrier(); + +		ns = data->cyc2ns_offset; +		ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); + +		barrier(); + +		if (!--data->__count) +			this_cpu_write(cyc2ns.tail, data); +	} +	preempt_enable_notrace(); + +	return ns; +} + +/* XXX surely we already have this someplace in the kernel?! */ +#define DIV_ROUND(n, d) (((n) + ((d) / 2)) / (d)) + +static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) +{ +	unsigned long long tsc_now, ns_now; +	struct cyc2ns_data *data; +	unsigned long flags; + +	local_irq_save(flags); +	sched_clock_idle_sleep_event(); + +	if (!cpu_khz) +		goto done; + +	data = cyc2ns_write_begin(cpu); + +	rdtscll(tsc_now); +	ns_now = cycles_2_ns(tsc_now); + +	/* +	 * Compute a new multiplier as per the above comment and ensure our +	 * time function is continuous; see the comment near struct +	 * cyc2ns_data. +	 */ +	data->cyc2ns_mul = DIV_ROUND(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR, cpu_khz); +	data->cyc2ns_shift = CYC2NS_SCALE_FACTOR; +	data->cyc2ns_offset = ns_now - +		mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); + +	cyc2ns_write_end(cpu, data); + +done: +	sched_clock_idle_wakeup_event(0); +	local_irq_restore(flags); +}  /*   * Scheduler clock - returns current time in nanosec units.   */  u64 native_sched_clock(void)  { -	u64 this_offset; +	u64 tsc_now;  	/*  	 * Fall back to jiffies if there's no TSC available: @@ -53,16 +285,16 @@ u64 native_sched_clock(void)  	 *   very important for it to be as fast as the platform  	 *   can achieve it. )  	 */ -	if (unlikely(tsc_disabled)) { +	if (!static_key_false(&__use_tsc)) {  		/* No locking but a rare wrong value is not a big deal: */  		return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);  	}  	/* read the Time Stamp Counter: */ -	rdtscll(this_offset); +	rdtscll(tsc_now);  	/* return the value in ns */ -	return __cycles_2_ns(this_offset); +	return cycles_2_ns(tsc_now);  }  /* We need to define a real function for sched_clock, to override the @@ -419,6 +651,13 @@ unsigned long native_calibrate_tsc(void)  	unsigned long flags, latch, ms, fast_calibrate;  	int hpet = is_hpet_enabled(), i, loopmin; +	/* Calibrate TSC using MSR for Intel Atom SoCs */ +	local_irq_save(flags); +	fast_calibrate = try_msr_calibrate_tsc(); +	local_irq_restore(flags); +	if (fast_calibrate) +		return fast_calibrate; +  	local_irq_save(flags);  	fast_calibrate = quick_pit_calibrate();  	local_irq_restore(flags); @@ -589,61 +828,11 @@ int recalibrate_cpu_khz(void)  EXPORT_SYMBOL(recalibrate_cpu_khz); -/* Accelerators for sched_clock() - * convert from cycles(64bits) => nanoseconds (64bits) - *  basic equation: - *              ns = cycles / (freq / ns_per_sec) - *              ns = cycles * (ns_per_sec / freq) - *              ns = cycles * (10^9 / (cpu_khz * 10^3)) - *              ns = cycles * (10^6 / cpu_khz) - * - *      Then we use scaling math (suggested by george@mvista.com) to get: - *              ns = cycles * (10^6 * SC / cpu_khz) / SC - *              ns = cycles * cyc2ns_scale / SC - * - *      And since SC is a constant power of two, we can convert the div - *  into a shift. - * - *  We can use khz divisor instead of mhz to keep a better precision, since - *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. - *  (mathieu.desnoyers@polymtl.ca) - * - *                      -johnstul@us.ibm.com "math is hard, lets go shopping!" - */ - -DEFINE_PER_CPU(unsigned long, cyc2ns); -DEFINE_PER_CPU(unsigned long long, cyc2ns_offset); - -static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) -{ -	unsigned long long tsc_now, ns_now, *offset; -	unsigned long flags, *scale; - -	local_irq_save(flags); -	sched_clock_idle_sleep_event(); - -	scale = &per_cpu(cyc2ns, cpu); -	offset = &per_cpu(cyc2ns_offset, cpu); - -	rdtscll(tsc_now); -	ns_now = __cycles_2_ns(tsc_now); - -	if (cpu_khz) { -		*scale = ((NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR) + -				cpu_khz / 2) / cpu_khz; -		*offset = ns_now - mult_frac(tsc_now, *scale, -					     (1UL << CYC2NS_SCALE_FACTOR)); -	} - -	sched_clock_idle_wakeup_event(0); -	local_irq_restore(flags); -} -  static unsigned long long cyc2ns_suspend;  void tsc_save_sched_clock_state(void)  { -	if (!sched_clock_stable) +	if (!sched_clock_stable())  		return;  	cyc2ns_suspend = sched_clock(); @@ -663,16 +852,26 @@ void tsc_restore_sched_clock_state(void)  	unsigned long flags;  	int cpu; -	if (!sched_clock_stable) +	if (!sched_clock_stable())  		return;  	local_irq_save(flags); -	__this_cpu_write(cyc2ns_offset, 0); +	/* +	 * We're comming out of suspend, there's no concurrency yet; don't +	 * bother being nice about the RCU stuff, just write to both +	 * data fields. +	 */ + +	this_cpu_write(cyc2ns.data[0].cyc2ns_offset, 0); +	this_cpu_write(cyc2ns.data[1].cyc2ns_offset, 0); +  	offset = cyc2ns_suspend - sched_clock(); -	for_each_possible_cpu(cpu) -		per_cpu(cyc2ns_offset, cpu) = offset; +	for_each_possible_cpu(cpu) { +		per_cpu(cyc2ns.data[0].cyc2ns_offset, cpu) = offset; +		per_cpu(cyc2ns.data[1].cyc2ns_offset, cpu) = offset; +	}  	local_irq_restore(flags);  } @@ -715,16 +914,15 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,  		tsc_khz_ref = tsc_khz;  	}  	if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) || -			(val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || -			(val == CPUFREQ_RESUMECHANGE)) { +			(val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {  		*lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);  		tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);  		if (!(freq->flags & CPUFREQ_CONST_LOOPS))  			mark_tsc_unstable("cpufreq changes"); -	} -	set_cyc2ns_scale(tsc_khz, freq->cpu); +		set_cyc2ns_scale(tsc_khz, freq->cpu); +	}  	return 0;  } @@ -786,16 +984,14 @@ static struct clocksource clocksource_tsc = {  	.mask                   = CLOCKSOURCE_MASK(64),  	.flags                  = CLOCK_SOURCE_IS_CONTINUOUS |  				  CLOCK_SOURCE_MUST_VERIFY, -#ifdef CONFIG_X86_64  	.archdata               = { .vclock_mode = VCLOCK_TSC }, -#endif  };  void mark_tsc_unstable(char *reason)  {  	if (!tsc_unstable) {  		tsc_unstable = 1; -		sched_clock_stable = 0; +		clear_sched_clock_stable();  		disable_sched_clock_irqtime();  		pr_info("Marking TSC unstable due to %s\n", reason);  		/* Change only the rating, when not registered */ @@ -995,14 +1191,18 @@ void __init tsc_init(void)  	 * speed as the bootup CPU. (cpufreq notifiers will fix this  	 * up if their speed diverges)  	 */ -	for_each_possible_cpu(cpu) +	for_each_possible_cpu(cpu) { +		cyc2ns_init(cpu);  		set_cyc2ns_scale(cpu_khz, cpu); +	}  	if (tsc_disabled > 0)  		return;  	/* now allow native_sched_clock() to use rdtsc */ +  	tsc_disabled = 0; +	static_key_slow_inc(&__use_tsc);  	if (!no_sched_irq_time)  		enable_sched_clock_irqtime(); diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c new file mode 100644 index 00000000000..92ae6acac8a --- /dev/null +++ b/arch/x86/kernel/tsc_msr.c @@ -0,0 +1,127 @@ +/* + * tsc_msr.c - MSR based TSC calibration on Intel Atom SoC platforms. + * + * TSC in Intel Atom SoC runs at a constant rate which can be figured + * by this formula: + * <maximum core-clock to bus-clock ratio> * <maximum resolved frequency> + * See Intel 64 and IA-32 System Programming Guid section 16.12 and 30.11.5 + * for details. + * Especially some Intel Atom SoCs don't have PIT(i8254) or HPET, so MSR + * based calibration is the only option. + * + * + * Copyright (C) 2013 Intel Corporation + * Author: Bin Gao <bin.gao@intel.com> + * + * This file is released under the GPLv2. + */ + +#include <linux/kernel.h> +#include <asm/processor.h> +#include <asm/setup.h> +#include <asm/apic.h> +#include <asm/param.h> + +/* CPU reference clock frequency: in KHz */ +#define FREQ_83		83200 +#define FREQ_100	99840 +#define FREQ_133	133200 +#define FREQ_166	166400 + +#define MAX_NUM_FREQS	8 + +/* + * According to Intel 64 and IA-32 System Programming Guide, + * if MSR_PERF_STAT[31] is set, the maximum resolved bus ratio can be + * read in MSR_PLATFORM_ID[12:8], otherwise in MSR_PERF_STAT[44:40]. + * Unfortunately some Intel Atom SoCs aren't quite compliant to this, + * so we need manually differentiate SoC families. This is what the + * field msr_plat does. + */ +struct freq_desc { +	u8 x86_family;	/* CPU family */ +	u8 x86_model;	/* model */ +	u8 msr_plat;	/* 1: use MSR_PLATFORM_INFO, 0: MSR_IA32_PERF_STATUS */ +	u32 freqs[MAX_NUM_FREQS]; +}; + +static struct freq_desc freq_desc_tables[] = { +	/* PNW */ +	{ 6, 0x27, 0, { 0, 0, 0, 0, 0, FREQ_100, 0, FREQ_83 } }, +	/* CLV+ */ +	{ 6, 0x35, 0, { 0, FREQ_133, 0, 0, 0, FREQ_100, 0, FREQ_83 } }, +	/* TNG */ +	{ 6, 0x4a, 1, { 0, FREQ_100, FREQ_133, 0, 0, 0, 0, 0 } }, +	/* VLV2 */ +	{ 6, 0x37, 1, { FREQ_83, FREQ_100, FREQ_133, FREQ_166, 0, 0, 0, 0 } }, +	/* ANN */ +	{ 6, 0x5a, 1, { FREQ_83, FREQ_100, FREQ_133, FREQ_100, 0, 0, 0, 0 } }, +}; + +static int match_cpu(u8 family, u8 model) +{ +	int i; + +	for (i = 0; i < ARRAY_SIZE(freq_desc_tables); i++) { +		if ((family == freq_desc_tables[i].x86_family) && +			(model == freq_desc_tables[i].x86_model)) +			return i; +	} + +	return -1; +} + +/* Map CPU reference clock freq ID(0-7) to CPU reference clock freq(KHz) */ +#define id_to_freq(cpu_index, freq_id) \ +	(freq_desc_tables[cpu_index].freqs[freq_id]) + +/* + * Do MSR calibration only for known/supported CPUs. + * + * Returns the calibration value or 0 if MSR calibration failed. + */ +unsigned long try_msr_calibrate_tsc(void) +{ +	u32 lo, hi, ratio, freq_id, freq; +	unsigned long res; +	int cpu_index; + +	cpu_index = match_cpu(boot_cpu_data.x86, boot_cpu_data.x86_model); +	if (cpu_index < 0) +		return 0; + +	if (freq_desc_tables[cpu_index].msr_plat) { +		rdmsr(MSR_PLATFORM_INFO, lo, hi); +		ratio = (lo >> 8) & 0x1f; +	} else { +		rdmsr(MSR_IA32_PERF_STATUS, lo, hi); +		ratio = (hi >> 8) & 0x1f; +	} +	pr_info("Maximum core-clock to bus-clock ratio: 0x%x\n", ratio); + +	if (!ratio) +		goto fail; + +	/* Get FSB FREQ ID */ +	rdmsr(MSR_FSB_FREQ, lo, hi); +	freq_id = lo & 0x7; +	freq = id_to_freq(cpu_index, freq_id); +	pr_info("Resolved frequency ID: %u, frequency: %u KHz\n", +				freq_id, freq); +	if (!freq) +		goto fail; + +	/* TSC frequency = maximum resolved freq * maximum resolved bus ratio */ +	res = freq * ratio; +	pr_info("TSC runs at %lu KHz\n", res); + +#ifdef CONFIG_X86_LOCAL_APIC +	lapic_timer_frequency = (freq * 1000) / HZ; +	pr_info("lapic_timer_frequency = %d\n", lapic_timer_frequency); +#endif +	return res; + +fail: +	pr_warn("Fast TSC calibration using MSR failed\n"); +	return 0; +} diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index adfdf56a371..26488487bc6 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -16,7 +16,6 @@   */  #include <linux/spinlock.h>  #include <linux/kernel.h> -#include <linux/init.h>  #include <linux/smp.h>  #include <linux/nmi.h>  #include <asm/tsc.h> diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 2ed845928b5..5d1cbfe4ae5 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -32,20 +32,20 @@  /* Post-execution fixups. */ -/* No fixup needed */ -#define UPROBE_FIX_NONE		0x0 -  /* Adjust IP back to vicinity of actual insn */ -#define UPROBE_FIX_IP		0x1 +#define UPROBE_FIX_IP		0x01  /* Adjust the return address of a call insn */ -#define UPROBE_FIX_CALL	0x2 +#define UPROBE_FIX_CALL		0x02  /* Instruction will modify TF, don't change it */ -#define UPROBE_FIX_SETF	0x4 +#define UPROBE_FIX_SETF		0x04 -#define UPROBE_FIX_RIP_AX	0x8000 -#define UPROBE_FIX_RIP_CX	0x4000 +#define UPROBE_FIX_RIP_SI	0x08 +#define UPROBE_FIX_RIP_DI	0x10 +#define UPROBE_FIX_RIP_BX	0x20 +#define UPROBE_FIX_RIP_MASK	\ +	(UPROBE_FIX_RIP_SI | UPROBE_FIX_RIP_DI | UPROBE_FIX_RIP_BX)  #define	UPROBE_TRAP_NR		UINT_MAX @@ -53,7 +53,7 @@  #define OPCODE1(insn)		((insn)->opcode.bytes[0])  #define OPCODE2(insn)		((insn)->opcode.bytes[1])  #define OPCODE3(insn)		((insn)->opcode.bytes[2]) -#define MODRM_REG(insn)		X86_MODRM_REG(insn->modrm.value) +#define MODRM_REG(insn)		X86_MODRM_REG((insn)->modrm.value)  #define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\  	(((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) |   \ @@ -67,6 +67,7 @@   * to keep gcc from statically optimizing it out, as variable_test_bit makes   * some versions of gcc to think only *(unsigned long*) is used.   */ +#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)  static volatile u32 good_insns_32[256 / 32] = {  	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */  	/*      ----------------------------------------------         */ @@ -89,33 +90,12 @@ static volatile u32 good_insns_32[256 / 32] = {  	/*      ----------------------------------------------         */  	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */  }; +#else +#define good_insns_32	NULL +#endif -/* Using this for both 64-bit and 32-bit apps */ -static volatile u32 good_2byte_insns[256 / 32] = { -	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */ -	/*      ----------------------------------------------         */ -	W(0x00, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1) | /* 00 */ -	W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* 10 */ -	W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */ -	W(0x30, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */ -	W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ -	W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ -	W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */ -	W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */ -	W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ -	W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ -	W(0xa0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */ -	W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ -	W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */ -	W(0xd0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ -	W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */ -	W(0xf0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0)   /* f0 */ -	/*      ----------------------------------------------         */ -	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */ -}; - -#ifdef CONFIG_X86_64  /* Good-instruction tables for 64-bit apps */ +#if defined(CONFIG_X86_64)  static volatile u32 good_insns_64[256 / 32] = {  	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */  	/*      ----------------------------------------------         */ @@ -138,7 +118,33 @@ static volatile u32 good_insns_64[256 / 32] = {  	/*      ----------------------------------------------         */  	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */  }; +#else +#define good_insns_64	NULL  #endif + +/* Using this for both 64-bit and 32-bit apps */ +static volatile u32 good_2byte_insns[256 / 32] = { +	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */ +	/*      ----------------------------------------------         */ +	W(0x00, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1) | /* 00 */ +	W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* 10 */ +	W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */ +	W(0x30, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */ +	W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ +	W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ +	W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */ +	W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */ +	W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ +	W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ +	W(0xa0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */ +	W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ +	W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */ +	W(0xd0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ +	W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */ +	W(0xf0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0)   /* f0 */ +	/*      ----------------------------------------------         */ +	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */ +};  #undef W  /* @@ -209,16 +215,25 @@ static bool is_prefix_bad(struct insn *insn)  	return false;  } -static int validate_insn_32bits(struct arch_uprobe *auprobe, struct insn *insn) +static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool x86_64)  { -	insn_init(insn, auprobe->insn, false); +	u32 volatile *good_insns; + +	insn_init(insn, auprobe->insn, x86_64); +	/* has the side-effect of processing the entire instruction */ +	insn_get_length(insn); +	if (WARN_ON_ONCE(!insn_complete(insn))) +		return -ENOEXEC; -	/* Skip good instruction prefixes; reject "bad" ones. */ -	insn_get_opcode(insn);  	if (is_prefix_bad(insn))  		return -ENOTSUPP; -	if (test_bit(OPCODE1(insn), (unsigned long *)good_insns_32)) +	if (x86_64) +		good_insns = good_insns_64; +	else +		good_insns = good_insns_32; + +	if (test_bit(OPCODE1(insn), (unsigned long *)good_insns))  		return 0;  	if (insn->opcode.nbytes == 2) { @@ -229,72 +244,19 @@ static int validate_insn_32bits(struct arch_uprobe *auprobe, struct insn *insn)  	return -ENOTSUPP;  } -/* - * Figure out which fixups arch_uprobe_post_xol() will need to perform, and - * annotate arch_uprobe->fixups accordingly.  To start with, - * arch_uprobe->fixups is either zero or it reflects rip-related fixups. - */ -static void prepare_fixups(struct arch_uprobe *auprobe, struct insn *insn) +#ifdef CONFIG_X86_64 +static inline bool is_64bit_mm(struct mm_struct *mm)  { -	bool fix_ip = true, fix_call = false;	/* defaults */ -	int reg; - -	insn_get_opcode(insn);	/* should be a nop */ - -	switch (OPCODE1(insn)) { -	case 0x9d: -		/* popf */ -		auprobe->fixups |= UPROBE_FIX_SETF; -		break; -	case 0xc3:		/* ret/lret */ -	case 0xcb: -	case 0xc2: -	case 0xca: -		/* ip is correct */ -		fix_ip = false; -		break; -	case 0xe8:		/* call relative - Fix return addr */ -		fix_call = true; -		break; -	case 0x9a:		/* call absolute - Fix return addr, not ip */ -		fix_call = true; -		fix_ip = false; -		break; -	case 0xff: -		insn_get_modrm(insn); -		reg = MODRM_REG(insn); -		if (reg == 2 || reg == 3) { -			/* call or lcall, indirect */ -			/* Fix return addr; ip is correct. */ -			fix_call = true; -			fix_ip = false; -		} else if (reg == 4 || reg == 5) { -			/* jmp or ljmp, indirect */ -			/* ip is correct. */ -			fix_ip = false; -		} -		break; -	case 0xea:		/* jmp absolute -- ip is correct */ -		fix_ip = false; -		break; -	default: -		break; -	} -	if (fix_ip) -		auprobe->fixups |= UPROBE_FIX_IP; -	if (fix_call) -		auprobe->fixups |= UPROBE_FIX_CALL; +	return	!config_enabled(CONFIG_IA32_EMULATION) || +		!(mm->context.ia32_compat == TIF_IA32);  } - -#ifdef CONFIG_X86_64  /*   * If arch_uprobe->insn doesn't use rip-relative addressing, return   * immediately.  Otherwise, rewrite the instruction so that it accesses   * its memory operand indirectly through a scratch register.  Set - * arch_uprobe->fixups and arch_uprobe->rip_rela_target_address - * accordingly.  (The contents of the scratch register will be saved - * before we single-step the modified instruction, and restored - * afterward.) + * defparam->fixups accordingly. (The contents of the scratch register + * will be saved before we single-step the modified instruction, + * and restored afterward).   *   * We do this because a rip-relative instruction can access only a   * relatively small area (+/- 2 GB from the instruction), and the XOL @@ -305,248 +267,513 @@ static void prepare_fixups(struct arch_uprobe *auprobe, struct insn *insn)   *   * Some useful facts about rip-relative instructions:   * - *  - There's always a modrm byte. + *  - There's always a modrm byte with bit layout "00 reg 101".   *  - There's never a SIB byte.   *  - The displacement is always 4 bytes. + *  - REX.B=1 bit in REX prefix, which normally extends r/m field, + *    has no effect on rip-relative mode. It doesn't make modrm byte + *    with r/m=101 refer to register 1101 = R13.   */ -static void -handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) +static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn)  {  	u8 *cursor;  	u8 reg; +	u8 reg2; -	if (mm->context.ia32_compat) -		return; - -	auprobe->rip_rela_target_address = 0x0;  	if (!insn_rip_relative(insn))  		return;  	/* -	 * insn_rip_relative() would have decoded rex_prefix, modrm. +	 * insn_rip_relative() would have decoded rex_prefix, vex_prefix, modrm.  	 * Clear REX.b bit (extension of MODRM.rm field): -	 * we want to encode rax/rcx, not r8/r9. +	 * we want to encode low numbered reg, not r8+.  	 */  	if (insn->rex_prefix.nbytes) {  		cursor = auprobe->insn + insn_offset_rex_prefix(insn); -		*cursor &= 0xfe;	/* Clearing REX.B bit */ +		/* REX byte has 0100wrxb layout, clearing REX.b bit */ +		*cursor &= 0xfe; +	} +	/* +	 * Similar treatment for VEX3 prefix. +	 * TODO: add XOP/EVEX treatment when insn decoder supports them +	 */ +	if (insn->vex_prefix.nbytes == 3) { +		/* +		 * vex2:     c5    rvvvvLpp   (has no b bit) +		 * vex3/xop: c4/8f rxbmmmmm wvvvvLpp +		 * evex:     62    rxbR00mm wvvvv1pp zllBVaaa +		 *   (evex will need setting of both b and x since +		 *   in non-sib encoding evex.x is 4th bit of MODRM.rm) +		 * Setting VEX3.b (setting because it has inverted meaning): +		 */ +		cursor = auprobe->insn + insn_offset_vex_prefix(insn) + 1; +		*cursor |= 0x20;  	}  	/* +	 * Convert from rip-relative addressing to register-relative addressing +	 * via a scratch register. +	 * +	 * This is tricky since there are insns with modrm byte +	 * which also use registers not encoded in modrm byte: +	 * [i]div/[i]mul: implicitly use dx:ax +	 * shift ops: implicitly use cx +	 * cmpxchg: implicitly uses ax +	 * cmpxchg8/16b: implicitly uses dx:ax and bx:cx +	 *   Encoding: 0f c7/1 modrm +	 *   The code below thinks that reg=1 (cx), chooses si as scratch. +	 * mulx: implicitly uses dx: mulx r/m,r1,r2 does r1:r2 = dx * r/m. +	 *   First appeared in Haswell (BMI2 insn). It is vex-encoded. +	 *   Example where none of bx,cx,dx can be used as scratch reg: +	 *   c4 e2 63 f6 0d disp32   mulx disp32(%rip),%ebx,%ecx +	 * [v]pcmpistri: implicitly uses cx, xmm0 +	 * [v]pcmpistrm: implicitly uses xmm0 +	 * [v]pcmpestri: implicitly uses ax, dx, cx, xmm0 +	 * [v]pcmpestrm: implicitly uses ax, dx, xmm0 +	 *   Evil SSE4.2 string comparison ops from hell. +	 * maskmovq/[v]maskmovdqu: implicitly uses (ds:rdi) as destination. +	 *   Encoding: 0f f7 modrm, 66 0f f7 modrm, vex-encoded: c5 f9 f7 modrm. +	 *   Store op1, byte-masked by op2 msb's in each byte, to (ds:rdi). +	 *   AMD says it has no 3-operand form (vex.vvvv must be 1111) +	 *   and that it can have only register operands, not mem +	 *   (its modrm byte must have mode=11). +	 *   If these restrictions will ever be lifted, +	 *   we'll need code to prevent selection of di as scratch reg! +	 * +	 * Summary: I don't know any insns with modrm byte which +	 * use SI register implicitly. DI register is used only +	 * by one insn (maskmovq) and BX register is used +	 * only by one too (cmpxchg8b). +	 * BP is stack-segment based (may be a problem?). +	 * AX, DX, CX are off-limits (many implicit users). +	 * SP is unusable (it's stack pointer - think about "pop mem"; +	 * also, rsp+disp32 needs sib encoding -> insn length change). +	 */ + +	reg = MODRM_REG(insn);	/* Fetch modrm.reg */ +	reg2 = 0xff;		/* Fetch vex.vvvv */ +	if (insn->vex_prefix.nbytes == 2) +		reg2 = insn->vex_prefix.bytes[1]; +	else if (insn->vex_prefix.nbytes == 3) +		reg2 = insn->vex_prefix.bytes[2]; +	/* +	 * TODO: add XOP, EXEV vvvv reading. +	 * +	 * vex.vvvv field is in bits 6-3, bits are inverted. +	 * But in 32-bit mode, high-order bit may be ignored. +	 * Therefore, let's consider only 3 low-order bits. +	 */ +	reg2 = ((reg2 >> 3) & 0x7) ^ 0x7; +	/* +	 * Register numbering is ax,cx,dx,bx, sp,bp,si,di, r8..r15. +	 * +	 * Choose scratch reg. Order is important: must not select bx +	 * if we can use si (cmpxchg8b case!) +	 */ +	if (reg != 6 && reg2 != 6) { +		reg2 = 6; +		auprobe->defparam.fixups |= UPROBE_FIX_RIP_SI; +	} else if (reg != 7 && reg2 != 7) { +		reg2 = 7; +		auprobe->defparam.fixups |= UPROBE_FIX_RIP_DI; +		/* TODO (paranoia): force maskmovq to not use di */ +	} else { +		reg2 = 3; +		auprobe->defparam.fixups |= UPROBE_FIX_RIP_BX; +	} +	/*  	 * Point cursor at the modrm byte.  The next 4 bytes are the  	 * displacement.  Beyond the displacement, for some instructions,  	 * is the immediate operand.  	 */  	cursor = auprobe->insn + insn_offset_modrm(insn); -	insn_get_length(insn); -  	/* -	 * Convert from rip-relative addressing to indirect addressing -	 * via a scratch register.  Change the r/m field from 0x5 (%rip) -	 * to 0x0 (%rax) or 0x1 (%rcx), and squeeze out the offset field. +	 * Change modrm from "00 reg 101" to "10 reg reg2". Example: +	 * 89 05 disp32  mov %eax,disp32(%rip) becomes +	 * 89 86 disp32  mov %eax,disp32(%rsi)  	 */ -	reg = MODRM_REG(insn); -	if (reg == 0) { -		/* -		 * The register operand (if any) is either the A register -		 * (%rax, %eax, etc.) or (if the 0x4 bit is set in the -		 * REX prefix) %r8.  In any case, we know the C register -		 * is NOT the register operand, so we use %rcx (register -		 * #1) for the scratch register. -		 */ -		auprobe->fixups = UPROBE_FIX_RIP_CX; -		/* Change modrm from 00 000 101 to 00 000 001. */ -		*cursor = 0x1; -	} else { -		/* Use %rax (register #0) for the scratch register. */ -		auprobe->fixups = UPROBE_FIX_RIP_AX; -		/* Change modrm from 00 xxx 101 to 00 xxx 000 */ -		*cursor = (reg << 3); -	} - -	/* Target address = address of next instruction + (signed) offset */ -	auprobe->rip_rela_target_address = (long)insn->length + insn->displacement.value; - -	/* Displacement field is gone; slide immediate field (if any) over. */ -	if (insn->immediate.nbytes) { -		cursor++; -		memmove(cursor, cursor + insn->displacement.nbytes, insn->immediate.nbytes); -	} -	return; +	*cursor = 0x80 | (reg << 3) | reg2;  } -static int validate_insn_64bits(struct arch_uprobe *auprobe, struct insn *insn) +static inline unsigned long * +scratch_reg(struct arch_uprobe *auprobe, struct pt_regs *regs)  { -	insn_init(insn, auprobe->insn, true); - -	/* Skip good instruction prefixes; reject "bad" ones. */ -	insn_get_opcode(insn); -	if (is_prefix_bad(insn)) -		return -ENOTSUPP; +	if (auprobe->defparam.fixups & UPROBE_FIX_RIP_SI) +		return ®s->si; +	if (auprobe->defparam.fixups & UPROBE_FIX_RIP_DI) +		return ®s->di; +	return ®s->bx; +} -	if (test_bit(OPCODE1(insn), (unsigned long *)good_insns_64)) -		return 0; +/* + * If we're emulating a rip-relative instruction, save the contents + * of the scratch register and store the target address in that register. + */ +static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ +	if (auprobe->defparam.fixups & UPROBE_FIX_RIP_MASK) { +		struct uprobe_task *utask = current->utask; +		unsigned long *sr = scratch_reg(auprobe, regs); -	if (insn->opcode.nbytes == 2) { -		if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns)) -			return 0; +		utask->autask.saved_scratch_register = *sr; +		*sr = utask->vaddr + auprobe->defparam.ilen;  	} -	return -ENOTSUPP;  } -static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) +static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)  { -	if (mm->context.ia32_compat) -		return validate_insn_32bits(auprobe, insn); -	return validate_insn_64bits(auprobe, insn); +	if (auprobe->defparam.fixups & UPROBE_FIX_RIP_MASK) { +		struct uprobe_task *utask = current->utask; +		unsigned long *sr = scratch_reg(auprobe, regs); + +		*sr = utask->autask.saved_scratch_register; +	}  }  #else /* 32-bit: */ -static void handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) +static inline bool is_64bit_mm(struct mm_struct *mm)  { -	/* No RIP-relative addressing on 32-bit */ +	return false;  } - -static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm,  struct insn *insn) +/* + * No RIP-relative addressing on 32-bit + */ +static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) +{ +} +static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ +} +static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)  { -	return validate_insn_32bits(auprobe, insn);  }  #endif /* CONFIG_X86_64 */ -/** - * arch_uprobe_analyze_insn - instruction analysis including validity and fixups. - * @mm: the probed address space. - * @arch_uprobe: the probepoint information. - * @addr: virtual address at which to install the probepoint - * Return 0 on success or a -ve number on error. - */ -int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr) +struct uprobe_xol_ops { +	bool	(*emulate)(struct arch_uprobe *, struct pt_regs *); +	int	(*pre_xol)(struct arch_uprobe *, struct pt_regs *); +	int	(*post_xol)(struct arch_uprobe *, struct pt_regs *); +	void	(*abort)(struct arch_uprobe *, struct pt_regs *); +}; + +static inline int sizeof_long(void)  { -	int ret; -	struct insn insn; +	return is_ia32_task() ? 4 : 8; +} -	auprobe->fixups = 0; -	ret = validate_insn_bits(auprobe, mm, &insn); -	if (ret != 0) -		return ret; +static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ +	riprel_pre_xol(auprobe, regs); +	return 0; +} -	handle_riprel_insn(auprobe, mm, &insn); -	prepare_fixups(auprobe, &insn); +static int push_ret_address(struct pt_regs *regs, unsigned long ip) +{ +	unsigned long new_sp = regs->sp - sizeof_long(); +	if (copy_to_user((void __user *)new_sp, &ip, sizeof_long())) +		return -EFAULT; + +	regs->sp = new_sp;  	return 0;  } -#ifdef CONFIG_X86_64  /* - * If we're emulating a rip-relative instruction, save the contents - * of the scratch register and store the target address in that register. + * We have to fix things up as follows: + * + * Typically, the new ip is relative to the copied instruction.  We need + * to make it relative to the original instruction (FIX_IP).  Exceptions + * are return instructions and absolute or indirect jump or call instructions. + * + * If the single-stepped instruction was a call, the return address that + * is atop the stack is the address following the copied instruction.  We + * need to make it the address following the original instruction (FIX_CALL). + * + * If the original instruction was a rip-relative instruction such as + * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent + * instruction using a scratch register -- e.g., "movl %edx,0xnnnn(%rsi)". + * We need to restore the contents of the scratch register + * (FIX_RIP_reg).   */ -static void -pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs, -				struct arch_uprobe_task *autask) -{ -	if (auprobe->fixups & UPROBE_FIX_RIP_AX) { -		autask->saved_scratch_register = regs->ax; -		regs->ax = current->utask->vaddr; -		regs->ax += auprobe->rip_rela_target_address; -	} else if (auprobe->fixups & UPROBE_FIX_RIP_CX) { -		autask->saved_scratch_register = regs->cx; -		regs->cx = current->utask->vaddr; -		regs->cx += auprobe->rip_rela_target_address; +static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ +	struct uprobe_task *utask = current->utask; + +	riprel_post_xol(auprobe, regs); +	if (auprobe->defparam.fixups & UPROBE_FIX_IP) { +		long correction = utask->vaddr - utask->xol_vaddr; +		regs->ip += correction; +	} else if (auprobe->defparam.fixups & UPROBE_FIX_CALL) { +		regs->sp += sizeof_long(); /* Pop incorrect return address */ +		if (push_ret_address(regs, utask->vaddr + auprobe->defparam.ilen)) +			return -ERESTART;  	} +	/* popf; tell the caller to not touch TF */ +	if (auprobe->defparam.fixups & UPROBE_FIX_SETF) +		utask->autask.saved_tf = true; + +	return 0;  } -#else -static void -pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs, -				struct arch_uprobe_task *autask) + +static void default_abort_op(struct arch_uprobe *auprobe, struct pt_regs *regs)  { -	/* No RIP-relative addressing on 32-bit */ +	riprel_post_xol(auprobe, regs);  } -#endif -/* - * arch_uprobe_pre_xol - prepare to execute out of line. - * @auprobe: the probepoint information. - * @regs: reflects the saved user state of current task. - */ -int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +static struct uprobe_xol_ops default_xol_ops = { +	.pre_xol  = default_pre_xol_op, +	.post_xol = default_post_xol_op, +	.abort	  = default_abort_op, +}; + +static bool branch_is_call(struct arch_uprobe *auprobe)  { -	struct arch_uprobe_task *autask; +	return auprobe->branch.opc1 == 0xe8; +} -	autask = ¤t->utask->autask; -	autask->saved_trap_nr = current->thread.trap_nr; -	current->thread.trap_nr = UPROBE_TRAP_NR; -	regs->ip = current->utask->xol_vaddr; -	pre_xol_rip_insn(auprobe, regs, autask); +#define CASE_COND					\ +	COND(70, 71, XF(OF))				\ +	COND(72, 73, XF(CF))				\ +	COND(74, 75, XF(ZF))				\ +	COND(78, 79, XF(SF))				\ +	COND(7a, 7b, XF(PF))				\ +	COND(76, 77, XF(CF) || XF(ZF))			\ +	COND(7c, 7d, XF(SF) != XF(OF))			\ +	COND(7e, 7f, XF(ZF) || XF(SF) != XF(OF)) -	autask->saved_tf = !!(regs->flags & X86_EFLAGS_TF); -	regs->flags |= X86_EFLAGS_TF; -	if (test_tsk_thread_flag(current, TIF_BLOCKSTEP)) -		set_task_blockstep(current, false); +#define COND(op_y, op_n, expr)				\ +	case 0x ## op_y: DO((expr) != 0)		\ +	case 0x ## op_n: DO((expr) == 0) -	return 0; +#define XF(xf)	(!!(flags & X86_EFLAGS_ ## xf)) + +static bool is_cond_jmp_opcode(u8 opcode) +{ +	switch (opcode) { +	#define DO(expr)	\ +		return true; +	CASE_COND +	#undef	DO + +	default: +		return false; +	}  } -/* - * This function is called by arch_uprobe_post_xol() to adjust the return - * address pushed by a call instruction executed out of line. - */ -static int adjust_ret_addr(unsigned long sp, long correction) +static bool check_jmp_cond(struct arch_uprobe *auprobe, struct pt_regs *regs)  { -	int rasize, ncopied; -	long ra = 0; +	unsigned long flags = regs->flags; -	if (is_ia32_task()) -		rasize = 4; -	else -		rasize = 8; +	switch (auprobe->branch.opc1) { +	#define DO(expr)	\ +		return expr; +	CASE_COND +	#undef	DO -	ncopied = copy_from_user(&ra, (void __user *)sp, rasize); -	if (unlikely(ncopied)) -		return -EFAULT; +	default:	/* not a conditional jmp */ +		return true; +	} +} -	ra += correction; -	ncopied = copy_to_user((void __user *)sp, &ra, rasize); -	if (unlikely(ncopied)) -		return -EFAULT; +#undef	XF +#undef	COND +#undef	CASE_COND -	return 0; +static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ +	unsigned long new_ip = regs->ip += auprobe->branch.ilen; +	unsigned long offs = (long)auprobe->branch.offs; + +	if (branch_is_call(auprobe)) { +		/* +		 * If it fails we execute this (mangled, see the comment in +		 * branch_clear_offset) insn out-of-line. In the likely case +		 * this should trigger the trap, and the probed application +		 * should die or restart the same insn after it handles the +		 * signal, arch_uprobe_post_xol() won't be even called. +		 * +		 * But there is corner case, see the comment in ->post_xol(). +		 */ +		if (push_ret_address(regs, new_ip)) +			return false; +	} else if (!check_jmp_cond(auprobe, regs)) { +		offs = 0; +	} + +	regs->ip = new_ip + offs; +	return true;  } -#ifdef CONFIG_X86_64 -static bool is_riprel_insn(struct arch_uprobe *auprobe) +static int branch_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)  { -	return ((auprobe->fixups & (UPROBE_FIX_RIP_AX | UPROBE_FIX_RIP_CX)) != 0); +	BUG_ON(!branch_is_call(auprobe)); +	/* +	 * We can only get here if branch_emulate_op() failed to push the ret +	 * address _and_ another thread expanded our stack before the (mangled) +	 * "call" insn was executed out-of-line. Just restore ->sp and restart. +	 * We could also restore ->ip and try to call branch_emulate_op() again. +	 */ +	regs->sp += sizeof_long(); +	return -ERESTART; +} + +static void branch_clear_offset(struct arch_uprobe *auprobe, struct insn *insn) +{ +	/* +	 * Turn this insn into "call 1f; 1:", this is what we will execute +	 * out-of-line if ->emulate() fails. We only need this to generate +	 * a trap, so that the probed task receives the correct signal with +	 * the properly filled siginfo. +	 * +	 * But see the comment in ->post_xol(), in the unlikely case it can +	 * succeed. So we need to ensure that the new ->ip can not fall into +	 * the non-canonical area and trigger #GP. +	 * +	 * We could turn it into (say) "pushf", but then we would need to +	 * divorce ->insn[] and ->ixol[]. We need to preserve the 1st byte +	 * of ->insn[] for set_orig_insn(). +	 */ +	memset(auprobe->insn + insn_offset_immediate(insn), +		0, insn->immediate.nbytes);  } -static void -handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction) +static struct uprobe_xol_ops branch_xol_ops = { +	.emulate  = branch_emulate_op, +	.post_xol = branch_post_xol_op, +}; + +/* Returns -ENOSYS if branch_xol_ops doesn't handle this insn */ +static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)  { -	if (is_riprel_insn(auprobe)) { -		struct arch_uprobe_task *autask; +	u8 opc1 = OPCODE1(insn); +	int i; -		autask = ¤t->utask->autask; -		if (auprobe->fixups & UPROBE_FIX_RIP_AX) -			regs->ax = autask->saved_scratch_register; -		else -			regs->cx = autask->saved_scratch_register; +	switch (opc1) { +	case 0xeb:	/* jmp 8 */ +	case 0xe9:	/* jmp 32 */ +	case 0x90:	/* prefix* + nop; same as jmp with .offs = 0 */ +		break; + +	case 0xe8:	/* call relative */ +		branch_clear_offset(auprobe, insn); +		break; +	case 0x0f: +		if (insn->opcode.nbytes != 2) +			return -ENOSYS;  		/* -		 * The original instruction includes a displacement, and so -		 * is 4 bytes longer than what we've just single-stepped. -		 * Fall through to handle stuff like "jmpq *...(%rip)" and -		 * "callq *...(%rip)". +		 * If it is a "near" conditional jmp, OPCODE2() - 0x10 matches +		 * OPCODE1() of the "short" jmp which checks the same condition.  		 */ -		if (correction) -			*correction += 4; +		opc1 = OPCODE2(insn) - 0x10; +	default: +		if (!is_cond_jmp_opcode(opc1)) +			return -ENOSYS; +	} + +	/* +	 * 16-bit overrides such as CALLW (66 e8 nn nn) are not supported. +	 * Intel and AMD behavior differ in 64-bit mode: Intel ignores 66 prefix. +	 * No one uses these insns, reject any branch insns with such prefix. +	 */ +	for (i = 0; i < insn->prefixes.nbytes; i++) { +		if (insn->prefixes.bytes[i] == 0x66) +			return -ENOTSUPP;  	} + +	auprobe->branch.opc1 = opc1; +	auprobe->branch.ilen = insn->length; +	auprobe->branch.offs = insn->immediate.value; + +	auprobe->ops = &branch_xol_ops; +	return 0;  } -#else -static void -handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction) + +/** + * arch_uprobe_analyze_insn - instruction analysis including validity and fixups. + * @mm: the probed address space. + * @arch_uprobe: the probepoint information. + * @addr: virtual address at which to install the probepoint + * Return 0 on success or a -ve number on error. + */ +int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr)  { -	/* No RIP-relative addressing on 32-bit */ +	struct insn insn; +	u8 fix_ip_or_call = UPROBE_FIX_IP; +	int ret; + +	ret = uprobe_init_insn(auprobe, &insn, is_64bit_mm(mm)); +	if (ret) +		return ret; + +	ret = branch_setup_xol_ops(auprobe, &insn); +	if (ret != -ENOSYS) +		return ret; + +	/* +	 * Figure out which fixups default_post_xol_op() will need to perform, +	 * and annotate defparam->fixups accordingly. +	 */ +	switch (OPCODE1(&insn)) { +	case 0x9d:		/* popf */ +		auprobe->defparam.fixups |= UPROBE_FIX_SETF; +		break; +	case 0xc3:		/* ret or lret -- ip is correct */ +	case 0xcb: +	case 0xc2: +	case 0xca: +	case 0xea:		/* jmp absolute -- ip is correct */ +		fix_ip_or_call = 0; +		break; +	case 0x9a:		/* call absolute - Fix return addr, not ip */ +		fix_ip_or_call = UPROBE_FIX_CALL; +		break; +	case 0xff: +		switch (MODRM_REG(&insn)) { +		case 2: case 3:			/* call or lcall, indirect */ +			fix_ip_or_call = UPROBE_FIX_CALL; +			break; +		case 4: case 5:			/* jmp or ljmp, indirect */ +			fix_ip_or_call = 0; +			break; +		} +		/* fall through */ +	default: +		riprel_analyze(auprobe, &insn); +	} + +	auprobe->defparam.ilen = insn.length; +	auprobe->defparam.fixups |= fix_ip_or_call; + +	auprobe->ops = &default_xol_ops; +	return 0; +} + +/* + * arch_uprobe_pre_xol - prepare to execute out of line. + * @auprobe: the probepoint information. + * @regs: reflects the saved user state of current task. + */ +int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ +	struct uprobe_task *utask = current->utask; + +	if (auprobe->ops->pre_xol) { +		int err = auprobe->ops->pre_xol(auprobe, regs); +		if (err) +			return err; +	} + +	regs->ip = utask->xol_vaddr; +	utask->autask.saved_trap_nr = current->thread.trap_nr; +	current->thread.trap_nr = UPROBE_TRAP_NR; + +	utask->autask.saved_tf = !!(regs->flags & X86_EFLAGS_TF); +	regs->flags |= X86_EFLAGS_TF; +	if (test_tsk_thread_flag(current, TIF_BLOCKSTEP)) +		set_task_blockstep(current, false); + +	return 0;  } -#endif  /*   * If xol insn itself traps and generates a signal(Say, @@ -572,53 +799,42 @@ bool arch_uprobe_xol_was_trapped(struct task_struct *t)   * single-step, we single-stepped a copy of the instruction.   *   * This function prepares to resume execution after the single-step. - * We have to fix things up as follows: - * - * Typically, the new ip is relative to the copied instruction.  We need - * to make it relative to the original instruction (FIX_IP).  Exceptions - * are return instructions and absolute or indirect jump or call instructions. - * - * If the single-stepped instruction was a call, the return address that - * is atop the stack is the address following the copied instruction.  We - * need to make it the address following the original instruction (FIX_CALL). - * - * If the original instruction was a rip-relative instruction such as - * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent - * instruction using a scratch register -- e.g., "movl %edx,(%rax)". - * We need to restore the contents of the scratch register and adjust - * the ip, keeping in mind that the instruction we executed is 4 bytes - * shorter than the original instruction (since we squeezed out the offset - * field).  (FIX_RIP_AX or FIX_RIP_CX)   */  int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)  { -	struct uprobe_task *utask; -	long correction; -	int result = 0; +	struct uprobe_task *utask = current->utask; +	bool send_sigtrap = utask->autask.saved_tf; +	int err = 0;  	WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR); - -	utask = current->utask;  	current->thread.trap_nr = utask->autask.saved_trap_nr; -	correction = (long)(utask->vaddr - utask->xol_vaddr); -	handle_riprel_post_xol(auprobe, regs, &correction); -	if (auprobe->fixups & UPROBE_FIX_IP) -		regs->ip += correction; - -	if (auprobe->fixups & UPROBE_FIX_CALL) -		result = adjust_ret_addr(regs->sp, correction); +	if (auprobe->ops->post_xol) { +		err = auprobe->ops->post_xol(auprobe, regs); +		if (err) { +			/* +			 * Restore ->ip for restart or post mortem analysis. +			 * ->post_xol() must not return -ERESTART unless this +			 * is really possible. +			 */ +			regs->ip = utask->vaddr; +			if (err == -ERESTART) +				err = 0; +			send_sigtrap = false; +		} +	}  	/*  	 * arch_uprobe_pre_xol() doesn't save the state of TIF_BLOCKSTEP  	 * so we can get an extra SIGTRAP if we do not clear TF. We need  	 * to examine the opcode to make it right.  	 */ -	if (utask->autask.saved_tf) +	if (send_sigtrap)  		send_sig(SIGTRAP, current, 0); -	else if (!(auprobe->fixups & UPROBE_FIX_SETF)) + +	if (!utask->autask.saved_tf)  		regs->flags &= ~X86_EFLAGS_TF; -	return result; +	return err;  }  /* callback routine for handling exceptions. */ @@ -652,41 +868,27 @@ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val,  /*   * This function gets called when XOL instruction either gets trapped or - * the thread has a fatal signal, so reset the instruction pointer to its - * probed address. + * the thread has a fatal signal. Reset the instruction pointer to its + * probed address for the potential restart or for post mortem analysis.   */  void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)  {  	struct uprobe_task *utask = current->utask; -	current->thread.trap_nr = utask->autask.saved_trap_nr; -	handle_riprel_post_xol(auprobe, regs, NULL); -	instruction_pointer_set(regs, utask->vaddr); +	if (auprobe->ops->abort) +		auprobe->ops->abort(auprobe, regs); +	current->thread.trap_nr = utask->autask.saved_trap_nr; +	regs->ip = utask->vaddr;  	/* clear TF if it was set by us in arch_uprobe_pre_xol() */  	if (!utask->autask.saved_tf)  		regs->flags &= ~X86_EFLAGS_TF;  } -/* - * Skip these instructions as per the currently known x86 ISA. - * rep=0x66*; nop=0x90 - */  static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)  { -	int i; - -	for (i = 0; i < MAX_UINSN_BYTES; i++) { -		if (auprobe->insn[i] == 0x66) -			continue; - -		if (auprobe->insn[i] == 0x90) { -			regs->ip += i + 1; -			return true; -		} - -		break; -	} +	if (auprobe->ops->emulate) +		return auprobe->ops->emulate(auprobe, regs);  	return false;  } @@ -701,23 +903,21 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)  unsigned long  arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs)  { -	int rasize, ncopied; +	int rasize = sizeof_long(), nleft;  	unsigned long orig_ret_vaddr = 0; /* clear high bits for 32-bit apps */ -	rasize = is_ia32_task() ? 4 : 8; -	ncopied = copy_from_user(&orig_ret_vaddr, (void __user *)regs->sp, rasize); -	if (unlikely(ncopied)) +	if (copy_from_user(&orig_ret_vaddr, (void __user *)regs->sp, rasize))  		return -1;  	/* check whether address has been already hijacked */  	if (orig_ret_vaddr == trampoline_vaddr)  		return orig_ret_vaddr; -	ncopied = copy_to_user((void __user *)regs->sp, &trampoline_vaddr, rasize); -	if (likely(!ncopied)) +	nleft = copy_to_user((void __user *)regs->sp, &trampoline_vaddr, rasize); +	if (likely(!nleft))  		return orig_ret_vaddr; -	if (ncopied != rasize) { +	if (nleft != rasize) {  		pr_err("uprobe: return address clobbered: pid=%d, %%sp=%#lx, "  			"%%ip=%#lx\n", current->pid, regs->sp, regs->ip); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 10c4f3006af..49edf2dd361 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -147,7 +147,6 @@ SECTIONS  		_edata = .;  	} :data -#ifdef CONFIG_X86_64  	. = ALIGN(PAGE_SIZE);  	__vvar_page = .; @@ -165,12 +164,15 @@ SECTIONS  #undef __VVAR_KERNEL_LDS  #undef EMIT_VVAR +		/* +		 * Pad the rest of the page with zeros.  Otherwise the loader +		 * can leave garbage here. +		 */ +		. = __vvar_beginning_hack + PAGE_SIZE;  	} :data         . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE); -#endif /* CONFIG_X86_64 */ -  	/* Init code and data - will be freed after init */  	. = ALIGN(PAGE_SIZE);  	.init.begin : AT(ADDR(.init.begin) - LOAD_OFFSET) { @@ -199,6 +201,15 @@ SECTIONS  		__x86_cpu_dev_end = .;  	} +#ifdef CONFIG_X86_INTEL_MID +	.x86_intel_mid_dev.init : AT(ADDR(.x86_intel_mid_dev.init) - \ +								LOAD_OFFSET) { +		__x86_intel_mid_dev_start = .; +		*(.x86_intel_mid_dev.init) +		__x86_intel_mid_dev_end = .; +	} +#endif +  	/*  	 * start address and size of operations which during runtime  	 * can be patched with virtualization friendly instructions or diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index 992f890283e..b99b9ad8540 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -26,6 +26,9 @@  #define TOPOLOGY_REGISTER_OFFSET 0x10 +/* Flag below is initialized once during vSMP PCI initialization. */ +static int irq_routing_comply = 1; +  #if defined CONFIG_PCI && defined CONFIG_PARAVIRT  /*   * Interrupt control on vSMPowered systems: @@ -33,7 +36,7 @@   * and vice versa.   */ -static unsigned long vsmp_save_fl(void) +asmlinkage __visible unsigned long vsmp_save_fl(void)  {  	unsigned long flags = native_save_fl(); @@ -43,7 +46,7 @@ static unsigned long vsmp_save_fl(void)  }  PV_CALLEE_SAVE_REGS_THUNK(vsmp_save_fl); -static void vsmp_restore_fl(unsigned long flags) +__visible void vsmp_restore_fl(unsigned long flags)  {  	if (flags & X86_EFLAGS_IF)  		flags &= ~X86_EFLAGS_AC; @@ -53,7 +56,7 @@ static void vsmp_restore_fl(unsigned long flags)  }  PV_CALLEE_SAVE_REGS_THUNK(vsmp_restore_fl); -static void vsmp_irq_disable(void) +asmlinkage __visible void vsmp_irq_disable(void)  {  	unsigned long flags = native_save_fl(); @@ -61,7 +64,7 @@ static void vsmp_irq_disable(void)  }  PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_disable); -static void vsmp_irq_enable(void) +asmlinkage __visible void vsmp_irq_enable(void)  {  	unsigned long flags = native_save_fl(); @@ -101,6 +104,10 @@ static void __init set_vsmp_pv_ops(void)  #ifdef CONFIG_SMP  	if (cap & ctl & BIT(8)) {  		ctl &= ~BIT(8); + +		/* Interrupt routing set to ignore */ +		irq_routing_comply = 0; +  #ifdef CONFIG_PROC_FS  		/* Don't let users change irq affinity via procfs */  		no_irq_affinity = 1; @@ -218,7 +225,9 @@ static void vsmp_apic_post_init(void)  {  	/* need to update phys_pkg_id */  	apic->phys_pkg_id = apicid_phys_pkg_id; -	apic->vector_allocation_domain = fill_vector_allocation_domain; + +	if (!irq_routing_comply) +		apic->vector_allocation_domain = fill_vector_allocation_domain;  }  void __init vsmp_init(void) diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 1f96f9347ed..ea5b5709aa7 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -47,14 +47,12 @@  #include <asm/segment.h>  #include <asm/desc.h>  #include <asm/topology.h> -#include <asm/vgtod.h>  #include <asm/traps.h>  #define CREATE_TRACE_POINTS  #include "vsyscall_trace.h"  DEFINE_VVAR(int, vgetcpu_mode); -DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data);  static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; @@ -77,48 +75,6 @@ static int __init vsyscall_setup(char *str)  }  early_param("vsyscall", vsyscall_setup); -void update_vsyscall_tz(void) -{ -	vsyscall_gtod_data.sys_tz = sys_tz; -} - -void update_vsyscall(struct timekeeper *tk) -{ -	struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data; - -	write_seqcount_begin(&vdata->seq); - -	/* copy vsyscall data */ -	vdata->clock.vclock_mode	= tk->clock->archdata.vclock_mode; -	vdata->clock.cycle_last		= tk->clock->cycle_last; -	vdata->clock.mask		= tk->clock->mask; -	vdata->clock.mult		= tk->mult; -	vdata->clock.shift		= tk->shift; - -	vdata->wall_time_sec		= tk->xtime_sec; -	vdata->wall_time_snsec		= tk->xtime_nsec; - -	vdata->monotonic_time_sec	= tk->xtime_sec -					+ tk->wall_to_monotonic.tv_sec; -	vdata->monotonic_time_snsec	= tk->xtime_nsec -					+ (tk->wall_to_monotonic.tv_nsec -						<< tk->shift); -	while (vdata->monotonic_time_snsec >= -					(((u64)NSEC_PER_SEC) << tk->shift)) { -		vdata->monotonic_time_snsec -= -					((u64)NSEC_PER_SEC) << tk->shift; -		vdata->monotonic_time_sec++; -	} - -	vdata->wall_time_coarse.tv_sec	= tk->xtime_sec; -	vdata->wall_time_coarse.tv_nsec	= (long)(tk->xtime_nsec >> tk->shift); - -	vdata->monotonic_time_coarse	= timespec_add(vdata->wall_time_coarse, -							tk->wall_to_monotonic); - -	write_seqcount_end(&vdata->seq); -} -  static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,  			      const char *message)  { @@ -135,7 +91,7 @@ static int addr_to_vsyscall_nr(unsigned long addr)  {  	int nr; -	if ((addr & ~0xC00UL) != VSYSCALL_START) +	if ((addr & ~0xC00UL) != VSYSCALL_ADDR)  		return -EINVAL;  	nr = (addr & 0xC00UL) >> 10; @@ -374,28 +330,24 @@ void __init map_vsyscall(void)  {  	extern char __vsyscall_page;  	unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); -	extern char __vvar_page; -	unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page); -	__set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_vsyscall, +	__set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,  		     vsyscall_mode == NATIVE  		     ? PAGE_KERNEL_VSYSCALL  		     : PAGE_KERNEL_VVAR); -	BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_FIRST_PAGE) != -		     (unsigned long)VSYSCALL_START); - -	__set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR); -	BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) != -		     (unsigned long)VVAR_ADDRESS); +	BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != +		     (unsigned long)VSYSCALL_ADDR);  }  static int __init vsyscall_init(void)  { -	BUG_ON(VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)); +	cpu_notifier_register_begin();  	on_each_cpu(cpu_vsyscall_init, NULL, 1);  	/* notifier priority > KVM */ -	hotcpu_notifier(cpu_vsyscall_notifier, 30); +	__hotcpu_notifier(cpu_vsyscall_notifier, 30); + +	cpu_notifier_register_done();  	return 0;  } diff --git a/arch/x86/kernel/vsyscall_gtod.c b/arch/x86/kernel/vsyscall_gtod.c new file mode 100644 index 00000000000..9531fbb123b --- /dev/null +++ b/arch/x86/kernel/vsyscall_gtod.c @@ -0,0 +1,69 @@ +/* + *  Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE + *  Copyright 2003 Andi Kleen, SuSE Labs. + * + *  Modified for x86 32 bit architecture by + *  Stefani Seibold <stefani@seibold.net> + *  sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany + * + *  Thanks to hpa@transmeta.com for some useful hint. + *  Special thanks to Ingo Molnar for his early experience with + *  a different vsyscall implementation for Linux/IA32 and for the name. + * + */ + +#include <linux/timekeeper_internal.h> +#include <asm/vgtod.h> +#include <asm/vvar.h> + +DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data); + +void update_vsyscall_tz(void) +{ +	vsyscall_gtod_data.tz_minuteswest = sys_tz.tz_minuteswest; +	vsyscall_gtod_data.tz_dsttime = sys_tz.tz_dsttime; +} + +void update_vsyscall(struct timekeeper *tk) +{ +	struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data; + +	gtod_write_begin(vdata); + +	/* copy vsyscall data */ +	vdata->vclock_mode	= tk->clock->archdata.vclock_mode; +	vdata->cycle_last	= tk->clock->cycle_last; +	vdata->mask		= tk->clock->mask; +	vdata->mult		= tk->mult; +	vdata->shift		= tk->shift; + +	vdata->wall_time_sec		= tk->xtime_sec; +	vdata->wall_time_snsec		= tk->xtime_nsec; + +	vdata->monotonic_time_sec	= tk->xtime_sec +					+ tk->wall_to_monotonic.tv_sec; +	vdata->monotonic_time_snsec	= tk->xtime_nsec +					+ ((u64)tk->wall_to_monotonic.tv_nsec +						<< tk->shift); +	while (vdata->monotonic_time_snsec >= +					(((u64)NSEC_PER_SEC) << tk->shift)) { +		vdata->monotonic_time_snsec -= +					((u64)NSEC_PER_SEC) << tk->shift; +		vdata->monotonic_time_sec++; +	} + +	vdata->wall_time_coarse_sec	= tk->xtime_sec; +	vdata->wall_time_coarse_nsec	= (long)(tk->xtime_nsec >> tk->shift); + +	vdata->monotonic_time_coarse_sec = +		vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec; +	vdata->monotonic_time_coarse_nsec = +		vdata->wall_time_coarse_nsec + tk->wall_to_monotonic.tv_nsec; + +	while (vdata->monotonic_time_coarse_nsec >= NSEC_PER_SEC) { +		vdata->monotonic_time_coarse_nsec -= NSEC_PER_SEC; +		vdata->monotonic_time_coarse_sec++; +	} + +	gtod_write_end(vdata); +} diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index b014d9414d0..040681928e9 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -66,3 +66,10 @@ EXPORT_SYMBOL(empty_zero_page);  #ifndef CONFIG_PARAVIRT  EXPORT_SYMBOL(native_load_gs_index);  #endif + +#ifdef CONFIG_PREEMPT +EXPORT_SYMBOL(___preempt_schedule); +#ifdef CONFIG_CONTEXT_TRACKING +EXPORT_SYMBOL(___preempt_schedule_context); +#endif +#endif diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 8ce0072cd70..e48b674639c 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -116,6 +116,8 @@ struct x86_msi_ops x86_msi = {  	.teardown_msi_irqs	= default_teardown_msi_irqs,  	.restore_msi_irqs	= default_restore_msi_irqs,  	.setup_hpet_msi		= default_setup_hpet_msi, +	.msi_mask_irq		= default_msi_mask_irq, +	.msix_mask_irq		= default_msix_mask_irq,  };  /* MSI arch specific hooks */ @@ -134,9 +136,17 @@ void arch_teardown_msi_irq(unsigned int irq)  	x86_msi.teardown_msi_irq(irq);  } -void arch_restore_msi_irqs(struct pci_dev *dev, int irq) +void arch_restore_msi_irqs(struct pci_dev *dev)  { -	x86_msi.restore_msi_irqs(dev, irq); +	x86_msi.restore_msi_irqs(dev); +} +u32 arch_msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag) +{ +	return x86_msi.msi_mask_irq(desc, mask, flag); +} +u32 arch_msix_mask_irq(struct msi_desc *desc, u32 flag) +{ +	return x86_msi.msix_mask_irq(desc, flag);  }  #endif diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 422fd822347..a4b451c6add 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -562,6 +562,16 @@ static void __init xstate_enable_boot_cpu(void)  	if (cpu_has_xsaveopt && eagerfpu != DISABLE)  		eagerfpu = ENABLE; +	if (pcntxt_mask & XSTATE_EAGER) { +		if (eagerfpu == DISABLE) { +			pr_err("eagerfpu not present, disabling some xstate features: 0x%llx\n", +					pcntxt_mask & XSTATE_EAGER); +			pcntxt_mask &= ~XSTATE_EAGER; +		} else { +			eagerfpu = ENABLE; +		} +	} +  	pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x\n",  		pcntxt_mask, xstate_size);  }  | 
