diff options
Diffstat (limited to 'arch/x86/kernel')
177 files changed, 5791 insertions, 21381 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 2c833d8c414..250806472a7 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -36,18 +36,18 @@ obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o ldt.o dumpstack.o obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o obj-$(CONFIG_IRQ_WORK) += irq_work.o -obj-$(CONFIG_X86_VISWS) += visws_quirks.o -obj-$(CONFIG_X86_32) += probe_roms_32.o +obj-y += probe_roms.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o obj-y += bootflag.o e820.o -obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o +obj-y += pci-dma.o quirks.o topology.o kdebugfs.o obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o obj-y += tsc.o io_delay.o rtc.o obj-y += pci-iommu_table.o +obj-y += resource.o -obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o +obj-y += trampoline.o trampoline_$(BITS).o obj-y += process.o obj-y += i387.o xsave.o obj-y += ptrace.o @@ -55,11 +55,12 @@ obj-$(CONFIG_X86_32) += tls.o obj-$(CONFIG_IA32_EMULATION) += tls.o obj-y += step.o obj-$(CONFIG_INTEL_TXT) += tboot.o +obj-$(CONFIG_ISA_DMA_API) += i8237.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += cpu/ obj-y += acpi/ -obj-$(CONFIG_SFI) += sfi.o obj-y += reboot.o +obj-$(CONFIG_X86_32) += reboot_32.o obj-$(CONFIG_MCA) += mca_32.o obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_CPUID) += cpuid.o @@ -67,10 +68,9 @@ obj-$(CONFIG_PCI) += early-quirks.o apm-y := apm_32.o obj-$(CONFIG_APM) += apm.o obj-$(CONFIG_SMP) += smp.o -obj-$(CONFIG_SMP) += smpboot.o tsc_sync.o +obj-$(CONFIG_SMP) += smpboot.o +obj-$(CONFIG_SMP) += tsc_sync.o obj-$(CONFIG_SMP) += setup_percpu.o -obj-$(CONFIG_X86_64_SMP) += tsc_sync.o -obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-y += apic/ obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o @@ -82,12 +82,10 @@ obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_MODULES) += module.o -obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_VM86) += vm86_32.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o -obj-$(CONFIG_EARLY_PRINTK_MRST) += early_printk_mrst.o obj-$(CONFIG_HPET_TIMER) += hpet.o obj-$(CONFIG_APB_TIMER) += apb_timer.o @@ -104,14 +102,6 @@ obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o -obj-$(CONFIG_SCx200) += scx200.o -scx200-y += scx200_32.o - -obj-$(CONFIG_OLPC) += olpc.o -obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o -obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o -obj-$(CONFIG_X86_MRST) += mrst.o - microcode-y := microcode_core.o microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o @@ -120,14 +110,14 @@ obj-$(CONFIG_MICROCODE) += microcode.o obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o +obj-$(CONFIG_OF) += devicetree.o ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) - obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o obj-$(CONFIG_AUDIT) += audit_64.o - obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o + obj-$(CONFIG_GART_IOMMU) += amd_gart_64.o aperture_64.o obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index c05872aa3ce..9a966c579af 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -72,6 +72,7 @@ u8 acpi_sci_flags __initdata; int acpi_sci_override_gsi __initdata; int acpi_skip_timer_override __initdata; int acpi_use_timer_override __initdata; +int acpi_fix_pin2_polarity __initdata; #ifdef CONFIG_X86_LOCAL_APIC static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; @@ -198,6 +199,11 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled) { unsigned int ver = 0; + if (id >= (MAX_LOCAL_APIC-1)) { + printk(KERN_INFO PREFIX "skipped apicid that is too big\n"); + return; + } + if (!enabled) { ++disabled_cpus; return; @@ -410,10 +416,15 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header, return 0; } - if (acpi_skip_timer_override && - intsrc->source_irq == 0 && intsrc->global_irq == 2) { - printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n"); - return 0; + if (intsrc->source_irq == 0 && intsrc->global_irq == 2) { + if (acpi_skip_timer_override) { + printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n"); + return 0; + } + if (acpi_fix_pin2_polarity && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) { + intsrc->inti_flags &= ~ACPI_MADT_POLARITY_MASK; + printk(PREFIX "BIOS IRQ0 pin2 override: forcing polarity to high active.\n"); + } } mp_override_legacy_irq(intsrc->source_irq, @@ -504,6 +515,7 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) return 0; } +EXPORT_SYMBOL_GPL(acpi_gsi_to_irq); int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi) { @@ -513,35 +525,62 @@ int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi) return 0; } -/* - * success: return IRQ number (>=0) - * failure: return < 0 - */ -int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) +static int acpi_register_gsi_pic(struct device *dev, u32 gsi, + int trigger, int polarity) { - unsigned int irq; - unsigned int plat_gsi = gsi; - #ifdef CONFIG_PCI /* * Make sure all (legacy) PCI IRQs are set as level-triggered. */ - if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) { - if (trigger == ACPI_LEVEL_SENSITIVE) - eisa_set_level_irq(gsi); - } + if (trigger == ACPI_LEVEL_SENSITIVE) + eisa_set_level_irq(gsi); #endif + return gsi; +} + +static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi, + int trigger, int polarity) +{ #ifdef CONFIG_X86_IO_APIC - if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) { - plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity); - } + gsi = mp_register_gsi(dev, gsi, trigger, polarity); #endif + + return gsi; +} + +int (*__acpi_register_gsi)(struct device *dev, u32 gsi, + int trigger, int polarity) = acpi_register_gsi_pic; + +/* + * success: return IRQ number (>=0) + * failure: return < 0 + */ +int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) +{ + unsigned int irq; + unsigned int plat_gsi = gsi; + + plat_gsi = (*__acpi_register_gsi)(dev, gsi, trigger, polarity); irq = gsi_to_irq(plat_gsi); return irq; } +void __init acpi_set_irq_model_pic(void) +{ + acpi_irq_model = ACPI_IRQ_MODEL_PIC; + __acpi_register_gsi = acpi_register_gsi_pic; + acpi_ioapic = 0; +} + +void __init acpi_set_irq_model_ioapic(void) +{ + acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC; + __acpi_register_gsi = acpi_register_gsi_ioapic; + acpi_ioapic = 1; +} + /* * ACPI based hotplug support for CPU */ @@ -556,14 +595,8 @@ static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) nid = acpi_get_node(handle); if (nid == -1 || !node_online(nid)) return; -#ifdef CONFIG_X86_64 - apicid_to_node[physid] = nid; + set_apicid_to_node(physid, nid); numa_set_node(cpu, nid); -#else /* CONFIG_X86_32 */ - apicid_2_node[physid] = nid; - cpu_to_node_map[cpu] = nid; -#endif - #endif } @@ -820,18 +853,6 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table) * returns 0 on success, < 0 on error */ -static void __init acpi_register_lapic_address(unsigned long address) -{ - mp_lapic_addr = address; - - set_fixmap_nocache(FIX_APIC_BASE, address); - if (boot_cpu_physical_apicid == -1U) { - boot_cpu_physical_apicid = read_apic_id(); - apic_version[boot_cpu_physical_apicid] = - GET_APIC_VERSION(apic_read(APIC_LVR)); - } -} - static int __init early_acpi_parse_madt_lapic_addr_ovr(void) { int count; @@ -853,7 +874,7 @@ static int __init early_acpi_parse_madt_lapic_addr_ovr(void) return count; } - acpi_register_lapic_address(acpi_lapic_addr); + register_lapic_address(acpi_lapic_addr); return count; } @@ -880,16 +901,16 @@ static int __init acpi_parse_madt_lapic_entries(void) return count; } - acpi_register_lapic_address(acpi_lapic_addr); + register_lapic_address(acpi_lapic_addr); count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC, - acpi_parse_sapic, MAX_APICS); + acpi_parse_sapic, MAX_LOCAL_APIC); if (!count) { x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC, - acpi_parse_x2apic, MAX_APICS); + acpi_parse_x2apic, MAX_LOCAL_APIC); count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC, - acpi_parse_lapic, MAX_APICS); + acpi_parse_lapic, MAX_LOCAL_APIC); } if (!count && !x2count) { printk(KERN_ERR PREFIX "No LAPIC entries present\n"); @@ -922,32 +943,6 @@ static int __init acpi_parse_madt_lapic_entries(void) extern int es7000_plat; #endif -static void assign_to_mp_irq(struct mpc_intsrc *m, - struct mpc_intsrc *mp_irq) -{ - memcpy(mp_irq, m, sizeof(struct mpc_intsrc)); -} - -static int mp_irq_cmp(struct mpc_intsrc *mp_irq, - struct mpc_intsrc *m) -{ - return memcmp(mp_irq, m, sizeof(struct mpc_intsrc)); -} - -static void save_mp_irq(struct mpc_intsrc *m) -{ - int i; - - for (i = 0; i < mp_irq_entries; i++) { - if (!mp_irq_cmp(&mp_irqs[i], m)) - return; - } - - assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]); - if (++mp_irq_entries == MAX_IRQ_SOURCES) - panic("Max # of irq sources exceeded!!\n"); -} - void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) { int ioapic; @@ -978,7 +973,7 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) mp_irq.dstapic = mp_ioapics[ioapic].apicid; /* APIC ID */ mp_irq.dstirq = pin; /* INTIN# */ - save_mp_irq(&mp_irq); + mp_save_irq(&mp_irq); isa_irq_to_gsi[bus_irq] = gsi; } @@ -1053,7 +1048,7 @@ void __init mp_config_acpi_legacy_irqs(void) mp_irq.srcbusirq = i; /* Identity mapped */ mp_irq.dstirq = pin; - save_mp_irq(&mp_irq); + mp_save_irq(&mp_irq); } } @@ -1090,7 +1085,7 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, mp_irq.dstapic = mp_ioapics[ioapic].apicid; mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi); - save_mp_irq(&mp_irq); + mp_save_irq(&mp_irq); #endif return 0; } @@ -1259,8 +1254,7 @@ static void __init acpi_process_madt(void) */ error = acpi_parse_madt_ioapic_entries(); if (!error) { - acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC; - acpi_ioapic = 1; + acpi_set_irq_model_ioapic(); smp_found_config = 1; } diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S index 28595d6df47..ead21b66311 100644 --- a/arch/x86/kernel/acpi/realmode/wakeup.S +++ b/arch/x86/kernel/acpi/realmode/wakeup.S @@ -6,11 +6,17 @@ #include <asm/page_types.h> #include <asm/pgtable_types.h> #include <asm/processor-flags.h> +#include "wakeup.h" .code16 - .section ".header", "a" + .section ".jump", "ax" + .globl _start +_start: + cli + jmp wakeup_code /* This should match the structure in wakeup.h */ + .section ".header", "a" .globl wakeup_header wakeup_header: video_mode: .short 0 /* Video mode number */ @@ -30,14 +36,11 @@ wakeup_jmp: .byte 0xea /* ljmpw */ wakeup_jmp_off: .word 3f wakeup_jmp_seg: .word 0 wakeup_gdt: .quad 0, 0, 0 -signature: .long 0x51ee1111 +signature: .long WAKEUP_HEADER_SIGNATURE .text - .globl _start .code16 wakeup_code: -_start: - cli cld /* Apparently some dimwit BIOS programmers don't know how to @@ -77,12 +80,12 @@ _start: /* Check header signature... */ movl signature, %eax - cmpl $0x51ee1111, %eax + cmpl $WAKEUP_HEADER_SIGNATURE, %eax jne bogus_real_magic /* Check we really have everything... */ movl end_signature, %eax - cmpl $0x65a22c82, %eax + cmpl $WAKEUP_END_SIGNATURE, %eax jne bogus_real_magic /* Call the C code */ @@ -147,3 +150,7 @@ wakeup_heap: wakeup_stack: .space 2048 wakeup_stack_end: + + .section ".signature","a" +end_signature: + .long WAKEUP_END_SIGNATURE diff --git a/arch/x86/kernel/acpi/realmode/wakeup.h b/arch/x86/kernel/acpi/realmode/wakeup.h index 69d38d0b2b6..e1828c07e79 100644 --- a/arch/x86/kernel/acpi/realmode/wakeup.h +++ b/arch/x86/kernel/acpi/realmode/wakeup.h @@ -35,7 +35,8 @@ struct wakeup_header { extern struct wakeup_header wakeup_header; #endif -#define HEADER_OFFSET 0x3f00 -#define WAKEUP_SIZE 0x4000 +#define WAKEUP_HEADER_OFFSET 8 +#define WAKEUP_HEADER_SIGNATURE 0x51ee1111 +#define WAKEUP_END_SIGNATURE 0x65a22c82 #endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */ diff --git a/arch/x86/kernel/acpi/realmode/wakeup.lds.S b/arch/x86/kernel/acpi/realmode/wakeup.lds.S index 060fff8f5c5..d4f8010a5b1 100644 --- a/arch/x86/kernel/acpi/realmode/wakeup.lds.S +++ b/arch/x86/kernel/acpi/realmode/wakeup.lds.S @@ -13,9 +13,19 @@ ENTRY(_start) SECTIONS { . = 0; + .jump : { + *(.jump) + } = 0x90909090 + + . = WAKEUP_HEADER_OFFSET; + .header : { + *(.header) + } + + . = ALIGN(16); .text : { *(.text*) - } + } = 0x90909090 . = ALIGN(16); .rodata : { @@ -33,11 +43,6 @@ SECTIONS *(.data*) } - .signature : { - end_signature = .; - LONG(0x65a22c82) - } - . = ALIGN(16); .bss : { __bss_start = .; @@ -45,20 +50,13 @@ SECTIONS __bss_end = .; } - . = HEADER_OFFSET; - .header : { - *(.header) + .signature : { + *(.signature) } - . = ALIGN(16); _end = .; /DISCARD/ : { *(.note*) } - - /* - * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility: - */ - . = ASSERT(_end <= WAKEUP_SIZE, "Wakeup too big!"); } diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 69fd72aa559..18a857ba7a2 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -12,45 +12,34 @@ #include <linux/cpumask.h> #include <asm/segment.h> #include <asm/desc.h> - -#ifdef CONFIG_X86_32 #include <asm/pgtable.h> -#endif +#include <asm/cacheflush.h> #include "realmode/wakeup.h" #include "sleep.h" -unsigned long acpi_wakeup_address; unsigned long acpi_realmode_flags; -/* address in low memory of the wakeup routine. */ -static unsigned long acpi_realmode; - #if defined(CONFIG_SMP) && defined(CONFIG_64BIT) static char temp_stack[4096]; #endif /** - * acpi_save_state_mem - save kernel state + * acpi_suspend_lowlevel - save kernel state * * Create an identity mapped page table and copy the wakeup routine to * low memory. - * - * Note that this is too late to change acpi_wakeup_address. */ -int acpi_save_state_mem(void) +int acpi_suspend_lowlevel(void) { struct wakeup_header *header; + /* address in low memory of the wakeup routine. */ + char *acpi_realmode; - if (!acpi_realmode) { - printk(KERN_ERR "Could not allocate memory during boot, " - "S3 disabled\n"); - return -ENOMEM; - } - memcpy((void *)acpi_realmode, &wakeup_code_start, WAKEUP_SIZE); + acpi_realmode = TRAMPOLINE_SYM(acpi_wakeup_code); - header = (struct wakeup_header *)(acpi_realmode + HEADER_OFFSET); - if (header->signature != 0x51ee1111) { + header = (struct wakeup_header *)(acpi_realmode + WAKEUP_HEADER_OFFSET); + if (header->signature != WAKEUP_HEADER_SIGNATURE) { printk(KERN_ERR "wakeup header does not match\n"); return -EINVAL; } @@ -70,9 +59,7 @@ int acpi_save_state_mem(void) /* GDT[0]: GDT self-pointer */ header->wakeup_gdt[0] = (u64)(sizeof(header->wakeup_gdt) - 1) + - ((u64)(acpi_wakeup_address + - ((char *)&header->wakeup_gdt - (char *)acpi_realmode)) - << 16); + ((u64)__pa(&header->wakeup_gdt) << 16); /* GDT[1]: big real mode-like code segment */ header->wakeup_gdt[1] = GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff); @@ -98,9 +85,9 @@ int acpi_save_state_mem(void) header->pmode_cr3 = (u32)__pa(&initial_page_table); saved_magic = 0x12345678; #else /* CONFIG_64BIT */ - header->trampoline_segment = setup_trampoline() >> 4; + header->trampoline_segment = trampoline_address() >> 4; #ifdef CONFIG_SMP - stack_start.sp = temp_stack + sizeof(temp_stack); + stack_start = (unsigned long)temp_stack + sizeof(temp_stack); early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(smp_processor_id()); initial_gs = per_cpu_offset(smp_processor_id()); @@ -109,47 +96,10 @@ int acpi_save_state_mem(void) saved_magic = 0x123456789abcdef0L; #endif /* CONFIG_64BIT */ + do_suspend_lowlevel(); return 0; } -/* - * acpi_restore_state - undo effects of acpi_save_state_mem - */ -void acpi_restore_state_mem(void) -{ -} - - -/** - * acpi_reserve_wakeup_memory - do _very_ early ACPI initialisation - * - * We allocate a page from the first 1MB of memory for the wakeup - * routine for when we come back from a sleep state. The - * runtime allocator allows specification of <16MB pages, but not - * <1MB pages. - */ -void __init acpi_reserve_wakeup_memory(void) -{ - phys_addr_t mem; - - if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) { - printk(KERN_ERR - "ACPI: Wakeup code way too big, S3 disabled.\n"); - return; - } - - mem = memblock_find_in_range(0, 1<<20, WAKEUP_SIZE, PAGE_SIZE); - - if (mem == MEMBLOCK_ERROR) { - printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n"); - return; - } - acpi_realmode = (unsigned long) phys_to_virt(mem); - acpi_wakeup_address = mem; - memblock_x86_reserve_range(mem, mem + WAKEUP_SIZE, "ACPI WAKEUP"); -} - - static int __init acpi_sleep_setup(char *str) { while ((str != NULL) && (*str != '\0')) { @@ -162,11 +112,6 @@ static int __init acpi_sleep_setup(char *str) #ifdef CONFIG_HIBERNATION if (strncmp(str, "s4_nohwsig", 10) == 0) acpi_no_s4_hw_signature(); - if (strncmp(str, "s4_nonvs", 8) == 0) { - pr_warning("ACPI: acpi_sleep=s4_nonvs is deprecated, " - "please use acpi_sleep=nonvs instead"); - acpi_nvs_nosave(); - } #endif if (strncmp(str, "nonvs", 5) == 0) acpi_nvs_nosave(); diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h index adbcbaa6f1d..416d4be13fe 100644 --- a/arch/x86/kernel/acpi/sleep.h +++ b/arch/x86/kernel/acpi/sleep.h @@ -4,13 +4,12 @@ #include <asm/trampoline.h> -extern char wakeup_code_start, wakeup_code_end; - extern unsigned long saved_video_mode; extern long saved_magic; extern int wakeup_pmode_return; -extern char swsusp_pg_dir[PAGE_SIZE]; extern unsigned long acpi_copy_wakeup_routine(unsigned long); extern void wakeup_long64(void); + +extern void do_suspend_lowlevel(void); diff --git a/arch/x86/kernel/acpi/wakeup_rm.S b/arch/x86/kernel/acpi/wakeup_rm.S index 6ff3b573057..63b8ab524f2 100644 --- a/arch/x86/kernel/acpi/wakeup_rm.S +++ b/arch/x86/kernel/acpi/wakeup_rm.S @@ -2,9 +2,11 @@ * Wrapper script for the realmode binary as a transport object * before copying to low memory. */ - .section ".rodata","a" - .globl wakeup_code_start, wakeup_code_end -wakeup_code_start: +#include <asm/page_types.h> + + .section ".x86_trampoline","a" + .balign PAGE_SIZE + .globl acpi_wakeup_code +acpi_wakeup_code: .incbin "arch/x86/kernel/acpi/realmode/wakeup.bin" -wakeup_code_end: - .size wakeup_code_start, .-wakeup_code_start + .size acpi_wakeup_code, .-acpi_wakeup_code diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index a36bb90aef5..a81f2d52f86 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -67,17 +67,30 @@ __setup("noreplace-paravirt", setup_noreplace_paravirt); #define DPRINTK(fmt, args...) if (debug_alternative) \ printk(KERN_DEBUG fmt, args) +/* + * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes + * that correspond to that nop. Getting from one nop to the next, we + * add to the array the offset that is equal to the sum of all sizes of + * nops preceding the one we are after. + * + * Note: The GENERIC_NOP5_ATOMIC is at the end, as it breaks the + * nice symmetry of sizes of the previous nops. + */ #if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64) -/* Use inline assembly to define this because the nops are defined - as inline assembly strings in the include files and we cannot - get them easily into strings. */ -asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nintelnops: " - GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6 - GENERIC_NOP7 GENERIC_NOP8 - "\t.previous"); -extern const unsigned char intelnops[]; -static const unsigned char *const __initconst_or_module -intel_nops[ASM_NOP_MAX+1] = { +static const unsigned char intelnops[] = +{ + GENERIC_NOP1, + GENERIC_NOP2, + GENERIC_NOP3, + GENERIC_NOP4, + GENERIC_NOP5, + GENERIC_NOP6, + GENERIC_NOP7, + GENERIC_NOP8, + GENERIC_NOP5_ATOMIC +}; +static const unsigned char * const intel_nops[ASM_NOP_MAX+2] = +{ NULL, intelnops, intelnops + 1, @@ -87,17 +100,25 @@ intel_nops[ASM_NOP_MAX+1] = { intelnops + 1 + 2 + 3 + 4 + 5, intelnops + 1 + 2 + 3 + 4 + 5 + 6, intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7, + intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, }; #endif #ifdef K8_NOP1 -asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk8nops: " - K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6 - K8_NOP7 K8_NOP8 - "\t.previous"); -extern const unsigned char k8nops[]; -static const unsigned char *const __initconst_or_module -k8_nops[ASM_NOP_MAX+1] = { +static const unsigned char k8nops[] = +{ + K8_NOP1, + K8_NOP2, + K8_NOP3, + K8_NOP4, + K8_NOP5, + K8_NOP6, + K8_NOP7, + K8_NOP8, + K8_NOP5_ATOMIC +}; +static const unsigned char * const k8_nops[ASM_NOP_MAX+2] = +{ NULL, k8nops, k8nops + 1, @@ -107,17 +128,25 @@ k8_nops[ASM_NOP_MAX+1] = { k8nops + 1 + 2 + 3 + 4 + 5, k8nops + 1 + 2 + 3 + 4 + 5 + 6, k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, + k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, }; #endif #if defined(K7_NOP1) && !defined(CONFIG_X86_64) -asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk7nops: " - K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6 - K7_NOP7 K7_NOP8 - "\t.previous"); -extern const unsigned char k7nops[]; -static const unsigned char *const __initconst_or_module -k7_nops[ASM_NOP_MAX+1] = { +static const unsigned char k7nops[] = +{ + K7_NOP1, + K7_NOP2, + K7_NOP3, + K7_NOP4, + K7_NOP5, + K7_NOP6, + K7_NOP7, + K7_NOP8, + K7_NOP5_ATOMIC +}; +static const unsigned char * const k7_nops[ASM_NOP_MAX+2] = +{ NULL, k7nops, k7nops + 1, @@ -127,17 +156,25 @@ k7_nops[ASM_NOP_MAX+1] = { k7nops + 1 + 2 + 3 + 4 + 5, k7nops + 1 + 2 + 3 + 4 + 5 + 6, k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, + k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, }; #endif #ifdef P6_NOP1 -asm("\t" __stringify(__INITRODATA_OR_MODULE) "\np6nops: " - P6_NOP1 P6_NOP2 P6_NOP3 P6_NOP4 P6_NOP5 P6_NOP6 - P6_NOP7 P6_NOP8 - "\t.previous"); -extern const unsigned char p6nops[]; -static const unsigned char *const __initconst_or_module -p6_nops[ASM_NOP_MAX+1] = { +static const unsigned char __initconst_or_module p6nops[] = +{ + P6_NOP1, + P6_NOP2, + P6_NOP3, + P6_NOP4, + P6_NOP5, + P6_NOP6, + P6_NOP7, + P6_NOP8, + P6_NOP5_ATOMIC +}; +static const unsigned char * const p6_nops[ASM_NOP_MAX+2] = +{ NULL, p6nops, p6nops + 1, @@ -147,47 +184,65 @@ p6_nops[ASM_NOP_MAX+1] = { p6nops + 1 + 2 + 3 + 4 + 5, p6nops + 1 + 2 + 3 + 4 + 5 + 6, p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, + p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, }; #endif +/* Initialize these to a safe default */ #ifdef CONFIG_X86_64 +const unsigned char * const *ideal_nops = p6_nops; +#else +const unsigned char * const *ideal_nops = intel_nops; +#endif -extern char __vsyscall_0; -static const unsigned char *const *__init_or_module find_nop_table(void) +void __init arch_init_ideal_nops(void) { - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && - boot_cpu_has(X86_FEATURE_NOPL)) - return p6_nops; - else - return k8_nops; -} - -#else /* CONFIG_X86_64 */ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + /* + * Due to a decoder implementation quirk, some + * specific Intel CPUs actually perform better with + * the "k8_nops" than with the SDM-recommended NOPs. + */ + if (boot_cpu_data.x86 == 6 && + boot_cpu_data.x86_model >= 0x0f && + boot_cpu_data.x86_model != 0x1c && + boot_cpu_data.x86_model != 0x26 && + boot_cpu_data.x86_model != 0x27 && + boot_cpu_data.x86_model < 0x30) { + ideal_nops = k8_nops; + } else if (boot_cpu_has(X86_FEATURE_NOPL)) { + ideal_nops = p6_nops; + } else { +#ifdef CONFIG_X86_64 + ideal_nops = k8_nops; +#else + ideal_nops = intel_nops; +#endif + } -static const unsigned char *const *__init_or_module find_nop_table(void) -{ - if (boot_cpu_has(X86_FEATURE_K8)) - return k8_nops; - else if (boot_cpu_has(X86_FEATURE_K7)) - return k7_nops; - else if (boot_cpu_has(X86_FEATURE_NOPL)) - return p6_nops; - else - return intel_nops; + default: +#ifdef CONFIG_X86_64 + ideal_nops = k8_nops; +#else + if (boot_cpu_has(X86_FEATURE_K8)) + ideal_nops = k8_nops; + else if (boot_cpu_has(X86_FEATURE_K7)) + ideal_nops = k7_nops; + else + ideal_nops = intel_nops; +#endif + } } -#endif /* CONFIG_X86_64 */ - /* Use this to add nops to a buffer, then text_poke the whole buffer. */ static void __init_or_module add_nops(void *insns, unsigned int len) { - const unsigned char *const *noptable = find_nop_table(); - while (len > 0) { unsigned int noplen = len; if (noplen > ASM_NOP_MAX) noplen = ASM_NOP_MAX; - memcpy(insns, noptable[noplen], noplen); + memcpy(insns, ideal_nops[noplen], noplen); insns += noplen; len -= noplen; } @@ -195,11 +250,12 @@ static void __init_or_module add_nops(void *insns, unsigned int len) extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern s32 __smp_locks[], __smp_locks_end[]; +extern char __vsyscall_0; void *text_poke_early(void *addr, const void *opcode, size_t len); /* Replace instructions with better alternatives for this CPU type. This runs before SMP is initialized to avoid SMP problems with - self modifying code. This implies that assymetric systems where + self modifying code. This implies that asymmetric systems where APs have less capabilities than the boot processor are not handled. Tough. Make sure you disable such features by hand. */ @@ -210,6 +266,15 @@ void __init_or_module apply_alternatives(struct alt_instr *start, u8 insnbuf[MAX_PATCH_LEN]; DPRINTK("%s: alt table %p -> %p\n", __func__, start, end); + /* + * The scan order should be from start to end. A later scanned + * alternative code can overwrite a previous scanned alternative code. + * Some kernel functions (e.g. memcpy, memset, etc) use this order to + * patch code. + * + * So be careful if you want to change the scan order to any other + * order. + */ for (a = start; a < end; a++) { u8 *instr = a->instr; BUG_ON(a->replacementlen > a->instrlen); @@ -353,6 +418,7 @@ void __init_or_module alternatives_smp_module_del(struct module *mod) mutex_unlock(&smp_alt); } +bool skip_smp_alternatives; void alternatives_smp_switch(int smp) { struct smp_alt_module *mod; @@ -368,7 +434,7 @@ void alternatives_smp_switch(int smp) printk("lockdep: fixing up alternatives.\n"); #endif - if (noreplace_smp || smp_alt_once) + if (noreplace_smp || smp_alt_once || skip_smp_alternatives) return; BUG_ON(!smp && (num_online_cpus() > 1)); @@ -591,17 +657,21 @@ static atomic_t stop_machine_first; static int wrote_text; struct text_poke_params { - void *addr; - const void *opcode; - size_t len; + struct text_poke_param *params; + int nparams; }; static int __kprobes stop_machine_text_poke(void *data) { struct text_poke_params *tpp = data; + struct text_poke_param *p; + int i; if (atomic_dec_and_test(&stop_machine_first)) { - text_poke(tpp->addr, tpp->opcode, tpp->len); + for (i = 0; i < tpp->nparams; i++) { + p = &tpp->params[i]; + text_poke(p->addr, p->opcode, p->len); + } smp_wmb(); /* Make sure other cpus see that this has run */ wrote_text = 1; } else { @@ -610,8 +680,17 @@ static int __kprobes stop_machine_text_poke(void *data) smp_mb(); /* Load wrote_text before following execution */ } - flush_icache_range((unsigned long)tpp->addr, - (unsigned long)tpp->addr + tpp->len); + for (i = 0; i < tpp->nparams; i++) { + p = &tpp->params[i]; + flush_icache_range((unsigned long)p->addr, + (unsigned long)p->addr + p->len); + } + /* + * Intel Archiecture Software Developer's Manual section 7.1.3 specifies + * that a core serializing instruction such as "cpuid" should be + * executed on _each_ core before the new instruction is made visible. + */ + sync_core(); return 0; } @@ -631,78 +710,36 @@ static int __kprobes stop_machine_text_poke(void *data) void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len) { struct text_poke_params tpp; + struct text_poke_param p; - tpp.addr = addr; - tpp.opcode = opcode; - tpp.len = len; + p.addr = addr; + p.opcode = opcode; + p.len = len; + tpp.params = &p; + tpp.nparams = 1; atomic_set(&stop_machine_first, 1); wrote_text = 0; /* Use __stop_machine() because the caller already got online_cpus. */ - __stop_machine(stop_machine_text_poke, (void *)&tpp, NULL); + __stop_machine(stop_machine_text_poke, (void *)&tpp, cpu_online_mask); return addr; } -#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL) - -unsigned char ideal_nop5[IDEAL_NOP_SIZE_5]; - -void __init arch_init_ideal_nop5(void) +/** + * text_poke_smp_batch - Update instructions on a live kernel on SMP + * @params: an array of text_poke parameters + * @n: the number of elements in params. + * + * Modify multi-byte instruction by using stop_machine() on SMP. Since the + * stop_machine() is heavy task, it is better to aggregate text_poke requests + * and do it once if possible. + * + * Note: Must be called under get_online_cpus() and text_mutex. + */ +void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n) { - extern const unsigned char ftrace_test_p6nop[]; - extern const unsigned char ftrace_test_nop5[]; - extern const unsigned char ftrace_test_jmp[]; - int faulted = 0; - - /* - * There is no good nop for all x86 archs. - * We will default to using the P6_NOP5, but first we - * will test to make sure that the nop will actually - * work on this CPU. If it faults, we will then - * go to a lesser efficient 5 byte nop. If that fails - * we then just use a jmp as our nop. This isn't the most - * efficient nop, but we can not use a multi part nop - * since we would then risk being preempted in the middle - * of that nop, and if we enabled tracing then, it might - * cause a system crash. - * - * TODO: check the cpuid to determine the best nop. - */ - asm volatile ( - "ftrace_test_jmp:" - "jmp ftrace_test_p6nop\n" - "nop\n" - "nop\n" - "nop\n" /* 2 byte jmp + 3 bytes */ - "ftrace_test_p6nop:" - P6_NOP5 - "jmp 1f\n" - "ftrace_test_nop5:" - ".byte 0x66,0x66,0x66,0x66,0x90\n" - "1:" - ".section .fixup, \"ax\"\n" - "2: movl $1, %0\n" - " jmp ftrace_test_nop5\n" - "3: movl $2, %0\n" - " jmp 1b\n" - ".previous\n" - _ASM_EXTABLE(ftrace_test_p6nop, 2b) - _ASM_EXTABLE(ftrace_test_nop5, 3b) - : "=r"(faulted) : "0" (faulted)); - - switch (faulted) { - case 0: - pr_info("converting mcount calls to 0f 1f 44 00 00\n"); - memcpy(ideal_nop5, ftrace_test_p6nop, IDEAL_NOP_SIZE_5); - break; - case 1: - pr_info("converting mcount calls to 66 66 66 66 90\n"); - memcpy(ideal_nop5, ftrace_test_nop5, IDEAL_NOP_SIZE_5); - break; - case 2: - pr_info("converting mcount calls to jmp . + 5\n"); - memcpy(ideal_nop5, ftrace_test_jmp, IDEAL_NOP_SIZE_5); - break; - } + struct text_poke_params tpp = {.params = params, .nparams = n}; + atomic_set(&stop_machine_first, 1); + wrote_text = 0; + __stop_machine(stop_machine_text_poke, (void *)&tpp, NULL); } -#endif diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/amd_gart_64.c index ba0f0ca9f28..b117efd24f7 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -27,7 +27,7 @@ #include <linux/kdebug.h> #include <linux/scatterlist.h> #include <linux/iommu-helper.h> -#include <linux/sysdev.h> +#include <linux/syscore_ops.h> #include <linux/io.h> #include <linux/gfp.h> #include <asm/atomic.h> @@ -81,6 +81,9 @@ static u32 gart_unmapped_entry; #define AGPEXTERN #endif +/* GART can only remap to physical addresses < 1TB */ +#define GART_MAX_PHYS_ADDR (1ULL << 40) + /* backdoor interface to AGP driver */ AGPEXTERN int agp_memory_reserved; AGPEXTERN __u32 *agp_gatt_table; @@ -143,7 +146,7 @@ static void flush_gart(void) spin_lock_irqsave(&iommu_bitmap_lock, flags); if (need_flush) { - k8_flush_garts(); + amd_flush_garts(); need_flush = false; } spin_unlock_irqrestore(&iommu_bitmap_lock, flags); @@ -212,9 +215,13 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, size_t size, int dir, unsigned long align_mask) { unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE); - unsigned long iommu_page = alloc_iommu(dev, npages, align_mask); + unsigned long iommu_page; int i; + if (unlikely(phys_mem + size > GART_MAX_PHYS_ADDR)) + return bad_dma_addr; + + iommu_page = alloc_iommu(dev, npages, align_mask); if (iommu_page == -1) { if (!nonforced_iommu(dev, phys_mem, size)) return phys_mem; @@ -561,17 +568,17 @@ static void enable_gart_translations(void) { int i; - if (!k8_northbridges.gart_supported) + if (!amd_nb_has_feature(AMD_NB_GART)) return; - for (i = 0; i < k8_northbridges.num; i++) { - struct pci_dev *dev = k8_northbridges.nb_misc[i]; + for (i = 0; i < amd_nb_num(); i++) { + struct pci_dev *dev = node_to_amd_nb(i)->misc; enable_gart_translation(dev, __pa(agp_gatt_table)); } /* Flush the GART-TLB to remove stale entries */ - k8_flush_garts(); + amd_flush_garts(); } /* @@ -589,20 +596,20 @@ void set_up_gart_resume(u32 aper_order, u32 aper_alloc) aperture_alloc = aper_alloc; } -static void gart_fixup_northbridges(struct sys_device *dev) +static void gart_fixup_northbridges(void) { int i; if (!fix_up_north_bridges) return; - if (!k8_northbridges.gart_supported) + if (!amd_nb_has_feature(AMD_NB_GART)) return; pr_info("PCI-DMA: Restoring GART aperture settings\n"); - for (i = 0; i < k8_northbridges.num; i++) { - struct pci_dev *dev = k8_northbridges.nb_misc[i]; + for (i = 0; i < amd_nb_num(); i++) { + struct pci_dev *dev = node_to_amd_nb(i)->misc; /* * Don't enable translations just yet. That is the next @@ -613,51 +620,38 @@ static void gart_fixup_northbridges(struct sys_device *dev) } } -static int gart_resume(struct sys_device *dev) +static void gart_resume(void) { pr_info("PCI-DMA: Resuming GART IOMMU\n"); - gart_fixup_northbridges(dev); + gart_fixup_northbridges(); enable_gart_translations(); - - return 0; } -static int gart_suspend(struct sys_device *dev, pm_message_t state) -{ - return 0; -} - -static struct sysdev_class gart_sysdev_class = { - .name = "gart", - .suspend = gart_suspend, +static struct syscore_ops gart_syscore_ops = { .resume = gart_resume, }; -static struct sys_device device_gart = { - .cls = &gart_sysdev_class, -}; - /* * Private Northbridge GATT initialization in case we cannot use the * AGP driver for some reason. */ -static __init int init_k8_gatt(struct agp_kern_info *info) +static __init int init_amd_gatt(struct agp_kern_info *info) { unsigned aper_size, gatt_size, new_aper_size; unsigned aper_base, new_aper_base; struct pci_dev *dev; void *gatt; - int i, error; + int i; pr_info("PCI-DMA: Disabling AGP.\n"); aper_size = aper_base = info->aper_size = 0; dev = NULL; - for (i = 0; i < k8_northbridges.num; i++) { - dev = k8_northbridges.nb_misc[i]; + for (i = 0; i < amd_nb_num(); i++) { + dev = node_to_amd_nb(i)->misc; new_aper_base = read_aperture(dev, &new_aper_size); if (!new_aper_base) goto nommu; @@ -685,12 +679,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info) agp_gatt_table = gatt; - error = sysdev_class_register(&gart_sysdev_class); - if (!error) - error = sysdev_register(&device_gart); - if (error) - panic("Could not register gart_sysdev -- " - "would corrupt data on next suspend"); + register_syscore_ops(&gart_syscore_ops); flush_gart(); @@ -725,13 +714,13 @@ static void gart_iommu_shutdown(void) if (!no_agp) return; - if (!k8_northbridges.gart_supported) + if (!amd_nb_has_feature(AMD_NB_GART)) return; - for (i = 0; i < k8_northbridges.num; i++) { + for (i = 0; i < amd_nb_num(); i++) { u32 ctl; - dev = k8_northbridges.nb_misc[i]; + dev = node_to_amd_nb(i)->misc; pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); ctl &= ~GARTEN; @@ -749,14 +738,14 @@ int __init gart_iommu_init(void) unsigned long scratch; long i; - if (!k8_northbridges.gart_supported) + if (!amd_nb_has_feature(AMD_NB_GART)) return 0; #ifndef CONFIG_AGP_AMD64 no_agp = 1; #else /* Makefile puts PCI initialization via subsys_initcall first. */ - /* Add other K8 AGP bridge drivers here */ + /* Add other AMD AGP bridge drivers here */ no_agp = no_agp || (agp_amd64_init() < 0) || (agp_copy_info(agp_bridge, &info) < 0); @@ -765,7 +754,7 @@ int __init gart_iommu_init(void) if (no_iommu || (!force_iommu && max_pfn <= MAX_DMA32_PFN) || !gart_iommu_aperture || - (no_agp && init_k8_gatt(&info) < 0)) { + (no_agp && init_amd_gatt(&info) < 0)) { if (max_pfn > MAX_DMA32_PFN) { pr_warning("More than 4GB of memory but GART IOMMU not available.\n"); pr_warning("falling back to iommu=soft.\n"); diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index d2fdb0826df..873e7e1ead7 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -18,6 +18,7 @@ */ #include <linux/pci.h> +#include <linux/pci-ats.h> #include <linux/bitmap.h> #include <linux/slab.h> #include <linux/debugfs.h> @@ -25,6 +26,7 @@ #include <linux/dma-mapping.h> #include <linux/iommu-helper.h> #include <linux/iommu.h> +#include <linux/delay.h> #include <asm/proto.h> #include <asm/iommu.h> #include <asm/gart.h> @@ -34,7 +36,7 @@ #define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28)) -#define EXIT_LOOP_COUNT 10000000 +#define LOOP_TIMEOUT 100000 static DEFINE_RWLOCK(amd_iommu_devtable_lock); @@ -57,7 +59,6 @@ struct iommu_cmd { u32 data[4]; }; -static void reset_iommu_command_buffer(struct amd_iommu *iommu); static void update_domain(struct protection_domain *domain); /**************************************************************************** @@ -322,8 +323,6 @@ static void iommu_print_event(struct amd_iommu *iommu, void *__evt) break; case EVENT_TYPE_ILL_CMD: printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address); - iommu->reset_in_progress = true; - reset_iommu_command_buffer(iommu); dump_command(address); break; case EVENT_TYPE_CMD_HARD_ERR: @@ -367,7 +366,7 @@ static void iommu_poll_events(struct amd_iommu *iommu) spin_unlock_irqrestore(&iommu->lock, flags); } -irqreturn_t amd_iommu_int_handler(int irq, void *data) +irqreturn_t amd_iommu_int_thread(int irq, void *data) { struct amd_iommu *iommu; @@ -377,192 +376,300 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data) return IRQ_HANDLED; } +irqreturn_t amd_iommu_int_handler(int irq, void *data) +{ + return IRQ_WAKE_THREAD; +} + /**************************************************************************** * * IOMMU command queuing functions * ****************************************************************************/ -/* - * Writes the command to the IOMMUs command buffer and informs the - * hardware about the new command. Must be called with iommu->lock held. - */ -static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) +static int wait_on_sem(volatile u64 *sem) +{ + int i = 0; + + while (*sem == 0 && i < LOOP_TIMEOUT) { + udelay(1); + i += 1; + } + + if (i == LOOP_TIMEOUT) { + pr_alert("AMD-Vi: Completion-Wait loop timed out\n"); + return -EIO; + } + + return 0; +} + +static void copy_cmd_to_buffer(struct amd_iommu *iommu, + struct iommu_cmd *cmd, + u32 tail) { - u32 tail, head; u8 *target; - WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED); - tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); target = iommu->cmd_buf + tail; - memcpy_toio(target, cmd, sizeof(*cmd)); - tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size; - head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); - if (tail == head) - return -ENOMEM; + tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size; + + /* Copy command to buffer */ + memcpy(target, cmd, sizeof(*cmd)); + + /* Tell the IOMMU about it */ writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); +} - return 0; +static void build_completion_wait(struct iommu_cmd *cmd, u64 address) +{ + WARN_ON(address & 0x7ULL); + + memset(cmd, 0, sizeof(*cmd)); + cmd->data[0] = lower_32_bits(__pa(address)) | CMD_COMPL_WAIT_STORE_MASK; + cmd->data[1] = upper_32_bits(__pa(address)); + cmd->data[2] = 1; + CMD_SET_TYPE(cmd, CMD_COMPL_WAIT); +} + +static void build_inv_dte(struct iommu_cmd *cmd, u16 devid) +{ + memset(cmd, 0, sizeof(*cmd)); + cmd->data[0] = devid; + CMD_SET_TYPE(cmd, CMD_INV_DEV_ENTRY); +} + +static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, + size_t size, u16 domid, int pde) +{ + u64 pages; + int s; + + pages = iommu_num_pages(address, size, PAGE_SIZE); + s = 0; + + if (pages > 1) { + /* + * If we have to flush more than one page, flush all + * TLB entries for this domain + */ + address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; + s = 1; + } + + address &= PAGE_MASK; + + memset(cmd, 0, sizeof(*cmd)); + cmd->data[1] |= domid; + cmd->data[2] = lower_32_bits(address); + cmd->data[3] = upper_32_bits(address); + CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES); + if (s) /* size bit - we flush more than one 4kb page */ + cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; + if (pde) /* PDE bit - we wan't flush everything not only the PTEs */ + cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; +} + +static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep, + u64 address, size_t size) +{ + u64 pages; + int s; + + pages = iommu_num_pages(address, size, PAGE_SIZE); + s = 0; + + if (pages > 1) { + /* + * If we have to flush more than one page, flush all + * TLB entries for this domain + */ + address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; + s = 1; + } + + address &= PAGE_MASK; + + memset(cmd, 0, sizeof(*cmd)); + cmd->data[0] = devid; + cmd->data[0] |= (qdep & 0xff) << 24; + cmd->data[1] = devid; + cmd->data[2] = lower_32_bits(address); + cmd->data[3] = upper_32_bits(address); + CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES); + if (s) + cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; +} + +static void build_inv_all(struct iommu_cmd *cmd) +{ + memset(cmd, 0, sizeof(*cmd)); + CMD_SET_TYPE(cmd, CMD_INV_ALL); } /* - * General queuing function for commands. Takes iommu->lock and calls - * __iommu_queue_command(). + * Writes the command to the IOMMUs command buffer and informs the + * hardware about the new command. */ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) { + u32 left, tail, head, next_tail; unsigned long flags; - int ret; + WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED); + +again: spin_lock_irqsave(&iommu->lock, flags); - ret = __iommu_queue_command(iommu, cmd); - if (!ret) - iommu->need_sync = true; - spin_unlock_irqrestore(&iommu->lock, flags); - return ret; -} + head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); + tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); + next_tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size; + left = (head - next_tail) % iommu->cmd_buf_size; -/* - * This function waits until an IOMMU has completed a completion - * wait command - */ -static void __iommu_wait_for_completion(struct amd_iommu *iommu) -{ - int ready = 0; - unsigned status = 0; - unsigned long i = 0; + if (left <= 2) { + struct iommu_cmd sync_cmd; + volatile u64 sem = 0; + int ret; - INC_STATS_COUNTER(compl_wait); + build_completion_wait(&sync_cmd, (u64)&sem); + copy_cmd_to_buffer(iommu, &sync_cmd, tail); - while (!ready && (i < EXIT_LOOP_COUNT)) { - ++i; - /* wait for the bit to become one */ - status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); - ready = status & MMIO_STATUS_COM_WAIT_INT_MASK; + spin_unlock_irqrestore(&iommu->lock, flags); + + if ((ret = wait_on_sem(&sem)) != 0) + return ret; + + goto again; } - /* set bit back to zero */ - status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; - writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); + copy_cmd_to_buffer(iommu, cmd, tail); + + /* We need to sync now to make sure all commands are processed */ + iommu->need_sync = true; + + spin_unlock_irqrestore(&iommu->lock, flags); - if (unlikely(i == EXIT_LOOP_COUNT)) - iommu->reset_in_progress = true; + return 0; } /* * This function queues a completion wait command into the command * buffer of an IOMMU */ -static int __iommu_completion_wait(struct amd_iommu *iommu) +static int iommu_completion_wait(struct amd_iommu *iommu) { struct iommu_cmd cmd; + volatile u64 sem = 0; + int ret; - memset(&cmd, 0, sizeof(cmd)); - cmd.data[0] = CMD_COMPL_WAIT_INT_MASK; - CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT); + if (!iommu->need_sync) + return 0; - return __iommu_queue_command(iommu, &cmd); + build_completion_wait(&cmd, (u64)&sem); + + ret = iommu_queue_command(iommu, &cmd); + if (ret) + return ret; + + return wait_on_sem(&sem); } -/* - * This function is called whenever we need to ensure that the IOMMU has - * completed execution of all commands we sent. It sends a - * COMPLETION_WAIT command and waits for it to finish. The IOMMU informs - * us about that by writing a value to a physical address we pass with - * the command. - */ -static int iommu_completion_wait(struct amd_iommu *iommu) +static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid) { - int ret = 0; - unsigned long flags; + struct iommu_cmd cmd; - spin_lock_irqsave(&iommu->lock, flags); + build_inv_dte(&cmd, devid); - if (!iommu->need_sync) - goto out; + return iommu_queue_command(iommu, &cmd); +} - ret = __iommu_completion_wait(iommu); +static void iommu_flush_dte_all(struct amd_iommu *iommu) +{ + u32 devid; - iommu->need_sync = false; + for (devid = 0; devid <= 0xffff; ++devid) + iommu_flush_dte(iommu, devid); - if (ret) - goto out; - - __iommu_wait_for_completion(iommu); + iommu_completion_wait(iommu); +} -out: - spin_unlock_irqrestore(&iommu->lock, flags); +/* + * This function uses heavy locking and may disable irqs for some time. But + * this is no issue because it is only called during resume. + */ +static void iommu_flush_tlb_all(struct amd_iommu *iommu) +{ + u32 dom_id; - if (iommu->reset_in_progress) - reset_iommu_command_buffer(iommu); + for (dom_id = 0; dom_id <= 0xffff; ++dom_id) { + struct iommu_cmd cmd; + build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, + dom_id, 1); + iommu_queue_command(iommu, &cmd); + } - return 0; + iommu_completion_wait(iommu); } -static void iommu_flush_complete(struct protection_domain *domain) +static void iommu_flush_all(struct amd_iommu *iommu) { - int i; + struct iommu_cmd cmd; - for (i = 0; i < amd_iommus_present; ++i) { - if (!domain->dev_iommu[i]) - continue; + build_inv_all(&cmd); - /* - * Devices of this domain are behind this IOMMU - * We need to wait for completion of all commands. - */ - iommu_completion_wait(amd_iommus[i]); + iommu_queue_command(iommu, &cmd); + iommu_completion_wait(iommu); +} + +void iommu_flush_all_caches(struct amd_iommu *iommu) +{ + if (iommu_feature(iommu, FEATURE_IA)) { + iommu_flush_all(iommu); + } else { + iommu_flush_dte_all(iommu); + iommu_flush_tlb_all(iommu); } } /* - * Command send function for invalidating a device table entry + * Command send function for flushing on-device TLB */ -static int iommu_flush_device(struct device *dev) +static int device_flush_iotlb(struct device *dev, u64 address, size_t size) { + struct pci_dev *pdev = to_pci_dev(dev); struct amd_iommu *iommu; struct iommu_cmd cmd; u16 devid; + int qdep; + qdep = pci_ats_queue_depth(pdev); devid = get_device_id(dev); iommu = amd_iommu_rlookup_table[devid]; - /* Build command */ - memset(&cmd, 0, sizeof(cmd)); - CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY); - cmd.data[0] = devid; + build_inv_iotlb_pages(&cmd, devid, qdep, address, size); return iommu_queue_command(iommu, &cmd); } -static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, - u16 domid, int pde, int s) -{ - memset(cmd, 0, sizeof(*cmd)); - address &= PAGE_MASK; - CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES); - cmd->data[1] |= domid; - cmd->data[2] = lower_32_bits(address); - cmd->data[3] = upper_32_bits(address); - if (s) /* size bit - we flush more than one 4kb page */ - cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; - if (pde) /* PDE bit - we wan't flush everything not only the PTEs */ - cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; -} - /* - * Generic command send function for invalidaing TLB entries + * Command send function for invalidating a device table entry */ -static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, - u64 address, u16 domid, int pde, int s) +static int device_flush_dte(struct device *dev) { - struct iommu_cmd cmd; + struct amd_iommu *iommu; + struct pci_dev *pdev; + u16 devid; int ret; - __iommu_build_inv_iommu_pages(&cmd, address, domid, pde, s); + pdev = to_pci_dev(dev); + devid = get_device_id(dev); + iommu = amd_iommu_rlookup_table[devid]; - ret = iommu_queue_command(iommu, &cmd); + ret = iommu_flush_dte(iommu, devid); + if (ret) + return ret; + + if (pci_ats_enabled(pdev)) + ret = device_flush_iotlb(dev, 0, ~0UL); return ret; } @@ -572,23 +679,14 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, * It invalidates a single PTE if the range to flush is within a single * page. Otherwise it flushes the whole TLB of the IOMMU. */ -static void __iommu_flush_pages(struct protection_domain *domain, - u64 address, size_t size, int pde) +static void __domain_flush_pages(struct protection_domain *domain, + u64 address, size_t size, int pde) { - int s = 0, i; - unsigned long pages = iommu_num_pages(address, size, PAGE_SIZE); - - address &= PAGE_MASK; - - if (pages > 1) { - /* - * If we have to flush more than one page, flush all - * TLB entries for this domain - */ - address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; - s = 1; - } + struct iommu_dev_data *dev_data; + struct iommu_cmd cmd; + int ret = 0, i; + build_inv_iommu_pages(&cmd, address, size, domain->id, pde); for (i = 0; i < amd_iommus_present; ++i) { if (!domain->dev_iommu[i]) @@ -598,101 +696,70 @@ static void __iommu_flush_pages(struct protection_domain *domain, * Devices of this domain are behind this IOMMU * We need a TLB flush */ - iommu_queue_inv_iommu_pages(amd_iommus[i], address, - domain->id, pde, s); + ret |= iommu_queue_command(amd_iommus[i], &cmd); + } + + list_for_each_entry(dev_data, &domain->dev_list, list) { + struct pci_dev *pdev = to_pci_dev(dev_data->dev); + + if (!pci_ats_enabled(pdev)) + continue; + + ret |= device_flush_iotlb(dev_data->dev, address, size); } - return; + WARN_ON(ret); } -static void iommu_flush_pages(struct protection_domain *domain, - u64 address, size_t size) +static void domain_flush_pages(struct protection_domain *domain, + u64 address, size_t size) { - __iommu_flush_pages(domain, address, size, 0); + __domain_flush_pages(domain, address, size, 0); } /* Flush the whole IO/TLB for a given protection domain */ -static void iommu_flush_tlb(struct protection_domain *domain) +static void domain_flush_tlb(struct protection_domain *domain) { - __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0); + __domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0); } /* Flush the whole IO/TLB for a given protection domain - including PDE */ -static void iommu_flush_tlb_pde(struct protection_domain *domain) +static void domain_flush_tlb_pde(struct protection_domain *domain) { - __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1); -} - - -/* - * This function flushes the DTEs for all devices in domain - */ -static void iommu_flush_domain_devices(struct protection_domain *domain) -{ - struct iommu_dev_data *dev_data; - unsigned long flags; - - spin_lock_irqsave(&domain->lock, flags); - - list_for_each_entry(dev_data, &domain->dev_list, list) - iommu_flush_device(dev_data->dev); - - spin_unlock_irqrestore(&domain->lock, flags); + __domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1); } -static void iommu_flush_all_domain_devices(void) +static void domain_flush_complete(struct protection_domain *domain) { - struct protection_domain *domain; - unsigned long flags; + int i; - spin_lock_irqsave(&amd_iommu_pd_lock, flags); + for (i = 0; i < amd_iommus_present; ++i) { + if (!domain->dev_iommu[i]) + continue; - list_for_each_entry(domain, &amd_iommu_pd_list, list) { - iommu_flush_domain_devices(domain); - iommu_flush_complete(domain); + /* + * Devices of this domain are behind this IOMMU + * We need to wait for completion of all commands. + */ + iommu_completion_wait(amd_iommus[i]); } - - spin_unlock_irqrestore(&amd_iommu_pd_lock, flags); } -void amd_iommu_flush_all_devices(void) -{ - iommu_flush_all_domain_devices(); -} /* - * This function uses heavy locking and may disable irqs for some time. But - * this is no issue because it is only called during resume. + * This function flushes the DTEs for all devices in domain */ -void amd_iommu_flush_all_domains(void) +static void domain_flush_devices(struct protection_domain *domain) { - struct protection_domain *domain; + struct iommu_dev_data *dev_data; unsigned long flags; - spin_lock_irqsave(&amd_iommu_pd_lock, flags); - - list_for_each_entry(domain, &amd_iommu_pd_list, list) { - spin_lock(&domain->lock); - iommu_flush_tlb_pde(domain); - iommu_flush_complete(domain); - spin_unlock(&domain->lock); - } - - spin_unlock_irqrestore(&amd_iommu_pd_lock, flags); -} - -static void reset_iommu_command_buffer(struct amd_iommu *iommu) -{ - pr_err("AMD-Vi: Resetting IOMMU command buffer\n"); - - if (iommu->reset_in_progress) - panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n"); + spin_lock_irqsave(&domain->lock, flags); - amd_iommu_reset_cmd_buffer(iommu); - amd_iommu_flush_all_devices(); - amd_iommu_flush_all_domains(); + list_for_each_entry(dev_data, &domain->dev_list, list) + device_flush_dte(dev_data->dev); - iommu->reset_in_progress = false; + spin_unlock_irqrestore(&domain->lock, flags); } /**************************************************************************** @@ -1086,7 +1153,7 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom, dma_dom->aperture_size += APERTURE_RANGE_SIZE; - /* Intialize the exclusion range if necessary */ + /* Initialize the exclusion range if necessary */ for_each_iommu(iommu) { if (iommu->exclusion_start && iommu->exclusion_start >= dma_dom->aperture[index]->offset @@ -1353,7 +1420,7 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom) /* * Allocates a new protection domain usable for the dma_ops functions. - * It also intializes the page table and the address allocator data + * It also initializes the page table and the address allocator data * structures required for the dma_ops interface */ static struct dma_ops_domain *dma_ops_domain_alloc(void) @@ -1410,17 +1477,22 @@ static bool dma_ops_domain(struct protection_domain *domain) return domain->flags & PD_DMA_OPS_MASK; } -static void set_dte_entry(u16 devid, struct protection_domain *domain) +static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats) { u64 pte_root = virt_to_phys(domain->pt_root); + u32 flags = 0; pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK) << DEV_ENTRY_MODE_SHIFT; pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV; - amd_iommu_dev_table[devid].data[2] = domain->id; - amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root); - amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); + if (ats) + flags |= DTE_FLAG_IOTLB; + + amd_iommu_dev_table[devid].data[3] |= flags; + amd_iommu_dev_table[devid].data[2] = domain->id; + amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root); + amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); } static void clear_dte_entry(u16 devid) @@ -1437,34 +1509,42 @@ static void do_attach(struct device *dev, struct protection_domain *domain) { struct iommu_dev_data *dev_data; struct amd_iommu *iommu; + struct pci_dev *pdev; + bool ats = false; u16 devid; devid = get_device_id(dev); iommu = amd_iommu_rlookup_table[devid]; dev_data = get_dev_data(dev); + pdev = to_pci_dev(dev); + + if (amd_iommu_iotlb_sup) + ats = pci_ats_enabled(pdev); /* Update data structures */ dev_data->domain = domain; list_add(&dev_data->list, &domain->dev_list); - set_dte_entry(devid, domain); + set_dte_entry(devid, domain, ats); /* Do reference counting */ domain->dev_iommu[iommu->index] += 1; domain->dev_cnt += 1; /* Flush the DTE entry */ - iommu_flush_device(dev); + device_flush_dte(dev); } static void do_detach(struct device *dev) { struct iommu_dev_data *dev_data; struct amd_iommu *iommu; + struct pci_dev *pdev; u16 devid; devid = get_device_id(dev); iommu = amd_iommu_rlookup_table[devid]; dev_data = get_dev_data(dev); + pdev = to_pci_dev(dev); /* decrease reference counters */ dev_data->domain->dev_iommu[iommu->index] -= 1; @@ -1476,7 +1556,7 @@ static void do_detach(struct device *dev) clear_dte_entry(devid); /* Flush the DTE entry */ - iommu_flush_device(dev); + device_flush_dte(dev); } /* @@ -1539,9 +1619,13 @@ out_unlock: static int attach_device(struct device *dev, struct protection_domain *domain) { + struct pci_dev *pdev = to_pci_dev(dev); unsigned long flags; int ret; + if (amd_iommu_iotlb_sup) + pci_enable_ats(pdev, PAGE_SHIFT); + write_lock_irqsave(&amd_iommu_devtable_lock, flags); ret = __attach_device(dev, domain); write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); @@ -1551,7 +1635,7 @@ static int attach_device(struct device *dev, * left the caches in the IOMMU dirty. So we have to flush * here to evict all dirty stuff. */ - iommu_flush_tlb_pde(domain); + domain_flush_tlb_pde(domain); return ret; } @@ -1598,12 +1682,16 @@ static void __detach_device(struct device *dev) */ static void detach_device(struct device *dev) { + struct pci_dev *pdev = to_pci_dev(dev); unsigned long flags; /* lock device table */ write_lock_irqsave(&amd_iommu_devtable_lock, flags); __detach_device(dev); write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); + + if (amd_iommu_iotlb_sup && pci_ats_enabled(pdev)) + pci_disable_ats(pdev); } /* @@ -1692,7 +1780,7 @@ static int device_change_notifier(struct notifier_block *nb, goto out; } - iommu_flush_device(dev); + device_flush_dte(dev); iommu_completion_wait(iommu); out: @@ -1753,8 +1841,9 @@ static void update_device_table(struct protection_domain *domain) struct iommu_dev_data *dev_data; list_for_each_entry(dev_data, &domain->dev_list, list) { + struct pci_dev *pdev = to_pci_dev(dev_data->dev); u16 devid = get_device_id(dev_data->dev); - set_dte_entry(devid, domain); + set_dte_entry(devid, domain, pci_ats_enabled(pdev)); } } @@ -1764,8 +1853,9 @@ static void update_domain(struct protection_domain *domain) return; update_device_table(domain); - iommu_flush_domain_devices(domain); - iommu_flush_tlb_pde(domain); + + domain_flush_devices(domain); + domain_flush_tlb_pde(domain); domain->updated = false; } @@ -1924,10 +2014,10 @@ retry: ADD_STATS_COUNTER(alloced_io_mem, size); if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) { - iommu_flush_tlb(&dma_dom->domain); + domain_flush_tlb(&dma_dom->domain); dma_dom->need_flush = false; } else if (unlikely(amd_iommu_np_cache)) - iommu_flush_pages(&dma_dom->domain, address, size); + domain_flush_pages(&dma_dom->domain, address, size); out: return address; @@ -1976,7 +2066,7 @@ static void __unmap_single(struct dma_ops_domain *dma_dom, dma_ops_free_addresses(dma_dom, dma_addr, pages); if (amd_iommu_unmap_flush || dma_dom->need_flush) { - iommu_flush_pages(&dma_dom->domain, flush_addr, size); + domain_flush_pages(&dma_dom->domain, flush_addr, size); dma_dom->need_flush = false; } } @@ -2012,7 +2102,7 @@ static dma_addr_t map_page(struct device *dev, struct page *page, if (addr == DMA_ERROR_CODE) goto out; - iommu_flush_complete(domain); + domain_flush_complete(domain); out: spin_unlock_irqrestore(&domain->lock, flags); @@ -2039,7 +2129,7 @@ static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size, __unmap_single(domain->priv, dma_addr, size, dir); - iommu_flush_complete(domain); + domain_flush_complete(domain); spin_unlock_irqrestore(&domain->lock, flags); } @@ -2104,7 +2194,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, goto unmap; } - iommu_flush_complete(domain); + domain_flush_complete(domain); out: spin_unlock_irqrestore(&domain->lock, flags); @@ -2150,7 +2240,7 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist, s->dma_address = s->dma_length = 0; } - iommu_flush_complete(domain); + domain_flush_complete(domain); spin_unlock_irqrestore(&domain->lock, flags); } @@ -2200,7 +2290,7 @@ static void *alloc_coherent(struct device *dev, size_t size, goto out_free; } - iommu_flush_complete(domain); + domain_flush_complete(domain); spin_unlock_irqrestore(&domain->lock, flags); @@ -2232,7 +2322,7 @@ static void free_coherent(struct device *dev, size_t size, __unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); - iommu_flush_complete(domain); + domain_flush_complete(domain); spin_unlock_irqrestore(&domain->lock, flags); @@ -2476,7 +2566,7 @@ static void amd_iommu_detach_device(struct iommu_domain *dom, if (!iommu) return; - iommu_flush_device(dev); + device_flush_dte(dev); iommu_completion_wait(iommu); } @@ -2542,7 +2632,7 @@ static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova, unmap_size = iommu_unmap_page(domain, iova, page_size); mutex_unlock(&domain->api_lock); - iommu_flush_tlb_pde(domain); + domain_flush_tlb_pde(domain); return get_order(unmap_size); } diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 6e11c813415..9179c21120a 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -21,7 +21,7 @@ #include <linux/acpi.h> #include <linux/list.h> #include <linux/slab.h> -#include <linux/sysdev.h> +#include <linux/syscore_ops.h> #include <linux/interrupt.h> #include <linux/msi.h> #include <asm/pci-direct.h> @@ -137,6 +137,7 @@ int amd_iommus_present; /* IOMMUs have a non-present cache? */ bool amd_iommu_np_cache __read_mostly; +bool amd_iommu_iotlb_sup __read_mostly = true; /* * The ACPI table parsing functions set this variable on an error @@ -180,6 +181,12 @@ static u32 dev_table_size; /* size of the device table */ static u32 alias_table_size; /* size of the alias table */ static u32 rlookup_table_size; /* size if the rlookup table */ +/* + * This function flushes all internal caches of + * the IOMMU used by this driver. + */ +extern void iommu_flush_all_caches(struct amd_iommu *iommu); + static inline void update_last_devid(u16 devid) { if (devid > amd_iommu_last_bdf) @@ -293,9 +300,23 @@ static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit) /* Function to enable the hardware */ static void iommu_enable(struct amd_iommu *iommu) { - printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx\n", + static const char * const feat_str[] = { + "PreF", "PPR", "X2APIC", "NX", "GT", "[5]", + "IA", "GA", "HE", "PC", NULL + }; + int i; + + printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx", dev_name(&iommu->dev->dev), iommu->cap_ptr); + if (iommu->cap & (1 << IOMMU_CAP_EFR)) { + printk(KERN_CONT " extended features: "); + for (i = 0; feat_str[i]; ++i) + if (iommu_feature(iommu, (1ULL << i))) + printk(KERN_CONT " %s", feat_str[i]); + } + printk(KERN_CONT "\n"); + iommu_feature_enable(iommu, CONTROL_IOMMU_EN); } @@ -651,7 +672,7 @@ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m) static void __init init_iommu_from_pci(struct amd_iommu *iommu) { int cap_ptr = iommu->cap_ptr; - u32 range, misc; + u32 range, misc, low, high; int i, j; pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET, @@ -667,6 +688,15 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu) MMIO_GET_LD(range)); iommu->evt_msi_num = MMIO_MSI_NUM(misc); + if (!(iommu->cap & (1 << IOMMU_CAP_IOTLB))) + amd_iommu_iotlb_sup = false; + + /* read extended feature bits */ + low = readl(iommu->mmio_base + MMIO_EXT_FEATURES); + high = readl(iommu->mmio_base + MMIO_EXT_FEATURES + 4); + + iommu->features = ((u64)high << 32) | low; + if (!is_rd890_iommu(iommu->dev)) return; @@ -1004,10 +1034,11 @@ static int iommu_setup_msi(struct amd_iommu *iommu) if (pci_enable_msi(iommu->dev)) return 1; - r = request_irq(iommu->dev->irq, amd_iommu_int_handler, - IRQF_SAMPLE_RANDOM, - "AMD-Vi", - NULL); + r = request_threaded_irq(iommu->dev->irq, + amd_iommu_int_handler, + amd_iommu_int_thread, + 0, "AMD-Vi", + iommu->dev); if (r) { pci_disable_msi(iommu->dev); @@ -1244,6 +1275,7 @@ static void enable_iommus(void) iommu_set_exclusion_range(iommu); iommu_init_msi(iommu); iommu_enable(iommu); + iommu_flush_all_caches(iommu); } } @@ -1260,7 +1292,7 @@ static void disable_iommus(void) * disable suspend until real resume implemented */ -static int amd_iommu_resume(struct sys_device *dev) +static void amd_iommu_resume(void) { struct amd_iommu *iommu; @@ -1274,13 +1306,11 @@ static int amd_iommu_resume(struct sys_device *dev) * we have to flush after the IOMMUs are enabled because a * disabled IOMMU will never execute the commands we send */ - amd_iommu_flush_all_devices(); - amd_iommu_flush_all_domains(); - - return 0; + for_each_iommu(iommu) + iommu_flush_all_caches(iommu); } -static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state) +static int amd_iommu_suspend(void) { /* disable IOMMUs to go out of the way for BIOS */ disable_iommus(); @@ -1288,17 +1318,11 @@ static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state) return 0; } -static struct sysdev_class amd_iommu_sysdev_class = { - .name = "amd_iommu", +static struct syscore_ops amd_iommu_syscore_ops = { .suspend = amd_iommu_suspend, .resume = amd_iommu_resume, }; -static struct sys_device device_amd_iommu = { - .id = 0, - .cls = &amd_iommu_sysdev_class, -}; - /* * This is the core init function for AMD IOMMU hardware in the system. * This function is called from the generic x86 DMA layer initialization @@ -1415,14 +1439,6 @@ static int __init amd_iommu_init(void) goto free; } - ret = sysdev_class_register(&amd_iommu_sysdev_class); - if (ret) - goto free; - - ret = sysdev_register(&device_amd_iommu); - if (ret) - goto free; - ret = amd_iommu_init_devices(); if (ret) goto free; @@ -1441,6 +1457,8 @@ static int __init amd_iommu_init(void) amd_iommu_init_notifier(); + register_syscore_ops(&amd_iommu_syscore_ops); + if (iommu_pass_through) goto out; diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 8f6463d8ed0..4c39baa8fac 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -12,95 +12,199 @@ static u32 *flush_words; -struct pci_device_id k8_nb_ids[] = { +const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_MISC) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) }, {} }; -EXPORT_SYMBOL(k8_nb_ids); +EXPORT_SYMBOL(amd_nb_misc_ids); -struct k8_northbridge_info k8_northbridges; -EXPORT_SYMBOL(k8_northbridges); +static struct pci_device_id amd_nb_link_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) }, + {} +}; + +const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[] __initconst = { + { 0x00, 0x18, 0x20 }, + { 0xff, 0x00, 0x20 }, + { 0xfe, 0x00, 0x20 }, + { } +}; + +struct amd_northbridge_info amd_northbridges; +EXPORT_SYMBOL(amd_northbridges); -static struct pci_dev *next_k8_northbridge(struct pci_dev *dev) +static struct pci_dev *next_northbridge(struct pci_dev *dev, + const struct pci_device_id *ids) { do { dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); if (!dev) break; - } while (!pci_match_id(&k8_nb_ids[0], dev)); + } while (!pci_match_id(ids, dev)); return dev; } -int cache_k8_northbridges(void) +int amd_cache_northbridges(void) { - int i; - struct pci_dev *dev; + u16 i = 0; + struct amd_northbridge *nb; + struct pci_dev *misc, *link; + + if (amd_nb_num()) + return 0; + + misc = NULL; + while ((misc = next_northbridge(misc, amd_nb_misc_ids)) != NULL) + i++; - if (k8_northbridges.num) + if (i == 0) return 0; - dev = NULL; - while ((dev = next_k8_northbridge(dev)) != NULL) - k8_northbridges.num++; + nb = kzalloc(i * sizeof(struct amd_northbridge), GFP_KERNEL); + if (!nb) + return -ENOMEM; + + amd_northbridges.nb = nb; + amd_northbridges.num = i; + + link = misc = NULL; + for (i = 0; i != amd_nb_num(); i++) { + node_to_amd_nb(i)->misc = misc = + next_northbridge(misc, amd_nb_misc_ids); + node_to_amd_nb(i)->link = link = + next_northbridge(link, amd_nb_link_ids); + } /* some CPU families (e.g. family 0x11) do not support GART */ if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 || boot_cpu_data.x86 == 0x15) - k8_northbridges.gart_supported = 1; + amd_northbridges.flags |= AMD_NB_GART; - k8_northbridges.nb_misc = kmalloc((k8_northbridges.num + 1) * - sizeof(void *), GFP_KERNEL); - if (!k8_northbridges.nb_misc) - return -ENOMEM; + /* + * Some CPU families support L3 Cache Index Disable. There are some + * limitations because of E382 and E388 on family 0x10. + */ + if (boot_cpu_data.x86 == 0x10 && + boot_cpu_data.x86_model >= 0x8 && + (boot_cpu_data.x86_model > 0x9 || + boot_cpu_data.x86_mask >= 0x1)) + amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE; - if (!k8_northbridges.num) { - k8_northbridges.nb_misc[0] = NULL; - return 0; - } + if (boot_cpu_data.x86 == 0x15) + amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE; - if (k8_northbridges.gart_supported) { - flush_words = kmalloc(k8_northbridges.num * sizeof(u32), - GFP_KERNEL); - if (!flush_words) { - kfree(k8_northbridges.nb_misc); - return -ENOMEM; - } - } + /* L3 cache partitioning is supported on family 0x15 */ + if (boot_cpu_data.x86 == 0x15) + amd_northbridges.flags |= AMD_NB_L3_PARTITIONING; - dev = NULL; - i = 0; - while ((dev = next_k8_northbridge(dev)) != NULL) { - k8_northbridges.nb_misc[i] = dev; - if (k8_northbridges.gart_supported) - pci_read_config_dword(dev, 0x9c, &flush_words[i++]); - } - k8_northbridges.nb_misc[i] = NULL; return 0; } -EXPORT_SYMBOL_GPL(cache_k8_northbridges); +EXPORT_SYMBOL_GPL(amd_cache_northbridges); -/* Ignores subdevice/subvendor but as far as I can figure out - they're useless anyways */ -int __init early_is_k8_nb(u32 device) +/* + * Ignores subdevice/subvendor but as far as I can figure out + * they're useless anyways + */ +bool __init early_is_amd_nb(u32 device) { - struct pci_device_id *id; + const struct pci_device_id *id; u32 vendor = device & 0xffff; + device >>= 16; - for (id = k8_nb_ids; id->vendor; id++) + for (id = amd_nb_misc_ids; id->vendor; id++) if (vendor == id->vendor && device == id->device) - return 1; + return true; + return false; +} + +int amd_get_subcaches(int cpu) +{ + struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link; + unsigned int mask; + int cuid = 0; + + if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + return 0; + + pci_read_config_dword(link, 0x1d4, &mask); + +#ifdef CONFIG_SMP + cuid = cpu_data(cpu).compute_unit_id; +#endif + return (mask >> (4 * cuid)) & 0xf; +} + +int amd_set_subcaches(int cpu, int mask) +{ + static unsigned int reset, ban; + struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu)); + unsigned int reg; + int cuid = 0; + + if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING) || mask > 0xf) + return -EINVAL; + + /* if necessary, collect reset state of L3 partitioning and BAN mode */ + if (reset == 0) { + pci_read_config_dword(nb->link, 0x1d4, &reset); + pci_read_config_dword(nb->misc, 0x1b8, &ban); + ban &= 0x180000; + } + + /* deactivate BAN mode if any subcaches are to be disabled */ + if (mask != 0xf) { + pci_read_config_dword(nb->misc, 0x1b8, ®); + pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000); + } + +#ifdef CONFIG_SMP + cuid = cpu_data(cpu).compute_unit_id; +#endif + mask <<= 4 * cuid; + mask |= (0xf ^ (1 << cuid)) << 26; + + pci_write_config_dword(nb->link, 0x1d4, mask); + + /* reset BAN mode if L3 partitioning returned to reset state */ + pci_read_config_dword(nb->link, 0x1d4, ®); + if (reg == reset) { + pci_read_config_dword(nb->misc, 0x1b8, ®); + reg &= ~0x180000; + pci_write_config_dword(nb->misc, 0x1b8, reg | ban); + } + return 0; } -void k8_flush_garts(void) +static int amd_cache_gart(void) +{ + u16 i; + + if (!amd_nb_has_feature(AMD_NB_GART)) + return 0; + + flush_words = kmalloc(amd_nb_num() * sizeof(u32), GFP_KERNEL); + if (!flush_words) { + amd_northbridges.flags &= ~AMD_NB_GART; + return -ENOMEM; + } + + for (i = 0; i != amd_nb_num(); i++) + pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c, + &flush_words[i]); + + return 0; +} + +void amd_flush_garts(void) { int flushed, i; unsigned long flags; static DEFINE_SPINLOCK(gart_lock); - if (!k8_northbridges.gart_supported) + if (!amd_nb_has_feature(AMD_NB_GART)) return; /* Avoid races between AGP and IOMMU. In theory it's not needed @@ -109,16 +213,16 @@ void k8_flush_garts(void) that it doesn't matter to serialize more. -AK */ spin_lock_irqsave(&gart_lock, flags); flushed = 0; - for (i = 0; i < k8_northbridges.num; i++) { - pci_write_config_dword(k8_northbridges.nb_misc[i], 0x9c, - flush_words[i]|1); + for (i = 0; i < amd_nb_num(); i++) { + pci_write_config_dword(node_to_amd_nb(i)->misc, 0x9c, + flush_words[i] | 1); flushed++; } - for (i = 0; i < k8_northbridges.num; i++) { + for (i = 0; i < amd_nb_num(); i++) { u32 w; /* Make sure the hardware actually executed the flush*/ for (;;) { - pci_read_config_dword(k8_northbridges.nb_misc[i], + pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c, &w); if (!(w & 1)) break; @@ -129,19 +233,23 @@ void k8_flush_garts(void) if (!flushed) printk("nothing to flush?\n"); } -EXPORT_SYMBOL_GPL(k8_flush_garts); +EXPORT_SYMBOL_GPL(amd_flush_garts); -static __init int init_k8_nbs(void) +static __init int init_amd_nbs(void) { int err = 0; - err = cache_k8_northbridges(); + err = amd_cache_northbridges(); if (err < 0) - printk(KERN_NOTICE "K8 NB: Cannot enumerate AMD northbridges.\n"); + printk(KERN_NOTICE "AMD NB: Cannot enumerate AMD northbridges.\n"); + + if (amd_cache_gart() < 0) + printk(KERN_NOTICE "AMD NB: Cannot initialize GART flush words, " + "GART support disabled.\n"); return err; } /* This has to go after the PCI subsystem */ -fs_initcall(init_k8_nbs); +fs_initcall(init_amd_nbs); diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index 92543c73cf8..289e92862fd 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -177,7 +177,6 @@ static struct clocksource clocksource_apbt = { .rating = APBT_CLOCKSOURCE_RATING, .read = apbt_read_clocksource, .mask = APBT_MASK, - .shift = APBT_SHIFT, .flags = CLOCK_SOURCE_IS_CONTINUOUS, .resume = apbt_restart_clocksource, }; @@ -284,7 +283,7 @@ static int __init apbt_clockevent_register(void) memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device)); if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) { - apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100; + adev->evt.rating = APBT_CLOCKEVENT_RATING - 100; global_clock_event = &adev->evt; printk(KERN_DEBUG "%s clockevent registered as global\n", global_clock_event->name); @@ -313,13 +312,16 @@ static void apbt_setup_irq(struct apbt_dev *adev) if (adev->irq == 0) return; + irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT); + irq_set_affinity(adev->irq, cpumask_of(adev->cpu)); + /* APB timer irqs are set up as mp_irqs, timer is edge type */ + __irq_set_handler(adev->irq, handle_edge_irq, 0, "edge"); + if (system_state == SYSTEM_BOOTING) { - irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT); - /* APB timer irqs are set up as mp_irqs, timer is edge type */ - __set_irq_handler(adev->irq, handle_edge_irq, 0, "edge"); if (request_irq(adev->irq, apbt_interrupt_handler, - IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING, - adev->name, adev)) { + IRQF_TIMER | IRQF_DISABLED | + IRQF_NOBALANCING, + adev->name, adev)) { printk(KERN_ERR "Failed request IRQ for APBT%d\n", adev->num); } @@ -505,64 +507,12 @@ static int apbt_next_event(unsigned long delta, return 0; } -/* - * APB timer clock is not in sync with pclk on Langwell, which translates to - * unreliable read value caused by sampling error. the error does not add up - * overtime and only happens when sampling a 0 as a 1 by mistake. so the time - * would go backwards. the following code is trying to prevent time traveling - * backwards. little bit paranoid. - */ static cycle_t apbt_read_clocksource(struct clocksource *cs) { - unsigned long t0, t1, t2; - static unsigned long last_read; - -bad_count: - t1 = apbt_readl(phy_cs_timer_id, - APBTMR_N_CURRENT_VALUE); - t2 = apbt_readl(phy_cs_timer_id, - APBTMR_N_CURRENT_VALUE); - if (unlikely(t1 < t2)) { - pr_debug("APBT: read current count error %lx:%lx:%lx\n", - t1, t2, t2 - t1); - goto bad_count; - } - /* - * check against cached last read, makes sure time does not go back. - * it could be a normal rollover but we will do tripple check anyway - */ - if (unlikely(t2 > last_read)) { - /* check if we have a normal rollover */ - unsigned long raw_intr_status = - apbt_readl_reg(APBTMRS_RAW_INT_STATUS); - /* - * cs timer interrupt is masked but raw intr bit is set if - * rollover occurs. then we read EOI reg to clear it. - */ - if (raw_intr_status & (1 << phy_cs_timer_id)) { - apbt_readl(phy_cs_timer_id, APBTMR_N_EOI); - goto out; - } - pr_debug("APB CS going back %lx:%lx:%lx ", - t2, last_read, t2 - last_read); -bad_count_x3: - pr_debug("triple check enforced\n"); - t0 = apbt_readl(phy_cs_timer_id, - APBTMR_N_CURRENT_VALUE); - udelay(1); - t1 = apbt_readl(phy_cs_timer_id, - APBTMR_N_CURRENT_VALUE); - udelay(1); - t2 = apbt_readl(phy_cs_timer_id, - APBTMR_N_CURRENT_VALUE); - if ((t2 > t1) || (t1 > t0)) { - printk(KERN_ERR "Error: APB CS tripple check failed\n"); - goto bad_count_x3; - } - } -out: - last_read = t2; - return (cycle_t)~t2; + unsigned long current_count; + + current_count = apbt_readl(phy_cs_timer_id, APBTMR_N_CURRENT_VALUE); + return (cycle_t)~current_count; } static int apbt_clocksource_register(void) @@ -592,14 +542,7 @@ static int apbt_clocksource_register(void) if (t1 == apbt_read_clocksource(&clocksource_apbt)) panic("APBT counter not counting. APBT disabled\n"); - /* - * initialize and register APBT clocksource - * convert that to ns/clock cycle - * mult = (ns/c) * 2^APBT_SHIFT - */ - clocksource_apbt.mult = div_sc(MSEC_PER_SEC, - (unsigned long) apbt_freq, APBT_SHIFT); - clocksource_register(&clocksource_apbt); + clocksource_register_khz(&clocksource_apbt, (u32)apbt_freq*1000); return 0; } diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index b3a16e8f070..3d2661ca654 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -13,7 +13,7 @@ #include <linux/kernel.h> #include <linux/types.h> #include <linux/init.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/mmzone.h> #include <linux/pci_ids.h> #include <linux/pci.h> @@ -30,6 +30,22 @@ #include <asm/amd_nb.h> #include <asm/x86_init.h> +/* + * Using 512M as goal, in case kexec will load kernel_big + * that will do the on-position decompress, and could overlap with + * with the gart aperture that is used. + * Sequence: + * kernel_small + * ==> kexec (with kdump trigger path or gart still enabled) + * ==> kernel_small (gart area become e820_reserved) + * ==> kexec (with kdump trigger path or gart still enabled) + * ==> kerne_big (uncompressed size will be big than 64M or 128M) + * So don't use 512M below as gart iommu, leave the space for kernel + * code for safe. + */ +#define GART_MIN_ADDR (512ULL << 20) +#define GART_MAX_ADDR (1ULL << 32) + int gart_iommu_aperture; int gart_iommu_aperture_disabled __initdata; int gart_iommu_aperture_allowed __initdata; @@ -39,18 +55,6 @@ int fallback_aper_force __initdata; int fix_aperture __initdata = 1; -struct bus_dev_range { - int bus; - int dev_base; - int dev_limit; -}; - -static struct bus_dev_range bus_dev_ranges[] __initdata = { - { 0x00, 0x18, 0x20}, - { 0xff, 0x00, 0x20}, - { 0xfe, 0x00, 0x20} -}; - static struct resource gart_resource = { .name = "GART", .flags = IORESOURCE_MEM, @@ -69,7 +73,7 @@ static void __init insert_aperture_resource(u32 aper_base, u32 aper_size) static u32 __init allocate_aperture(void) { u32 aper_size; - void *p; + unsigned long addr; /* aper_size should <= 1G */ if (fallback_aper_order > 5) @@ -82,40 +86,27 @@ static u32 __init allocate_aperture(void) * memory. Unfortunately we cannot move it up because that would * make the IOMMU useless. */ - /* - * using 512M as goal, in case kexec will load kernel_big - * that will do the on position decompress, and could overlap with - * that positon with gart that is used. - * sequende: - * kernel_small - * ==> kexec (with kdump trigger path or previous doesn't shutdown gart) - * ==> kernel_small(gart area become e820_reserved) - * ==> kexec (with kdump trigger path or previous doesn't shutdown gart) - * ==> kerne_big (uncompressed size will be big than 64M or 128M) - * so don't use 512M below as gart iommu, leave the space for kernel - * code for safe - */ - p = __alloc_bootmem_nopanic(aper_size, aper_size, 512ULL<<20); + addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR, + aper_size, aper_size); + if (addr == MEMBLOCK_ERROR || addr + aper_size > GART_MAX_ADDR) { + printk(KERN_ERR + "Cannot allocate aperture memory hole (%lx,%uK)\n", + addr, aper_size>>10); + return 0; + } + memblock_x86_reserve_range(addr, addr + aper_size, "aperture64"); /* * Kmemleak should not scan this block as it may not be mapped via the * kernel direct mapping. */ - kmemleak_ignore(p); - if (!p || __pa(p)+aper_size > 0xffffffff) { - printk(KERN_ERR - "Cannot allocate aperture memory hole (%p,%uK)\n", - p, aper_size>>10); - if (p) - free_bootmem(__pa(p), aper_size); - return 0; - } + kmemleak_ignore(phys_to_virt(addr)); printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n", - aper_size >> 10, __pa(p)); - insert_aperture_resource((u32)__pa(p), aper_size); - register_nosave_region((u32)__pa(p) >> PAGE_SHIFT, - (u32)__pa(p+aper_size) >> PAGE_SHIFT); + aper_size >> 10, addr); + insert_aperture_resource((u32)addr, aper_size); + register_nosave_region(addr >> PAGE_SHIFT, + (addr+aper_size) >> PAGE_SHIFT); - return (u32)__pa(p); + return (u32)addr; } @@ -206,7 +197,7 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order) * Do an PCI bus scan by hand because we're running before the PCI * subsystem. * - * All K8 AGP bridges are AGPv3 compliant, so we can do this scan + * All AMD AGP bridges are AGPv3 compliant, so we can do this scan * generically. It's probably overkill to always scan all slots because * the AGP bridges should be always an own bus on the HT hierarchy, * but do it here for future safety. @@ -294,16 +285,16 @@ void __init early_gart_iommu_check(void) search_agp_bridge(&agp_aper_order, &valid_agp); fix = 0; - for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { + for (i = 0; amd_nb_bus_dev_ranges[i].dev_limit; i++) { int bus; int dev_base, dev_limit; - bus = bus_dev_ranges[i].bus; - dev_base = bus_dev_ranges[i].dev_base; - dev_limit = bus_dev_ranges[i].dev_limit; + bus = amd_nb_bus_dev_ranges[i].bus; + dev_base = amd_nb_bus_dev_ranges[i].dev_base; + dev_limit = amd_nb_bus_dev_ranges[i].dev_limit; for (slot = dev_base; slot < dev_limit; slot++) { - if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) + if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00))) continue; ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); @@ -349,16 +340,16 @@ void __init early_gart_iommu_check(void) return; /* disable them all at first */ - for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { + for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) { int bus; int dev_base, dev_limit; - bus = bus_dev_ranges[i].bus; - dev_base = bus_dev_ranges[i].dev_base; - dev_limit = bus_dev_ranges[i].dev_limit; + bus = amd_nb_bus_dev_ranges[i].bus; + dev_base = amd_nb_bus_dev_ranges[i].dev_base; + dev_limit = amd_nb_bus_dev_ranges[i].dev_limit; for (slot = dev_base; slot < dev_limit; slot++) { - if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) + if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00))) continue; ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); @@ -390,17 +381,17 @@ int __init gart_iommu_hole_init(void) fix = 0; node = 0; - for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { + for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) { int bus; int dev_base, dev_limit; u32 ctl; - bus = bus_dev_ranges[i].bus; - dev_base = bus_dev_ranges[i].dev_base; - dev_limit = bus_dev_ranges[i].dev_limit; + bus = amd_nb_bus_dev_ranges[i].bus; + dev_base = amd_nb_bus_dev_ranges[i].dev_base; + dev_limit = amd_nb_bus_dev_ranges[i].dev_limit; for (slot = dev_base; slot < dev_limit; slot++) { - if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) + if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00))) continue; iommu_detected = 1; @@ -505,20 +496,20 @@ out: } /* Fix up the north bridges */ - for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { + for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) { int bus, dev_base, dev_limit; /* * Don't enable translation yet but enable GART IO and CPU * accesses and set DISTLBWALKPRB since GART table memory is UC. */ - u32 ctl = DISTLBWALKPRB | aper_order << 1; + u32 ctl = aper_order << 1; - bus = bus_dev_ranges[i].bus; - dev_base = bus_dev_ranges[i].dev_base; - dev_limit = bus_dev_ranges[i].dev_limit; + bus = amd_nb_bus_dev_ranges[i].bus; + dev_base = amd_nb_bus_dev_ranges[i].dev_base; + dev_limit = amd_nb_bus_dev_ranges[i].dev_limit; for (slot = dev_base; slot < dev_limit; slot++) { - if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) + if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00))) continue; write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl); diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 910f20b457c..3966b564ea4 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -3,10 +3,7 @@ # obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o -ifneq ($(CONFIG_HARDLOCKUP_DETECTOR),y) -obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o -endif -obj-$(CONFIG_HARDLOCKUP_DETECTOR) += hw_nmi.o +obj-y += hw_nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_SMP) += ipi.o diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 850657d1b0e..f92a8e5d1e2 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -24,14 +24,13 @@ #include <linux/ftrace.h> #include <linux/ioport.h> #include <linux/module.h> -#include <linux/sysdev.h> +#include <linux/syscore_ops.h> #include <linux/delay.h> #include <linux/timex.h> #include <linux/dmar.h> #include <linux/init.h> #include <linux/cpu.h> #include <linux/dmi.h> -#include <linux/nmi.h> #include <linux/smp.h> #include <linux/mm.h> @@ -44,15 +43,15 @@ #include <asm/i8259.h> #include <asm/proto.h> #include <asm/apic.h> +#include <asm/io_apic.h> #include <asm/desc.h> #include <asm/hpet.h> #include <asm/idle.h> #include <asm/mtrr.h> #include <asm/smp.h> #include <asm/mce.h> -#include <asm/kvm_para.h> #include <asm/tsc.h> -#include <asm/atomic.h> +#include <asm/hypervisor.h> unsigned int num_processors; @@ -80,12 +79,21 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); #ifdef CONFIG_X86_32 + +/* + * On x86_32, the mapping between cpu and logical apicid may vary + * depending on apic in use. The following early percpu variable is + * used for the mapping. This is where the behaviors of x86_64 and 32 + * actually diverge. Let's keep it ugly for now. + */ +DEFINE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid, BAD_APICID); + /* * Knob to control our willingness to enable the local APIC. * * +1=force-enable */ -static int force_enable_local_apic; +static int force_enable_local_apic __initdata; /* * APIC command line parameters */ @@ -155,7 +163,7 @@ early_param("nox2apic", setup_nox2apic); unsigned long mp_lapic_addr; int disable_apic; /* Disable local APIC timer from the kernel commandline or via dmi quirk */ -static int disable_apic_timer __cpuinitdata; +static int disable_apic_timer __initdata; /* Local APIC timer works in C2 */ int local_apic_timer_c2_ok; EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); @@ -179,29 +187,8 @@ static struct resource lapic_resource = { static unsigned int calibration_result; -static int lapic_next_event(unsigned long delta, - struct clock_event_device *evt); -static void lapic_timer_setup(enum clock_event_mode mode, - struct clock_event_device *evt); -static void lapic_timer_broadcast(const struct cpumask *mask); static void apic_pm_activate(void); -/* - * The local apic timer can be used for any function which is CPU local. - */ -static struct clock_event_device lapic_clockevent = { - .name = "lapic", - .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT - | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY, - .shift = 32, - .set_mode = lapic_timer_setup, - .set_next_event = lapic_next_event, - .broadcast = lapic_timer_broadcast, - .rating = 100, - .irq = -1, -}; -static DEFINE_PER_CPU(struct clock_event_device, lapic_events); - static unsigned long apic_phys; /* @@ -240,7 +227,7 @@ static int modern_apic(void) * right after this call apic become NOOP driven * so apic->write/read doesn't do anything */ -void apic_disable(void) +static void __init apic_disable(void) { pr_info("APIC: switched to apic NOOP\n"); apic = &apic_noop; @@ -284,23 +271,6 @@ u64 native_apic_icr_read(void) return icr1 | ((u64)icr2 << 32); } -/** - * enable_NMI_through_LVT0 - enable NMI through local vector table 0 - */ -void __cpuinit enable_NMI_through_LVT0(void) -{ - unsigned int v; - - /* unmask and set to NMI */ - v = APIC_DM_NMI; - - /* Level triggered for 82489DX (32bit mode) */ - if (!lapic_is_integrated()) - v |= APIC_LVT_LEVEL_TRIGGER; - - apic_write(APIC_LVT0, v); -} - #ifdef CONFIG_X86_32 /** * get_physical_broadcast - Get number of physical broadcast IDs @@ -433,17 +403,18 @@ int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask) reserved = reserve_eilvt_offset(offset, new); if (reserved != new) { - pr_err(FW_BUG "cpu %d, try to setup vector 0x%x, but " - "vector 0x%x was already reserved by another core, " - "APIC%lX=0x%x\n", - smp_processor_id(), new, reserved, reg, old); + pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for " + "vector 0x%x, but the register is already in use for " + "vector 0x%x on another cpu\n", + smp_processor_id(), reg, offset, new, reserved); return -EINVAL; } if (!eilvt_entry_is_changeable(old, new)) { - pr_err(FW_BUG "cpu %d, try to setup vector 0x%x but " - "register already in use, APIC%lX=0x%x\n", - smp_processor_id(), new, reg, old); + pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for " + "vector 0x%x, but the register is already in use for " + "vector 0x%x on this cpu\n", + smp_processor_id(), reg, offset, new, old); return -EBUSY; } @@ -509,6 +480,23 @@ static void lapic_timer_broadcast(const struct cpumask *mask) #endif } + +/* + * The local apic timer can be used for any function which is CPU local. + */ +static struct clock_event_device lapic_clockevent = { + .name = "lapic", + .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT + | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY, + .shift = 32, + .set_mode = lapic_timer_setup, + .set_next_event = lapic_next_event, + .broadcast = lapic_timer_broadcast, + .rating = 100, + .irq = -1, +}; +static DEFINE_PER_CPU(struct clock_event_device, lapic_events); + /* * Setup the local APIC timer for this CPU. Copy the initialized values * of the boot CPU and register the clock event in the framework. @@ -517,7 +505,7 @@ static void __cpuinit setup_APIC_timer(void) { struct clock_event_device *levt = &__get_cpu_var(lapic_events); - if (cpu_has(¤t_cpu_data, X86_FEATURE_ARAT)) { + if (this_cpu_has(X86_FEATURE_ARAT)) { lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP; /* Make LAPIC timer preferrable over percpu HPET */ lapic_clockevent.rating = 150; @@ -685,7 +673,7 @@ static int __init calibrate_APIC_clock(void) lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, lapic_clockevent.shift); lapic_clockevent.max_delta_ns = - clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); + clockevent_delta2ns(0x7FFFFFFF, &lapic_clockevent); lapic_clockevent.min_delta_ns = clockevent_delta2ns(0xF, &lapic_clockevent); @@ -800,11 +788,7 @@ void __init setup_boot_APIC_clock(void) * PIT/HPET going. Otherwise register lapic as a dummy * device. */ - if (nmi_watchdog != NMI_IO_APIC) - lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; - else - pr_warning("APIC timer registered as dummy," - " due to nmi_watchdog=%d!\n", nmi_watchdog); + lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; /* Setup the lapic or request the broadcast */ setup_APIC_timer(); @@ -1196,12 +1180,15 @@ static void __cpuinit lapic_setup_esr(void) oldvalue, value); } - /** * setup_local_APIC - setup the local APIC + * + * Used to setup local APIC while initializing BSP or bringin up APs. + * Always called with preemption disabled. */ void __cpuinit setup_local_APIC(void) { + int cpu = smp_processor_id(); unsigned int value, queued; int i, j, acked = 0; unsigned long long tsc = 0, ntsc; @@ -1211,7 +1198,7 @@ void __cpuinit setup_local_APIC(void) rdtscll(tsc); if (disable_apic) { - arch_disable_smp_support(); + disable_ioapic_support(); return; } @@ -1226,8 +1213,6 @@ void __cpuinit setup_local_APIC(void) #endif perf_events_lapic_init(); - preempt_disable(); - /* * Double-check whether this APIC is really registered. * This is meaningless in clustered apic mode, so we skip it. @@ -1241,6 +1226,30 @@ void __cpuinit setup_local_APIC(void) */ apic->init_apic_ldr(); +#ifdef CONFIG_X86_32 + /* + * APIC LDR is initialized. If logical_apicid mapping was + * initialized during get_smp_config(), make sure it matches the + * actual value. + */ + i = early_per_cpu(x86_cpu_to_logical_apicid, cpu); + WARN_ON(i != BAD_APICID && i != logical_smp_processor_id()); + /* always use the value from LDR */ + early_per_cpu(x86_cpu_to_logical_apicid, cpu) = + logical_smp_processor_id(); + + /* + * Some NUMA implementations (NUMAQ) don't initialize apicid to + * node mapping during NUMA init. Now that logical apicid is + * guaranteed to be known, give it another chance. This is already + * a bit too late - percpu allocation has already happened without + * proper NUMA affinity. + */ + if (apic->x86_32_numa_cpu_node) + set_apicid_to_node(early_per_cpu(x86_cpu_to_apicid, cpu), + apic->x86_32_numa_cpu_node(cpu)); +#endif + /* * Set Task Priority to 'accept all'. We never change this * later on. @@ -1343,21 +1352,19 @@ void __cpuinit setup_local_APIC(void) * TODO: set up through-local-APIC from through-I/O-APIC? --macro */ value = apic_read(APIC_LVT0) & APIC_LVT_MASKED; - if (!smp_processor_id() && (pic_mode || !value)) { + if (!cpu && (pic_mode || !value)) { value = APIC_DM_EXTINT; - apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", - smp_processor_id()); + apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", cpu); } else { value = APIC_DM_EXTINT | APIC_LVT_MASKED; - apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", - smp_processor_id()); + apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", cpu); } apic_write(APIC_LVT0, value); /* * only the BP should see the LINT1 NMI signal, obviously. */ - if (!smp_processor_id()) + if (!cpu) value = APIC_DM_NMI; else value = APIC_DM_NMI | APIC_LVT_MASKED; @@ -1365,11 +1372,9 @@ void __cpuinit setup_local_APIC(void) value |= APIC_LVT_LEVEL_TRIGGER; apic_write(APIC_LVT1, value); - preempt_enable(); - #ifdef CONFIG_X86_MCE_INTEL /* Recheck CMCI information after local APIC is up on CPU #0 */ - if (smp_processor_id() == 0) + if (!cpu) cmci_recheck(); #endif } @@ -1388,10 +1393,22 @@ void __cpuinit end_local_APIC_setup(void) } #endif - setup_apic_nmi_watchdog(NULL); apic_pm_activate(); } +void __init bsp_end_local_APIC_setup(void) +{ + end_local_APIC_setup(); + + /* + * Now that local APIC setup is completed for BP, configure the fault + * handling for interrupt remapping. + */ + if (intr_remapping_enabled) + enable_drhd_fault_handling(); + +} + #ifdef CONFIG_X86_X2APIC void check_x2apic(void) { @@ -1444,7 +1461,7 @@ int __init enable_IR(void) void __init enable_IR_x2apic(void) { unsigned long flags; - struct IO_APIC_route_entry **ioapic_entries = NULL; + struct IO_APIC_route_entry **ioapic_entries; int ret, x2apic_enabled = 0; int dmar_table_init_ret; @@ -1477,7 +1494,8 @@ void __init enable_IR_x2apic(void) /* IR is required if there is APIC ID > 255 even when running * under KVM */ - if (max_physical_apicid > 255 || !kvm_para_available()) + if (max_physical_apicid > 255 || + !hypervisor_x2apic_available()) goto nox2apic; /* * without IR all CPUs can be addressed by IOAPIC/MSI @@ -1531,13 +1549,60 @@ static int __init detect_init_APIC(void) return 0; } #else + +static int __init apic_verify(void) +{ + u32 features, h, l; + + /* + * The APIC feature bit should now be enabled + * in `cpuid' + */ + features = cpuid_edx(1); + if (!(features & (1 << X86_FEATURE_APIC))) { + pr_warning("Could not enable APIC!\n"); + return -1; + } + set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + + /* The BIOS may have set up the APIC at some other address */ + rdmsr(MSR_IA32_APICBASE, l, h); + if (l & MSR_IA32_APICBASE_ENABLE) + mp_lapic_addr = l & MSR_IA32_APICBASE_BASE; + + pr_info("Found and enabled local APIC!\n"); + return 0; +} + +int __init apic_force_enable(unsigned long addr) +{ + u32 h, l; + + if (disable_apic) + return -1; + + /* + * Some BIOSes disable the local APIC in the APIC_BASE + * MSR. This can only be done in software for Intel P6 or later + * and AMD K7 (Model > 1) or later. + */ + rdmsr(MSR_IA32_APICBASE, l, h); + if (!(l & MSR_IA32_APICBASE_ENABLE)) { + pr_info("Local APIC disabled by BIOS -- reenabling.\n"); + l &= ~MSR_IA32_APICBASE_BASE; + l |= MSR_IA32_APICBASE_ENABLE | addr; + wrmsr(MSR_IA32_APICBASE, l, h); + enabled_via_apicbase = 1; + } + return apic_verify(); +} + /* * Detect and initialize APIC */ static int __init detect_init_APIC(void) { - u32 h, l, features; - /* Disabled by kernel option? */ if (disable_apic) return -1; @@ -1567,38 +1632,12 @@ static int __init detect_init_APIC(void) "you can enable it with \"lapic\"\n"); return -1; } - /* - * Some BIOSes disable the local APIC in the APIC_BASE - * MSR. This can only be done in software for Intel P6 or later - * and AMD K7 (Model > 1) or later. - */ - rdmsr(MSR_IA32_APICBASE, l, h); - if (!(l & MSR_IA32_APICBASE_ENABLE)) { - pr_info("Local APIC disabled by BIOS -- reenabling.\n"); - l &= ~MSR_IA32_APICBASE_BASE; - l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; - wrmsr(MSR_IA32_APICBASE, l, h); - enabled_via_apicbase = 1; - } - } - /* - * The APIC feature bit should now be enabled - * in `cpuid' - */ - features = cpuid_edx(1); - if (!(features & (1 << X86_FEATURE_APIC))) { - pr_warning("Could not enable APIC!\n"); - return -1; + if (apic_force_enable(APIC_DEFAULT_PHYS_BASE)) + return -1; + } else { + if (apic_verify()) + return -1; } - set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - - /* The BIOS may have set up the APIC at some other address */ - rdmsr(MSR_IA32_APICBASE, l, h); - if (l & MSR_IA32_APICBASE_ENABLE) - mp_lapic_addr = l & MSR_IA32_APICBASE_BASE; - - pr_info("Found and enabled local APIC!\n"); apic_pm_activate(); @@ -1610,28 +1649,6 @@ no_apic: } #endif -#ifdef CONFIG_X86_64 -void __init early_init_lapic_mapping(void) -{ - /* - * If no local APIC can be found then go out - * : it means there is no mpatable and MADT - */ - if (!smp_found_config) - return; - - set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); - apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", - APIC_BASE, mp_lapic_addr); - - /* - * Fetch the APIC ID of the BSP in case we have a - * default configuration (or the MP table is broken). - */ - boot_cpu_physical_apicid = read_apic_id(); -} -#endif - /** * init_apic_mappings - initialize APIC mappings */ @@ -1657,10 +1674,7 @@ void __init init_apic_mappings(void) * acpi_register_lapic_address() */ if (!acpi_lapic && !smp_found_config) - set_fixmap_nocache(FIX_APIC_BASE, apic_phys); - - apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", - APIC_BASE, apic_phys); + register_lapic_address(apic_phys); } /* @@ -1682,11 +1696,27 @@ void __init init_apic_mappings(void) } } +void __init register_lapic_address(unsigned long address) +{ + mp_lapic_addr = address; + + if (!x2apic_mode) { + set_fixmap_nocache(FIX_APIC_BASE, address); + apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", + APIC_BASE, mp_lapic_addr); + } + if (boot_cpu_physical_apicid == -1U) { + boot_cpu_physical_apicid = read_apic_id(); + apic_version[boot_cpu_physical_apicid] = + GET_APIC_VERSION(apic_read(APIC_LVR)); + } +} + /* * This initializes the IO-APIC and APIC hardware if this is * a UP kernel. */ -int apic_version[MAX_APICS]; +int apic_version[MAX_LOCAL_APIC]; int __init APIC_init_uniprocessor(void) { @@ -1744,24 +1774,17 @@ int __init APIC_init_uniprocessor(void) enable_IO_APIC(); #endif - end_local_APIC_setup(); + bsp_end_local_APIC_setup(); #ifdef CONFIG_X86_IO_APIC if (smp_found_config && !skip_ioapic_setup && nr_ioapics) setup_IO_APIC(); else { nr_ioapics = 0; - localise_nmi_watchdog(); } -#else - localise_nmi_watchdog(); #endif x86_init.timers.setup_percpu_clockev(); -#ifdef CONFIG_X86_64 - check_nmi_watchdog(); -#endif - return 0; } @@ -1800,30 +1823,41 @@ void smp_spurious_interrupt(struct pt_regs *regs) */ void smp_error_interrupt(struct pt_regs *regs) { - u32 v, v1; + u32 v0, v1; + u32 i = 0; + static const char * const error_interrupt_reason[] = { + "Send CS error", /* APIC Error Bit 0 */ + "Receive CS error", /* APIC Error Bit 1 */ + "Send accept error", /* APIC Error Bit 2 */ + "Receive accept error", /* APIC Error Bit 3 */ + "Redirectable IPI", /* APIC Error Bit 4 */ + "Send illegal vector", /* APIC Error Bit 5 */ + "Received illegal vector", /* APIC Error Bit 6 */ + "Illegal register address", /* APIC Error Bit 7 */ + }; exit_idle(); irq_enter(); /* First tickle the hardware, only then report what went on. -- REW */ - v = apic_read(APIC_ESR); + v0 = apic_read(APIC_ESR); apic_write(APIC_ESR, 0); v1 = apic_read(APIC_ESR); ack_APIC_irq(); atomic_inc(&irq_err_count); - /* - * Here is what the APIC error bits mean: - * 0: Send CS error - * 1: Receive CS error - * 2: Send accept error - * 3: Receive accept error - * 4: Reserved - * 5: Send illegal vector - * 6: Received illegal vector - * 7: Illegal register address - */ - pr_debug("APIC error on CPU%d: %02x(%02x)\n", - smp_processor_id(), v , v1); + apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x(%02x)", + smp_processor_id(), v0 , v1); + + v1 = v1 & 0xff; + while (v1) { + if (v1 & 0x1) + apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]); + i++; + v1 >>= 1; + }; + + apic_printk(APIC_DEBUG, KERN_CONT "\n"); + irq_exit(); } @@ -1920,17 +1954,6 @@ void __cpuinit generic_processor_info(int apicid, int version) { int cpu; - /* - * Validate version - */ - if (version == 0x0) { - pr_warning("BIOS bug, APIC version is 0 for CPU#%d! " - "fixing up to 0x10. (tell your hw vendor)\n", - version); - version = 0x10; - } - apic_version[apicid] = version; - if (num_processors >= nr_cpu_ids) { int max = nr_cpu_ids; int thiscpu = max + disabled_cpus; @@ -1944,22 +1967,34 @@ void __cpuinit generic_processor_info(int apicid, int version) } num_processors++; - cpu = cpumask_next_zero(-1, cpu_present_mask); - - if (version != apic_version[boot_cpu_physical_apicid]) - WARN_ONCE(1, - "ACPI: apic version mismatch, bootcpu: %x cpu %d: %x\n", - apic_version[boot_cpu_physical_apicid], cpu, version); - - physid_set(apicid, phys_cpu_present_map); if (apicid == boot_cpu_physical_apicid) { /* * x86_bios_cpu_apicid is required to have processors listed * in same order as logical cpu numbers. Hence the first * entry is BSP, and so on. + * boot_cpu_init() already hold bit 0 in cpu_present_mask + * for BSP. */ cpu = 0; + } else + cpu = cpumask_next_zero(-1, cpu_present_mask); + + /* + * Validate version + */ + if (version == 0x0) { + pr_warning("BIOS bug: APIC version is 0 for CPU %d/0x%x, fixing up to 0x10\n", + cpu, apicid); + version = 0x10; + } + apic_version[apicid] = version; + + if (version != apic_version[boot_cpu_physical_apicid]) { + pr_warning("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n", + apic_version[boot_cpu_physical_apicid], cpu, version); } + + physid_set(apicid, phys_cpu_present_map); if (apicid > max_physical_apicid) max_physical_apicid = apicid; @@ -1967,7 +2002,10 @@ void __cpuinit generic_processor_info(int apicid, int version) early_per_cpu(x86_cpu_to_apicid, cpu) = apicid; early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid; #endif - +#ifdef CONFIG_X86_32 + early_per_cpu(x86_cpu_to_logical_apicid, cpu) = + apic->x86_32_early_logical_apicid(cpu); +#endif set_cpu_possible(cpu, true); set_cpu_present(cpu, true); } @@ -1987,17 +2025,6 @@ void default_init_apic_ldr(void) apic_write(APIC_LDR, val); } -#ifdef CONFIG_X86_32 -int default_apicid_to_node(int logical_apicid) -{ -#ifdef CONFIG_SMP - return apicid_2_node[hard_smp_processor_id()]; -#else - return 0; -#endif -} -#endif - /* * Power management */ @@ -2026,7 +2053,7 @@ static struct { unsigned int apic_thmr; } apic_pm_state; -static int lapic_suspend(struct sys_device *dev, pm_message_t state) +static int lapic_suspend(void) { unsigned long flags; int maxlvt; @@ -2064,23 +2091,21 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state) return 0; } -static int lapic_resume(struct sys_device *dev) +static void lapic_resume(void) { unsigned int l, h; unsigned long flags; - int maxlvt; - int ret = 0; + int maxlvt, ret; struct IO_APIC_route_entry **ioapic_entries = NULL; if (!apic_pm_state.active) - return 0; + return; local_irq_save(flags); if (intr_remapping_enabled) { ioapic_entries = alloc_ioapic_entries(); if (!ioapic_entries) { WARN(1, "Alloc ioapic_entries in lapic resume failed."); - ret = -ENOMEM; goto restore; } @@ -2142,8 +2167,6 @@ static int lapic_resume(struct sys_device *dev) } restore: local_irq_restore(flags); - - return ret; } /* @@ -2151,17 +2174,11 @@ restore: * are needed on every CPU up until machine_halt/restart/poweroff. */ -static struct sysdev_class lapic_sysclass = { - .name = "lapic", +static struct syscore_ops lapic_syscore_ops = { .resume = lapic_resume, .suspend = lapic_suspend, }; -static struct sys_device device_lapic = { - .id = 0, - .cls = &lapic_sysclass, -}; - static void __cpuinit apic_pm_activate(void) { apic_pm_state.active = 1; @@ -2169,16 +2186,11 @@ static void __cpuinit apic_pm_activate(void) static int __init init_lapic_sysfs(void) { - int error; - - if (!cpu_has_apic) - return 0; /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ + if (cpu_has_apic) + register_syscore_ops(&lapic_syscore_ops); - error = sysdev_class_register(&lapic_sysclass); - if (!error) - error = sysdev_register(&device_lapic); - return error; + return 0; } /* local apic needs to resume before other devices access its registers. */ diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 09d3b17ce0c..5652d31fe10 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -185,8 +185,6 @@ struct apic apic_flat = { .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .multi_timer_check = NULL, - .apicid_to_node = NULL, - .cpu_to_logical_apicid = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = NULL, .setup_portio_remap = NULL, @@ -337,8 +335,6 @@ struct apic apic_physflat = { .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .multi_timer_check = NULL, - .apicid_to_node = NULL, - .cpu_to_logical_apicid = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = NULL, .setup_portio_remap = NULL, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index e31b9ffe25f..775b82bc655 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -54,11 +54,6 @@ static u64 noop_apic_icr_read(void) return 0; } -static int noop_cpu_to_logical_apicid(int cpu) -{ - return 0; -} - static int noop_phys_pkg_id(int cpuid_apic, int index_msb) { return 0; @@ -113,12 +108,6 @@ static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask) cpumask_set_cpu(cpu, retmask); } -int noop_apicid_to_node(int logical_apicid) -{ - /* we're always on node 0 */ - return 0; -} - static u32 noop_apic_read(u32 reg) { WARN_ON_ONCE((cpu_has_apic && !disable_apic)); @@ -153,9 +142,7 @@ struct apic apic_noop = { .ioapic_phys_id_map = default_ioapic_phys_id_map, .setup_apic_routing = NULL, .multi_timer_check = NULL, - .apicid_to_node = noop_apicid_to_node, - .cpu_to_logical_apicid = noop_cpu_to_logical_apicid, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = physid_set_mask_of_physid, @@ -197,4 +184,8 @@ struct apic apic_noop = { .icr_write = noop_apic_icr_write, .wait_icr_idle = noop_apic_wait_icr_idle, .safe_wait_icr_idle = noop_safe_apic_wait_icr_idle, + +#ifdef CONFIG_X86_32 + .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid, +#endif }; diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index cb804c5091b..d84ac5a584b 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -45,6 +45,12 @@ static unsigned long bigsmp_check_apicid_present(int bit) return 1; } +static int bigsmp_early_logical_apicid(int cpu) +{ + /* on bigsmp, logical apicid is the same as physical */ + return early_per_cpu(x86_cpu_to_apicid, cpu); +} + static inline unsigned long calculate_ldr(int cpu) { unsigned long val, id; @@ -80,11 +86,6 @@ static void bigsmp_setup_apic_routing(void) nr_ioapics); } -static int bigsmp_apicid_to_node(int logical_apicid) -{ - return apicid_2_node[hard_smp_processor_id()]; -} - static int bigsmp_cpu_present_to_apicid(int mps_cpu) { if (mps_cpu < nr_cpu_ids) @@ -93,14 +94,6 @@ static int bigsmp_cpu_present_to_apicid(int mps_cpu) return BAD_APICID; } -/* Mapping from cpu number to logical apicid */ -static inline int bigsmp_cpu_to_logical_apicid(int cpu) -{ - if (cpu >= nr_cpu_ids) - return BAD_APICID; - return cpu_physical_id(cpu); -} - static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) { /* For clustered we don't have a good way to do this yet - hack */ @@ -115,7 +108,11 @@ static int bigsmp_check_phys_apicid_present(int phys_apicid) /* As we are using single CPU as destination, pick only one CPU here */ static unsigned int bigsmp_cpu_mask_to_apicid(const struct cpumask *cpumask) { - return bigsmp_cpu_to_logical_apicid(cpumask_first(cpumask)); + int cpu = cpumask_first(cpumask); + + if (cpu < nr_cpu_ids) + return cpu_physical_id(cpu); + return BAD_APICID; } static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask, @@ -129,9 +126,9 @@ static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask, */ for_each_cpu_and(cpu, cpumask, andmask) { if (cpumask_test_cpu(cpu, cpu_online_mask)) - break; + return cpu_physical_id(cpu); } - return bigsmp_cpu_to_logical_apicid(cpu); + return BAD_APICID; } static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb) @@ -219,8 +216,6 @@ struct apic apic_bigsmp = { .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map, .setup_apic_routing = bigsmp_setup_apic_routing, .multi_timer_check = NULL, - .apicid_to_node = bigsmp_apicid_to_node, - .cpu_to_logical_apicid = bigsmp_cpu_to_logical_apicid, .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid, .apicid_to_cpu_present = physid_set_mask_of_physid, .setup_portio_remap = NULL, @@ -256,4 +251,6 @@ struct apic apic_bigsmp = { .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, + + .x86_32_early_logical_apicid = bigsmp_early_logical_apicid, }; diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 8593582d802..70533de5bd2 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -460,6 +460,12 @@ static unsigned long es7000_check_apicid_present(int bit) return physid_isset(bit, phys_cpu_present_map); } +static int es7000_early_logical_apicid(int cpu) +{ + /* on es7000, logical apicid is the same as physical */ + return early_per_cpu(x86_bios_cpu_apicid, cpu); +} + static unsigned long calculate_ldr(int cpu) { unsigned long id = per_cpu(x86_bios_cpu_apicid, cpu); @@ -504,12 +510,6 @@ static void es7000_setup_apic_routing(void) nr_ioapics, cpumask_bits(es7000_target_cpus())[0]); } -static int es7000_apicid_to_node(int logical_apicid) -{ - return 0; -} - - static int es7000_cpu_present_to_apicid(int mps_cpu) { if (!mps_cpu) @@ -528,18 +528,6 @@ static void es7000_apicid_to_cpu_present(int phys_apicid, physid_mask_t *retmap) ++cpu_id; } -/* Mapping from cpu number to logical apicid */ -static int es7000_cpu_to_logical_apicid(int cpu) -{ -#ifdef CONFIG_SMP - if (cpu >= nr_cpu_ids) - return BAD_APICID; - return cpu_2_logical_apicid[cpu]; -#else - return logical_smp_processor_id(); -#endif -} - static void es7000_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) { /* For clustered we don't have a good way to do this yet - hack */ @@ -561,7 +549,7 @@ static unsigned int es7000_cpu_mask_to_apicid(const struct cpumask *cpumask) * The cpus in the mask must all be on the apic cluster. */ for_each_cpu(cpu, cpumask) { - int new_apicid = es7000_cpu_to_logical_apicid(cpu); + int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { WARN(1, "Not a valid mask!"); @@ -578,7 +566,7 @@ static unsigned int es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask, const struct cpumask *andmask) { - int apicid = es7000_cpu_to_logical_apicid(0); + int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0); cpumask_var_t cpumask; if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) @@ -655,8 +643,6 @@ struct apic __refdata apic_es7000_cluster = { .ioapic_phys_id_map = es7000_ioapic_phys_id_map, .setup_apic_routing = es7000_setup_apic_routing, .multi_timer_check = NULL, - .apicid_to_node = es7000_apicid_to_node, - .cpu_to_logical_apicid = es7000_cpu_to_logical_apicid, .cpu_present_to_apicid = es7000_cpu_present_to_apicid, .apicid_to_cpu_present = es7000_apicid_to_cpu_present, .setup_portio_remap = NULL, @@ -695,6 +681,8 @@ struct apic __refdata apic_es7000_cluster = { .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, + + .x86_32_early_logical_apicid = es7000_early_logical_apicid, }; struct apic __refdata apic_es7000 = { @@ -720,8 +708,6 @@ struct apic __refdata apic_es7000 = { .ioapic_phys_id_map = es7000_ioapic_phys_id_map, .setup_apic_routing = es7000_setup_apic_routing, .multi_timer_check = NULL, - .apicid_to_node = es7000_apicid_to_node, - .cpu_to_logical_apicid = es7000_cpu_to_logical_apicid, .cpu_present_to_apicid = es7000_cpu_present_to_apicid, .apicid_to_cpu_present = es7000_apicid_to_cpu_present, .setup_portio_remap = NULL, @@ -758,4 +744,6 @@ struct apic __refdata apic_es7000 = { .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, + + .x86_32_early_logical_apicid = es7000_early_logical_apicid, }; diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index cefd6942f0e..5260fe91bcb 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -16,20 +16,33 @@ #include <linux/kprobes.h> #include <linux/nmi.h> #include <linux/module.h> +#include <linux/delay.h> -/* For reliability, we're prepared to waste bits here. */ -static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; - +#ifdef CONFIG_HARDLOCKUP_DETECTOR u64 hw_nmi_get_sample_period(void) { return (u64)(cpu_khz) * 1000 * 60; } +#endif + +#ifdef arch_trigger_all_cpu_backtrace +/* For reliability, we're prepared to waste bits here. */ +static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; + +/* "in progress" flag of arch_trigger_all_cpu_backtrace */ +static unsigned long backtrace_flag; -#ifdef ARCH_HAS_NMI_WATCHDOG void arch_trigger_all_cpu_backtrace(void) { int i; + if (test_and_set_bit(0, &backtrace_flag)) + /* + * If there is already a trigger_all_cpu_backtrace() in progress + * (backtrace_flag == 1), don't output double cpu dump infos. + */ + return; + cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); printk(KERN_INFO "sending NMI to all CPUs:\n"); @@ -41,6 +54,9 @@ void arch_trigger_all_cpu_backtrace(void) break; mdelay(1); } + + clear_bit(0, &backtrace_flag); + smp_mb__after_clear_bit(); } static int __kprobes @@ -49,11 +65,10 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self, { struct die_args *args = __args; struct pt_regs *regs; - int cpu = smp_processor_id(); + int cpu; switch (cmd) { case DIE_NMI: - case DIE_NMI_IPI: break; default: @@ -61,6 +76,7 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self, } regs = args->regs; + cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED; @@ -68,7 +84,6 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self, arch_spin_lock(&lock); printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); show_regs(regs); - dump_stack(); arch_spin_unlock(&lock); cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); return NOTIFY_STOP; @@ -80,7 +95,7 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self, static __read_mostly struct notifier_block backtrace_notifier = { .notifier_call = arch_trigger_all_cpu_backtrace_handler, .next = NULL, - .priority = 1 + .priority = NMI_LOCAL_LOW_PRIOR, }; static int __init register_trigger_all_cpu_backtrace(void) @@ -90,18 +105,3 @@ static int __init register_trigger_all_cpu_backtrace(void) } early_initcall(register_trigger_all_cpu_backtrace); #endif - -/* STUB calls to mimic old nmi_watchdog behaviour */ -#if defined(CONFIG_X86_LOCAL_APIC) -unsigned int nmi_watchdog = NMI_NONE; -EXPORT_SYMBOL(nmi_watchdog); -void acpi_nmi_enable(void) { return; } -void acpi_nmi_disable(void) { return; } -#endif -atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ -EXPORT_SYMBOL(nmi_active); -int unknown_nmi_panic; -void cpu_nmi_set_wd_enabled(void) { return; } -void stop_apic_nmi_watchdog(void *unused) { return; } -void setup_apic_nmi_watchdog(void *unused) { return; } -int __init check_nmi_watchdog(void) { return 0; } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 8ae808d110f..45fd33d1fd3 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -30,7 +30,7 @@ #include <linux/compiler.h> #include <linux/acpi.h> #include <linux/module.h> -#include <linux/sysdev.h> +#include <linux/syscore_ops.h> #include <linux/msi.h> #include <linux/htirq.h> #include <linux/freezer.h> @@ -54,7 +54,6 @@ #include <asm/dma.h> #include <asm/timer.h> #include <asm/i8259.h> -#include <asm/nmi.h> #include <asm/msidef.h> #include <asm/hypertransport.h> #include <asm/setup.h> @@ -109,7 +108,10 @@ DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); int skip_ioapic_setup; -void arch_disable_smp_support(void) +/** + * disable_ioapic_support() - disables ioapic support at runtime + */ +void disable_ioapic_support(void) { #ifdef CONFIG_PCI noioapicquirk = 1; @@ -121,11 +123,34 @@ void arch_disable_smp_support(void) static int __init parse_noapic(char *str) { /* disable IO-APIC */ - arch_disable_smp_support(); + disable_ioapic_support(); return 0; } early_param("noapic", parse_noapic); +static int io_apic_setup_irq_pin(unsigned int irq, int node, + struct io_apic_irq_attr *attr); + +/* Will be called in mpparse/acpi/sfi codes for saving IRQ info */ +void mp_save_irq(struct mpc_intsrc *m) +{ + int i; + + apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x," + " IRQ %02x, APIC ID %x, APIC INT %02x\n", + m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus, + m->srcbusirq, m->dstapic, m->dstirq); + + for (i = 0; i < mp_irq_entries; i++) { + if (!memcmp(&mp_irqs[i], m, sizeof(*m))) + return; + } + + memcpy(&mp_irqs[mp_irq_entries], m, sizeof(*m)); + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!!\n"); +} + struct irq_pin_list { int apic, pin; struct irq_pin_list *next; @@ -136,6 +161,7 @@ static struct irq_pin_list *alloc_irq_pin_list(int node) return kzalloc_node(sizeof(struct irq_pin_list), GFP_KERNEL, node); } + /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ #ifdef CONFIG_SPARSE_IRQ static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY]; @@ -161,7 +187,7 @@ int __init arch_early_irq_init(void) irq_reserve_irqs(0, legacy_pic->nr_legacy_irqs); for (i = 0; i < count; i++) { - set_irq_chip_data(i, &cfg[i]); + irq_set_chip_data(i, &cfg[i]); zalloc_cpumask_var_node(&cfg[i].domain, GFP_KERNEL, node); zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node); /* @@ -180,7 +206,7 @@ int __init arch_early_irq_init(void) #ifdef CONFIG_SPARSE_IRQ static struct irq_cfg *irq_cfg(unsigned int irq) { - return get_irq_chip_data(irq); + return irq_get_chip_data(irq); } static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node) @@ -206,7 +232,7 @@ static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg) { if (!cfg) return; - set_irq_chip_data(at, NULL); + irq_set_chip_data(at, NULL); free_cpumask_var(cfg->domain); free_cpumask_var(cfg->old_domain); kfree(cfg); @@ -236,14 +262,14 @@ static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node) if (res < 0) { if (res != -EEXIST) return NULL; - cfg = get_irq_chip_data(at); + cfg = irq_get_chip_data(at); if (cfg) return cfg; } cfg = alloc_irq_cfg(at, node); if (cfg) - set_irq_chip_data(at, cfg); + irq_set_chip_data(at, cfg); else irq_free_desc(at); return cfg; @@ -798,7 +824,7 @@ static int EISA_ELCR(unsigned int irq) #define default_MCA_trigger(idx) (1) #define default_MCA_polarity(idx) default_ISA_polarity(idx) -static int MPBIOS_polarity(int idx) +static int irq_polarity(int idx) { int bus = mp_irqs[idx].srcbus; int polarity; @@ -840,7 +866,7 @@ static int MPBIOS_polarity(int idx) return polarity; } -static int MPBIOS_trigger(int idx) +static int irq_trigger(int idx) { int bus = mp_irqs[idx].srcbus; int trigger; @@ -912,16 +938,6 @@ static int MPBIOS_trigger(int idx) return trigger; } -static inline int irq_polarity(int idx) -{ - return MPBIOS_polarity(idx); -} - -static inline int irq_trigger(int idx) -{ - return MPBIOS_trigger(idx); -} - static int pin_2_irq(int idx, int apic, int pin) { int irq; @@ -1169,7 +1185,7 @@ void __setup_vector_irq(int cpu) raw_spin_lock(&vector_lock); /* Mark the inuse vectors */ for_each_active_irq(irq) { - cfg = get_irq_chip_data(irq); + cfg = irq_get_chip_data(irq); if (!cfg) continue; /* @@ -1200,10 +1216,6 @@ void __setup_vector_irq(int cpu) static struct irq_chip ioapic_chip; static struct irq_chip ir_ioapic_chip; -#define IOAPIC_AUTO -1 -#define IOAPIC_EDGE 0 -#define IOAPIC_LEVEL 1 - #ifdef CONFIG_X86_32 static inline int IO_APIC_irq_trigger(int irq) { @@ -1228,35 +1240,31 @@ static inline int IO_APIC_irq_trigger(int irq) } #endif -static void ioapic_register_intr(unsigned int irq, unsigned long trigger) +static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg, + unsigned long trigger) { + struct irq_chip *chip = &ioapic_chip; + irq_flow_handler_t hdl; + bool fasteoi; if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || - trigger == IOAPIC_LEVEL) + trigger == IOAPIC_LEVEL) { irq_set_status_flags(irq, IRQ_LEVEL); - else + fasteoi = true; + } else { irq_clear_status_flags(irq, IRQ_LEVEL); + fasteoi = false; + } - if (irq_remapped(get_irq_chip_data(irq))) { + if (irq_remapped(cfg)) { irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); - if (trigger) - set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, - handle_fasteoi_irq, - "fasteoi"); - else - set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, - handle_edge_irq, "edge"); - return; + chip = &ir_ioapic_chip; + fasteoi = trigger != 0; } - if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || - trigger == IOAPIC_LEVEL) - set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_fasteoi_irq, - "fasteoi"); - else - set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_edge_irq, "edge"); + hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq; + irq_set_chip_and_handler_name(irq, chip, hdl, + fasteoi ? "fasteoi" : "edge"); } static int setup_ioapic_entry(int apic_id, int irq, @@ -1354,7 +1362,7 @@ static void setup_ioapic_irq(int apic_id, int pin, unsigned int irq, return; } - ioapic_register_intr(irq, trigger); + ioapic_register_intr(irq, cfg, trigger); if (irq < legacy_pic->nr_legacy_irqs) legacy_pic->mask(irq); @@ -1365,33 +1373,26 @@ static struct { DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); } mp_ioapic_routing[MAX_IO_APICS]; -static void __init setup_IO_APIC_irqs(void) +static bool __init io_apic_pin_not_connected(int idx, int apic_id, int pin) { - int apic_id, pin, idx, irq, notcon = 0; - int node = cpu_to_node(0); - struct irq_cfg *cfg; + if (idx != -1) + return false; - apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); + apic_printk(APIC_VERBOSE, KERN_DEBUG " apic %d pin %d not connected\n", + mp_ioapics[apic_id].apicid, pin); + return true; +} + +static void __init __io_apic_setup_irqs(unsigned int apic_id) +{ + int idx, node = cpu_to_node(0); + struct io_apic_irq_attr attr; + unsigned int pin, irq; - for (apic_id = 0; apic_id < nr_ioapics; apic_id++) for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { idx = find_irq_entry(apic_id, pin, mp_INT); - if (idx == -1) { - if (!notcon) { - notcon = 1; - apic_printk(APIC_VERBOSE, - KERN_DEBUG " %d-%d", - mp_ioapics[apic_id].apicid, pin); - } else - apic_printk(APIC_VERBOSE, " %d-%d", - mp_ioapics[apic_id].apicid, pin); + if (io_apic_pin_not_connected(idx, apic_id, pin)) continue; - } - if (notcon) { - apic_printk(APIC_VERBOSE, - " (apicid-pin) not connected\n"); - notcon = 0; - } irq = pin_2_irq(idx, apic_id, pin); @@ -1403,25 +1404,24 @@ static void __init setup_IO_APIC_irqs(void) * installed and if it returns 1: */ if (apic->multi_timer_check && - apic->multi_timer_check(apic_id, irq)) + apic->multi_timer_check(apic_id, irq)) continue; - cfg = alloc_irq_and_cfg_at(irq, node); - if (!cfg) - continue; + set_io_apic_irq_attr(&attr, apic_id, pin, irq_trigger(idx), + irq_polarity(idx)); - add_pin_to_irq_node(cfg, node, apic_id, pin); - /* - * don't mark it in pin_programmed, so later acpi could - * set it correctly when irq < 16 - */ - setup_ioapic_irq(apic_id, pin, irq, cfg, irq_trigger(idx), - irq_polarity(idx)); + io_apic_setup_irq_pin(irq, node, &attr); } +} - if (notcon) - apic_printk(APIC_VERBOSE, - " (apicid-pin) not connected\n"); +static void __init setup_IO_APIC_irqs(void) +{ + unsigned int apic_id; + + apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); + + for (apic_id = 0; apic_id < nr_ioapics; apic_id++) + __io_apic_setup_irqs(apic_id); } /* @@ -1432,7 +1432,7 @@ static void __init setup_IO_APIC_irqs(void) void setup_IO_APIC_irq_extra(u32 gsi) { int apic_id = 0, pin, idx, irq, node = cpu_to_node(0); - struct irq_cfg *cfg; + struct io_apic_irq_attr attr; /* * Convert 'gsi' to 'ioapic.pin'. @@ -1452,21 +1452,10 @@ void setup_IO_APIC_irq_extra(u32 gsi) if (apic_id == 0 || irq < NR_IRQS_LEGACY) return; - cfg = alloc_irq_and_cfg_at(irq, node); - if (!cfg) - return; - - add_pin_to_irq_node(cfg, node, apic_id, pin); - - if (test_bit(pin, mp_ioapic_routing[apic_id].pin_programmed)) { - pr_debug("Pin %d-%d already programmed\n", - mp_ioapics[apic_id].apicid, pin); - return; - } - set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed); + set_io_apic_irq_attr(&attr, apic_id, pin, irq_trigger(idx), + irq_polarity(idx)); - setup_ioapic_irq(apic_id, pin, irq, cfg, - irq_trigger(idx), irq_polarity(idx)); + io_apic_setup_irq_pin_once(irq, node, &attr); } /* @@ -1498,7 +1487,8 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin, * The timer IRQ doesn't have to know that behind the * scene we may have a 8259A-master in AEOI mode ... */ - set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); + irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, + "edge"); /* * Add it to the IO-APIC irq-routing table: @@ -1605,7 +1595,7 @@ __apicdebuginit(void) print_IO_APIC(void) for_each_active_irq(irq) { struct irq_pin_list *entry; - cfg = get_irq_chip_data(irq); + cfg = irq_get_chip_data(irq); if (!cfg) continue; entry = cfg->irq_2_pin; @@ -1896,7 +1886,7 @@ void disable_IO_APIC(void) * * With interrupt-remapping, for now we will use virtual wire A mode, * as virtual wire B is little complex (need to configure both - * IOAPIC RTE aswell as interrupt-remapping table entry). + * IOAPIC RTE as well as interrupt-remapping table entry). * As this gets called during crash dump, keep this simple for now. */ if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) { @@ -1934,8 +1924,7 @@ void disable_IO_APIC(void) * * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 */ - -void __init setup_ioapic_ids_from_mpc(void) +void __init setup_ioapic_ids_from_mpc_nocheck(void) { union IO_APIC_reg_00 reg_00; physid_mask_t phys_id_present_map; @@ -1944,15 +1933,6 @@ void __init setup_ioapic_ids_from_mpc(void) unsigned char old_id; unsigned long flags; - if (acpi_ioapic) - return; - /* - * Don't check I/O APIC IDs for xAPIC systems. They have - * no meaning without the serial APIC bus. - */ - if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) - return; /* * This is broken; anything with a real cpu count has to * circumvent this idiocy regardless. @@ -2006,7 +1986,6 @@ void __init setup_ioapic_ids_from_mpc(void) physids_or(phys_id_present_map, phys_id_present_map, tmp); } - /* * We need to adjust the IRQ routing table * if the ID changed. @@ -2018,9 +1997,12 @@ void __init setup_ioapic_ids_from_mpc(void) = mp_ioapics[apic_id].apicid; /* - * Read the right value from the MPC table and - * write it into the ID register. + * Update the ID register according to the right value + * from the MPC table if they are different. */ + if (mp_ioapics[apic_id].apicid == reg_00.bits.ID) + continue; + apic_printk(APIC_VERBOSE, KERN_INFO "...changing IO-APIC physical APIC ID to %d ...", mp_ioapics[apic_id].apicid); @@ -2042,6 +2024,21 @@ void __init setup_ioapic_ids_from_mpc(void) apic_printk(APIC_VERBOSE, " ok.\n"); } } + +void __init setup_ioapic_ids_from_mpc(void) +{ + + if (acpi_ioapic) + return; + /* + * Don't check I/O APIC IDs for xAPIC systems. They have + * no meaning without the serial APIC bus. + */ + if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + return; + setup_ioapic_ids_from_mpc_nocheck(); +} #endif int no_timer_check __initdata; @@ -2302,7 +2299,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) unsigned int irr; struct irq_desc *desc; struct irq_cfg *cfg; - irq = __get_cpu_var(vector_irq)[vector]; + irq = __this_cpu_read(vector_irq[vector]); if (irq == -1) continue; @@ -2336,7 +2333,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR); goto unlock; } - __get_cpu_var(vector_irq)[vector] = -1; + __this_cpu_write(vector_irq[vector], -1); unlock: raw_spin_unlock(&desc->lock); } @@ -2364,7 +2361,7 @@ static void irq_complete_move(struct irq_cfg *cfg) void irq_force_complete_move(int irq) { - struct irq_cfg *cfg = get_irq_chip_data(irq); + struct irq_cfg *cfg = irq_get_chip_data(irq); if (!cfg) return; @@ -2378,7 +2375,7 @@ static inline void irq_complete_move(struct irq_cfg *cfg) { } static void ack_apic_edge(struct irq_data *data) { irq_complete_move(data->chip_data); - move_native_irq(data->irq); + irq_move_irq(data); ack_APIC_irq(); } @@ -2430,13 +2427,12 @@ static void ack_apic_level(struct irq_data *data) { struct irq_cfg *cfg = data->chip_data; int i, do_unmask_irq = 0, irq = data->irq; - struct irq_desc *desc = irq_to_desc(irq); unsigned long v; irq_complete_move(cfg); #ifdef CONFIG_GENERIC_PENDING_IRQ /* If we are moving the irq we need to mask it */ - if (unlikely(desc->status & IRQ_MOVE_PENDING)) { + if (unlikely(irqd_is_setaffinity_pending(data))) { do_unmask_irq = 1; mask_ioapic(cfg); } @@ -2525,7 +2521,7 @@ static void ack_apic_level(struct irq_data *data) * and you can go talk to the chipset vendor about it. */ if (!io_apic_level_ack_pending(cfg)) - move_masked_irq(irq); + irq_move_masked_irq(data); unmask_ioapic(cfg); } } @@ -2588,7 +2584,7 @@ static inline void init_IO_APIC_traps(void) * 0x80, because int 0x80 is hm, kind of importantish. ;) */ for_each_active_irq(irq) { - cfg = get_irq_chip_data(irq); + cfg = irq_get_chip_data(irq); if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) { /* * Hmm.. We don't have an entry for this, @@ -2599,7 +2595,7 @@ static inline void init_IO_APIC_traps(void) legacy_pic->make_irq(irq); else /* Strange. Oh, well.. */ - set_irq_chip(irq, &no_irq_chip); + irq_set_chip(irq, &no_irq_chip); } } } @@ -2639,28 +2635,10 @@ static struct irq_chip lapic_chip __read_mostly = { static void lapic_register_intr(int irq) { irq_clear_status_flags(irq, IRQ_LEVEL); - set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, + irq_set_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, "edge"); } -static void __init setup_nmi(void) -{ - /* - * Dirty trick to enable the NMI watchdog ... - * We put the 8259A master into AEOI mode and - * unmask on all local APICs LVT0 as NMI. - * - * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') - * is from Maciej W. Rozycki - so we do not have to EOI from - * the NMI handler or the timer interrupt. - */ - apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); - - enable_NMI_through_LVT0(); - - apic_printk(APIC_VERBOSE, " done.\n"); -} - /* * This looks a bit hackish but it's about the only one way of sending * a few INTA cycles to 8259As and any associated glue logic. ICR does @@ -2741,7 +2719,7 @@ int timer_through_8259 __initdata; */ static inline void __init check_timer(void) { - struct irq_cfg *cfg = get_irq_chip_data(0); + struct irq_cfg *cfg = irq_get_chip_data(0); int node = cpu_to_node(0); int apic1, pin1, apic2, pin2; unsigned long flags; @@ -2766,15 +2744,6 @@ static inline void __init check_timer(void) */ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); legacy_pic->init(1); -#ifdef CONFIG_X86_32 - { - unsigned int ver; - - ver = apic_read(APIC_LVR); - ver = GET_APIC_VERSION(ver); - timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); - } -#endif pin1 = find_isa_irq_pin(0, mp_INT); apic1 = find_isa_irq_apic(0, mp_INT); @@ -2822,10 +2791,6 @@ static inline void __init check_timer(void) unmask_ioapic(cfg); } if (timer_irq_works()) { - if (nmi_watchdog == NMI_IO_APIC) { - setup_nmi(); - legacy_pic->unmask(0); - } if (disable_timer_pin_1 > 0) clear_IO_APIC_pin(0, pin1); goto out; @@ -2851,11 +2816,6 @@ static inline void __init check_timer(void) if (timer_irq_works()) { apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); timer_through_8259 = 1; - if (nmi_watchdog == NMI_IO_APIC) { - legacy_pic->mask(0); - setup_nmi(); - legacy_pic->unmask(0); - } goto out; } /* @@ -2867,15 +2827,6 @@ static inline void __init check_timer(void) apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); } - if (nmi_watchdog == NMI_IO_APIC) { - apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work " - "through the IO-APIC - disabling NMI Watchdog!\n"); - nmi_watchdog = NMI_NONE; - } -#ifdef CONFIG_X86_32 - timer_ack = 0; -#endif - apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...\n"); @@ -2954,7 +2905,7 @@ void __init setup_IO_APIC(void) } /* - * Called after all the initialization is done. If we didnt find any + * Called after all the initialization is done. If we didn't find any * APIC bugs then we can allow the modify fast path */ @@ -2967,89 +2918,84 @@ static int __init io_apic_bug_finalize(void) late_initcall(io_apic_bug_finalize); -struct sysfs_ioapic_data { - struct sys_device dev; - struct IO_APIC_route_entry entry[0]; -}; -static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; +static struct IO_APIC_route_entry *ioapic_saved_data[MAX_IO_APICS]; -static int ioapic_suspend(struct sys_device *dev, pm_message_t state) +static void suspend_ioapic(int ioapic_id) { - struct IO_APIC_route_entry *entry; - struct sysfs_ioapic_data *data; + struct IO_APIC_route_entry *saved_data = ioapic_saved_data[ioapic_id]; int i; - data = container_of(dev, struct sysfs_ioapic_data, dev); - entry = data->entry; - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) - *entry = ioapic_read_entry(dev->id, i); + if (!saved_data) + return; + + for (i = 0; i < nr_ioapic_registers[ioapic_id]; i++) + saved_data[i] = ioapic_read_entry(ioapic_id, i); +} + +static int ioapic_suspend(void) +{ + int ioapic_id; + + for (ioapic_id = 0; ioapic_id < nr_ioapics; ioapic_id++) + suspend_ioapic(ioapic_id); return 0; } -static int ioapic_resume(struct sys_device *dev) +static void resume_ioapic(int ioapic_id) { - struct IO_APIC_route_entry *entry; - struct sysfs_ioapic_data *data; + struct IO_APIC_route_entry *saved_data = ioapic_saved_data[ioapic_id]; unsigned long flags; union IO_APIC_reg_00 reg_00; int i; - data = container_of(dev, struct sysfs_ioapic_data, dev); - entry = data->entry; + if (!saved_data) + return; raw_spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(dev->id, 0); - if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) { - reg_00.bits.ID = mp_ioapics[dev->id].apicid; - io_apic_write(dev->id, 0, reg_00.raw); + reg_00.raw = io_apic_read(ioapic_id, 0); + if (reg_00.bits.ID != mp_ioapics[ioapic_id].apicid) { + reg_00.bits.ID = mp_ioapics[ioapic_id].apicid; + io_apic_write(ioapic_id, 0, reg_00.raw); } raw_spin_unlock_irqrestore(&ioapic_lock, flags); - for (i = 0; i < nr_ioapic_registers[dev->id]; i++) - ioapic_write_entry(dev->id, i, entry[i]); + for (i = 0; i < nr_ioapic_registers[ioapic_id]; i++) + ioapic_write_entry(ioapic_id, i, saved_data[i]); +} - return 0; +static void ioapic_resume(void) +{ + int ioapic_id; + + for (ioapic_id = nr_ioapics - 1; ioapic_id >= 0; ioapic_id--) + resume_ioapic(ioapic_id); } -static struct sysdev_class ioapic_sysdev_class = { - .name = "ioapic", +static struct syscore_ops ioapic_syscore_ops = { .suspend = ioapic_suspend, .resume = ioapic_resume, }; -static int __init ioapic_init_sysfs(void) +static int __init ioapic_init_ops(void) { - struct sys_device * dev; - int i, size, error; + int i; - error = sysdev_class_register(&ioapic_sysdev_class); - if (error) - return error; + for (i = 0; i < nr_ioapics; i++) { + unsigned int size; - for (i = 0; i < nr_ioapics; i++ ) { - size = sizeof(struct sys_device) + nr_ioapic_registers[i] + size = nr_ioapic_registers[i] * sizeof(struct IO_APIC_route_entry); - mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL); - if (!mp_ioapic_data[i]) { - printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); - continue; - } - dev = &mp_ioapic_data[i]->dev; - dev->id = i; - dev->cls = &ioapic_sysdev_class; - error = sysdev_register(dev); - if (error) { - kfree(mp_ioapic_data[i]); - mp_ioapic_data[i] = NULL; - printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); - continue; - } + ioapic_saved_data[i] = kzalloc(size, GFP_KERNEL); + if (!ioapic_saved_data[i]) + pr_err("IOAPIC %d: suspend/resume impossible!\n", i); } + register_syscore_ops(&ioapic_syscore_ops); + return 0; } -device_initcall(ioapic_init_sysfs); +device_initcall(ioapic_init_ops); /* * Dynamic irq allocate and deallocation @@ -3079,7 +3025,7 @@ unsigned int create_irq_nr(unsigned int from, int node) raw_spin_unlock_irqrestore(&vector_lock, flags); if (ret) { - set_irq_chip_data(irq, cfg); + irq_set_chip_data(irq, cfg); irq_clear_status_flags(irq, IRQ_NOREQUEST); } else { free_irq_at(irq, cfg); @@ -3104,12 +3050,12 @@ int create_irq(void) void destroy_irq(unsigned int irq) { - struct irq_cfg *cfg = get_irq_chip_data(irq); + struct irq_cfg *cfg = irq_get_chip_data(irq); unsigned long flags; irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE); - if (intr_remapping_enabled) + if (irq_remapped(cfg)) free_irte(irq); raw_spin_lock_irqsave(&vector_lock, flags); __clear_irq_vector(irq, cfg); @@ -3138,7 +3084,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); - if (irq_remapped(get_irq_chip_data(irq))) { + if (irq_remapped(cfg)) { struct irte irte; int ir_index; u16 sub_handle; @@ -3310,6 +3256,7 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec) static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) { + struct irq_chip *chip = &msi_chip; struct msi_msg msg; int ret; @@ -3317,21 +3264,22 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) if (ret < 0) return ret; - set_irq_msi(irq, msidesc); + irq_set_msi_desc(irq, msidesc); write_msi_msg(irq, &msg); - if (irq_remapped(get_irq_chip_data(irq))) { + if (irq_remapped(irq_get_chip_data(irq))) { irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); - set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge"); - } else - set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); + chip = &msi_ir_chip; + } + + irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge"); dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq); return 0; } -int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) +int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { int node, ret, sub_handle, index = 0; unsigned int irq, irq_want; @@ -3389,7 +3337,7 @@ error: return ret; } -void arch_teardown_msi_irq(unsigned int irq) +void native_teardown_msi_irq(unsigned int irq) { destroy_irq(irq); } @@ -3413,6 +3361,7 @@ dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, msg.data |= MSI_DATA_VECTOR(cfg->vector); msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; msg.address_lo |= MSI_ADDR_DEST_ID(dest); + msg.address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(dest); dmar_msi_write(irq, &msg); @@ -3441,8 +3390,8 @@ int arch_setup_dmar_msi(unsigned int irq) if (ret < 0) return ret; dmar_msi_write(irq, &msg); - set_irq_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, - "edge"); + irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, + "edge"); return 0; } #endif @@ -3500,6 +3449,7 @@ static struct irq_chip hpet_msi_type = { int arch_setup_hpet_msi(unsigned int irq, unsigned int id) { + struct irq_chip *chip = &hpet_msi_type; struct msi_msg msg; int ret; @@ -3519,15 +3469,12 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id) if (ret < 0) return ret; - hpet_msi_write(get_irq_data(irq), &msg); + hpet_msi_write(irq_get_handler_data(irq), &msg); irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); - if (irq_remapped(get_irq_chip_data(irq))) - set_irq_chip_and_handler_name(irq, &ir_hpet_msi_type, - handle_edge_irq, "edge"); - else - set_irq_chip_and_handler_name(irq, &hpet_msi_type, - handle_edge_irq, "edge"); + if (irq_remapped(irq_get_chip_data(irq))) + chip = &ir_hpet_msi_type; + irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge"); return 0; } #endif @@ -3614,7 +3561,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) write_ht_irq_msg(irq, &msg); - set_irq_chip_and_handler_name(irq, &ht_irq_chip, + irq_set_chip_and_handler_name(irq, &ht_irq_chip, handle_edge_irq, "edge"); dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq); @@ -3623,7 +3570,40 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) } #endif /* CONFIG_HT_IRQ */ -int __init io_apic_get_redir_entries (int ioapic) +static int +io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr) +{ + struct irq_cfg *cfg = alloc_irq_and_cfg_at(irq, node); + int ret; + + if (!cfg) + return -EINVAL; + ret = __add_pin_to_irq_node(cfg, node, attr->ioapic, attr->ioapic_pin); + if (!ret) + setup_ioapic_irq(attr->ioapic, attr->ioapic_pin, irq, cfg, + attr->trigger, attr->polarity); + return ret; +} + +int io_apic_setup_irq_pin_once(unsigned int irq, int node, + struct io_apic_irq_attr *attr) +{ + unsigned int id = attr->ioapic, pin = attr->ioapic_pin; + int ret; + + /* Avoid redundant programming */ + if (test_bit(pin, mp_ioapic_routing[id].pin_programmed)) { + pr_debug("Pin %d-%d already programmed\n", + mp_ioapics[id].apicid, pin); + return 0; + } + ret = io_apic_setup_irq_pin(irq, node, attr); + if (!ret) + set_bit(pin, mp_ioapic_routing[id].pin_programmed); + return ret; +} + +static int __init io_apic_get_redir_entries(int ioapic) { union IO_APIC_reg_01 reg_01; unsigned long flags; @@ -3639,7 +3619,7 @@ int __init io_apic_get_redir_entries (int ioapic) return reg_01.bits.entries + 1; } -void __init probe_nr_irqs_gsi(void) +static void __init probe_nr_irqs_gsi(void) { int nr; @@ -3650,6 +3630,11 @@ void __init probe_nr_irqs_gsi(void) printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi); } +int get_nr_irqs_gsi(void) +{ + return nr_irqs_gsi; +} + #ifdef CONFIG_SPARSE_IRQ int __init arch_probe_nr_irqs(void) { @@ -3672,96 +3657,24 @@ int __init arch_probe_nr_irqs(void) } #endif -static int __io_apic_set_pci_routing(struct device *dev, int irq, - struct io_apic_irq_attr *irq_attr) +int io_apic_set_pci_routing(struct device *dev, int irq, + struct io_apic_irq_attr *irq_attr) { - struct irq_cfg *cfg; int node; - int ioapic, pin; - int trigger, polarity; - ioapic = irq_attr->ioapic; if (!IO_APIC_IRQ(irq)) { apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", - ioapic); + irq_attr->ioapic); return -EINVAL; } - if (dev) - node = dev_to_node(dev); - else - node = cpu_to_node(0); - - cfg = alloc_irq_and_cfg_at(irq, node); - if (!cfg) - return 0; - - pin = irq_attr->ioapic_pin; - trigger = irq_attr->trigger; - polarity = irq_attr->polarity; - - /* - * IRQs < 16 are already in the irq_2_pin[] map - */ - if (irq >= legacy_pic->nr_legacy_irqs) { - if (__add_pin_to_irq_node(cfg, node, ioapic, pin)) { - printk(KERN_INFO "can not add pin %d for irq %d\n", - pin, irq); - return 0; - } - } - - setup_ioapic_irq(ioapic, pin, irq, cfg, trigger, polarity); - - return 0; -} - -int io_apic_set_pci_routing(struct device *dev, int irq, - struct io_apic_irq_attr *irq_attr) -{ - int ioapic, pin; - /* - * Avoid pin reprogramming. PRTs typically include entries - * with redundant pin->gsi mappings (but unique PCI devices); - * we only program the IOAPIC on the first. - */ - ioapic = irq_attr->ioapic; - pin = irq_attr->ioapic_pin; - if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) { - pr_debug("Pin %d-%d already programmed\n", - mp_ioapics[ioapic].apicid, pin); - return 0; - } - set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed); - - return __io_apic_set_pci_routing(dev, irq, irq_attr); -} + node = dev ? dev_to_node(dev) : cpu_to_node(0); -u8 __init io_apic_unique_id(u8 id) -{ -#ifdef CONFIG_X86_32 - if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && - !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) - return io_apic_get_unique_id(nr_ioapics, id); - else - return id; -#else - int i; - DECLARE_BITMAP(used, 256); - - bitmap_zero(used, 256); - for (i = 0; i < nr_ioapics; i++) { - struct mpc_ioapic *ia = &mp_ioapics[i]; - __set_bit(ia->apicid, used); - } - if (!test_bit(id, used)) - return id; - return find_first_zero_bit(used, 256); -#endif + return io_apic_setup_irq_pin_once(irq, node, irq_attr); } #ifdef CONFIG_X86_32 -int __init io_apic_get_unique_id(int ioapic, int apic_id) +static int __init io_apic_get_unique_id(int ioapic, int apic_id) { union IO_APIC_reg_00 reg_00; static physid_mask_t apic_id_map = PHYSID_MASK_NONE; @@ -3834,9 +3747,33 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id) return apic_id; } + +static u8 __init io_apic_unique_id(u8 id) +{ + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && + !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + return io_apic_get_unique_id(nr_ioapics, id); + else + return id; +} +#else +static u8 __init io_apic_unique_id(u8 id) +{ + int i; + DECLARE_BITMAP(used, 256); + + bitmap_zero(used, 256); + for (i = 0; i < nr_ioapics; i++) { + struct mpc_ioapic *ia = &mp_ioapics[i]; + __set_bit(ia->apicid, used); + } + if (!test_bit(id, used)) + return id; + return find_first_zero_bit(used, 256); +} #endif -int __init io_apic_get_version(int ioapic) +static int __init io_apic_get_version(int ioapic) { union IO_APIC_reg_01 reg_01; unsigned long flags; @@ -3881,8 +3818,8 @@ int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity) void __init setup_ioapic_dest(void) { int pin, ioapic, irq, irq_entry; - struct irq_desc *desc; const struct cpumask *mask; + struct irq_data *idata; if (skip_ioapic_setup == 1) return; @@ -3897,21 +3834,20 @@ void __init setup_ioapic_dest(void) if ((ioapic > 0) && (irq > 16)) continue; - desc = irq_to_desc(irq); + idata = irq_get_irq_data(irq); /* * Honour affinities which have been set in early boot */ - if (desc->status & - (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) - mask = desc->irq_data.affinity; + if (!irqd_can_balance(idata) || irqd_affinity_was_set(idata)) + mask = idata->affinity; else mask = apic->target_cpus(); if (intr_remapping_enabled) - ir_ioapic_set_affinity(&desc->irq_data, mask, false); + ir_ioapic_set_affinity(idata, mask, false); else - ioapic_set_affinity(&desc->irq_data, mask, false); + ioapic_set_affinity(idata, mask, false); } } @@ -3951,7 +3887,7 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics) return res; } -void __init ioapic_init_mappings(void) +void __init ioapic_and_gsi_init(void) { unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; struct resource *ioapic_res; @@ -3989,6 +3925,8 @@ fake_ioapic_page: ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1; ioapic_res++; } + + probe_nr_irqs_gsi(); } void __init ioapic_insert_resources(void) @@ -4013,6 +3951,9 @@ int mp_find_ioapic(u32 gsi) { int i = 0; + if (nr_ioapics == 0) + return -1; + /* Find the IOAPIC that manages this GSI. */ for (i = 0; i < nr_ioapics; i++) { if ((gsi >= mp_gsi_routing[i].gsi_base) @@ -4034,10 +3975,10 @@ int mp_find_ioapic_pin(int ioapic, u32 gsi) return gsi - mp_gsi_routing[ioapic].gsi_base; } -static int bad_ioapic(unsigned long address) +static __init int bad_ioapic(unsigned long address) { if (nr_ioapics >= MAX_IO_APICS) { - printk(KERN_WARNING "WARING: Max # of I/O APICs (%d) exceeded " + printk(KERN_WARNING "WARNING: Max # of I/O APICs (%d) exceeded " "(found %d), skipping\n", MAX_IO_APICS, nr_ioapics); return 1; } @@ -4094,19 +4035,16 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) /* Enable IOAPIC early just for system timer */ void __init pre_init_apic_IRQ0(void) { - struct irq_cfg *cfg; + struct io_apic_irq_attr attr = { 0, 0, 0, 0 }; printk(KERN_INFO "Early APIC setup for system timer0\n"); #ifndef CONFIG_SMP - phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid); + physid_set_mask_of_physid(boot_cpu_physical_apicid, + &phys_cpu_present_map); #endif - /* Make sure the irq descriptor is set up */ - cfg = alloc_irq_and_cfg_at(0, 0); - setup_local_APIC(); - add_pin_to_irq_node(cfg, 0, 0, 0); - set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); - - setup_ioapic_irq(0, 0, 0, cfg, 0, 0); + io_apic_setup_irq_pin(0, 0, &attr); + irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, + "edge"); } diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index 08385e090a6..cce91bf2667 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -56,6 +56,8 @@ void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, local_irq_restore(flags); } +#ifdef CONFIG_X86_32 + void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, int vector) { @@ -71,8 +73,8 @@ void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, local_irq_save(flags); for_each_cpu(query_cpu, mask) __default_send_IPI_dest_field( - apic->cpu_to_logical_apicid(query_cpu), vector, - apic->dest_logical); + early_per_cpu(x86_cpu_to_logical_apicid, query_cpu), + vector, apic->dest_logical); local_irq_restore(flags); } @@ -90,14 +92,12 @@ void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, if (query_cpu == this_cpu) continue; __default_send_IPI_dest_field( - apic->cpu_to_logical_apicid(query_cpu), vector, - apic->dest_logical); + early_per_cpu(x86_cpu_to_logical_apicid, query_cpu), + vector, apic->dest_logical); } local_irq_restore(flags); } -#ifdef CONFIG_X86_32 - /* * This is only used on smaller machines. */ diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c deleted file mode 100644 index c90041ccb74..00000000000 --- a/arch/x86/kernel/apic/nmi.c +++ /dev/null @@ -1,567 +0,0 @@ -/* - * NMI watchdog support on APIC systems - * - * Started by Ingo Molnar <mingo@redhat.com> - * - * Fixes: - * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog. - * Mikael Pettersson : Power Management for local APIC NMI watchdog. - * Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog. - * Pavel Machek and - * Mikael Pettersson : PM converted to driver model. Disable/enable API. - */ - -#include <asm/apic.h> - -#include <linux/nmi.h> -#include <linux/mm.h> -#include <linux/delay.h> -#include <linux/interrupt.h> -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/sysdev.h> -#include <linux/sysctl.h> -#include <linux/percpu.h> -#include <linux/kprobes.h> -#include <linux/cpumask.h> -#include <linux/kernel_stat.h> -#include <linux/kdebug.h> -#include <linux/smp.h> - -#include <asm/i8259.h> -#include <asm/io_apic.h> -#include <asm/proto.h> -#include <asm/timer.h> - -#include <asm/mce.h> - -#include <asm/mach_traps.h> - -int unknown_nmi_panic; -int nmi_watchdog_enabled; - -/* For reliability, we're prepared to waste bits here. */ -static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; - -/* nmi_active: - * >0: the lapic NMI watchdog is active, but can be disabled - * <0: the lapic NMI watchdog has not been set up, and cannot - * be enabled - * 0: the lapic NMI watchdog is disabled, but can be enabled - */ -atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ -EXPORT_SYMBOL(nmi_active); - -unsigned int nmi_watchdog = NMI_NONE; -EXPORT_SYMBOL(nmi_watchdog); - -static int panic_on_timeout; - -static unsigned int nmi_hz = HZ; -static DEFINE_PER_CPU(short, wd_enabled); -static int endflag __initdata; - -static inline unsigned int get_nmi_count(int cpu) -{ - return per_cpu(irq_stat, cpu).__nmi_count; -} - -static inline int mce_in_progress(void) -{ -#if defined(CONFIG_X86_MCE) - return atomic_read(&mce_entry) > 0; -#endif - return 0; -} - -/* - * Take the local apic timer and PIT/HPET into account. We don't - * know which one is active, when we have highres/dyntick on - */ -static inline unsigned int get_timer_irqs(int cpu) -{ - return per_cpu(irq_stat, cpu).apic_timer_irqs + - per_cpu(irq_stat, cpu).irq0_irqs; -} - -#ifdef CONFIG_SMP -/* - * The performance counters used by NMI_LOCAL_APIC don't trigger when - * the CPU is idle. To make sure the NMI watchdog really ticks on all - * CPUs during the test make them busy. - */ -static __init void nmi_cpu_busy(void *data) -{ - local_irq_enable_in_hardirq(); - /* - * Intentionally don't use cpu_relax here. This is - * to make sure that the performance counter really ticks, - * even if there is a simulator or similar that catches the - * pause instruction. On a real HT machine this is fine because - * all other CPUs are busy with "useless" delay loops and don't - * care if they get somewhat less cycles. - */ - while (endflag == 0) - mb(); -} -#endif - -static void report_broken_nmi(int cpu, unsigned int *prev_nmi_count) -{ - printk(KERN_CONT "\n"); - - printk(KERN_WARNING - "WARNING: CPU#%d: NMI appears to be stuck (%d->%d)!\n", - cpu, prev_nmi_count[cpu], get_nmi_count(cpu)); - - printk(KERN_WARNING - "Please report this to bugzilla.kernel.org,\n"); - printk(KERN_WARNING - "and attach the output of the 'dmesg' command.\n"); - - per_cpu(wd_enabled, cpu) = 0; - atomic_dec(&nmi_active); -} - -static void __acpi_nmi_disable(void *__unused) -{ - apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); -} - -int __init check_nmi_watchdog(void) -{ - unsigned int *prev_nmi_count; - int cpu; - - if (!nmi_watchdog_active() || !atomic_read(&nmi_active)) - return 0; - - prev_nmi_count = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL); - if (!prev_nmi_count) - goto error; - - printk(KERN_INFO "Testing NMI watchdog ... "); - -#ifdef CONFIG_SMP - if (nmi_watchdog == NMI_LOCAL_APIC) - smp_call_function(nmi_cpu_busy, (void *)&endflag, 0); -#endif - - for_each_possible_cpu(cpu) - prev_nmi_count[cpu] = get_nmi_count(cpu); - local_irq_enable(); - mdelay((20 * 1000) / nmi_hz); /* wait 20 ticks */ - - for_each_online_cpu(cpu) { - if (!per_cpu(wd_enabled, cpu)) - continue; - if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) - report_broken_nmi(cpu, prev_nmi_count); - } - endflag = 1; - if (!atomic_read(&nmi_active)) { - kfree(prev_nmi_count); - atomic_set(&nmi_active, -1); - goto error; - } - printk("OK.\n"); - - /* - * now that we know it works we can reduce NMI frequency to - * something more reasonable; makes a difference in some configs - */ - if (nmi_watchdog == NMI_LOCAL_APIC) - nmi_hz = lapic_adjust_nmi_hz(1); - - kfree(prev_nmi_count); - return 0; -error: - if (nmi_watchdog == NMI_IO_APIC) { - if (!timer_through_8259) - legacy_pic->mask(0); - on_each_cpu(__acpi_nmi_disable, NULL, 1); - } - -#ifdef CONFIG_X86_32 - timer_ack = 0; -#endif - return -1; -} - -static int __init setup_nmi_watchdog(char *str) -{ - unsigned int nmi; - - if (!strncmp(str, "panic", 5)) { - panic_on_timeout = 1; - str = strchr(str, ','); - if (!str) - return 1; - ++str; - } - - if (!strncmp(str, "lapic", 5)) - nmi_watchdog = NMI_LOCAL_APIC; - else if (!strncmp(str, "ioapic", 6)) - nmi_watchdog = NMI_IO_APIC; - else { - get_option(&str, &nmi); - if (nmi >= NMI_INVALID) - return 0; - nmi_watchdog = nmi; - } - - return 1; -} -__setup("nmi_watchdog=", setup_nmi_watchdog); - -/* - * Suspend/resume support - */ -#ifdef CONFIG_PM - -static int nmi_pm_active; /* nmi_active before suspend */ - -static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) -{ - /* only CPU0 goes here, other CPUs should be offline */ - nmi_pm_active = atomic_read(&nmi_active); - stop_apic_nmi_watchdog(NULL); - BUG_ON(atomic_read(&nmi_active) != 0); - return 0; -} - -static int lapic_nmi_resume(struct sys_device *dev) -{ - /* only CPU0 goes here, other CPUs should be offline */ - if (nmi_pm_active > 0) { - setup_apic_nmi_watchdog(NULL); - touch_nmi_watchdog(); - } - return 0; -} - -static struct sysdev_class nmi_sysclass = { - .name = "lapic_nmi", - .resume = lapic_nmi_resume, - .suspend = lapic_nmi_suspend, -}; - -static struct sys_device device_lapic_nmi = { - .id = 0, - .cls = &nmi_sysclass, -}; - -static int __init init_lapic_nmi_sysfs(void) -{ - int error; - - /* - * should really be a BUG_ON but b/c this is an - * init call, it just doesn't work. -dcz - */ - if (nmi_watchdog != NMI_LOCAL_APIC) - return 0; - - if (atomic_read(&nmi_active) < 0) - return 0; - - error = sysdev_class_register(&nmi_sysclass); - if (!error) - error = sysdev_register(&device_lapic_nmi); - return error; -} - -/* must come after the local APIC's device_initcall() */ -late_initcall(init_lapic_nmi_sysfs); - -#endif /* CONFIG_PM */ - -static void __acpi_nmi_enable(void *__unused) -{ - apic_write(APIC_LVT0, APIC_DM_NMI); -} - -/* - * Enable timer based NMIs on all CPUs: - */ -void acpi_nmi_enable(void) -{ - if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) - on_each_cpu(__acpi_nmi_enable, NULL, 1); -} - -/* - * Disable timer based NMIs on all CPUs: - */ -void acpi_nmi_disable(void) -{ - if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) - on_each_cpu(__acpi_nmi_disable, NULL, 1); -} - -/* - * This function is called as soon the LAPIC NMI watchdog driver has everything - * in place and it's ready to check if the NMIs belong to the NMI watchdog - */ -void cpu_nmi_set_wd_enabled(void) -{ - __get_cpu_var(wd_enabled) = 1; -} - -void setup_apic_nmi_watchdog(void *unused) -{ - if (__get_cpu_var(wd_enabled)) - return; - - /* cheap hack to support suspend/resume */ - /* if cpu0 is not active neither should the other cpus */ - if (smp_processor_id() != 0 && atomic_read(&nmi_active) <= 0) - return; - - switch (nmi_watchdog) { - case NMI_LOCAL_APIC: - if (lapic_watchdog_init(nmi_hz) < 0) { - __get_cpu_var(wd_enabled) = 0; - return; - } - /* FALL THROUGH */ - case NMI_IO_APIC: - __get_cpu_var(wd_enabled) = 1; - atomic_inc(&nmi_active); - } -} - -void stop_apic_nmi_watchdog(void *unused) -{ - /* only support LOCAL and IO APICs for now */ - if (!nmi_watchdog_active()) - return; - if (__get_cpu_var(wd_enabled) == 0) - return; - if (nmi_watchdog == NMI_LOCAL_APIC) - lapic_watchdog_stop(); - else - __acpi_nmi_disable(NULL); - __get_cpu_var(wd_enabled) = 0; - atomic_dec(&nmi_active); -} - -/* - * the best way to detect whether a CPU has a 'hard lockup' problem - * is to check it's local APIC timer IRQ counts. If they are not - * changing then that CPU has some problem. - * - * as these watchdog NMI IRQs are generated on every CPU, we only - * have to check the current processor. - * - * since NMIs don't listen to _any_ locks, we have to be extremely - * careful not to rely on unsafe variables. The printk might lock - * up though, so we have to break up any console locks first ... - * [when there will be more tty-related locks, break them up here too!] - */ - -static DEFINE_PER_CPU(unsigned, last_irq_sum); -static DEFINE_PER_CPU(long, alert_counter); -static DEFINE_PER_CPU(int, nmi_touch); - -void touch_nmi_watchdog(void) -{ - if (nmi_watchdog_active()) { - unsigned cpu; - - /* - * Tell other CPUs to reset their alert counters. We cannot - * do it ourselves because the alert count increase is not - * atomic. - */ - for_each_present_cpu(cpu) { - if (per_cpu(nmi_touch, cpu) != 1) - per_cpu(nmi_touch, cpu) = 1; - } - } - - /* - * Tickle the softlockup detector too: - */ - touch_softlockup_watchdog(); -} -EXPORT_SYMBOL(touch_nmi_watchdog); - -notrace __kprobes int -nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) -{ - /* - * Since current_thread_info()-> is always on the stack, and we - * always switch the stack NMI-atomically, it's safe to use - * smp_processor_id(). - */ - unsigned int sum; - int touched = 0; - int cpu = smp_processor_id(); - int rc = 0; - - sum = get_timer_irqs(cpu); - - if (__get_cpu_var(nmi_touch)) { - __get_cpu_var(nmi_touch) = 0; - touched = 1; - } - - /* We can be called before check_nmi_watchdog, hence NULL check. */ - if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { - static DEFINE_RAW_SPINLOCK(lock); /* Serialise the printks */ - - raw_spin_lock(&lock); - printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); - show_regs(regs); - dump_stack(); - raw_spin_unlock(&lock); - cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); - - rc = 1; - } - - /* Could check oops_in_progress here too, but it's safer not to */ - if (mce_in_progress()) - touched = 1; - - /* if the none of the timers isn't firing, this cpu isn't doing much */ - if (!touched && __get_cpu_var(last_irq_sum) == sum) { - /* - * Ayiee, looks like this CPU is stuck ... - * wait a few IRQs (5 seconds) before doing the oops ... - */ - __this_cpu_inc(alert_counter); - if (__this_cpu_read(alert_counter) == 5 * nmi_hz) - /* - * die_nmi will return ONLY if NOTIFY_STOP happens.. - */ - die_nmi("BUG: NMI Watchdog detected LOCKUP", - regs, panic_on_timeout); - } else { - __get_cpu_var(last_irq_sum) = sum; - __this_cpu_write(alert_counter, 0); - } - - /* see if the nmi watchdog went off */ - if (!__get_cpu_var(wd_enabled)) - return rc; - switch (nmi_watchdog) { - case NMI_LOCAL_APIC: - rc |= lapic_wd_event(nmi_hz); - break; - case NMI_IO_APIC: - /* - * don't know how to accurately check for this. - * just assume it was a watchdog timer interrupt - * This matches the old behaviour. - */ - rc = 1; - break; - } - return rc; -} - -#ifdef CONFIG_SYSCTL - -static void enable_ioapic_nmi_watchdog_single(void *unused) -{ - __get_cpu_var(wd_enabled) = 1; - atomic_inc(&nmi_active); - __acpi_nmi_enable(NULL); -} - -static void enable_ioapic_nmi_watchdog(void) -{ - on_each_cpu(enable_ioapic_nmi_watchdog_single, NULL, 1); - touch_nmi_watchdog(); -} - -static void disable_ioapic_nmi_watchdog(void) -{ - on_each_cpu(stop_apic_nmi_watchdog, NULL, 1); -} - -static int __init setup_unknown_nmi_panic(char *str) -{ - unknown_nmi_panic = 1; - return 1; -} -__setup("unknown_nmi_panic", setup_unknown_nmi_panic); - -static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) -{ - unsigned char reason = get_nmi_reason(); - char buf[64]; - - sprintf(buf, "NMI received for unknown reason %02x\n", reason); - die_nmi(buf, regs, 1); /* Always panic here */ - return 0; -} - -/* - * proc handler for /proc/sys/kernel/nmi - */ -int proc_nmi_enabled(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) -{ - int old_state; - - nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0; - old_state = nmi_watchdog_enabled; - proc_dointvec(table, write, buffer, length, ppos); - if (!!old_state == !!nmi_watchdog_enabled) - return 0; - - if (atomic_read(&nmi_active) < 0 || !nmi_watchdog_active()) { - printk(KERN_WARNING - "NMI watchdog is permanently disabled\n"); - return -EIO; - } - - if (nmi_watchdog == NMI_LOCAL_APIC) { - if (nmi_watchdog_enabled) - enable_lapic_nmi_watchdog(); - else - disable_lapic_nmi_watchdog(); - } else if (nmi_watchdog == NMI_IO_APIC) { - if (nmi_watchdog_enabled) - enable_ioapic_nmi_watchdog(); - else - disable_ioapic_nmi_watchdog(); - } else { - printk(KERN_WARNING - "NMI watchdog doesn't know what hardware to touch\n"); - return -EIO; - } - return 0; -} - -#endif /* CONFIG_SYSCTL */ - -int do_nmi_callback(struct pt_regs *regs, int cpu) -{ -#ifdef CONFIG_SYSCTL - if (unknown_nmi_panic) - return unknown_nmi_panic_callback(regs, cpu); -#endif - return 0; -} - -void arch_trigger_all_cpu_backtrace(void) -{ - int i; - - cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); - - printk(KERN_INFO "sending NMI to all CPUs:\n"); - apic->send_IPI_all(NMI_VECTOR); - - /* Wait for up to 10 seconds for all CPUs to do the backtrace */ - for (i = 0; i < 10 * 1000; i++) { - if (cpumask_empty(to_cpumask(backtrace_mask))) - break; - mdelay(1); - } -} diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 960f26ab5c9..30f13319e24 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -48,8 +48,6 @@ #include <asm/e820.h> #include <asm/ipi.h> -#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT)) - int found_numaq; /* @@ -79,31 +77,20 @@ int quad_local_to_mp_bus_id[NR_CPUS/4][4]; static inline void numaq_register_node(int node, struct sys_cfg_data *scd) { struct eachquadmem *eq = scd->eq + node; + u64 start = (u64)(eq->hi_shrd_mem_start - eq->priv_mem_size) << 20; + u64 end = (u64)(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size) << 20; + int ret; - node_set_online(node); - - /* Convert to pages */ - node_start_pfn[node] = - MB_TO_PAGES(eq->hi_shrd_mem_start - eq->priv_mem_size); - - node_end_pfn[node] = - MB_TO_PAGES(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size); - - memblock_x86_register_active_regions(node, node_start_pfn[node], - node_end_pfn[node]); - - memory_present(node, node_start_pfn[node], node_end_pfn[node]); - - node_remap_size[node] = node_memmap_size_bytes(node, - node_start_pfn[node], - node_end_pfn[node]); + node_set(node, numa_nodes_parsed); + ret = numa_add_memblk(node, start, end); + BUG_ON(ret < 0); } /* * Function: smp_dump_qct() * * Description: gets memory layout from the quad config table. This - * function also updates node_online_map with the nodes (quads) present. + * function also updates numa_nodes_parsed with the nodes (quads) present. */ static void __init smp_dump_qct(void) { @@ -112,7 +99,6 @@ static void __init smp_dump_qct(void) scd = (void *)__va(SYS_CFG_DATA_PRIV_ADDR); - nodes_clear(node_online_map); for_each_node(node) { if (scd->quads_present31_0 & (1 << node)) numaq_register_node(node, scd); @@ -282,14 +268,14 @@ static __init void early_check_numaq(void) } } -int __init get_memcfg_numaq(void) +int __init numaq_numa_init(void) { early_check_numaq(); if (!found_numaq) - return 0; + return -ENOENT; smp_dump_qct(); - return 1; + return 0; } #define NUMAQ_APIC_DFR_VALUE (APIC_DFR_CLUSTER) @@ -373,13 +359,6 @@ static inline void numaq_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask return physids_promote(0xFUL, retmap); } -static inline int numaq_cpu_to_logical_apicid(int cpu) -{ - if (cpu >= nr_cpu_ids) - return BAD_APICID; - return cpu_2_logical_apicid[cpu]; -} - /* * Supporting over 60 cpus on NUMA-Q requires a locality-dependent * cpu to APIC ID relation to properly interact with the intelligent @@ -398,6 +377,15 @@ static inline int numaq_apicid_to_node(int logical_apicid) return logical_apicid >> 4; } +static int numaq_numa_cpu_node(int cpu) +{ + int logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); + + if (logical_apicid != BAD_APICID) + return numaq_apicid_to_node(logical_apicid); + return NUMA_NO_NODE; +} + static void numaq_apicid_to_cpu_present(int logical_apicid, physid_mask_t *retmap) { int node = numaq_apicid_to_node(logical_apicid); @@ -508,8 +496,6 @@ struct apic __refdata apic_numaq = { .ioapic_phys_id_map = numaq_ioapic_phys_id_map, .setup_apic_routing = numaq_setup_apic_routing, .multi_timer_check = numaq_multi_timer_check, - .apicid_to_node = numaq_apicid_to_node, - .cpu_to_logical_apicid = numaq_cpu_to_logical_apicid, .cpu_present_to_apicid = numaq_cpu_present_to_apicid, .apicid_to_cpu_present = numaq_apicid_to_cpu_present, .setup_portio_remap = numaq_setup_portio_remap, @@ -547,4 +533,7 @@ struct apic __refdata apic_numaq = { .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, + + .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid, + .x86_32_numa_cpu_node = numaq_numa_cpu_node, }; diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 99d2fe01608..6541e471fd9 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -77,6 +77,11 @@ void __init default_setup_apic_routing(void) apic->setup_apic_routing(); } +static int default_x86_32_early_logical_apicid(int cpu) +{ + return 1 << cpu; +} + static void setup_apic_flat_routing(void) { #ifdef CONFIG_X86_IO_APIC @@ -130,8 +135,6 @@ struct apic apic_default = { .ioapic_phys_id_map = default_ioapic_phys_id_map, .setup_apic_routing = setup_apic_flat_routing, .multi_timer_check = NULL, - .apicid_to_node = default_apicid_to_node, - .cpu_to_logical_apicid = default_cpu_to_logical_apicid, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = physid_set_mask_of_physid, .setup_portio_remap = NULL, @@ -167,6 +170,8 @@ struct apic apic_default = { .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, + + .x86_32_early_logical_apicid = default_x86_32_early_logical_apicid, }; extern struct apic apic_numaq; diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index f9e4e6a5407..d8c4a6feb28 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -79,13 +79,6 @@ void __init default_setup_apic_routing(void) /* need to update phys_pkg_id */ apic->phys_pkg_id = apicid_phys_pkg_id; } - - /* - * Now that apic routing model is selected, configure the - * fault handling for intr remapping. - */ - if (intr_remapping_enabled) - enable_drhd_fault_handling(); } /* Same for both flat and physical. */ diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 9b419263d90..35bcd7d995a 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -194,11 +194,10 @@ static unsigned long summit_check_apicid_present(int bit) return 1; } -static void summit_init_apic_ldr(void) +static int summit_early_logical_apicid(int cpu) { - unsigned long val, id; int count = 0; - u8 my_id = (u8)hard_smp_processor_id(); + u8 my_id = early_per_cpu(x86_cpu_to_apicid, cpu); u8 my_cluster = APIC_CLUSTER(my_id); #ifdef CONFIG_SMP u8 lid; @@ -206,7 +205,7 @@ static void summit_init_apic_ldr(void) /* Create logical APIC IDs by counting CPUs already in cluster. */ for (count = 0, i = nr_cpu_ids; --i >= 0; ) { - lid = cpu_2_logical_apicid[i]; + lid = early_per_cpu(x86_cpu_to_logical_apicid, i); if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster) ++count; } @@ -214,7 +213,15 @@ static void summit_init_apic_ldr(void) /* We only have a 4 wide bitmap in cluster mode. If a deranged * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */ BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT); - id = my_cluster | (1UL << count); + return my_cluster | (1UL << count); +} + +static void summit_init_apic_ldr(void) +{ + int cpu = smp_processor_id(); + unsigned long id = early_per_cpu(x86_cpu_to_logical_apicid, cpu); + unsigned long val; + apic_write(APIC_DFR, SUMMIT_APIC_DFR_VALUE); val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; val |= SET_APIC_LOGICAL_ID(id); @@ -232,27 +239,6 @@ static void summit_setup_apic_routing(void) nr_ioapics); } -static int summit_apicid_to_node(int logical_apicid) -{ -#ifdef CONFIG_SMP - return apicid_2_node[hard_smp_processor_id()]; -#else - return 0; -#endif -} - -/* Mapping from cpu number to logical apicid */ -static inline int summit_cpu_to_logical_apicid(int cpu) -{ -#ifdef CONFIG_SMP - if (cpu >= nr_cpu_ids) - return BAD_APICID; - return cpu_2_logical_apicid[cpu]; -#else - return logical_smp_processor_id(); -#endif -} - static int summit_cpu_present_to_apicid(int mps_cpu) { if (mps_cpu < nr_cpu_ids) @@ -286,7 +272,7 @@ static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask) * The cpus in the mask must all be on the apic cluster. */ for_each_cpu(cpu, cpumask) { - int new_apicid = summit_cpu_to_logical_apicid(cpu); + int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { printk("%s: Not a valid mask!\n", __func__); @@ -301,7 +287,7 @@ static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask) static unsigned int summit_cpu_mask_to_apicid_and(const struct cpumask *inmask, const struct cpumask *andmask) { - int apicid = summit_cpu_to_logical_apicid(0); + int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0); cpumask_var_t cpumask; if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) @@ -528,8 +514,6 @@ struct apic apic_summit = { .ioapic_phys_id_map = summit_ioapic_phys_id_map, .setup_apic_routing = summit_setup_apic_routing, .multi_timer_check = NULL, - .apicid_to_node = summit_apicid_to_node, - .cpu_to_logical_apicid = summit_cpu_to_logical_apicid, .cpu_present_to_apicid = summit_cpu_present_to_apicid, .apicid_to_cpu_present = summit_apicid_to_cpu_present, .setup_portio_remap = NULL, @@ -565,4 +549,6 @@ struct apic apic_summit = { .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, + + .x86_32_early_logical_apicid = summit_early_logical_apicid, }; diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index cf69c59f491..90949bbd566 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -206,8 +206,6 @@ struct apic apic_x2apic_cluster = { .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .multi_timer_check = NULL, - .apicid_to_node = NULL, - .cpu_to_logical_apicid = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = NULL, .setup_portio_remap = NULL, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 8972f38c5ce..c7e6d6645bf 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -195,8 +195,6 @@ struct apic apic_x2apic_phys = { .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .multi_timer_check = NULL, - .apicid_to_node = NULL, - .cpu_to_logical_apicid = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = NULL, .setup_portio_remap = NULL, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index f744f54cb24..7acd2d2ac96 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -5,7 +5,7 @@ * * SGI UV APIC functions (note: not an Intel compatible APIC) * - * Copyright (C) 2007-2009 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2007-2010 Silicon Graphics, Inc. All rights reserved. */ #include <linux/cpumask.h> #include <linux/hardirq.h> @@ -23,6 +23,8 @@ #include <linux/io.h> #include <linux/pci.h> #include <linux/kdebug.h> +#include <linux/delay.h> +#include <linux/crash_dump.h> #include <asm/uv/uv_mmrs.h> #include <asm/uv/uv_hub.h> @@ -34,6 +36,14 @@ #include <asm/ipi.h> #include <asm/smp.h> #include <asm/x86_init.h> +#include <asm/emergency-restart.h> +#include <asm/nmi.h> + +/* BMC sets a bit this MMR non-zero before sending an NMI */ +#define UVH_NMI_MMR UVH_SCRATCH5 +#define UVH_NMI_MMR_CLEAR (UVH_NMI_MMR + 8) +#define UV_NMI_PENDING_MASK (1UL << 63) +DEFINE_PER_CPU(unsigned long, cpu_last_nmi_count); DEFINE_PER_CPU(int, x2apic_extra_bits); @@ -41,10 +51,23 @@ DEFINE_PER_CPU(int, x2apic_extra_bits); static enum uv_system_type uv_system_type; static u64 gru_start_paddr, gru_end_paddr; +static union uvh_apicid uvh_apicid; int uv_min_hub_revision_id; EXPORT_SYMBOL_GPL(uv_min_hub_revision_id); +unsigned int uv_apicid_hibits; +EXPORT_SYMBOL_GPL(uv_apicid_hibits); static DEFINE_SPINLOCK(uv_nmi_lock); +static unsigned long __init uv_early_read_mmr(unsigned long addr) +{ + unsigned long val, *mmr; + + mmr = early_ioremap(UV_LOCAL_MMR_BASE | addr, sizeof(*mmr)); + val = *mmr; + early_iounmap(mmr, sizeof(*mmr)); + return val; +} + static inline bool is_GRU_range(u64 start, u64 end) { return start >= gru_start_paddr && end <= gru_end_paddr; @@ -55,27 +78,51 @@ static bool uv_is_untracked_pat_range(u64 start, u64 end) return is_ISA_range(start, end) || is_GRU_range(start, end); } -static int early_get_nodeid(void) +static int __init early_get_pnodeid(void) { union uvh_node_id_u node_id; - unsigned long *mmr; - - mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_NODE_ID, sizeof(*mmr)); - node_id.v = *mmr; - early_iounmap(mmr, sizeof(*mmr)); + union uvh_rh_gam_config_mmr_u m_n_config; + int pnode; /* Currently, all blades have same revision number */ + node_id.v = uv_early_read_mmr(UVH_NODE_ID); + m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_CONFIG_MMR); uv_min_hub_revision_id = node_id.s.revision; - return node_id.s.node_id; + pnode = (node_id.s.node_id >> 1) & ((1 << m_n_config.s.n_skt) - 1); + return pnode; +} + +static void __init early_get_apic_pnode_shift(void) +{ + uvh_apicid.v = uv_early_read_mmr(UVH_APICID); + if (!uvh_apicid.v) + /* + * Old bios, use default value + */ + uvh_apicid.s.pnode_shift = UV_APIC_PNODE_SHIFT; +} + +/* + * Add an extra bit as dictated by bios to the destination apicid of + * interrupts potentially passing through the UV HUB. This prevents + * a deadlock between interrupts and IO port operations. + */ +static void __init uv_set_apicid_hibit(void) +{ + union uvh_lb_target_physical_apic_id_mask_u apicid_mask; + + apicid_mask.v = uv_early_read_mmr(UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK); + uv_apicid_hibits = apicid_mask.s.bit_enables & UV_APICID_HIBIT_MASK; } static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { - int nodeid; + int pnodeid; if (!strcmp(oem_id, "SGI")) { - nodeid = early_get_nodeid(); + pnodeid = early_get_pnodeid(); + early_get_apic_pnode_shift(); x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; x86_platform.nmi_init = uv_nmi_init; if (!strcmp(oem_table_id, "UVL")) @@ -83,9 +130,10 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) else if (!strcmp(oem_table_id, "UVX")) uv_system_type = UV_X2APIC; else if (!strcmp(oem_table_id, "UVH")) { - __get_cpu_var(x2apic_extra_bits) = - nodeid << (UV_APIC_PNODE_SHIFT - 1); + __this_cpu_write(x2apic_extra_bits, + pnodeid << uvh_apicid.s.pnode_shift); uv_system_type = UV_NON_UNIQUE_APIC; + uv_set_apicid_hibit(); return 1; } } @@ -139,6 +187,7 @@ static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_ri int pnode; pnode = uv_apicid_to_pnode(phys_apicid); + phys_apicid |= uv_apicid_hibits; val = (1UL << UVH_IPI_INT_SEND_SHFT) | (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | @@ -220,7 +269,7 @@ static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask) int cpu = cpumask_first(cpumask); if ((unsigned)cpu < nr_cpu_ids) - return per_cpu(x86_cpu_to_apicid, cpu); + return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits; else return BAD_APICID; } @@ -239,7 +288,7 @@ uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask, if (cpumask_test_cpu(cpu, cpu_online_mask)) break; } - return per_cpu(x86_cpu_to_apicid, cpu); + return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits; } static unsigned int x2apic_get_apic_id(unsigned long x) @@ -247,7 +296,7 @@ static unsigned int x2apic_get_apic_id(unsigned long x) unsigned int id; WARN_ON(preemptible() && num_online_cpus() > 1); - id = x | __get_cpu_var(x2apic_extra_bits); + id = x | __this_cpu_read(x2apic_extra_bits); return id; } @@ -299,8 +348,6 @@ struct apic __refdata apic_x2apic_uv_x = { .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .multi_timer_check = NULL, - .apicid_to_node = NULL, - .cpu_to_logical_apicid = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = NULL, .setup_portio_remap = NULL, @@ -339,7 +386,7 @@ struct apic __refdata apic_x2apic_uv_x = { static __cpuinit void set_x2apic_extra_bits(int pnode) { - __get_cpu_var(x2apic_extra_bits) = (pnode << 6); + __this_cpu_write(x2apic_extra_bits, pnode << uvh_apicid.s.pnode_shift); } /* @@ -363,14 +410,14 @@ struct redir_addr { #define DEST_SHIFT UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT static __initdata struct redir_addr redir_addrs[] = { - {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR, UVH_SI_ALIAS0_OVERLAY_CONFIG}, - {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR, UVH_SI_ALIAS1_OVERLAY_CONFIG}, - {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_SI_ALIAS2_OVERLAY_CONFIG}, + {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR}, + {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR}, + {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR}, }; static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) { - union uvh_si_alias0_overlay_config_u alias; + union uvh_rh_gam_alias210_overlay_config_2_mmr_u alias; union uvh_rh_gam_alias210_redirect_config_2_mmr_u redirect; int i; @@ -602,18 +649,46 @@ void __cpuinit uv_cpu_init(void) */ int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data) { - if (reason != DIE_NMI_IPI) + unsigned long real_uv_nmi; + int bid; + + if (reason != DIE_NMIUNKNOWN) return NOTIFY_OK; if (in_crash_kexec) /* do nothing if entering the crash kernel */ return NOTIFY_OK; + + /* + * Each blade has an MMR that indicates when an NMI has been sent + * to cpus on the blade. If an NMI is detected, atomically + * clear the MMR and update a per-blade NMI count used to + * cause each cpu on the blade to notice a new NMI. + */ + bid = uv_numa_blade_id(); + real_uv_nmi = (uv_read_local_mmr(UVH_NMI_MMR) & UV_NMI_PENDING_MASK); + + if (unlikely(real_uv_nmi)) { + spin_lock(&uv_blade_info[bid].nmi_lock); + real_uv_nmi = (uv_read_local_mmr(UVH_NMI_MMR) & UV_NMI_PENDING_MASK); + if (real_uv_nmi) { + uv_blade_info[bid].nmi_count++; + uv_write_local_mmr(UVH_NMI_MMR_CLEAR, UV_NMI_PENDING_MASK); + } + spin_unlock(&uv_blade_info[bid].nmi_lock); + } + + if (likely(__get_cpu_var(cpu_last_nmi_count) == uv_blade_info[bid].nmi_count)) + return NOTIFY_DONE; + + __get_cpu_var(cpu_last_nmi_count) = uv_blade_info[bid].nmi_count; + /* - * Use a lock so only one cpu prints at a time - * to prevent intermixed output. + * Use a lock so only one cpu prints at a time. + * This prevents intermixed output. */ spin_lock(&uv_nmi_lock); - pr_info("NMI stack dump cpu %u:\n", smp_processor_id()); + pr_info("UV NMI stack dump cpu %u:\n", smp_processor_id()); dump_stack(); spin_unlock(&uv_nmi_lock); @@ -621,7 +696,8 @@ int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data) } static struct notifier_block uv_dump_stack_nmi_nb = { - .notifier_call = uv_handle_nmi + .notifier_call = uv_handle_nmi, + .priority = NMI_LOCAL_LOW_PRIOR - 1, }; void uv_register_nmi_notifier(void) @@ -644,28 +720,33 @@ void uv_nmi_init(void) void __init uv_system_init(void) { - union uvh_si_addr_map_config_u m_n_config; + union uvh_rh_gam_config_mmr_u m_n_config; + union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh; union uvh_node_id_u node_id; unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size; - int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val; + int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val, n_io; int gnode_extra, max_pnode = 0; unsigned long mmr_base, present, paddr; - unsigned short pnode_mask; + unsigned short pnode_mask, pnode_io_mask; map_low_mmrs(); - m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); + m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR ); m_val = m_n_config.s.m_skt; n_val = m_n_config.s.n_skt; + mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR); + n_io = mmioh.s.n_io; mmr_base = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & ~UV_MMR_ENABLE; pnode_mask = (1 << n_val) - 1; + pnode_io_mask = (1 << n_io) - 1; + node_id.v = uv_read_local_mmr(UVH_NODE_ID); gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1; gnode_upper = ((unsigned long)gnode_extra << m_val); - printk(KERN_DEBUG "UV: N %d, M %d, gnode_upper 0x%lx, gnode_extra 0x%x\n", - n_val, m_val, gnode_upper, gnode_extra); + printk(KERN_INFO "UV: N %d, M %d, N_IO: %d, gnode_upper 0x%lx, gnode_extra 0x%x, pnode_mask 0x%x, pnode_io_mask 0x%x\n", + n_val, m_val, n_io, gnode_upper, gnode_extra, pnode_mask, pnode_io_mask); printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base); @@ -675,8 +756,9 @@ void __init uv_system_init(void) printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades()); bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); - uv_blade_info = kmalloc(bytes, GFP_KERNEL); + uv_blade_info = kzalloc(bytes, GFP_KERNEL); BUG_ON(!uv_blade_info); + for (blade = 0; blade < uv_num_possible_blades(); blade++) uv_blade_info[blade].memory_nid = -1; @@ -698,10 +780,11 @@ void __init uv_system_init(void) for (j = 0; j < 64; j++) { if (!test_bit(j, &present)) continue; - pnode = (i * 64 + j); + pnode = (i * 64 + j) & pnode_mask; uv_blade_info[blade].pnode = pnode; uv_blade_info[blade].nr_possible_cpus = 0; uv_blade_info[blade].nr_online_cpus = 0; + spin_lock_init(&uv_blade_info[blade].nmi_lock); max_pnode = max(pnode, max_pnode); blade++; } @@ -716,6 +799,11 @@ void __init uv_system_init(void) int apicid = per_cpu(x86_cpu_to_apicid, cpu); nid = cpu_to_node(cpu); + /* + * apic_pnode_shift must be set before calling uv_apicid_to_pnode(); + */ + uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask; + uv_cpu_hub_info(cpu)->apic_pnode_shift = uvh_apicid.s.pnode_shift; pnode = uv_apicid_to_pnode(apicid); blade = boot_pnode_to_blade(pnode); lcpu = uv_blade_info[blade].nr_possible_cpus; @@ -731,7 +819,6 @@ void __init uv_system_init(void) uv_cpu_hub_info(cpu)->numa_blade_id = blade; uv_cpu_hub_info(cpu)->blade_processor_id = lcpu; uv_cpu_hub_info(cpu)->pnode = pnode; - uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask; uv_cpu_hub_info(cpu)->gpa_mask = (1UL << (m_val + n_val)) - 1; uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra; @@ -755,7 +842,7 @@ void __init uv_system_init(void) map_gru_high(max_pnode); map_mmr_high(max_pnode); - map_mmioh_high(max_pnode); + map_mmioh_high(max_pnode & pnode_io_mask); uv_cpu_init(); uv_scir_register_cpu_notifier(); @@ -764,4 +851,11 @@ void __init uv_system_init(void) /* register Legacy VGA I/O redirection handler */ pci_register_set_vga_state(uv_set_vga_state); + + /* + * For a kdump kernel the reset must be BOOT_ACPI, not BOOT_EFI, as + * EFI is not enabled in the kdump kernel. + */ + if (is_kdump_kernel()) + reboot_type = BOOT_ACPI; } diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 0e4f24c2a74..3bfa0223596 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -66,7 +66,7 @@ * 1.5: Fix segment register reloading (in case of bad segments saved * across BIOS call). * Stephen Rothwell - * 1.6: Cope with complier/assembler differences. + * 1.6: Cope with compiler/assembler differences. * Only try to turn off the first display device. * Fix OOPS at power off with no APM BIOS by Jan Echternach * <echter@informatik.uni-rostock.de> @@ -227,6 +227,8 @@ #include <linux/suspend.h> #include <linux/kthread.h> #include <linux/jiffies.h> +#include <linux/acpi.h> +#include <linux/syscore_ops.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -975,20 +977,10 @@ recalc: static void apm_power_off(void) { - unsigned char po_bios_call[] = { - 0xb8, 0x00, 0x10, /* movw $0x1000,ax */ - 0x8e, 0xd0, /* movw ax,ss */ - 0xbc, 0x00, 0xf0, /* movw $0xf000,sp */ - 0xb8, 0x07, 0x53, /* movw $0x5307,ax */ - 0xbb, 0x01, 0x00, /* movw $0x0001,bx */ - 0xb9, 0x03, 0x00, /* movw $0x0003,cx */ - 0xcd, 0x15 /* int $0x15 */ - }; - /* Some bioses don't like being called from CPU != 0 */ if (apm_info.realmode_power_off) { set_cpus_allowed_ptr(current, cpumask_of(0)); - machine_real_restart(po_bios_call, sizeof(po_bios_call)); + machine_real_restart(MRR_APM); } else { (void)set_system_power_state(APM_STATE_OFF); } @@ -1246,7 +1238,7 @@ static int suspend(int vetoable) dpm_suspend_noirq(PMSG_SUSPEND); local_irq_disable(); - sysdev_suspend(PMSG_SUSPEND); + syscore_suspend(); local_irq_enable(); @@ -1264,7 +1256,7 @@ static int suspend(int vetoable) apm_error("suspend", err); err = (err == APM_SUCCESS) ? 0 : -EIO; - sysdev_resume(); + syscore_resume(); local_irq_enable(); dpm_resume_noirq(PMSG_RESUME); @@ -1288,7 +1280,7 @@ static void standby(void) dpm_suspend_noirq(PMSG_SUSPEND); local_irq_disable(); - sysdev_suspend(PMSG_SUSPEND); + syscore_suspend(); local_irq_enable(); err = set_system_power_state(APM_STATE_STANDBY); @@ -1296,7 +1288,7 @@ static void standby(void) apm_error("standby", err); local_irq_disable(); - sysdev_resume(); + syscore_resume(); local_irq_enable(); dpm_resume_noirq(PMSG_RESUME); @@ -2331,12 +2323,11 @@ static int __init apm_init(void) apm_info.disabled = 1; return -ENODEV; } - if (pm_flags & PM_ACPI) { + if (!acpi_disabled) { printk(KERN_NOTICE "apm: overridden by ACPI.\n"); apm_info.disabled = 1; return -ENODEV; } - pm_flags |= PM_APM; /* * Set up the long jump entry point to the APM BIOS, which is called @@ -2428,7 +2419,6 @@ static void __exit apm_exit(void) kthread_stop(kapmd_task); kapmd_task = NULL; } - pm_flags &= ~PM_APM; } module_init(apm_init); diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index cfa82c899f4..4f13fafc526 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -1,5 +1,70 @@ +/* + * Generate definitions needed by assembly language modules. + * This code generates raw asm output which is post-processed to extract + * and format the required data. + */ +#define COMPILE_OFFSETS + +#include <linux/crypto.h> +#include <linux/sched.h> +#include <linux/stddef.h> +#include <linux/hardirq.h> +#include <linux/suspend.h> +#include <linux/kbuild.h> +#include <asm/processor.h> +#include <asm/thread_info.h> +#include <asm/sigframe.h> +#include <asm/bootparam.h> +#include <asm/suspend.h> + +#ifdef CONFIG_XEN +#include <xen/interface/xen.h> +#endif + #ifdef CONFIG_X86_32 # include "asm-offsets_32.c" #else # include "asm-offsets_64.c" #endif + +void common(void) { + BLANK(); + OFFSET(TI_flags, thread_info, flags); + OFFSET(TI_status, thread_info, status); + OFFSET(TI_addr_limit, thread_info, addr_limit); + OFFSET(TI_preempt_count, thread_info, preempt_count); + + BLANK(); + OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); + + BLANK(); + OFFSET(pbe_address, pbe, address); + OFFSET(pbe_orig_address, pbe, orig_address); + OFFSET(pbe_next, pbe, next); + +#ifdef CONFIG_PARAVIRT + BLANK(); + OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled); + OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops); + OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops); + OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); + OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); + OFFSET(PV_CPU_iret, pv_cpu_ops, iret); + OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit); + OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0); + OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2); +#endif + +#ifdef CONFIG_XEN + BLANK(); + OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask); + OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending); +#endif + + BLANK(); + OFFSET(BP_scratch, boot_params, scratch); + OFFSET(BP_loadflags, boot_params, hdr.loadflags); + OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); + OFFSET(BP_version, boot_params, hdr.version); + OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); +} diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 1a4088dda37..c29d631af6f 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -1,26 +1,4 @@ -/* - * Generate definitions needed by assembly language modules. - * This code generates raw asm output which is post-processed - * to extract and format the required data. - */ - -#include <linux/crypto.h> -#include <linux/sched.h> -#include <linux/signal.h> -#include <linux/personality.h> -#include <linux/suspend.h> -#include <linux/kbuild.h> #include <asm/ucontext.h> -#include <asm/sigframe.h> -#include <asm/pgtable.h> -#include <asm/fixmap.h> -#include <asm/processor.h> -#include <asm/thread_info.h> -#include <asm/bootparam.h> -#include <asm/elf.h> -#include <asm/suspend.h> - -#include <xen/interface/xen.h> #include <linux/lguest.h> #include "../../../drivers/lguest/lg.h" @@ -51,21 +29,10 @@ void foo(void) OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id); BLANK(); - OFFSET(TI_task, thread_info, task); - OFFSET(TI_exec_domain, thread_info, exec_domain); - OFFSET(TI_flags, thread_info, flags); - OFFSET(TI_status, thread_info, status); - OFFSET(TI_preempt_count, thread_info, preempt_count); - OFFSET(TI_addr_limit, thread_info, addr_limit); - OFFSET(TI_restart_block, thread_info, restart_block); OFFSET(TI_sysenter_return, thread_info, sysenter_return); OFFSET(TI_cpu, thread_info, cpu); BLANK(); - OFFSET(GDS_size, desc_ptr, size); - OFFSET(GDS_address, desc_ptr, address); - BLANK(); - OFFSET(PT_EBX, pt_regs, bx); OFFSET(PT_ECX, pt_regs, cx); OFFSET(PT_EDX, pt_regs, dx); @@ -85,42 +52,13 @@ void foo(void) OFFSET(PT_OLDSS, pt_regs, ss); BLANK(); - OFFSET(EXEC_DOMAIN_handler, exec_domain, handler); OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext); BLANK(); - OFFSET(pbe_address, pbe, address); - OFFSET(pbe_orig_address, pbe, orig_address); - OFFSET(pbe_next, pbe, next); - /* Offset from the sysenter stack to tss.sp0 */ DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - sizeof(struct tss_struct)); - DEFINE(PAGE_SIZE_asm, PAGE_SIZE); - DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT); - DEFINE(THREAD_SIZE_asm, THREAD_SIZE); - - OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); - -#ifdef CONFIG_PARAVIRT - BLANK(); - OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled); - OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops); - OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops); - OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); - OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); - OFFSET(PV_CPU_iret, pv_cpu_ops, iret); - OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit); - OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0); -#endif - -#ifdef CONFIG_XEN - BLANK(); - OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask); - OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending); -#endif - #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) BLANK(); OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); @@ -139,11 +77,4 @@ void foo(void) OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode); OFFSET(LGUEST_PAGES_regs, lguest_pages, regs); #endif - - BLANK(); - OFFSET(BP_scratch, boot_params, scratch); - OFFSET(BP_loadflags, boot_params, hdr.loadflags); - OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); - OFFSET(BP_version, boot_params, hdr.version); - OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); } diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 4a6aeedcd96..e72a1194af2 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -1,27 +1,4 @@ -/* - * Generate definitions needed by assembly language modules. - * This code generates raw asm output which is post-processed to extract - * and format the required data. - */ -#define COMPILE_OFFSETS - -#include <linux/crypto.h> -#include <linux/sched.h> -#include <linux/stddef.h> -#include <linux/errno.h> -#include <linux/hardirq.h> -#include <linux/suspend.h> -#include <linux/kbuild.h> -#include <asm/processor.h> -#include <asm/segment.h> -#include <asm/thread_info.h> #include <asm/ia32.h> -#include <asm/bootparam.h> -#include <asm/suspend.h> - -#include <xen/interface/xen.h> - -#include <asm/sigframe.h> #define __NO_STUBS 1 #undef __SYSCALL @@ -33,41 +10,19 @@ static char syscalls[] = { int main(void) { -#define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry)) - ENTRY(state); - ENTRY(flags); - ENTRY(pid); - BLANK(); -#undef ENTRY -#define ENTRY(entry) DEFINE(TI_ ## entry, offsetof(struct thread_info, entry)) - ENTRY(flags); - ENTRY(addr_limit); - ENTRY(preempt_count); - ENTRY(status); -#ifdef CONFIG_IA32_EMULATION - ENTRY(sysenter_return); -#endif - BLANK(); -#undef ENTRY #ifdef CONFIG_PARAVIRT - BLANK(); - OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled); - OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops); - OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops); - OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); - OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame); - OFFSET(PV_CPU_iret, pv_cpu_ops, iret); OFFSET(PV_CPU_usergs_sysret32, pv_cpu_ops, usergs_sysret32); OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64); - OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit); OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); - OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2); + BLANK(); #endif - #ifdef CONFIG_IA32_EMULATION -#define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry)) + OFFSET(TI_sysenter_return, thread_info, sysenter_return); + BLANK(); + +#define ENTRY(entry) OFFSET(IA32_SIGCONTEXT_ ## entry, sigcontext_ia32, entry) ENTRY(ax); ENTRY(bx); ENTRY(cx); @@ -79,15 +34,12 @@ int main(void) ENTRY(ip); BLANK(); #undef ENTRY - DEFINE(IA32_RT_SIGFRAME_sigcontext, - offsetof (struct rt_sigframe_ia32, uc.uc_mcontext)); + + OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext); BLANK(); #endif - DEFINE(pbe_address, offsetof(struct pbe, address)); - DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address)); - DEFINE(pbe_next, offsetof(struct pbe, next)); - BLANK(); -#define ENTRY(entry) DEFINE(pt_regs_ ## entry, offsetof(struct pt_regs, entry)) + +#define ENTRY(entry) OFFSET(pt_regs_ ## entry, pt_regs, entry) ENTRY(bx); ENTRY(bx); ENTRY(cx); @@ -107,7 +59,8 @@ int main(void) ENTRY(flags); BLANK(); #undef ENTRY -#define ENTRY(entry) DEFINE(saved_context_ ## entry, offsetof(struct saved_context, entry)) + +#define ENTRY(entry) OFFSET(saved_context_ ## entry, saved_context, entry) ENTRY(cr0); ENTRY(cr2); ENTRY(cr3); @@ -115,26 +68,11 @@ int main(void) ENTRY(cr8); BLANK(); #undef ENTRY - DEFINE(TSS_ist, offsetof(struct tss_struct, x86_tss.ist)); - BLANK(); - DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx)); - BLANK(); - DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); + OFFSET(TSS_ist, tss_struct, x86_tss.ist); BLANK(); - OFFSET(BP_scratch, boot_params, scratch); - OFFSET(BP_loadflags, boot_params, hdr.loadflags); - OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); - OFFSET(BP_version, boot_params, hdr.version); - OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); - BLANK(); - DEFINE(PAGE_SIZE_asm, PAGE_SIZE); -#ifdef CONFIG_XEN - BLANK(); - OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask); - OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending); -#undef ENTRY -#endif + DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); + return 0; } diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c deleted file mode 100644 index 8bc57baaa9a..00000000000 --- a/arch/x86/kernel/bios_uv.c +++ /dev/null @@ -1,215 +0,0 @@ -/* - * BIOS run time interface routines. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * Copyright (c) 2008-2009 Silicon Graphics, Inc. All Rights Reserved. - * Copyright (c) Russ Anderson <rja@sgi.com> - */ - -#include <linux/efi.h> -#include <asm/efi.h> -#include <linux/io.h> -#include <asm/uv/bios.h> -#include <asm/uv/uv_hub.h> - -static struct uv_systab uv_systab; - -s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) -{ - struct uv_systab *tab = &uv_systab; - s64 ret; - - if (!tab->function) - /* - * BIOS does not support UV systab - */ - return BIOS_STATUS_UNIMPLEMENTED; - - ret = efi_call6((void *)__va(tab->function), (u64)which, - a1, a2, a3, a4, a5); - return ret; -} -EXPORT_SYMBOL_GPL(uv_bios_call); - -s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, - u64 a4, u64 a5) -{ - unsigned long bios_flags; - s64 ret; - - local_irq_save(bios_flags); - ret = uv_bios_call(which, a1, a2, a3, a4, a5); - local_irq_restore(bios_flags); - - return ret; -} - -s64 uv_bios_call_reentrant(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, - u64 a4, u64 a5) -{ - s64 ret; - - preempt_disable(); - ret = uv_bios_call(which, a1, a2, a3, a4, a5); - preempt_enable(); - - return ret; -} - - -long sn_partition_id; -EXPORT_SYMBOL_GPL(sn_partition_id); -long sn_coherency_id; -EXPORT_SYMBOL_GPL(sn_coherency_id); -long sn_region_size; -EXPORT_SYMBOL_GPL(sn_region_size); -long system_serial_number; -EXPORT_SYMBOL_GPL(system_serial_number); -int uv_type; -EXPORT_SYMBOL_GPL(uv_type); - - -s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher, - long *region, long *ssn) -{ - s64 ret; - u64 v0, v1; - union partition_info_u part; - - ret = uv_bios_call_irqsave(UV_BIOS_GET_SN_INFO, fc, - (u64)(&v0), (u64)(&v1), 0, 0); - if (ret != BIOS_STATUS_SUCCESS) - return ret; - - part.val = v0; - if (uvtype) - *uvtype = part.hub_version; - if (partid) - *partid = part.partition_id; - if (coher) - *coher = part.coherence_id; - if (region) - *region = part.region_size; - if (ssn) - *ssn = v1; - return ret; -} -EXPORT_SYMBOL_GPL(uv_bios_get_sn_info); - -int -uv_bios_mq_watchlist_alloc(unsigned long addr, unsigned int mq_size, - unsigned long *intr_mmr_offset) -{ - u64 watchlist; - s64 ret; - - /* - * bios returns watchlist number or negative error number. - */ - ret = (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_ALLOC, addr, - mq_size, (u64)intr_mmr_offset, - (u64)&watchlist, 0); - if (ret < BIOS_STATUS_SUCCESS) - return ret; - - return watchlist; -} -EXPORT_SYMBOL_GPL(uv_bios_mq_watchlist_alloc); - -int -uv_bios_mq_watchlist_free(int blade, int watchlist_num) -{ - return (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_FREE, - blade, watchlist_num, 0, 0, 0); -} -EXPORT_SYMBOL_GPL(uv_bios_mq_watchlist_free); - -s64 -uv_bios_change_memprotect(u64 paddr, u64 len, enum uv_memprotect perms) -{ - return uv_bios_call_irqsave(UV_BIOS_MEMPROTECT, paddr, len, - perms, 0, 0); -} -EXPORT_SYMBOL_GPL(uv_bios_change_memprotect); - -s64 -uv_bios_reserved_page_pa(u64 buf, u64 *cookie, u64 *addr, u64 *len) -{ - s64 ret; - - ret = uv_bios_call_irqsave(UV_BIOS_GET_PARTITION_ADDR, (u64)cookie, - (u64)addr, buf, (u64)len, 0); - return ret; -} -EXPORT_SYMBOL_GPL(uv_bios_reserved_page_pa); - -s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second) -{ - return uv_bios_call(UV_BIOS_FREQ_BASE, clock_type, - (u64)ticks_per_second, 0, 0, 0); -} -EXPORT_SYMBOL_GPL(uv_bios_freq_base); - -/* - * uv_bios_set_legacy_vga_target - Set Legacy VGA I/O Target - * @decode: true to enable target, false to disable target - * @domain: PCI domain number - * @bus: PCI bus number - * - * Returns: - * 0: Success - * -EINVAL: Invalid domain or bus number - * -ENOSYS: Capability not available - * -EBUSY: Legacy VGA I/O cannot be retargeted at this time - */ -int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus) -{ - return uv_bios_call(UV_BIOS_SET_LEGACY_VGA_TARGET, - (u64)decode, (u64)domain, (u64)bus, 0, 0); -} -EXPORT_SYMBOL_GPL(uv_bios_set_legacy_vga_target); - - -#ifdef CONFIG_EFI -void uv_bios_init(void) -{ - struct uv_systab *tab; - - if ((efi.uv_systab == EFI_INVALID_TABLE_ADDR) || - (efi.uv_systab == (unsigned long)NULL)) { - printk(KERN_CRIT "No EFI UV System Table.\n"); - uv_systab.function = (unsigned long)NULL; - return; - } - - tab = (struct uv_systab *)ioremap(efi.uv_systab, - sizeof(struct uv_systab)); - if (strncmp(tab->signature, "UVST", 4) != 0) - printk(KERN_ERR "bad signature in UV system table!"); - - /* - * Copy table to permanent spot for later use. - */ - memcpy(&uv_systab, tab, sizeof(struct uv_systab)); - iounmap(tab); - - printk(KERN_INFO "EFI UV System Table Revision %d\n", - uv_systab.revision); -} -#else /* !CONFIG_EFI */ - -void uv_bios_init(void) { } -#endif diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c index 13a38917951..452932d3473 100644 --- a/arch/x86/kernel/check.c +++ b/arch/x86/kernel/check.c @@ -106,8 +106,8 @@ void __init setup_bios_corruption_check(void) addr += size; } - printk(KERN_INFO "Scanning %d areas for low memory corruption\n", - num_scan_areas); + if (num_scan_areas) + printk(KERN_INFO "Scanning %d areas for low memory corruption\n", num_scan_areas); } @@ -143,12 +143,12 @@ static void check_corruption(struct work_struct *dummy) { check_for_bios_corruption(); schedule_delayed_work(&bios_check_work, - round_jiffies_relative(corruption_check_period*HZ)); + round_jiffies_relative(corruption_check_period*HZ)); } static int start_periodic_check_for_corruption(void) { - if (!memory_corruption_check || corruption_check_period == 0) + if (!num_scan_areas || !memory_corruption_check || corruption_check_period == 0) return 0; printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n", diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 3f0ebe429a0..6042981d030 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -30,7 +30,6 @@ obj-$(CONFIG_PERF_EVENTS) += perf_event.o obj-$(CONFIG_X86_MCE) += mcheck/ obj-$(CONFIG_MTRR) += mtrr/ -obj-$(CONFIG_CPU_FREQ) += cpufreq/ obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 9e093f8fe78..6f9d1f6063e 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -233,18 +233,22 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) } #endif -#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) +#ifdef CONFIG_NUMA +/* + * To workaround broken NUMA config. Read the comment in + * srat_detect_node(). + */ static int __cpuinit nearby_node(int apicid) { int i, node; for (i = apicid - 1; i >= 0; i--) { - node = apicid_to_node[i]; + node = __apicid_to_node[i]; if (node != NUMA_NO_NODE && node_online(node)) return node; } for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { - node = apicid_to_node[i]; + node = __apicid_to_node[i]; if (node != NUMA_NO_NODE && node_online(node)) return node; } @@ -261,7 +265,7 @@ static int __cpuinit nearby_node(int apicid) #ifdef CONFIG_X86_HT static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c) { - u32 nodes; + u32 nodes, cores_per_cu = 1; u8 node_id; int cpu = smp_processor_id(); @@ -276,6 +280,7 @@ static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c) /* get compute unit information */ smp_num_siblings = ((ebx >> 8) & 3) + 1; c->compute_unit_id = ebx & 0xff; + cores_per_cu += ((ebx >> 8) & 3); } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { u64 value; @@ -288,15 +293,18 @@ static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c) /* fixup multi-node processor information */ if (nodes > 1) { u32 cores_per_node; + u32 cus_per_node; set_cpu_cap(c, X86_FEATURE_AMD_DCM); cores_per_node = c->x86_max_cores / nodes; + cus_per_node = cores_per_node / cores_per_cu; /* store NodeID, use llc_shared_map to store sibling info */ per_cpu(cpu_llc_id, cpu) = node_id; - /* core id to be in range from 0 to (cores_per_node - 1) */ - c->cpu_core_id = c->cpu_core_id % cores_per_node; + /* core id has to be in the [0 .. cores_per_node - 1] range */ + c->cpu_core_id %= cores_per_node; + c->compute_unit_id %= cus_per_node; } } #endif @@ -334,31 +342,40 @@ EXPORT_SYMBOL_GPL(amd_get_nb_id); static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) { -#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) +#ifdef CONFIG_NUMA int cpu = smp_processor_id(); int node; unsigned apicid = c->apicid; - node = per_cpu(cpu_llc_id, cpu); + node = numa_cpu_node(cpu); + if (node == NUMA_NO_NODE) + node = per_cpu(cpu_llc_id, cpu); - if (apicid_to_node[apicid] != NUMA_NO_NODE) - node = apicid_to_node[apicid]; if (!node_online(node)) { - /* Two possibilities here: - - The CPU is missing memory and no node was created. - In that case try picking one from a nearby CPU - - The APIC IDs differ from the HyperTransport node IDs - which the K8 northbridge parsing fills in. - Assume they are all increased by a constant offset, - but in the same order as the HT nodeids. - If that doesn't result in a usable node fall back to the - path for the previous case. */ - + /* + * Two possibilities here: + * + * - The CPU is missing memory and no node was created. In + * that case try picking one from a nearby CPU. + * + * - The APIC IDs differ from the HyperTransport node IDs + * which the K8 northbridge parsing fills in. Assume + * they are all increased by a constant offset, but in + * the same order as the HT nodeids. If that doesn't + * result in a usable node fall back to the path for the + * previous case. + * + * This workaround operates directly on the mapping between + * APIC ID and NUMA node, assuming certain relationship + * between APIC ID, HT node ID and NUMA topology. As going + * through CPU mapping may alter the outcome, directly + * access __apicid_to_node[]. + */ int ht_nodeid = c->initial_apicid; if (ht_nodeid >= 0 && - apicid_to_node[ht_nodeid] != NUMA_NO_NODE) - node = apicid_to_node[ht_nodeid]; + __apicid_to_node[ht_nodeid] != NUMA_NO_NODE) + node = __apicid_to_node[ht_nodeid]; /* Pick a nearby node */ if (!node_online(node)) node = nearby_node(apicid); @@ -594,6 +611,29 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) } } #endif + + /* As a rule processors have APIC timer running in deep C states */ + if (c->x86 > 0xf && !cpu_has_amd_erratum(amd_erratum_400)) + set_cpu_cap(c, X86_FEATURE_ARAT); + + /* + * Disable GART TLB Walk Errors on Fam10h. We do this here + * because this is always needed when GART is enabled, even in a + * kernel which has no MCE support built in. + */ + if (c->x86 == 0x10) { + /* + * BIOS should disable GartTlbWlk Errors themself. If + * it doesn't do it here as suggested by the BKDG. + * + * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012 + */ + u64 mask; + + rdmsrl(MSR_AMD64_MCx_MASK(4), mask); + mask |= (1 << 10); + wrmsrl(MSR_AMD64_MCx_MASK(4), mask); + } } #ifdef CONFIG_X86_32 @@ -668,7 +708,7 @@ EXPORT_SYMBOL_GPL(amd_erratum_383); bool cpu_has_amd_erratum(const int *erratum) { - struct cpuinfo_x86 *cpu = ¤t_cpu_data; + struct cpuinfo_x86 *cpu = __this_cpu_ptr(&cpu_info); int osvw_id = *erratum++; u32 range; u32 ms; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4b68bda3093..c8b41623377 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -254,6 +254,25 @@ static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c) } #endif +static int disable_smep __cpuinitdata; +static __init int setup_disable_smep(char *arg) +{ + disable_smep = 1; + return 1; +} +__setup("nosmep", setup_disable_smep); + +static __cpuinit void setup_smep(struct cpuinfo_x86 *c) +{ + if (cpu_has(c, X86_FEATURE_SMEP)) { + if (unlikely(disable_smep)) { + setup_clear_cpu_cap(X86_FEATURE_SMEP); + clear_in_cr4(X86_CR4_SMEP); + } else + set_in_cr4(X86_CR4_SMEP); + } +} + /* * Some CPU features depend on higher CPUID levels, which may not always * be available due to CPUID level capping or broken virtualization @@ -565,8 +584,7 @@ void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx); - if (eax > 0) - c->x86_capability[9] = ebx; + c->x86_capability[9] = ebx; } /* AMD-defined flags: level 0x80000001 */ @@ -668,6 +686,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) c->cpu_index = 0; #endif filter_cpuid_features(c, false); + + setup_smep(c); } void __init early_cpu_init(void) @@ -675,7 +695,7 @@ void __init early_cpu_init(void) const struct cpu_dev *const *cdev; int count = 0; -#ifdef PROCESSOR_SELECT +#ifdef CONFIG_PROCESSOR_SELECT printk(KERN_INFO "KERNEL supported cpus:\n"); #endif @@ -687,7 +707,7 @@ void __init early_cpu_init(void) cpu_devs[count] = cpudev; count++; -#ifdef PROCESSOR_SELECT +#ifdef CONFIG_PROCESSOR_SELECT { unsigned int j; @@ -753,6 +773,8 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c) #endif } + setup_smep(c); + get_model_name(c); /* Default name */ detect_nopl(c); @@ -869,7 +891,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) select_idle_routine(c); -#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) +#ifdef CONFIG_NUMA numa_add_cpu(smp_processor_id()); #endif } @@ -894,7 +916,6 @@ void __init identify_boot_cpu(void) #else vgetcpu_set_mode(); #endif - init_hw_perf_events(); } void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig deleted file mode 100644 index 870e6cc6ad2..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/Kconfig +++ /dev/null @@ -1,266 +0,0 @@ -# -# CPU Frequency scaling -# - -menu "CPU Frequency scaling" - -source "drivers/cpufreq/Kconfig" - -if CPU_FREQ - -comment "CPUFreq processor drivers" - -config X86_PCC_CPUFREQ - tristate "Processor Clocking Control interface driver" - depends on ACPI && ACPI_PROCESSOR - help - This driver adds support for the PCC interface. - - For details, take a look at: - <file:Documentation/cpu-freq/pcc-cpufreq.txt>. - - To compile this driver as a module, choose M here: the - module will be called pcc-cpufreq. - - If in doubt, say N. - -config X86_ACPI_CPUFREQ - tristate "ACPI Processor P-States driver" - select CPU_FREQ_TABLE - depends on ACPI_PROCESSOR - help - This driver adds a CPUFreq driver which utilizes the ACPI - Processor Performance States. - This driver also supports Intel Enhanced Speedstep. - - To compile this driver as a module, choose M here: the - module will be called acpi-cpufreq. - - For details, take a look at <file:Documentation/cpu-freq/>. - - If in doubt, say N. - -config ELAN_CPUFREQ - tristate "AMD Elan SC400 and SC410" - select CPU_FREQ_TABLE - depends on X86_ELAN - ---help--- - This adds the CPUFreq driver for AMD Elan SC400 and SC410 - processors. - - You need to specify the processor maximum speed as boot - parameter: elanfreq=maxspeed (in kHz) or as module - parameter "max_freq". - - For details, take a look at <file:Documentation/cpu-freq/>. - - If in doubt, say N. - -config SC520_CPUFREQ - tristate "AMD Elan SC520" - select CPU_FREQ_TABLE - depends on X86_ELAN - ---help--- - This adds the CPUFreq driver for AMD Elan SC520 processor. - - For details, take a look at <file:Documentation/cpu-freq/>. - - If in doubt, say N. - - -config X86_POWERNOW_K6 - tristate "AMD Mobile K6-2/K6-3 PowerNow!" - select CPU_FREQ_TABLE - depends on X86_32 - help - This adds the CPUFreq driver for mobile AMD K6-2+ and mobile - AMD K6-3+ processors. - - For details, take a look at <file:Documentation/cpu-freq/>. - - If in doubt, say N. - -config X86_POWERNOW_K7 - tristate "AMD Mobile Athlon/Duron PowerNow!" - select CPU_FREQ_TABLE - depends on X86_32 - help - This adds the CPUFreq driver for mobile AMD K7 mobile processors. - - For details, take a look at <file:Documentation/cpu-freq/>. - - If in doubt, say N. - -config X86_POWERNOW_K7_ACPI - bool - depends on X86_POWERNOW_K7 && ACPI_PROCESSOR - depends on !(X86_POWERNOW_K7 = y && ACPI_PROCESSOR = m) - depends on X86_32 - default y - -config X86_POWERNOW_K8 - tristate "AMD Opteron/Athlon64 PowerNow!" - select CPU_FREQ_TABLE - depends on ACPI && ACPI_PROCESSOR - help - This adds the CPUFreq driver for K8/K10 Opteron/Athlon64 processors. - - To compile this driver as a module, choose M here: the - module will be called powernow-k8. - - For details, take a look at <file:Documentation/cpu-freq/>. - -config X86_GX_SUSPMOD - tristate "Cyrix MediaGX/NatSemi Geode Suspend Modulation" - depends on X86_32 && PCI - help - This add the CPUFreq driver for NatSemi Geode processors which - support suspend modulation. - - For details, take a look at <file:Documentation/cpu-freq/>. - - If in doubt, say N. - -config X86_SPEEDSTEP_CENTRINO - tristate "Intel Enhanced SpeedStep (deprecated)" - select CPU_FREQ_TABLE - select X86_SPEEDSTEP_CENTRINO_TABLE if X86_32 - depends on X86_32 || (X86_64 && ACPI_PROCESSOR) - help - This is deprecated and this functionality is now merged into - acpi_cpufreq (X86_ACPI_CPUFREQ). Use that driver instead of - speedstep_centrino. - This adds the CPUFreq driver for Enhanced SpeedStep enabled - mobile CPUs. This means Intel Pentium M (Centrino) CPUs - or 64bit enabled Intel Xeons. - - To compile this driver as a module, choose M here: the - module will be called speedstep-centrino. - - For details, take a look at <file:Documentation/cpu-freq/>. - - If in doubt, say N. - -config X86_SPEEDSTEP_CENTRINO_TABLE - bool "Built-in tables for Banias CPUs" - depends on X86_32 && X86_SPEEDSTEP_CENTRINO - default y - help - Use built-in tables for Banias CPUs if ACPI encoding - is not available. - - If in doubt, say N. - -config X86_SPEEDSTEP_ICH - tristate "Intel Speedstep on ICH-M chipsets (ioport interface)" - select CPU_FREQ_TABLE - depends on X86_32 - help - This adds the CPUFreq driver for certain mobile Intel Pentium III - (Coppermine), all mobile Intel Pentium III-M (Tualatin) and all - mobile Intel Pentium 4 P4-M on systems which have an Intel ICH2, - ICH3 or ICH4 southbridge. - - For details, take a look at <file:Documentation/cpu-freq/>. - - If in doubt, say N. - -config X86_SPEEDSTEP_SMI - tristate "Intel SpeedStep on 440BX/ZX/MX chipsets (SMI interface)" - select CPU_FREQ_TABLE - depends on X86_32 && EXPERIMENTAL - help - This adds the CPUFreq driver for certain mobile Intel Pentium III - (Coppermine), all mobile Intel Pentium III-M (Tualatin) - on systems which have an Intel 440BX/ZX/MX southbridge. - - For details, take a look at <file:Documentation/cpu-freq/>. - - If in doubt, say N. - -config X86_P4_CLOCKMOD - tristate "Intel Pentium 4 clock modulation" - select CPU_FREQ_TABLE - help - This adds the CPUFreq driver for Intel Pentium 4 / XEON - processors. When enabled it will lower CPU temperature by skipping - clocks. - - This driver should be only used in exceptional - circumstances when very low power is needed because it causes severe - slowdowns and noticeable latencies. Normally Speedstep should be used - instead. - - To compile this driver as a module, choose M here: the - module will be called p4-clockmod. - - For details, take a look at <file:Documentation/cpu-freq/>. - - Unless you are absolutely sure say N. - -config X86_CPUFREQ_NFORCE2 - tristate "nVidia nForce2 FSB changing" - depends on X86_32 && EXPERIMENTAL - help - This adds the CPUFreq driver for FSB changing on nVidia nForce2 - platforms. - - For details, take a look at <file:Documentation/cpu-freq/>. - - If in doubt, say N. - -config X86_LONGRUN - tristate "Transmeta LongRun" - depends on X86_32 - help - This adds the CPUFreq driver for Transmeta Crusoe and Efficeon processors - which support LongRun. - - For details, take a look at <file:Documentation/cpu-freq/>. - - If in doubt, say N. - -config X86_LONGHAUL - tristate "VIA Cyrix III Longhaul" - select CPU_FREQ_TABLE - depends on X86_32 && ACPI_PROCESSOR - help - This adds the CPUFreq driver for VIA Samuel/CyrixIII, - VIA Cyrix Samuel/C3, VIA Cyrix Ezra and VIA Cyrix Ezra-T - processors. - - For details, take a look at <file:Documentation/cpu-freq/>. - - If in doubt, say N. - -config X86_E_POWERSAVER - tristate "VIA C7 Enhanced PowerSaver (DANGEROUS)" - select CPU_FREQ_TABLE - depends on X86_32 && EXPERIMENTAL - help - This adds the CPUFreq driver for VIA C7 processors. However, this driver - does not have any safeguards to prevent operating the CPU out of spec - and is thus considered dangerous. Please use the regular ACPI cpufreq - driver, enabled by CONFIG_X86_ACPI_CPUFREQ. - - If in doubt, say N. - -comment "shared options" - -config X86_SPEEDSTEP_LIB - tristate - default (X86_SPEEDSTEP_ICH || X86_SPEEDSTEP_SMI || X86_P4_CLOCKMOD) - -config X86_SPEEDSTEP_RELAXED_CAP_CHECK - bool "Relaxed speedstep capability checks" - depends on X86_32 && (X86_SPEEDSTEP_SMI || X86_SPEEDSTEP_ICH) - help - Don't perform all checks for a speedstep capable system which would - normally be done. Some ancient or strange systems, though speedstep - capable, don't always indicate that they are speedstep capable. This - option lets the probing code bypass some of those checks if the - parameter "relaxed_check=1" is passed to the module. - -endif # CPU_FREQ - -endmenu diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile deleted file mode 100644 index bd54bf67e6f..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/Makefile +++ /dev/null @@ -1,21 +0,0 @@ -# Link order matters. K8 is preferred to ACPI because of firmware bugs in early -# K8 systems. ACPI is preferred to all other hardware-specific drivers. -# speedstep-* is preferred over p4-clockmod. - -obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o mperf.o -obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o mperf.o -obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o -obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o -obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o -obj-$(CONFIG_X86_LONGHAUL) += longhaul.o -obj-$(CONFIG_X86_E_POWERSAVER) += e_powersaver.o -obj-$(CONFIG_ELAN_CPUFREQ) += elanfreq.o -obj-$(CONFIG_SC520_CPUFREQ) += sc520_freq.o -obj-$(CONFIG_X86_LONGRUN) += longrun.o -obj-$(CONFIG_X86_GX_SUSPMOD) += gx-suspmod.o -obj-$(CONFIG_X86_SPEEDSTEP_ICH) += speedstep-ich.o -obj-$(CONFIG_X86_SPEEDSTEP_LIB) += speedstep-lib.o -obj-$(CONFIG_X86_SPEEDSTEP_SMI) += speedstep-smi.o -obj-$(CONFIG_X86_SPEEDSTEP_CENTRINO) += speedstep-centrino.o -obj-$(CONFIG_X86_P4_CLOCKMOD) += p4-clockmod.o -obj-$(CONFIG_X86_CPUFREQ_NFORCE2) += cpufreq-nforce2.o diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c deleted file mode 100644 index cd8da247dda..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ /dev/null @@ -1,775 +0,0 @@ -/* - * acpi-cpufreq.c - ACPI Processor P-States Driver - * - * Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com> - * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> - * Copyright (C) 2002 - 2004 Dominik Brodowski <linux@brodo.de> - * Copyright (C) 2006 Denis Sadykov <denis.m.sadykov@intel.com> - * - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or (at - * your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write to the Free Software Foundation, Inc., - * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. - * - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/smp.h> -#include <linux/sched.h> -#include <linux/cpufreq.h> -#include <linux/compiler.h> -#include <linux/dmi.h> -#include <linux/slab.h> - -#include <linux/acpi.h> -#include <linux/io.h> -#include <linux/delay.h> -#include <linux/uaccess.h> - -#include <acpi/processor.h> - -#include <asm/msr.h> -#include <asm/processor.h> -#include <asm/cpufeature.h> -#include "mperf.h" - -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "acpi-cpufreq", msg) - -MODULE_AUTHOR("Paul Diefenbaugh, Dominik Brodowski"); -MODULE_DESCRIPTION("ACPI Processor P-States Driver"); -MODULE_LICENSE("GPL"); - -enum { - UNDEFINED_CAPABLE = 0, - SYSTEM_INTEL_MSR_CAPABLE, - SYSTEM_IO_CAPABLE, -}; - -#define INTEL_MSR_RANGE (0xffff) - -struct acpi_cpufreq_data { - struct acpi_processor_performance *acpi_data; - struct cpufreq_frequency_table *freq_table; - unsigned int resume; - unsigned int cpu_feature; -}; - -static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data); - -/* acpi_perf_data is a pointer to percpu data. */ -static struct acpi_processor_performance __percpu *acpi_perf_data; - -static struct cpufreq_driver acpi_cpufreq_driver; - -static unsigned int acpi_pstate_strict; - -static int check_est_cpu(unsigned int cpuid) -{ - struct cpuinfo_x86 *cpu = &cpu_data(cpuid); - - return cpu_has(cpu, X86_FEATURE_EST); -} - -static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data) -{ - struct acpi_processor_performance *perf; - int i; - - perf = data->acpi_data; - - for (i = 0; i < perf->state_count; i++) { - if (value == perf->states[i].status) - return data->freq_table[i].frequency; - } - return 0; -} - -static unsigned extract_msr(u32 msr, struct acpi_cpufreq_data *data) -{ - int i; - struct acpi_processor_performance *perf; - - msr &= INTEL_MSR_RANGE; - perf = data->acpi_data; - - for (i = 0; data->freq_table[i].frequency != CPUFREQ_TABLE_END; i++) { - if (msr == perf->states[data->freq_table[i].index].status) - return data->freq_table[i].frequency; - } - return data->freq_table[0].frequency; -} - -static unsigned extract_freq(u32 val, struct acpi_cpufreq_data *data) -{ - switch (data->cpu_feature) { - case SYSTEM_INTEL_MSR_CAPABLE: - return extract_msr(val, data); - case SYSTEM_IO_CAPABLE: - return extract_io(val, data); - default: - return 0; - } -} - -struct msr_addr { - u32 reg; -}; - -struct io_addr { - u16 port; - u8 bit_width; -}; - -struct drv_cmd { - unsigned int type; - const struct cpumask *mask; - union { - struct msr_addr msr; - struct io_addr io; - } addr; - u32 val; -}; - -/* Called via smp_call_function_single(), on the target CPU */ -static void do_drv_read(void *_cmd) -{ - struct drv_cmd *cmd = _cmd; - u32 h; - - switch (cmd->type) { - case SYSTEM_INTEL_MSR_CAPABLE: - rdmsr(cmd->addr.msr.reg, cmd->val, h); - break; - case SYSTEM_IO_CAPABLE: - acpi_os_read_port((acpi_io_address)cmd->addr.io.port, - &cmd->val, - (u32)cmd->addr.io.bit_width); - break; - default: - break; - } -} - -/* Called via smp_call_function_many(), on the target CPUs */ -static void do_drv_write(void *_cmd) -{ - struct drv_cmd *cmd = _cmd; - u32 lo, hi; - - switch (cmd->type) { - case SYSTEM_INTEL_MSR_CAPABLE: - rdmsr(cmd->addr.msr.reg, lo, hi); - lo = (lo & ~INTEL_MSR_RANGE) | (cmd->val & INTEL_MSR_RANGE); - wrmsr(cmd->addr.msr.reg, lo, hi); - break; - case SYSTEM_IO_CAPABLE: - acpi_os_write_port((acpi_io_address)cmd->addr.io.port, - cmd->val, - (u32)cmd->addr.io.bit_width); - break; - default: - break; - } -} - -static void drv_read(struct drv_cmd *cmd) -{ - int err; - cmd->val = 0; - - err = smp_call_function_any(cmd->mask, do_drv_read, cmd, 1); - WARN_ON_ONCE(err); /* smp_call_function_any() was buggy? */ -} - -static void drv_write(struct drv_cmd *cmd) -{ - int this_cpu; - - this_cpu = get_cpu(); - if (cpumask_test_cpu(this_cpu, cmd->mask)) - do_drv_write(cmd); - smp_call_function_many(cmd->mask, do_drv_write, cmd, 1); - put_cpu(); -} - -static u32 get_cur_val(const struct cpumask *mask) -{ - struct acpi_processor_performance *perf; - struct drv_cmd cmd; - - if (unlikely(cpumask_empty(mask))) - return 0; - - switch (per_cpu(acfreq_data, cpumask_first(mask))->cpu_feature) { - case SYSTEM_INTEL_MSR_CAPABLE: - cmd.type = SYSTEM_INTEL_MSR_CAPABLE; - cmd.addr.msr.reg = MSR_IA32_PERF_STATUS; - break; - case SYSTEM_IO_CAPABLE: - cmd.type = SYSTEM_IO_CAPABLE; - perf = per_cpu(acfreq_data, cpumask_first(mask))->acpi_data; - cmd.addr.io.port = perf->control_register.address; - cmd.addr.io.bit_width = perf->control_register.bit_width; - break; - default: - return 0; - } - - cmd.mask = mask; - drv_read(&cmd); - - dprintk("get_cur_val = %u\n", cmd.val); - - return cmd.val; -} - -static unsigned int get_cur_freq_on_cpu(unsigned int cpu) -{ - struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu); - unsigned int freq; - unsigned int cached_freq; - - dprintk("get_cur_freq_on_cpu (%d)\n", cpu); - - if (unlikely(data == NULL || - data->acpi_data == NULL || data->freq_table == NULL)) { - return 0; - } - - cached_freq = data->freq_table[data->acpi_data->state].frequency; - freq = extract_freq(get_cur_val(cpumask_of(cpu)), data); - if (freq != cached_freq) { - /* - * The dreaded BIOS frequency change behind our back. - * Force set the frequency on next target call. - */ - data->resume = 1; - } - - dprintk("cur freq = %u\n", freq); - - return freq; -} - -static unsigned int check_freqs(const struct cpumask *mask, unsigned int freq, - struct acpi_cpufreq_data *data) -{ - unsigned int cur_freq; - unsigned int i; - - for (i = 0; i < 100; i++) { - cur_freq = extract_freq(get_cur_val(mask), data); - if (cur_freq == freq) - return 1; - udelay(10); - } - return 0; -} - -static int acpi_cpufreq_target(struct cpufreq_policy *policy, - unsigned int target_freq, unsigned int relation) -{ - struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu); - struct acpi_processor_performance *perf; - struct cpufreq_freqs freqs; - struct drv_cmd cmd; - unsigned int next_state = 0; /* Index into freq_table */ - unsigned int next_perf_state = 0; /* Index into perf table */ - unsigned int i; - int result = 0; - - dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu); - - if (unlikely(data == NULL || - data->acpi_data == NULL || data->freq_table == NULL)) { - return -ENODEV; - } - - perf = data->acpi_data; - result = cpufreq_frequency_table_target(policy, - data->freq_table, - target_freq, - relation, &next_state); - if (unlikely(result)) { - result = -ENODEV; - goto out; - } - - next_perf_state = data->freq_table[next_state].index; - if (perf->state == next_perf_state) { - if (unlikely(data->resume)) { - dprintk("Called after resume, resetting to P%d\n", - next_perf_state); - data->resume = 0; - } else { - dprintk("Already at target state (P%d)\n", - next_perf_state); - goto out; - } - } - - switch (data->cpu_feature) { - case SYSTEM_INTEL_MSR_CAPABLE: - cmd.type = SYSTEM_INTEL_MSR_CAPABLE; - cmd.addr.msr.reg = MSR_IA32_PERF_CTL; - cmd.val = (u32) perf->states[next_perf_state].control; - break; - case SYSTEM_IO_CAPABLE: - cmd.type = SYSTEM_IO_CAPABLE; - cmd.addr.io.port = perf->control_register.address; - cmd.addr.io.bit_width = perf->control_register.bit_width; - cmd.val = (u32) perf->states[next_perf_state].control; - break; - default: - result = -ENODEV; - goto out; - } - - /* cpufreq holds the hotplug lock, so we are safe from here on */ - if (policy->shared_type != CPUFREQ_SHARED_TYPE_ANY) - cmd.mask = policy->cpus; - else - cmd.mask = cpumask_of(policy->cpu); - - freqs.old = perf->states[perf->state].core_frequency * 1000; - freqs.new = data->freq_table[next_state].frequency; - for_each_cpu(i, policy->cpus) { - freqs.cpu = i; - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - } - - drv_write(&cmd); - - if (acpi_pstate_strict) { - if (!check_freqs(cmd.mask, freqs.new, data)) { - dprintk("acpi_cpufreq_target failed (%d)\n", - policy->cpu); - result = -EAGAIN; - goto out; - } - } - - for_each_cpu(i, policy->cpus) { - freqs.cpu = i; - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - } - perf->state = next_perf_state; - -out: - return result; -} - -static int acpi_cpufreq_verify(struct cpufreq_policy *policy) -{ - struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu); - - dprintk("acpi_cpufreq_verify\n"); - - return cpufreq_frequency_table_verify(policy, data->freq_table); -} - -static unsigned long -acpi_cpufreq_guess_freq(struct acpi_cpufreq_data *data, unsigned int cpu) -{ - struct acpi_processor_performance *perf = data->acpi_data; - - if (cpu_khz) { - /* search the closest match to cpu_khz */ - unsigned int i; - unsigned long freq; - unsigned long freqn = perf->states[0].core_frequency * 1000; - - for (i = 0; i < (perf->state_count-1); i++) { - freq = freqn; - freqn = perf->states[i+1].core_frequency * 1000; - if ((2 * cpu_khz) > (freqn + freq)) { - perf->state = i; - return freq; - } - } - perf->state = perf->state_count-1; - return freqn; - } else { - /* assume CPU is at P0... */ - perf->state = 0; - return perf->states[0].core_frequency * 1000; - } -} - -static void free_acpi_perf_data(void) -{ - unsigned int i; - - /* Freeing a NULL pointer is OK, and alloc_percpu zeroes. */ - for_each_possible_cpu(i) - free_cpumask_var(per_cpu_ptr(acpi_perf_data, i) - ->shared_cpu_map); - free_percpu(acpi_perf_data); -} - -/* - * acpi_cpufreq_early_init - initialize ACPI P-States library - * - * Initialize the ACPI P-States library (drivers/acpi/processor_perflib.c) - * in order to determine correct frequency and voltage pairings. We can - * do _PDC and _PSD and find out the processor dependency for the - * actual init that will happen later... - */ -static int __init acpi_cpufreq_early_init(void) -{ - unsigned int i; - dprintk("acpi_cpufreq_early_init\n"); - - acpi_perf_data = alloc_percpu(struct acpi_processor_performance); - if (!acpi_perf_data) { - dprintk("Memory allocation error for acpi_perf_data.\n"); - return -ENOMEM; - } - for_each_possible_cpu(i) { - if (!zalloc_cpumask_var_node( - &per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map, - GFP_KERNEL, cpu_to_node(i))) { - - /* Freeing a NULL pointer is OK: alloc_percpu zeroes. */ - free_acpi_perf_data(); - return -ENOMEM; - } - } - - /* Do initialization in ACPI core */ - acpi_processor_preregister_performance(acpi_perf_data); - return 0; -} - -#ifdef CONFIG_SMP -/* - * Some BIOSes do SW_ANY coordination internally, either set it up in hw - * or do it in BIOS firmware and won't inform about it to OS. If not - * detected, this has a side effect of making CPU run at a different speed - * than OS intended it to run at. Detect it and handle it cleanly. - */ -static int bios_with_sw_any_bug; - -static int sw_any_bug_found(const struct dmi_system_id *d) -{ - bios_with_sw_any_bug = 1; - return 0; -} - -static const struct dmi_system_id sw_any_bug_dmi_table[] = { - { - .callback = sw_any_bug_found, - .ident = "Supermicro Server X6DLP", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Supermicro"), - DMI_MATCH(DMI_BIOS_VERSION, "080010"), - DMI_MATCH(DMI_PRODUCT_NAME, "X6DLP"), - }, - }, - { } -}; - -static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c) -{ - /* Intel Xeon Processor 7100 Series Specification Update - * http://www.intel.com/Assets/PDF/specupdate/314554.pdf - * AL30: A Machine Check Exception (MCE) Occurring during an - * Enhanced Intel SpeedStep Technology Ratio Change May Cause - * Both Processor Cores to Lock Up. */ - if (c->x86_vendor == X86_VENDOR_INTEL) { - if ((c->x86 == 15) && - (c->x86_model == 6) && - (c->x86_mask == 8)) { - printk(KERN_INFO "acpi-cpufreq: Intel(R) " - "Xeon(R) 7100 Errata AL30, processors may " - "lock up on frequency changes: disabling " - "acpi-cpufreq.\n"); - return -ENODEV; - } - } - return 0; -} -#endif - -static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) -{ - unsigned int i; - unsigned int valid_states = 0; - unsigned int cpu = policy->cpu; - struct acpi_cpufreq_data *data; - unsigned int result = 0; - struct cpuinfo_x86 *c = &cpu_data(policy->cpu); - struct acpi_processor_performance *perf; -#ifdef CONFIG_SMP - static int blacklisted; -#endif - - dprintk("acpi_cpufreq_cpu_init\n"); - -#ifdef CONFIG_SMP - if (blacklisted) - return blacklisted; - blacklisted = acpi_cpufreq_blacklist(c); - if (blacklisted) - return blacklisted; -#endif - - data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL); - if (!data) - return -ENOMEM; - - data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu); - per_cpu(acfreq_data, cpu) = data; - - if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) - acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS; - - result = acpi_processor_register_performance(data->acpi_data, cpu); - if (result) - goto err_free; - - perf = data->acpi_data; - policy->shared_type = perf->shared_type; - - /* - * Will let policy->cpus know about dependency only when software - * coordination is required. - */ - if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL || - policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) { - cpumask_copy(policy->cpus, perf->shared_cpu_map); - } - cpumask_copy(policy->related_cpus, perf->shared_cpu_map); - -#ifdef CONFIG_SMP - dmi_check_system(sw_any_bug_dmi_table); - if (bios_with_sw_any_bug && cpumask_weight(policy->cpus) == 1) { - policy->shared_type = CPUFREQ_SHARED_TYPE_ALL; - cpumask_copy(policy->cpus, cpu_core_mask(cpu)); - } -#endif - - /* capability check */ - if (perf->state_count <= 1) { - dprintk("No P-States\n"); - result = -ENODEV; - goto err_unreg; - } - - if (perf->control_register.space_id != perf->status_register.space_id) { - result = -ENODEV; - goto err_unreg; - } - - switch (perf->control_register.space_id) { - case ACPI_ADR_SPACE_SYSTEM_IO: - dprintk("SYSTEM IO addr space\n"); - data->cpu_feature = SYSTEM_IO_CAPABLE; - break; - case ACPI_ADR_SPACE_FIXED_HARDWARE: - dprintk("HARDWARE addr space\n"); - if (!check_est_cpu(cpu)) { - result = -ENODEV; - goto err_unreg; - } - data->cpu_feature = SYSTEM_INTEL_MSR_CAPABLE; - break; - default: - dprintk("Unknown addr space %d\n", - (u32) (perf->control_register.space_id)); - result = -ENODEV; - goto err_unreg; - } - - data->freq_table = kmalloc(sizeof(struct cpufreq_frequency_table) * - (perf->state_count+1), GFP_KERNEL); - if (!data->freq_table) { - result = -ENOMEM; - goto err_unreg; - } - - /* detect transition latency */ - policy->cpuinfo.transition_latency = 0; - for (i = 0; i < perf->state_count; i++) { - if ((perf->states[i].transition_latency * 1000) > - policy->cpuinfo.transition_latency) - policy->cpuinfo.transition_latency = - perf->states[i].transition_latency * 1000; - } - - /* Check for high latency (>20uS) from buggy BIOSes, like on T42 */ - if (perf->control_register.space_id == ACPI_ADR_SPACE_FIXED_HARDWARE && - policy->cpuinfo.transition_latency > 20 * 1000) { - policy->cpuinfo.transition_latency = 20 * 1000; - printk_once(KERN_INFO - "P-state transition latency capped at 20 uS\n"); - } - - /* table init */ - for (i = 0; i < perf->state_count; i++) { - if (i > 0 && perf->states[i].core_frequency >= - data->freq_table[valid_states-1].frequency / 1000) - continue; - - data->freq_table[valid_states].index = i; - data->freq_table[valid_states].frequency = - perf->states[i].core_frequency * 1000; - valid_states++; - } - data->freq_table[valid_states].frequency = CPUFREQ_TABLE_END; - perf->state = 0; - - result = cpufreq_frequency_table_cpuinfo(policy, data->freq_table); - if (result) - goto err_freqfree; - - if (perf->states[0].core_frequency * 1000 != policy->cpuinfo.max_freq) - printk(KERN_WARNING FW_WARN "P-state 0 is not max freq\n"); - - switch (perf->control_register.space_id) { - case ACPI_ADR_SPACE_SYSTEM_IO: - /* Current speed is unknown and not detectable by IO port */ - policy->cur = acpi_cpufreq_guess_freq(data, policy->cpu); - break; - case ACPI_ADR_SPACE_FIXED_HARDWARE: - acpi_cpufreq_driver.get = get_cur_freq_on_cpu; - policy->cur = get_cur_freq_on_cpu(cpu); - break; - default: - break; - } - - /* notify BIOS that we exist */ - acpi_processor_notify_smm(THIS_MODULE); - - /* Check for APERF/MPERF support in hardware */ - if (cpu_has(c, X86_FEATURE_APERFMPERF)) - acpi_cpufreq_driver.getavg = cpufreq_get_measured_perf; - - dprintk("CPU%u - ACPI performance management activated.\n", cpu); - for (i = 0; i < perf->state_count; i++) - dprintk(" %cP%d: %d MHz, %d mW, %d uS\n", - (i == perf->state ? '*' : ' '), i, - (u32) perf->states[i].core_frequency, - (u32) perf->states[i].power, - (u32) perf->states[i].transition_latency); - - cpufreq_frequency_table_get_attr(data->freq_table, policy->cpu); - - /* - * the first call to ->target() should result in us actually - * writing something to the appropriate registers. - */ - data->resume = 1; - - return result; - -err_freqfree: - kfree(data->freq_table); -err_unreg: - acpi_processor_unregister_performance(perf, cpu); -err_free: - kfree(data); - per_cpu(acfreq_data, cpu) = NULL; - - return result; -} - -static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy) -{ - struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu); - - dprintk("acpi_cpufreq_cpu_exit\n"); - - if (data) { - cpufreq_frequency_table_put_attr(policy->cpu); - per_cpu(acfreq_data, policy->cpu) = NULL; - acpi_processor_unregister_performance(data->acpi_data, - policy->cpu); - kfree(data); - } - - return 0; -} - -static int acpi_cpufreq_resume(struct cpufreq_policy *policy) -{ - struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu); - - dprintk("acpi_cpufreq_resume\n"); - - data->resume = 1; - - return 0; -} - -static struct freq_attr *acpi_cpufreq_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; - -static struct cpufreq_driver acpi_cpufreq_driver = { - .verify = acpi_cpufreq_verify, - .target = acpi_cpufreq_target, - .bios_limit = acpi_processor_get_bios_limit, - .init = acpi_cpufreq_cpu_init, - .exit = acpi_cpufreq_cpu_exit, - .resume = acpi_cpufreq_resume, - .name = "acpi-cpufreq", - .owner = THIS_MODULE, - .attr = acpi_cpufreq_attr, -}; - -static int __init acpi_cpufreq_init(void) -{ - int ret; - - if (acpi_disabled) - return 0; - - dprintk("acpi_cpufreq_init\n"); - - ret = acpi_cpufreq_early_init(); - if (ret) - return ret; - - ret = cpufreq_register_driver(&acpi_cpufreq_driver); - if (ret) - free_acpi_perf_data(); - - return ret; -} - -static void __exit acpi_cpufreq_exit(void) -{ - dprintk("acpi_cpufreq_exit\n"); - - cpufreq_unregister_driver(&acpi_cpufreq_driver); - - free_percpu(acpi_perf_data); -} - -module_param(acpi_pstate_strict, uint, 0644); -MODULE_PARM_DESC(acpi_pstate_strict, - "value 0 or non-zero. non-zero -> strict ACPI checks are " - "performed during frequency changes."); - -late_initcall(acpi_cpufreq_init); -module_exit(acpi_cpufreq_exit); - -MODULE_ALIAS("acpi"); diff --git a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c deleted file mode 100644 index 733093d6043..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c +++ /dev/null @@ -1,446 +0,0 @@ -/* - * (C) 2004-2006 Sebastian Witt <se.witt@gmx.net> - * - * Licensed under the terms of the GNU GPL License version 2. - * Based upon reverse engineered information - * - * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/moduleparam.h> -#include <linux/init.h> -#include <linux/cpufreq.h> -#include <linux/pci.h> -#include <linux/delay.h> - -#define NFORCE2_XTAL 25 -#define NFORCE2_BOOTFSB 0x48 -#define NFORCE2_PLLENABLE 0xa8 -#define NFORCE2_PLLREG 0xa4 -#define NFORCE2_PLLADR 0xa0 -#define NFORCE2_PLL(mul, div) (0x100000 | (mul << 8) | div) - -#define NFORCE2_MIN_FSB 50 -#define NFORCE2_SAFE_DISTANCE 50 - -/* Delay in ms between FSB changes */ -/* #define NFORCE2_DELAY 10 */ - -/* - * nforce2_chipset: - * FSB is changed using the chipset - */ -static struct pci_dev *nforce2_dev; - -/* fid: - * multiplier * 10 - */ -static int fid; - -/* min_fsb, max_fsb: - * minimum and maximum FSB (= FSB at boot time) - */ -static int min_fsb; -static int max_fsb; - -MODULE_AUTHOR("Sebastian Witt <se.witt@gmx.net>"); -MODULE_DESCRIPTION("nForce2 FSB changing cpufreq driver"); -MODULE_LICENSE("GPL"); - -module_param(fid, int, 0444); -module_param(min_fsb, int, 0444); - -MODULE_PARM_DESC(fid, "CPU multiplier to use (11.5 = 115)"); -MODULE_PARM_DESC(min_fsb, - "Minimum FSB to use, if not defined: current FSB - 50"); - -#define PFX "cpufreq-nforce2: " -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "cpufreq-nforce2", msg) - -/** - * nforce2_calc_fsb - calculate FSB - * @pll: PLL value - * - * Calculates FSB from PLL value - */ -static int nforce2_calc_fsb(int pll) -{ - unsigned char mul, div; - - mul = (pll >> 8) & 0xff; - div = pll & 0xff; - - if (div > 0) - return NFORCE2_XTAL * mul / div; - - return 0; -} - -/** - * nforce2_calc_pll - calculate PLL value - * @fsb: FSB - * - * Calculate PLL value for given FSB - */ -static int nforce2_calc_pll(unsigned int fsb) -{ - unsigned char xmul, xdiv; - unsigned char mul = 0, div = 0; - int tried = 0; - - /* Try to calculate multiplier and divider up to 4 times */ - while (((mul == 0) || (div == 0)) && (tried <= 3)) { - for (xdiv = 2; xdiv <= 0x80; xdiv++) - for (xmul = 1; xmul <= 0xfe; xmul++) - if (nforce2_calc_fsb(NFORCE2_PLL(xmul, xdiv)) == - fsb + tried) { - mul = xmul; - div = xdiv; - } - tried++; - } - - if ((mul == 0) || (div == 0)) - return -1; - - return NFORCE2_PLL(mul, div); -} - -/** - * nforce2_write_pll - write PLL value to chipset - * @pll: PLL value - * - * Writes new FSB PLL value to chipset - */ -static void nforce2_write_pll(int pll) -{ - int temp; - - /* Set the pll addr. to 0x00 */ - pci_write_config_dword(nforce2_dev, NFORCE2_PLLADR, 0); - - /* Now write the value in all 64 registers */ - for (temp = 0; temp <= 0x3f; temp++) - pci_write_config_dword(nforce2_dev, NFORCE2_PLLREG, pll); - - return; -} - -/** - * nforce2_fsb_read - Read FSB - * - * Read FSB from chipset - * If bootfsb != 0, return FSB at boot-time - */ -static unsigned int nforce2_fsb_read(int bootfsb) -{ - struct pci_dev *nforce2_sub5; - u32 fsb, temp = 0; - - /* Get chipset boot FSB from subdevice 5 (FSB at boot-time) */ - nforce2_sub5 = pci_get_subsys(PCI_VENDOR_ID_NVIDIA, 0x01EF, - PCI_ANY_ID, PCI_ANY_ID, NULL); - if (!nforce2_sub5) - return 0; - - pci_read_config_dword(nforce2_sub5, NFORCE2_BOOTFSB, &fsb); - fsb /= 1000000; - - /* Check if PLL register is already set */ - pci_read_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8 *)&temp); - - if (bootfsb || !temp) - return fsb; - - /* Use PLL register FSB value */ - pci_read_config_dword(nforce2_dev, NFORCE2_PLLREG, &temp); - fsb = nforce2_calc_fsb(temp); - - return fsb; -} - -/** - * nforce2_set_fsb - set new FSB - * @fsb: New FSB - * - * Sets new FSB - */ -static int nforce2_set_fsb(unsigned int fsb) -{ - u32 temp = 0; - unsigned int tfsb; - int diff; - int pll = 0; - - if ((fsb > max_fsb) || (fsb < NFORCE2_MIN_FSB)) { - printk(KERN_ERR PFX "FSB %d is out of range!\n", fsb); - return -EINVAL; - } - - tfsb = nforce2_fsb_read(0); - if (!tfsb) { - printk(KERN_ERR PFX "Error while reading the FSB\n"); - return -EINVAL; - } - - /* First write? Then set actual value */ - pci_read_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8 *)&temp); - if (!temp) { - pll = nforce2_calc_pll(tfsb); - - if (pll < 0) - return -EINVAL; - - nforce2_write_pll(pll); - } - - /* Enable write access */ - temp = 0x01; - pci_write_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8)temp); - - diff = tfsb - fsb; - - if (!diff) - return 0; - - while ((tfsb != fsb) && (tfsb <= max_fsb) && (tfsb >= min_fsb)) { - if (diff < 0) - tfsb++; - else - tfsb--; - - /* Calculate the PLL reg. value */ - pll = nforce2_calc_pll(tfsb); - if (pll == -1) - return -EINVAL; - - nforce2_write_pll(pll); -#ifdef NFORCE2_DELAY - mdelay(NFORCE2_DELAY); -#endif - } - - temp = 0x40; - pci_write_config_byte(nforce2_dev, NFORCE2_PLLADR, (u8)temp); - - return 0; -} - -/** - * nforce2_get - get the CPU frequency - * @cpu: CPU number - * - * Returns the CPU frequency - */ -static unsigned int nforce2_get(unsigned int cpu) -{ - if (cpu) - return 0; - return nforce2_fsb_read(0) * fid * 100; -} - -/** - * nforce2_target - set a new CPUFreq policy - * @policy: new policy - * @target_freq: the target frequency - * @relation: how that frequency relates to achieved frequency - * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) - * - * Sets a new CPUFreq policy. - */ -static int nforce2_target(struct cpufreq_policy *policy, - unsigned int target_freq, unsigned int relation) -{ -/* unsigned long flags; */ - struct cpufreq_freqs freqs; - unsigned int target_fsb; - - if ((target_freq > policy->max) || (target_freq < policy->min)) - return -EINVAL; - - target_fsb = target_freq / (fid * 100); - - freqs.old = nforce2_get(policy->cpu); - freqs.new = target_fsb * fid * 100; - freqs.cpu = 0; /* Only one CPU on nForce2 platforms */ - - if (freqs.old == freqs.new) - return 0; - - dprintk("Old CPU frequency %d kHz, new %d kHz\n", - freqs.old, freqs.new); - - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - - /* Disable IRQs */ - /* local_irq_save(flags); */ - - if (nforce2_set_fsb(target_fsb) < 0) - printk(KERN_ERR PFX "Changing FSB to %d failed\n", - target_fsb); - else - dprintk("Changed FSB successfully to %d\n", - target_fsb); - - /* Enable IRQs */ - /* local_irq_restore(flags); */ - - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - - return 0; -} - -/** - * nforce2_verify - verifies a new CPUFreq policy - * @policy: new policy - */ -static int nforce2_verify(struct cpufreq_policy *policy) -{ - unsigned int fsb_pol_max; - - fsb_pol_max = policy->max / (fid * 100); - - if (policy->min < (fsb_pol_max * fid * 100)) - policy->max = (fsb_pol_max + 1) * fid * 100; - - cpufreq_verify_within_limits(policy, - policy->cpuinfo.min_freq, - policy->cpuinfo.max_freq); - return 0; -} - -static int nforce2_cpu_init(struct cpufreq_policy *policy) -{ - unsigned int fsb; - unsigned int rfid; - - /* capability check */ - if (policy->cpu != 0) - return -ENODEV; - - /* Get current FSB */ - fsb = nforce2_fsb_read(0); - - if (!fsb) - return -EIO; - - /* FIX: Get FID from CPU */ - if (!fid) { - if (!cpu_khz) { - printk(KERN_WARNING PFX - "cpu_khz not set, can't calculate multiplier!\n"); - return -ENODEV; - } - - fid = cpu_khz / (fsb * 100); - rfid = fid % 5; - - if (rfid) { - if (rfid > 2) - fid += 5 - rfid; - else - fid -= rfid; - } - } - - printk(KERN_INFO PFX "FSB currently at %i MHz, FID %d.%d\n", fsb, - fid / 10, fid % 10); - - /* Set maximum FSB to FSB at boot time */ - max_fsb = nforce2_fsb_read(1); - - if (!max_fsb) - return -EIO; - - if (!min_fsb) - min_fsb = max_fsb - NFORCE2_SAFE_DISTANCE; - - if (min_fsb < NFORCE2_MIN_FSB) - min_fsb = NFORCE2_MIN_FSB; - - /* cpuinfo and default policy values */ - policy->cpuinfo.min_freq = min_fsb * fid * 100; - policy->cpuinfo.max_freq = max_fsb * fid * 100; - policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; - policy->cur = nforce2_get(policy->cpu); - policy->min = policy->cpuinfo.min_freq; - policy->max = policy->cpuinfo.max_freq; - - return 0; -} - -static int nforce2_cpu_exit(struct cpufreq_policy *policy) -{ - return 0; -} - -static struct cpufreq_driver nforce2_driver = { - .name = "nforce2", - .verify = nforce2_verify, - .target = nforce2_target, - .get = nforce2_get, - .init = nforce2_cpu_init, - .exit = nforce2_cpu_exit, - .owner = THIS_MODULE, -}; - -/** - * nforce2_detect_chipset - detect the Southbridge which contains FSB PLL logic - * - * Detects nForce2 A2 and C1 stepping - * - */ -static unsigned int nforce2_detect_chipset(void) -{ - nforce2_dev = pci_get_subsys(PCI_VENDOR_ID_NVIDIA, - PCI_DEVICE_ID_NVIDIA_NFORCE2, - PCI_ANY_ID, PCI_ANY_ID, NULL); - - if (nforce2_dev == NULL) - return -ENODEV; - - printk(KERN_INFO PFX "Detected nForce2 chipset revision %X\n", - nforce2_dev->revision); - printk(KERN_INFO PFX - "FSB changing is maybe unstable and can lead to " - "crashes and data loss.\n"); - - return 0; -} - -/** - * nforce2_init - initializes the nForce2 CPUFreq driver - * - * Initializes the nForce2 FSB support. Returns -ENODEV on unsupported - * devices, -EINVAL on problems during initiatization, and zero on - * success. - */ -static int __init nforce2_init(void) -{ - /* TODO: do we need to detect the processor? */ - - /* detect chipset */ - if (nforce2_detect_chipset()) { - printk(KERN_INFO PFX "No nForce2 chipset.\n"); - return -ENODEV; - } - - return cpufreq_register_driver(&nforce2_driver); -} - -/** - * nforce2_exit - unregisters cpufreq module - * - * Unregisters nForce2 FSB change support. - */ -static void __exit nforce2_exit(void) -{ - cpufreq_unregister_driver(&nforce2_driver); -} - -module_init(nforce2_init); -module_exit(nforce2_exit); - diff --git a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c deleted file mode 100644 index 35a257dd4bb..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c +++ /dev/null @@ -1,367 +0,0 @@ -/* - * Based on documentation provided by Dave Jones. Thanks! - * - * Licensed under the terms of the GNU GPL License version 2. - * - * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/cpufreq.h> -#include <linux/ioport.h> -#include <linux/slab.h> -#include <linux/timex.h> -#include <linux/io.h> -#include <linux/delay.h> - -#include <asm/msr.h> -#include <asm/tsc.h> - -#define EPS_BRAND_C7M 0 -#define EPS_BRAND_C7 1 -#define EPS_BRAND_EDEN 2 -#define EPS_BRAND_C3 3 -#define EPS_BRAND_C7D 4 - -struct eps_cpu_data { - u32 fsb; - struct cpufreq_frequency_table freq_table[]; -}; - -static struct eps_cpu_data *eps_cpu[NR_CPUS]; - - -static unsigned int eps_get(unsigned int cpu) -{ - struct eps_cpu_data *centaur; - u32 lo, hi; - - if (cpu) - return 0; - centaur = eps_cpu[cpu]; - if (centaur == NULL) - return 0; - - /* Return current frequency */ - rdmsr(MSR_IA32_PERF_STATUS, lo, hi); - return centaur->fsb * ((lo >> 8) & 0xff); -} - -static int eps_set_state(struct eps_cpu_data *centaur, - unsigned int cpu, - u32 dest_state) -{ - struct cpufreq_freqs freqs; - u32 lo, hi; - int err = 0; - int i; - - freqs.old = eps_get(cpu); - freqs.new = centaur->fsb * ((dest_state >> 8) & 0xff); - freqs.cpu = cpu; - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - - /* Wait while CPU is busy */ - rdmsr(MSR_IA32_PERF_STATUS, lo, hi); - i = 0; - while (lo & ((1 << 16) | (1 << 17))) { - udelay(16); - rdmsr(MSR_IA32_PERF_STATUS, lo, hi); - i++; - if (unlikely(i > 64)) { - err = -ENODEV; - goto postchange; - } - } - /* Set new multiplier and voltage */ - wrmsr(MSR_IA32_PERF_CTL, dest_state & 0xffff, 0); - /* Wait until transition end */ - i = 0; - do { - udelay(16); - rdmsr(MSR_IA32_PERF_STATUS, lo, hi); - i++; - if (unlikely(i > 64)) { - err = -ENODEV; - goto postchange; - } - } while (lo & ((1 << 16) | (1 << 17))); - - /* Return current frequency */ -postchange: - rdmsr(MSR_IA32_PERF_STATUS, lo, hi); - freqs.new = centaur->fsb * ((lo >> 8) & 0xff); - -#ifdef DEBUG - { - u8 current_multiplier, current_voltage; - - /* Print voltage and multiplier */ - rdmsr(MSR_IA32_PERF_STATUS, lo, hi); - current_voltage = lo & 0xff; - printk(KERN_INFO "eps: Current voltage = %dmV\n", - current_voltage * 16 + 700); - current_multiplier = (lo >> 8) & 0xff; - printk(KERN_INFO "eps: Current multiplier = %d\n", - current_multiplier); - } -#endif - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - return err; -} - -static int eps_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) -{ - struct eps_cpu_data *centaur; - unsigned int newstate = 0; - unsigned int cpu = policy->cpu; - unsigned int dest_state; - int ret; - - if (unlikely(eps_cpu[cpu] == NULL)) - return -ENODEV; - centaur = eps_cpu[cpu]; - - if (unlikely(cpufreq_frequency_table_target(policy, - &eps_cpu[cpu]->freq_table[0], - target_freq, - relation, - &newstate))) { - return -EINVAL; - } - - /* Make frequency transition */ - dest_state = centaur->freq_table[newstate].index & 0xffff; - ret = eps_set_state(centaur, cpu, dest_state); - if (ret) - printk(KERN_ERR "eps: Timeout!\n"); - return ret; -} - -static int eps_verify(struct cpufreq_policy *policy) -{ - return cpufreq_frequency_table_verify(policy, - &eps_cpu[policy->cpu]->freq_table[0]); -} - -static int eps_cpu_init(struct cpufreq_policy *policy) -{ - unsigned int i; - u32 lo, hi; - u64 val; - u8 current_multiplier, current_voltage; - u8 max_multiplier, max_voltage; - u8 min_multiplier, min_voltage; - u8 brand = 0; - u32 fsb; - struct eps_cpu_data *centaur; - struct cpuinfo_x86 *c = &cpu_data(0); - struct cpufreq_frequency_table *f_table; - int k, step, voltage; - int ret; - int states; - - if (policy->cpu != 0) - return -ENODEV; - - /* Check brand */ - printk(KERN_INFO "eps: Detected VIA "); - - switch (c->x86_model) { - case 10: - rdmsr(0x1153, lo, hi); - brand = (((lo >> 2) ^ lo) >> 18) & 3; - printk(KERN_CONT "Model A "); - break; - case 13: - rdmsr(0x1154, lo, hi); - brand = (((lo >> 4) ^ (lo >> 2))) & 0x000000ff; - printk(KERN_CONT "Model D "); - break; - } - - switch (brand) { - case EPS_BRAND_C7M: - printk(KERN_CONT "C7-M\n"); - break; - case EPS_BRAND_C7: - printk(KERN_CONT "C7\n"); - break; - case EPS_BRAND_EDEN: - printk(KERN_CONT "Eden\n"); - break; - case EPS_BRAND_C7D: - printk(KERN_CONT "C7-D\n"); - break; - case EPS_BRAND_C3: - printk(KERN_CONT "C3\n"); - return -ENODEV; - break; - } - /* Enable Enhanced PowerSaver */ - rdmsrl(MSR_IA32_MISC_ENABLE, val); - if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) { - val |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP; - wrmsrl(MSR_IA32_MISC_ENABLE, val); - /* Can be locked at 0 */ - rdmsrl(MSR_IA32_MISC_ENABLE, val); - if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) { - printk(KERN_INFO "eps: Can't enable Enhanced PowerSaver\n"); - return -ENODEV; - } - } - - /* Print voltage and multiplier */ - rdmsr(MSR_IA32_PERF_STATUS, lo, hi); - current_voltage = lo & 0xff; - printk(KERN_INFO "eps: Current voltage = %dmV\n", - current_voltage * 16 + 700); - current_multiplier = (lo >> 8) & 0xff; - printk(KERN_INFO "eps: Current multiplier = %d\n", current_multiplier); - - /* Print limits */ - max_voltage = hi & 0xff; - printk(KERN_INFO "eps: Highest voltage = %dmV\n", - max_voltage * 16 + 700); - max_multiplier = (hi >> 8) & 0xff; - printk(KERN_INFO "eps: Highest multiplier = %d\n", max_multiplier); - min_voltage = (hi >> 16) & 0xff; - printk(KERN_INFO "eps: Lowest voltage = %dmV\n", - min_voltage * 16 + 700); - min_multiplier = (hi >> 24) & 0xff; - printk(KERN_INFO "eps: Lowest multiplier = %d\n", min_multiplier); - - /* Sanity checks */ - if (current_multiplier == 0 || max_multiplier == 0 - || min_multiplier == 0) - return -EINVAL; - if (current_multiplier > max_multiplier - || max_multiplier <= min_multiplier) - return -EINVAL; - if (current_voltage > 0x1f || max_voltage > 0x1f) - return -EINVAL; - if (max_voltage < min_voltage) - return -EINVAL; - - /* Calc FSB speed */ - fsb = cpu_khz / current_multiplier; - /* Calc number of p-states supported */ - if (brand == EPS_BRAND_C7M) - states = max_multiplier - min_multiplier + 1; - else - states = 2; - - /* Allocate private data and frequency table for current cpu */ - centaur = kzalloc(sizeof(struct eps_cpu_data) - + (states + 1) * sizeof(struct cpufreq_frequency_table), - GFP_KERNEL); - if (!centaur) - return -ENOMEM; - eps_cpu[0] = centaur; - - /* Copy basic values */ - centaur->fsb = fsb; - - /* Fill frequency and MSR value table */ - f_table = ¢aur->freq_table[0]; - if (brand != EPS_BRAND_C7M) { - f_table[0].frequency = fsb * min_multiplier; - f_table[0].index = (min_multiplier << 8) | min_voltage; - f_table[1].frequency = fsb * max_multiplier; - f_table[1].index = (max_multiplier << 8) | max_voltage; - f_table[2].frequency = CPUFREQ_TABLE_END; - } else { - k = 0; - step = ((max_voltage - min_voltage) * 256) - / (max_multiplier - min_multiplier); - for (i = min_multiplier; i <= max_multiplier; i++) { - voltage = (k * step) / 256 + min_voltage; - f_table[k].frequency = fsb * i; - f_table[k].index = (i << 8) | voltage; - k++; - } - f_table[k].frequency = CPUFREQ_TABLE_END; - } - - policy->cpuinfo.transition_latency = 140000; /* 844mV -> 700mV in ns */ - policy->cur = fsb * current_multiplier; - - ret = cpufreq_frequency_table_cpuinfo(policy, ¢aur->freq_table[0]); - if (ret) { - kfree(centaur); - return ret; - } - - cpufreq_frequency_table_get_attr(¢aur->freq_table[0], policy->cpu); - return 0; -} - -static int eps_cpu_exit(struct cpufreq_policy *policy) -{ - unsigned int cpu = policy->cpu; - struct eps_cpu_data *centaur; - u32 lo, hi; - - if (eps_cpu[cpu] == NULL) - return -ENODEV; - centaur = eps_cpu[cpu]; - - /* Get max frequency */ - rdmsr(MSR_IA32_PERF_STATUS, lo, hi); - /* Set max frequency */ - eps_set_state(centaur, cpu, hi & 0xffff); - /* Bye */ - cpufreq_frequency_table_put_attr(policy->cpu); - kfree(eps_cpu[cpu]); - eps_cpu[cpu] = NULL; - return 0; -} - -static struct freq_attr *eps_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; - -static struct cpufreq_driver eps_driver = { - .verify = eps_verify, - .target = eps_target, - .init = eps_cpu_init, - .exit = eps_cpu_exit, - .get = eps_get, - .name = "e_powersaver", - .owner = THIS_MODULE, - .attr = eps_attr, -}; - -static int __init eps_init(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - - /* This driver will work only on Centaur C7 processors with - * Enhanced SpeedStep/PowerSaver registers */ - if (c->x86_vendor != X86_VENDOR_CENTAUR - || c->x86 != 6 || c->x86_model < 10) - return -ENODEV; - if (!cpu_has(c, X86_FEATURE_EST)) - return -ENODEV; - - if (cpufreq_register_driver(&eps_driver)) - return -EINVAL; - return 0; -} - -static void __exit eps_exit(void) -{ - cpufreq_unregister_driver(&eps_driver); -} - -MODULE_AUTHOR("Rafal Bilski <rafalbilski@interia.pl>"); -MODULE_DESCRIPTION("Enhanced PowerSaver driver for VIA C7 CPU's."); -MODULE_LICENSE("GPL"); - -module_init(eps_init); -module_exit(eps_exit); diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c deleted file mode 100644 index c587db472a7..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c +++ /dev/null @@ -1,309 +0,0 @@ -/* - * elanfreq: cpufreq driver for the AMD ELAN family - * - * (c) Copyright 2002 Robert Schwebel <r.schwebel@pengutronix.de> - * - * Parts of this code are (c) Sven Geggus <sven@geggus.net> - * - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * 2002-02-13: - initial revision for 2.4.18-pre9 by Robert Schwebel - * - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> - -#include <linux/delay.h> -#include <linux/cpufreq.h> - -#include <asm/msr.h> -#include <linux/timex.h> -#include <linux/io.h> - -#define REG_CSCIR 0x22 /* Chip Setup and Control Index Register */ -#define REG_CSCDR 0x23 /* Chip Setup and Control Data Register */ - -/* Module parameter */ -static int max_freq; - -struct s_elan_multiplier { - int clock; /* frequency in kHz */ - int val40h; /* PMU Force Mode register */ - int val80h; /* CPU Clock Speed Register */ -}; - -/* - * It is important that the frequencies - * are listed in ascending order here! - */ -static struct s_elan_multiplier elan_multiplier[] = { - {1000, 0x02, 0x18}, - {2000, 0x02, 0x10}, - {4000, 0x02, 0x08}, - {8000, 0x00, 0x00}, - {16000, 0x00, 0x02}, - {33000, 0x00, 0x04}, - {66000, 0x01, 0x04}, - {99000, 0x01, 0x05} -}; - -static struct cpufreq_frequency_table elanfreq_table[] = { - {0, 1000}, - {1, 2000}, - {2, 4000}, - {3, 8000}, - {4, 16000}, - {5, 33000}, - {6, 66000}, - {7, 99000}, - {0, CPUFREQ_TABLE_END}, -}; - - -/** - * elanfreq_get_cpu_frequency: determine current cpu speed - * - * Finds out at which frequency the CPU of the Elan SOC runs - * at the moment. Frequencies from 1 to 33 MHz are generated - * the normal way, 66 and 99 MHz are called "Hyperspeed Mode" - * and have the rest of the chip running with 33 MHz. - */ - -static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu) -{ - u8 clockspeed_reg; /* Clock Speed Register */ - - local_irq_disable(); - outb_p(0x80, REG_CSCIR); - clockspeed_reg = inb_p(REG_CSCDR); - local_irq_enable(); - - if ((clockspeed_reg & 0xE0) == 0xE0) - return 0; - - /* Are we in CPU clock multiplied mode (66/99 MHz)? */ - if ((clockspeed_reg & 0xE0) == 0xC0) { - if ((clockspeed_reg & 0x01) == 0) - return 66000; - else - return 99000; - } - - /* 33 MHz is not 32 MHz... */ - if ((clockspeed_reg & 0xE0) == 0xA0) - return 33000; - - return (1<<((clockspeed_reg & 0xE0) >> 5)) * 1000; -} - - -/** - * elanfreq_set_cpu_frequency: Change the CPU core frequency - * @cpu: cpu number - * @freq: frequency in kHz - * - * This function takes a frequency value and changes the CPU frequency - * according to this. Note that the frequency has to be checked by - * elanfreq_validatespeed() for correctness! - * - * There is no return value. - */ - -static void elanfreq_set_cpu_state(unsigned int state) -{ - struct cpufreq_freqs freqs; - - freqs.old = elanfreq_get_cpu_frequency(0); - freqs.new = elan_multiplier[state].clock; - freqs.cpu = 0; /* elanfreq.c is UP only driver */ - - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - - printk(KERN_INFO "elanfreq: attempting to set frequency to %i kHz\n", - elan_multiplier[state].clock); - - - /* - * Access to the Elan's internal registers is indexed via - * 0x22: Chip Setup & Control Register Index Register (CSCI) - * 0x23: Chip Setup & Control Register Data Register (CSCD) - * - */ - - /* - * 0x40 is the Power Management Unit's Force Mode Register. - * Bit 6 enables Hyperspeed Mode (66/100 MHz core frequency) - */ - - local_irq_disable(); - outb_p(0x40, REG_CSCIR); /* Disable hyperspeed mode */ - outb_p(0x00, REG_CSCDR); - local_irq_enable(); /* wait till internal pipelines and */ - udelay(1000); /* buffers have cleaned up */ - - local_irq_disable(); - - /* now, set the CPU clock speed register (0x80) */ - outb_p(0x80, REG_CSCIR); - outb_p(elan_multiplier[state].val80h, REG_CSCDR); - - /* now, the hyperspeed bit in PMU Force Mode Register (0x40) */ - outb_p(0x40, REG_CSCIR); - outb_p(elan_multiplier[state].val40h, REG_CSCDR); - udelay(10000); - local_irq_enable(); - - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); -}; - - -/** - * elanfreq_validatespeed: test if frequency range is valid - * @policy: the policy to validate - * - * This function checks if a given frequency range in kHz is valid - * for the hardware supported by the driver. - */ - -static int elanfreq_verify(struct cpufreq_policy *policy) -{ - return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]); -} - -static int elanfreq_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) -{ - unsigned int newstate = 0; - - if (cpufreq_frequency_table_target(policy, &elanfreq_table[0], - target_freq, relation, &newstate)) - return -EINVAL; - - elanfreq_set_cpu_state(newstate); - - return 0; -} - - -/* - * Module init and exit code - */ - -static int elanfreq_cpu_init(struct cpufreq_policy *policy) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - unsigned int i; - int result; - - /* capability check */ - if ((c->x86_vendor != X86_VENDOR_AMD) || - (c->x86 != 4) || (c->x86_model != 10)) - return -ENODEV; - - /* max freq */ - if (!max_freq) - max_freq = elanfreq_get_cpu_frequency(0); - - /* table init */ - for (i = 0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) { - if (elanfreq_table[i].frequency > max_freq) - elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID; - } - - /* cpuinfo and default policy values */ - policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; - policy->cur = elanfreq_get_cpu_frequency(0); - - result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table); - if (result) - return result; - - cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu); - return 0; -} - - -static int elanfreq_cpu_exit(struct cpufreq_policy *policy) -{ - cpufreq_frequency_table_put_attr(policy->cpu); - return 0; -} - - -#ifndef MODULE -/** - * elanfreq_setup - elanfreq command line parameter parsing - * - * elanfreq command line parameter. Use: - * elanfreq=66000 - * to set the maximum CPU frequency to 66 MHz. Note that in - * case you do not give this boot parameter, the maximum - * frequency will fall back to _current_ CPU frequency which - * might be lower. If you build this as a module, use the - * max_freq module parameter instead. - */ -static int __init elanfreq_setup(char *str) -{ - max_freq = simple_strtoul(str, &str, 0); - printk(KERN_WARNING "You're using the deprecated elanfreq command line option. Use elanfreq.max_freq instead, please!\n"); - return 1; -} -__setup("elanfreq=", elanfreq_setup); -#endif - - -static struct freq_attr *elanfreq_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; - - -static struct cpufreq_driver elanfreq_driver = { - .get = elanfreq_get_cpu_frequency, - .verify = elanfreq_verify, - .target = elanfreq_target, - .init = elanfreq_cpu_init, - .exit = elanfreq_cpu_exit, - .name = "elanfreq", - .owner = THIS_MODULE, - .attr = elanfreq_attr, -}; - - -static int __init elanfreq_init(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - - /* Test if we have the right hardware */ - if ((c->x86_vendor != X86_VENDOR_AMD) || - (c->x86 != 4) || (c->x86_model != 10)) { - printk(KERN_INFO "elanfreq: error: no Elan processor found!\n"); - return -ENODEV; - } - return cpufreq_register_driver(&elanfreq_driver); -} - - -static void __exit elanfreq_exit(void) -{ - cpufreq_unregister_driver(&elanfreq_driver); -} - - -module_param(max_freq, int, 0444); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, " - "Sven Geggus <sven@geggus.net>"); -MODULE_DESCRIPTION("cpufreq driver for AMD's Elan CPUs"); - -module_init(elanfreq_init); -module_exit(elanfreq_exit); diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c deleted file mode 100644 index 32974cf8423..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c +++ /dev/null @@ -1,517 +0,0 @@ -/* - * Cyrix MediaGX and NatSemi Geode Suspend Modulation - * (C) 2002 Zwane Mwaikambo <zwane@commfireservices.com> - * (C) 2002 Hiroshi Miura <miura@da-cha.org> - * All Rights Reserved - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * version 2 as published by the Free Software Foundation - * - * The author(s) of this software shall not be held liable for damages - * of any nature resulting due to the use of this software. This - * software is provided AS-IS with no warranties. - * - * Theoretical note: - * - * (see Geode(tm) CS5530 manual (rev.4.1) page.56) - * - * CPU frequency control on NatSemi Geode GX1/GXLV processor and CS55x0 - * are based on Suspend Modulation. - * - * Suspend Modulation works by asserting and de-asserting the SUSP# pin - * to CPU(GX1/GXLV) for configurable durations. When asserting SUSP# - * the CPU enters an idle state. GX1 stops its core clock when SUSP# is - * asserted then power consumption is reduced. - * - * Suspend Modulation's OFF/ON duration are configurable - * with 'Suspend Modulation OFF Count Register' - * and 'Suspend Modulation ON Count Register'. - * These registers are 8bit counters that represent the number of - * 32us intervals which the SUSP# pin is asserted(ON)/de-asserted(OFF) - * to the processor. - * - * These counters define a ratio which is the effective frequency - * of operation of the system. - * - * OFF Count - * F_eff = Fgx * ---------------------- - * OFF Count + ON Count - * - * 0 <= On Count, Off Count <= 255 - * - * From these limits, we can get register values - * - * off_duration + on_duration <= MAX_DURATION - * on_duration = off_duration * (stock_freq - freq) / freq - * - * off_duration = (freq * DURATION) / stock_freq - * on_duration = DURATION - off_duration - * - * - *--------------------------------------------------------------------------- - * - * ChangeLog: - * Dec. 12, 2003 Hiroshi Miura <miura@da-cha.org> - * - fix on/off register mistake - * - fix cpu_khz calc when it stops cpu modulation. - * - * Dec. 11, 2002 Hiroshi Miura <miura@da-cha.org> - * - rewrite for Cyrix MediaGX Cx5510/5520 and - * NatSemi Geode Cs5530(A). - * - * Jul. ??, 2002 Zwane Mwaikambo <zwane@commfireservices.com> - * - cs5530_mod patch for 2.4.19-rc1. - * - *--------------------------------------------------------------------------- - * - * Todo - * Test on machines with 5510, 5530, 5530A - */ - -/************************************************************************ - * Suspend Modulation - Definitions * - ************************************************************************/ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/smp.h> -#include <linux/cpufreq.h> -#include <linux/pci.h> -#include <linux/errno.h> -#include <linux/slab.h> - -#include <asm/processor-cyrix.h> - -/* PCI config registers, all at F0 */ -#define PCI_PMER1 0x80 /* power management enable register 1 */ -#define PCI_PMER2 0x81 /* power management enable register 2 */ -#define PCI_PMER3 0x82 /* power management enable register 3 */ -#define PCI_IRQTC 0x8c /* irq speedup timer counter register:typical 2 to 4ms */ -#define PCI_VIDTC 0x8d /* video speedup timer counter register: typical 50 to 100ms */ -#define PCI_MODOFF 0x94 /* suspend modulation OFF counter register, 1 = 32us */ -#define PCI_MODON 0x95 /* suspend modulation ON counter register */ -#define PCI_SUSCFG 0x96 /* suspend configuration register */ - -/* PMER1 bits */ -#define GPM (1<<0) /* global power management */ -#define GIT (1<<1) /* globally enable PM device idle timers */ -#define GTR (1<<2) /* globally enable IO traps */ -#define IRQ_SPDUP (1<<3) /* disable clock throttle during interrupt handling */ -#define VID_SPDUP (1<<4) /* disable clock throttle during vga video handling */ - -/* SUSCFG bits */ -#define SUSMOD (1<<0) /* enable/disable suspend modulation */ -/* the below is supported only with cs5530 (after rev.1.2)/cs5530A */ -#define SMISPDUP (1<<1) /* select how SMI re-enable suspend modulation: */ - /* IRQTC timer or read SMI speedup disable reg.(F1BAR[08-09h]) */ -#define SUSCFG (1<<2) /* enable powering down a GXLV processor. "Special 3Volt Suspend" mode */ -/* the below is supported only with cs5530A */ -#define PWRSVE_ISA (1<<3) /* stop ISA clock */ -#define PWRSVE (1<<4) /* active idle */ - -struct gxfreq_params { - u8 on_duration; - u8 off_duration; - u8 pci_suscfg; - u8 pci_pmer1; - u8 pci_pmer2; - struct pci_dev *cs55x0; -}; - -static struct gxfreq_params *gx_params; -static int stock_freq; - -/* PCI bus clock - defaults to 30.000 if cpu_khz is not available */ -static int pci_busclk; -module_param(pci_busclk, int, 0444); - -/* maximum duration for which the cpu may be suspended - * (32us * MAX_DURATION). If no parameter is given, this defaults - * to 255. - * Note that this leads to a maximum of 8 ms(!) where the CPU clock - * is suspended -- processing power is just 0.39% of what it used to be, - * though. 781.25 kHz(!) for a 200 MHz processor -- wow. */ -static int max_duration = 255; -module_param(max_duration, int, 0444); - -/* For the default policy, we want at least some processing power - * - let's say 5%. (min = maxfreq / POLICY_MIN_DIV) - */ -#define POLICY_MIN_DIV 20 - - -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "gx-suspmod", msg) - -/** - * we can detect a core multipiler from dir0_lsb - * from GX1 datasheet p.56, - * MULT[3:0]: - * 0000 = SYSCLK multiplied by 4 (test only) - * 0001 = SYSCLK multiplied by 10 - * 0010 = SYSCLK multiplied by 4 - * 0011 = SYSCLK multiplied by 6 - * 0100 = SYSCLK multiplied by 9 - * 0101 = SYSCLK multiplied by 5 - * 0110 = SYSCLK multiplied by 7 - * 0111 = SYSCLK multiplied by 8 - * of 33.3MHz - **/ -static int gx_freq_mult[16] = { - 4, 10, 4, 6, 9, 5, 7, 8, - 0, 0, 0, 0, 0, 0, 0, 0 -}; - - -/**************************************************************** - * Low Level chipset interface * - ****************************************************************/ -static struct pci_device_id gx_chipset_tbl[] __initdata = { - { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY), }, - { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5520), }, - { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5510), }, - { 0, }, -}; - -static void gx_write_byte(int reg, int value) -{ - pci_write_config_byte(gx_params->cs55x0, reg, value); -} - -/** - * gx_detect_chipset: - * - **/ -static __init struct pci_dev *gx_detect_chipset(void) -{ - struct pci_dev *gx_pci = NULL; - - /* check if CPU is a MediaGX or a Geode. */ - if ((boot_cpu_data.x86_vendor != X86_VENDOR_NSC) && - (boot_cpu_data.x86_vendor != X86_VENDOR_CYRIX)) { - dprintk("error: no MediaGX/Geode processor found!\n"); - return NULL; - } - - /* detect which companion chip is used */ - for_each_pci_dev(gx_pci) { - if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL) - return gx_pci; - } - - dprintk("error: no supported chipset found!\n"); - return NULL; -} - -/** - * gx_get_cpuspeed: - * - * Finds out at which efficient frequency the Cyrix MediaGX/NatSemi - * Geode CPU runs. - */ -static unsigned int gx_get_cpuspeed(unsigned int cpu) -{ - if ((gx_params->pci_suscfg & SUSMOD) == 0) - return stock_freq; - - return (stock_freq * gx_params->off_duration) - / (gx_params->on_duration + gx_params->off_duration); -} - -/** - * gx_validate_speed: - * determine current cpu speed - * - **/ - -static unsigned int gx_validate_speed(unsigned int khz, u8 *on_duration, - u8 *off_duration) -{ - unsigned int i; - u8 tmp_on, tmp_off; - int old_tmp_freq = stock_freq; - int tmp_freq; - - *off_duration = 1; - *on_duration = 0; - - for (i = max_duration; i > 0; i--) { - tmp_off = ((khz * i) / stock_freq) & 0xff; - tmp_on = i - tmp_off; - tmp_freq = (stock_freq * tmp_off) / i; - /* if this relation is closer to khz, use this. If it's equal, - * prefer it, too - lower latency */ - if (abs(tmp_freq - khz) <= abs(old_tmp_freq - khz)) { - *on_duration = tmp_on; - *off_duration = tmp_off; - old_tmp_freq = tmp_freq; - } - } - - return old_tmp_freq; -} - - -/** - * gx_set_cpuspeed: - * set cpu speed in khz. - **/ - -static void gx_set_cpuspeed(unsigned int khz) -{ - u8 suscfg, pmer1; - unsigned int new_khz; - unsigned long flags; - struct cpufreq_freqs freqs; - - freqs.cpu = 0; - freqs.old = gx_get_cpuspeed(0); - - new_khz = gx_validate_speed(khz, &gx_params->on_duration, - &gx_params->off_duration); - - freqs.new = new_khz; - - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - local_irq_save(flags); - - - - if (new_khz != stock_freq) { - /* if new khz == 100% of CPU speed, it is special case */ - switch (gx_params->cs55x0->device) { - case PCI_DEVICE_ID_CYRIX_5530_LEGACY: - pmer1 = gx_params->pci_pmer1 | IRQ_SPDUP | VID_SPDUP; - /* FIXME: need to test other values -- Zwane,Miura */ - /* typical 2 to 4ms */ - gx_write_byte(PCI_IRQTC, 4); - /* typical 50 to 100ms */ - gx_write_byte(PCI_VIDTC, 100); - gx_write_byte(PCI_PMER1, pmer1); - - if (gx_params->cs55x0->revision < 0x10) { - /* CS5530(rev 1.2, 1.3) */ - suscfg = gx_params->pci_suscfg|SUSMOD; - } else { - /* CS5530A,B.. */ - suscfg = gx_params->pci_suscfg|SUSMOD|PWRSVE; - } - break; - case PCI_DEVICE_ID_CYRIX_5520: - case PCI_DEVICE_ID_CYRIX_5510: - suscfg = gx_params->pci_suscfg | SUSMOD; - break; - default: - local_irq_restore(flags); - dprintk("fatal: try to set unknown chipset.\n"); - return; - } - } else { - suscfg = gx_params->pci_suscfg & ~(SUSMOD); - gx_params->off_duration = 0; - gx_params->on_duration = 0; - dprintk("suspend modulation disabled: cpu runs 100%% speed.\n"); - } - - gx_write_byte(PCI_MODOFF, gx_params->off_duration); - gx_write_byte(PCI_MODON, gx_params->on_duration); - - gx_write_byte(PCI_SUSCFG, suscfg); - pci_read_config_byte(gx_params->cs55x0, PCI_SUSCFG, &suscfg); - - local_irq_restore(flags); - - gx_params->pci_suscfg = suscfg; - - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - - dprintk("suspend modulation w/ duration of ON:%d us, OFF:%d us\n", - gx_params->on_duration * 32, gx_params->off_duration * 32); - dprintk("suspend modulation w/ clock speed: %d kHz.\n", freqs.new); -} - -/**************************************************************** - * High level functions * - ****************************************************************/ - -/* - * cpufreq_gx_verify: test if frequency range is valid - * - * This function checks if a given frequency range in kHz is valid - * for the hardware supported by the driver. - */ - -static int cpufreq_gx_verify(struct cpufreq_policy *policy) -{ - unsigned int tmp_freq = 0; - u8 tmp1, tmp2; - - if (!stock_freq || !policy) - return -EINVAL; - - policy->cpu = 0; - cpufreq_verify_within_limits(policy, (stock_freq / max_duration), - stock_freq); - - /* it needs to be assured that at least one supported frequency is - * within policy->min and policy->max. If it is not, policy->max - * needs to be increased until one freuqency is supported. - * policy->min may not be decreased, though. This way we guarantee a - * specific processing capacity. - */ - tmp_freq = gx_validate_speed(policy->min, &tmp1, &tmp2); - if (tmp_freq < policy->min) - tmp_freq += stock_freq / max_duration; - policy->min = tmp_freq; - if (policy->min > policy->max) - policy->max = tmp_freq; - tmp_freq = gx_validate_speed(policy->max, &tmp1, &tmp2); - if (tmp_freq > policy->max) - tmp_freq -= stock_freq / max_duration; - policy->max = tmp_freq; - if (policy->max < policy->min) - policy->max = policy->min; - cpufreq_verify_within_limits(policy, (stock_freq / max_duration), - stock_freq); - - return 0; -} - -/* - * cpufreq_gx_target: - * - */ -static int cpufreq_gx_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) -{ - u8 tmp1, tmp2; - unsigned int tmp_freq; - - if (!stock_freq || !policy) - return -EINVAL; - - policy->cpu = 0; - - tmp_freq = gx_validate_speed(target_freq, &tmp1, &tmp2); - while (tmp_freq < policy->min) { - tmp_freq += stock_freq / max_duration; - tmp_freq = gx_validate_speed(tmp_freq, &tmp1, &tmp2); - } - while (tmp_freq > policy->max) { - tmp_freq -= stock_freq / max_duration; - tmp_freq = gx_validate_speed(tmp_freq, &tmp1, &tmp2); - } - - gx_set_cpuspeed(tmp_freq); - - return 0; -} - -static int cpufreq_gx_cpu_init(struct cpufreq_policy *policy) -{ - unsigned int maxfreq, curfreq; - - if (!policy || policy->cpu != 0) - return -ENODEV; - - /* determine maximum frequency */ - if (pci_busclk) - maxfreq = pci_busclk * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f]; - else if (cpu_khz) - maxfreq = cpu_khz; - else - maxfreq = 30000 * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f]; - - stock_freq = maxfreq; - curfreq = gx_get_cpuspeed(0); - - dprintk("cpu max frequency is %d.\n", maxfreq); - dprintk("cpu current frequency is %dkHz.\n", curfreq); - - /* setup basic struct for cpufreq API */ - policy->cpu = 0; - - if (max_duration < POLICY_MIN_DIV) - policy->min = maxfreq / max_duration; - else - policy->min = maxfreq / POLICY_MIN_DIV; - policy->max = maxfreq; - policy->cur = curfreq; - policy->cpuinfo.min_freq = maxfreq / max_duration; - policy->cpuinfo.max_freq = maxfreq; - policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; - - return 0; -} - -/* - * cpufreq_gx_init: - * MediaGX/Geode GX initialize cpufreq driver - */ -static struct cpufreq_driver gx_suspmod_driver = { - .get = gx_get_cpuspeed, - .verify = cpufreq_gx_verify, - .target = cpufreq_gx_target, - .init = cpufreq_gx_cpu_init, - .name = "gx-suspmod", - .owner = THIS_MODULE, -}; - -static int __init cpufreq_gx_init(void) -{ - int ret; - struct gxfreq_params *params; - struct pci_dev *gx_pci; - - /* Test if we have the right hardware */ - gx_pci = gx_detect_chipset(); - if (gx_pci == NULL) - return -ENODEV; - - /* check whether module parameters are sane */ - if (max_duration > 0xff) - max_duration = 0xff; - - dprintk("geode suspend modulation available.\n"); - - params = kzalloc(sizeof(struct gxfreq_params), GFP_KERNEL); - if (params == NULL) - return -ENOMEM; - - params->cs55x0 = gx_pci; - gx_params = params; - - /* keep cs55x0 configurations */ - pci_read_config_byte(params->cs55x0, PCI_SUSCFG, &(params->pci_suscfg)); - pci_read_config_byte(params->cs55x0, PCI_PMER1, &(params->pci_pmer1)); - pci_read_config_byte(params->cs55x0, PCI_PMER2, &(params->pci_pmer2)); - pci_read_config_byte(params->cs55x0, PCI_MODON, &(params->on_duration)); - pci_read_config_byte(params->cs55x0, PCI_MODOFF, - &(params->off_duration)); - - ret = cpufreq_register_driver(&gx_suspmod_driver); - if (ret) { - kfree(params); - return ret; /* register error! */ - } - - return 0; -} - -static void __exit cpufreq_gx_exit(void) -{ - cpufreq_unregister_driver(&gx_suspmod_driver); - pci_dev_put(gx_params->cs55x0); - kfree(gx_params); -} - -MODULE_AUTHOR("Hiroshi Miura <miura@da-cha.org>"); -MODULE_DESCRIPTION("Cpufreq driver for Cyrix MediaGX and NatSemi Geode"); -MODULE_LICENSE("GPL"); - -module_init(cpufreq_gx_init); -module_exit(cpufreq_gx_exit); - diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c deleted file mode 100644 index 03162dac627..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/longhaul.c +++ /dev/null @@ -1,1029 +0,0 @@ -/* - * (C) 2001-2004 Dave Jones. <davej@redhat.com> - * (C) 2002 Padraig Brady. <padraig@antefacto.com> - * - * Licensed under the terms of the GNU GPL License version 2. - * Based upon datasheets & sample CPUs kindly provided by VIA. - * - * VIA have currently 3 different versions of Longhaul. - * Version 1 (Longhaul) uses the BCR2 MSR at 0x1147. - * It is present only in Samuel 1 (C5A), Samuel 2 (C5B) stepping 0. - * Version 2 of longhaul is backward compatible with v1, but adds - * LONGHAUL MSR for purpose of both frequency and voltage scaling. - * Present in Samuel 2 (steppings 1-7 only) (C5B), and Ezra (C5C). - * Version 3 of longhaul got renamed to Powersaver and redesigned - * to use only the POWERSAVER MSR at 0x110a. - * It is present in Ezra-T (C5M), Nehemiah (C5X) and above. - * It's pretty much the same feature wise to longhaul v2, though - * there is provision for scaling FSB too, but this doesn't work - * too well in practice so we don't even try to use this. - * - * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/moduleparam.h> -#include <linux/init.h> -#include <linux/cpufreq.h> -#include <linux/pci.h> -#include <linux/slab.h> -#include <linux/string.h> -#include <linux/delay.h> -#include <linux/timex.h> -#include <linux/io.h> -#include <linux/acpi.h> - -#include <asm/msr.h> -#include <acpi/processor.h> - -#include "longhaul.h" - -#define PFX "longhaul: " - -#define TYPE_LONGHAUL_V1 1 -#define TYPE_LONGHAUL_V2 2 -#define TYPE_POWERSAVER 3 - -#define CPU_SAMUEL 1 -#define CPU_SAMUEL2 2 -#define CPU_EZRA 3 -#define CPU_EZRA_T 4 -#define CPU_NEHEMIAH 5 -#define CPU_NEHEMIAH_C 6 - -/* Flags */ -#define USE_ACPI_C3 (1 << 1) -#define USE_NORTHBRIDGE (1 << 2) - -static int cpu_model; -static unsigned int numscales = 16; -static unsigned int fsb; - -static const struct mV_pos *vrm_mV_table; -static const unsigned char *mV_vrm_table; - -static unsigned int highest_speed, lowest_speed; /* kHz */ -static unsigned int minmult, maxmult; -static int can_scale_voltage; -static struct acpi_processor *pr; -static struct acpi_processor_cx *cx; -static u32 acpi_regs_addr; -static u8 longhaul_flags; -static unsigned int longhaul_index; - -/* Module parameters */ -static int scale_voltage; -static int disable_acpi_c3; -static int revid_errata; - -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "longhaul", msg) - - -/* Clock ratios multiplied by 10 */ -static int mults[32]; -static int eblcr[32]; -static int longhaul_version; -static struct cpufreq_frequency_table *longhaul_table; - -#ifdef CONFIG_CPU_FREQ_DEBUG -static char speedbuffer[8]; - -static char *print_speed(int speed) -{ - if (speed < 1000) { - snprintf(speedbuffer, sizeof(speedbuffer), "%dMHz", speed); - return speedbuffer; - } - - if (speed%1000 == 0) - snprintf(speedbuffer, sizeof(speedbuffer), - "%dGHz", speed/1000); - else - snprintf(speedbuffer, sizeof(speedbuffer), - "%d.%dGHz", speed/1000, (speed%1000)/100); - - return speedbuffer; -} -#endif - - -static unsigned int calc_speed(int mult) -{ - int khz; - khz = (mult/10)*fsb; - if (mult%10) - khz += fsb/2; - khz *= 1000; - return khz; -} - - -static int longhaul_get_cpu_mult(void) -{ - unsigned long invalue = 0, lo, hi; - - rdmsr(MSR_IA32_EBL_CR_POWERON, lo, hi); - invalue = (lo & (1<<22|1<<23|1<<24|1<<25))>>22; - if (longhaul_version == TYPE_LONGHAUL_V2 || - longhaul_version == TYPE_POWERSAVER) { - if (lo & (1<<27)) - invalue += 16; - } - return eblcr[invalue]; -} - -/* For processor with BCR2 MSR */ - -static void do_longhaul1(unsigned int mults_index) -{ - union msr_bcr2 bcr2; - - rdmsrl(MSR_VIA_BCR2, bcr2.val); - /* Enable software clock multiplier */ - bcr2.bits.ESOFTBF = 1; - bcr2.bits.CLOCKMUL = mults_index & 0xff; - - /* Sync to timer tick */ - safe_halt(); - /* Change frequency on next halt or sleep */ - wrmsrl(MSR_VIA_BCR2, bcr2.val); - /* Invoke transition */ - ACPI_FLUSH_CPU_CACHE(); - halt(); - - /* Disable software clock multiplier */ - local_irq_disable(); - rdmsrl(MSR_VIA_BCR2, bcr2.val); - bcr2.bits.ESOFTBF = 0; - wrmsrl(MSR_VIA_BCR2, bcr2.val); -} - -/* For processor with Longhaul MSR */ - -static void do_powersaver(int cx_address, unsigned int mults_index, - unsigned int dir) -{ - union msr_longhaul longhaul; - u32 t; - - rdmsrl(MSR_VIA_LONGHAUL, longhaul.val); - /* Setup new frequency */ - if (!revid_errata) - longhaul.bits.RevisionKey = longhaul.bits.RevisionID; - else - longhaul.bits.RevisionKey = 0; - longhaul.bits.SoftBusRatio = mults_index & 0xf; - longhaul.bits.SoftBusRatio4 = (mults_index & 0x10) >> 4; - /* Setup new voltage */ - if (can_scale_voltage) - longhaul.bits.SoftVID = (mults_index >> 8) & 0x1f; - /* Sync to timer tick */ - safe_halt(); - /* Raise voltage if necessary */ - if (can_scale_voltage && dir) { - longhaul.bits.EnableSoftVID = 1; - wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); - /* Change voltage */ - if (!cx_address) { - ACPI_FLUSH_CPU_CACHE(); - halt(); - } else { - ACPI_FLUSH_CPU_CACHE(); - /* Invoke C3 */ - inb(cx_address); - /* Dummy op - must do something useless after P_LVL3 - * read */ - t = inl(acpi_gbl_FADT.xpm_timer_block.address); - } - longhaul.bits.EnableSoftVID = 0; - wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); - } - - /* Change frequency on next halt or sleep */ - longhaul.bits.EnableSoftBusRatio = 1; - wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); - if (!cx_address) { - ACPI_FLUSH_CPU_CACHE(); - halt(); - } else { - ACPI_FLUSH_CPU_CACHE(); - /* Invoke C3 */ - inb(cx_address); - /* Dummy op - must do something useless after P_LVL3 read */ - t = inl(acpi_gbl_FADT.xpm_timer_block.address); - } - /* Disable bus ratio bit */ - longhaul.bits.EnableSoftBusRatio = 0; - wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); - - /* Reduce voltage if necessary */ - if (can_scale_voltage && !dir) { - longhaul.bits.EnableSoftVID = 1; - wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); - /* Change voltage */ - if (!cx_address) { - ACPI_FLUSH_CPU_CACHE(); - halt(); - } else { - ACPI_FLUSH_CPU_CACHE(); - /* Invoke C3 */ - inb(cx_address); - /* Dummy op - must do something useless after P_LVL3 - * read */ - t = inl(acpi_gbl_FADT.xpm_timer_block.address); - } - longhaul.bits.EnableSoftVID = 0; - wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); - } -} - -/** - * longhaul_set_cpu_frequency() - * @mults_index : bitpattern of the new multiplier. - * - * Sets a new clock ratio. - */ - -static void longhaul_setstate(unsigned int table_index) -{ - unsigned int mults_index; - int speed, mult; - struct cpufreq_freqs freqs; - unsigned long flags; - unsigned int pic1_mask, pic2_mask; - u16 bm_status = 0; - u32 bm_timeout = 1000; - unsigned int dir = 0; - - mults_index = longhaul_table[table_index].index; - /* Safety precautions */ - mult = mults[mults_index & 0x1f]; - if (mult == -1) - return; - speed = calc_speed(mult); - if ((speed > highest_speed) || (speed < lowest_speed)) - return; - /* Voltage transition before frequency transition? */ - if (can_scale_voltage && longhaul_index < table_index) - dir = 1; - - freqs.old = calc_speed(longhaul_get_cpu_mult()); - freqs.new = speed; - freqs.cpu = 0; /* longhaul.c is UP only driver */ - - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - - dprintk("Setting to FSB:%dMHz Mult:%d.%dx (%s)\n", - fsb, mult/10, mult%10, print_speed(speed/1000)); -retry_loop: - preempt_disable(); - local_irq_save(flags); - - pic2_mask = inb(0xA1); - pic1_mask = inb(0x21); /* works on C3. save mask. */ - outb(0xFF, 0xA1); /* Overkill */ - outb(0xFE, 0x21); /* TMR0 only */ - - /* Wait while PCI bus is busy. */ - if (acpi_regs_addr && (longhaul_flags & USE_NORTHBRIDGE - || ((pr != NULL) && pr->flags.bm_control))) { - bm_status = inw(acpi_regs_addr); - bm_status &= 1 << 4; - while (bm_status && bm_timeout) { - outw(1 << 4, acpi_regs_addr); - bm_timeout--; - bm_status = inw(acpi_regs_addr); - bm_status &= 1 << 4; - } - } - - if (longhaul_flags & USE_NORTHBRIDGE) { - /* Disable AGP and PCI arbiters */ - outb(3, 0x22); - } else if ((pr != NULL) && pr->flags.bm_control) { - /* Disable bus master arbitration */ - acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 1); - } - switch (longhaul_version) { - - /* - * Longhaul v1. (Samuel[C5A] and Samuel2 stepping 0[C5B]) - * Software controlled multipliers only. - */ - case TYPE_LONGHAUL_V1: - do_longhaul1(mults_index); - break; - - /* - * Longhaul v2 appears in Samuel2 Steppings 1->7 [C5B] and Ezra [C5C] - * - * Longhaul v3 (aka Powersaver). (Ezra-T [C5M] & Nehemiah [C5N]) - * Nehemiah can do FSB scaling too, but this has never been proven - * to work in practice. - */ - case TYPE_LONGHAUL_V2: - case TYPE_POWERSAVER: - if (longhaul_flags & USE_ACPI_C3) { - /* Don't allow wakeup */ - acpi_write_bit_register(ACPI_BITREG_BUS_MASTER_RLD, 0); - do_powersaver(cx->address, mults_index, dir); - } else { - do_powersaver(0, mults_index, dir); - } - break; - } - - if (longhaul_flags & USE_NORTHBRIDGE) { - /* Enable arbiters */ - outb(0, 0x22); - } else if ((pr != NULL) && pr->flags.bm_control) { - /* Enable bus master arbitration */ - acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 0); - } - outb(pic2_mask, 0xA1); /* restore mask */ - outb(pic1_mask, 0x21); - - local_irq_restore(flags); - preempt_enable(); - - freqs.new = calc_speed(longhaul_get_cpu_mult()); - /* Check if requested frequency is set. */ - if (unlikely(freqs.new != speed)) { - printk(KERN_INFO PFX "Failed to set requested frequency!\n"); - /* Revision ID = 1 but processor is expecting revision key - * equal to 0. Jumpers at the bottom of processor will change - * multiplier and FSB, but will not change bits in Longhaul - * MSR nor enable voltage scaling. */ - if (!revid_errata) { - printk(KERN_INFO PFX "Enabling \"Ignore Revision ID\" " - "option.\n"); - revid_errata = 1; - msleep(200); - goto retry_loop; - } - /* Why ACPI C3 sometimes doesn't work is a mystery for me. - * But it does happen. Processor is entering ACPI C3 state, - * but it doesn't change frequency. I tried poking various - * bits in northbridge registers, but without success. */ - if (longhaul_flags & USE_ACPI_C3) { - printk(KERN_INFO PFX "Disabling ACPI C3 support.\n"); - longhaul_flags &= ~USE_ACPI_C3; - if (revid_errata) { - printk(KERN_INFO PFX "Disabling \"Ignore " - "Revision ID\" option.\n"); - revid_errata = 0; - } - msleep(200); - goto retry_loop; - } - /* This shouldn't happen. Longhaul ver. 2 was reported not - * working on processors without voltage scaling, but with - * RevID = 1. RevID errata will make things right. Just - * to be 100% sure. */ - if (longhaul_version == TYPE_LONGHAUL_V2) { - printk(KERN_INFO PFX "Switching to Longhaul ver. 1\n"); - longhaul_version = TYPE_LONGHAUL_V1; - msleep(200); - goto retry_loop; - } - } - /* Report true CPU frequency */ - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - - if (!bm_timeout) - printk(KERN_INFO PFX "Warning: Timeout while waiting for " - "idle PCI bus.\n"); -} - -/* - * Centaur decided to make life a little more tricky. - * Only longhaul v1 is allowed to read EBLCR BSEL[0:1]. - * Samuel2 and above have to try and guess what the FSB is. - * We do this by assuming we booted at maximum multiplier, and interpolate - * between that value multiplied by possible FSBs and cpu_mhz which - * was calculated at boot time. Really ugly, but no other way to do this. - */ - -#define ROUNDING 0xf - -static int guess_fsb(int mult) -{ - int speed = cpu_khz / 1000; - int i; - int speeds[] = { 666, 1000, 1333, 2000 }; - int f_max, f_min; - - for (i = 0; i < 4; i++) { - f_max = ((speeds[i] * mult) + 50) / 100; - f_max += (ROUNDING / 2); - f_min = f_max - ROUNDING; - if ((speed <= f_max) && (speed >= f_min)) - return speeds[i] / 10; - } - return 0; -} - - -static int __cpuinit longhaul_get_ranges(void) -{ - unsigned int i, j, k = 0; - unsigned int ratio; - int mult; - - /* Get current frequency */ - mult = longhaul_get_cpu_mult(); - if (mult == -1) { - printk(KERN_INFO PFX "Invalid (reserved) multiplier!\n"); - return -EINVAL; - } - fsb = guess_fsb(mult); - if (fsb == 0) { - printk(KERN_INFO PFX "Invalid (reserved) FSB!\n"); - return -EINVAL; - } - /* Get max multiplier - as we always did. - * Longhaul MSR is usefull only when voltage scaling is enabled. - * C3 is booting at max anyway. */ - maxmult = mult; - /* Get min multiplier */ - switch (cpu_model) { - case CPU_NEHEMIAH: - minmult = 50; - break; - case CPU_NEHEMIAH_C: - minmult = 40; - break; - default: - minmult = 30; - break; - } - - dprintk("MinMult:%d.%dx MaxMult:%d.%dx\n", - minmult/10, minmult%10, maxmult/10, maxmult%10); - - highest_speed = calc_speed(maxmult); - lowest_speed = calc_speed(minmult); - dprintk("FSB:%dMHz Lowest speed: %s Highest speed:%s\n", fsb, - print_speed(lowest_speed/1000), - print_speed(highest_speed/1000)); - - if (lowest_speed == highest_speed) { - printk(KERN_INFO PFX "highestspeed == lowest, aborting.\n"); - return -EINVAL; - } - if (lowest_speed > highest_speed) { - printk(KERN_INFO PFX "nonsense! lowest (%d > %d) !\n", - lowest_speed, highest_speed); - return -EINVAL; - } - - longhaul_table = kmalloc((numscales + 1) * sizeof(*longhaul_table), - GFP_KERNEL); - if (!longhaul_table) - return -ENOMEM; - - for (j = 0; j < numscales; j++) { - ratio = mults[j]; - if (ratio == -1) - continue; - if (ratio > maxmult || ratio < minmult) - continue; - longhaul_table[k].frequency = calc_speed(ratio); - longhaul_table[k].index = j; - k++; - } - if (k <= 1) { - kfree(longhaul_table); - return -ENODEV; - } - /* Sort */ - for (j = 0; j < k - 1; j++) { - unsigned int min_f, min_i; - min_f = longhaul_table[j].frequency; - min_i = j; - for (i = j + 1; i < k; i++) { - if (longhaul_table[i].frequency < min_f) { - min_f = longhaul_table[i].frequency; - min_i = i; - } - } - if (min_i != j) { - swap(longhaul_table[j].frequency, - longhaul_table[min_i].frequency); - swap(longhaul_table[j].index, - longhaul_table[min_i].index); - } - } - - longhaul_table[k].frequency = CPUFREQ_TABLE_END; - - /* Find index we are running on */ - for (j = 0; j < k; j++) { - if (mults[longhaul_table[j].index & 0x1f] == mult) { - longhaul_index = j; - break; - } - } - return 0; -} - - -static void __cpuinit longhaul_setup_voltagescaling(void) -{ - union msr_longhaul longhaul; - struct mV_pos minvid, maxvid, vid; - unsigned int j, speed, pos, kHz_step, numvscales; - int min_vid_speed; - - rdmsrl(MSR_VIA_LONGHAUL, longhaul.val); - if (!(longhaul.bits.RevisionID & 1)) { - printk(KERN_INFO PFX "Voltage scaling not supported by CPU.\n"); - return; - } - - if (!longhaul.bits.VRMRev) { - printk(KERN_INFO PFX "VRM 8.5\n"); - vrm_mV_table = &vrm85_mV[0]; - mV_vrm_table = &mV_vrm85[0]; - } else { - printk(KERN_INFO PFX "Mobile VRM\n"); - if (cpu_model < CPU_NEHEMIAH) - return; - vrm_mV_table = &mobilevrm_mV[0]; - mV_vrm_table = &mV_mobilevrm[0]; - } - - minvid = vrm_mV_table[longhaul.bits.MinimumVID]; - maxvid = vrm_mV_table[longhaul.bits.MaximumVID]; - - if (minvid.mV == 0 || maxvid.mV == 0 || minvid.mV > maxvid.mV) { - printk(KERN_INFO PFX "Bogus values Min:%d.%03d Max:%d.%03d. " - "Voltage scaling disabled.\n", - minvid.mV/1000, minvid.mV%1000, - maxvid.mV/1000, maxvid.mV%1000); - return; - } - - if (minvid.mV == maxvid.mV) { - printk(KERN_INFO PFX "Claims to support voltage scaling but " - "min & max are both %d.%03d. " - "Voltage scaling disabled\n", - maxvid.mV/1000, maxvid.mV%1000); - return; - } - - /* How many voltage steps*/ - numvscales = maxvid.pos - minvid.pos + 1; - printk(KERN_INFO PFX - "Max VID=%d.%03d " - "Min VID=%d.%03d, " - "%d possible voltage scales\n", - maxvid.mV/1000, maxvid.mV%1000, - minvid.mV/1000, minvid.mV%1000, - numvscales); - - /* Calculate max frequency at min voltage */ - j = longhaul.bits.MinMHzBR; - if (longhaul.bits.MinMHzBR4) - j += 16; - min_vid_speed = eblcr[j]; - if (min_vid_speed == -1) - return; - switch (longhaul.bits.MinMHzFSB) { - case 0: - min_vid_speed *= 13333; - break; - case 1: - min_vid_speed *= 10000; - break; - case 3: - min_vid_speed *= 6666; - break; - default: - return; - break; - } - if (min_vid_speed >= highest_speed) - return; - /* Calculate kHz for one voltage step */ - kHz_step = (highest_speed - min_vid_speed) / numvscales; - - j = 0; - while (longhaul_table[j].frequency != CPUFREQ_TABLE_END) { - speed = longhaul_table[j].frequency; - if (speed > min_vid_speed) - pos = (speed - min_vid_speed) / kHz_step + minvid.pos; - else - pos = minvid.pos; - longhaul_table[j].index |= mV_vrm_table[pos] << 8; - vid = vrm_mV_table[mV_vrm_table[pos]]; - printk(KERN_INFO PFX "f: %d kHz, index: %d, vid: %d mV\n", - speed, j, vid.mV); - j++; - } - - can_scale_voltage = 1; - printk(KERN_INFO PFX "Voltage scaling enabled.\n"); -} - - -static int longhaul_verify(struct cpufreq_policy *policy) -{ - return cpufreq_frequency_table_verify(policy, longhaul_table); -} - - -static int longhaul_target(struct cpufreq_policy *policy, - unsigned int target_freq, unsigned int relation) -{ - unsigned int table_index = 0; - unsigned int i; - unsigned int dir = 0; - u8 vid, current_vid; - - if (cpufreq_frequency_table_target(policy, longhaul_table, target_freq, - relation, &table_index)) - return -EINVAL; - - /* Don't set same frequency again */ - if (longhaul_index == table_index) - return 0; - - if (!can_scale_voltage) - longhaul_setstate(table_index); - else { - /* On test system voltage transitions exceeding single - * step up or down were turning motherboard off. Both - * "ondemand" and "userspace" are unsafe. C7 is doing - * this in hardware, C3 is old and we need to do this - * in software. */ - i = longhaul_index; - current_vid = (longhaul_table[longhaul_index].index >> 8); - current_vid &= 0x1f; - if (table_index > longhaul_index) - dir = 1; - while (i != table_index) { - vid = (longhaul_table[i].index >> 8) & 0x1f; - if (vid != current_vid) { - longhaul_setstate(i); - current_vid = vid; - msleep(200); - } - if (dir) - i++; - else - i--; - } - longhaul_setstate(table_index); - } - longhaul_index = table_index; - return 0; -} - - -static unsigned int longhaul_get(unsigned int cpu) -{ - if (cpu) - return 0; - return calc_speed(longhaul_get_cpu_mult()); -} - -static acpi_status longhaul_walk_callback(acpi_handle obj_handle, - u32 nesting_level, - void *context, void **return_value) -{ - struct acpi_device *d; - - if (acpi_bus_get_device(obj_handle, &d)) - return 0; - - *return_value = acpi_driver_data(d); - return 1; -} - -/* VIA don't support PM2 reg, but have something similar */ -static int enable_arbiter_disable(void) -{ - struct pci_dev *dev; - int status = 1; - int reg; - u8 pci_cmd; - - /* Find PLE133 host bridge */ - reg = 0x78; - dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8601_0, - NULL); - /* Find PM133/VT8605 host bridge */ - if (dev == NULL) - dev = pci_get_device(PCI_VENDOR_ID_VIA, - PCI_DEVICE_ID_VIA_8605_0, NULL); - /* Find CLE266 host bridge */ - if (dev == NULL) { - reg = 0x76; - dev = pci_get_device(PCI_VENDOR_ID_VIA, - PCI_DEVICE_ID_VIA_862X_0, NULL); - /* Find CN400 V-Link host bridge */ - if (dev == NULL) - dev = pci_get_device(PCI_VENDOR_ID_VIA, 0x7259, NULL); - } - if (dev != NULL) { - /* Enable access to port 0x22 */ - pci_read_config_byte(dev, reg, &pci_cmd); - if (!(pci_cmd & 1<<7)) { - pci_cmd |= 1<<7; - pci_write_config_byte(dev, reg, pci_cmd); - pci_read_config_byte(dev, reg, &pci_cmd); - if (!(pci_cmd & 1<<7)) { - printk(KERN_ERR PFX - "Can't enable access to port 0x22.\n"); - status = 0; - } - } - pci_dev_put(dev); - return status; - } - return 0; -} - -static int longhaul_setup_southbridge(void) -{ - struct pci_dev *dev; - u8 pci_cmd; - - /* Find VT8235 southbridge */ - dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, NULL); - if (dev == NULL) - /* Find VT8237 southbridge */ - dev = pci_get_device(PCI_VENDOR_ID_VIA, - PCI_DEVICE_ID_VIA_8237, NULL); - if (dev != NULL) { - /* Set transition time to max */ - pci_read_config_byte(dev, 0xec, &pci_cmd); - pci_cmd &= ~(1 << 2); - pci_write_config_byte(dev, 0xec, pci_cmd); - pci_read_config_byte(dev, 0xe4, &pci_cmd); - pci_cmd &= ~(1 << 7); - pci_write_config_byte(dev, 0xe4, pci_cmd); - pci_read_config_byte(dev, 0xe5, &pci_cmd); - pci_cmd |= 1 << 7; - pci_write_config_byte(dev, 0xe5, pci_cmd); - /* Get address of ACPI registers block*/ - pci_read_config_byte(dev, 0x81, &pci_cmd); - if (pci_cmd & 1 << 7) { - pci_read_config_dword(dev, 0x88, &acpi_regs_addr); - acpi_regs_addr &= 0xff00; - printk(KERN_INFO PFX "ACPI I/O at 0x%x\n", - acpi_regs_addr); - } - - pci_dev_put(dev); - return 1; - } - return 0; -} - -static int __cpuinit longhaul_cpu_init(struct cpufreq_policy *policy) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - char *cpuname = NULL; - int ret; - u32 lo, hi; - - /* Check what we have on this motherboard */ - switch (c->x86_model) { - case 6: - cpu_model = CPU_SAMUEL; - cpuname = "C3 'Samuel' [C5A]"; - longhaul_version = TYPE_LONGHAUL_V1; - memcpy(mults, samuel1_mults, sizeof(samuel1_mults)); - memcpy(eblcr, samuel1_eblcr, sizeof(samuel1_eblcr)); - break; - - case 7: - switch (c->x86_mask) { - case 0: - longhaul_version = TYPE_LONGHAUL_V1; - cpu_model = CPU_SAMUEL2; - cpuname = "C3 'Samuel 2' [C5B]"; - /* Note, this is not a typo, early Samuel2's had - * Samuel1 ratios. */ - memcpy(mults, samuel1_mults, sizeof(samuel1_mults)); - memcpy(eblcr, samuel2_eblcr, sizeof(samuel2_eblcr)); - break; - case 1 ... 15: - longhaul_version = TYPE_LONGHAUL_V2; - if (c->x86_mask < 8) { - cpu_model = CPU_SAMUEL2; - cpuname = "C3 'Samuel 2' [C5B]"; - } else { - cpu_model = CPU_EZRA; - cpuname = "C3 'Ezra' [C5C]"; - } - memcpy(mults, ezra_mults, sizeof(ezra_mults)); - memcpy(eblcr, ezra_eblcr, sizeof(ezra_eblcr)); - break; - } - break; - - case 8: - cpu_model = CPU_EZRA_T; - cpuname = "C3 'Ezra-T' [C5M]"; - longhaul_version = TYPE_POWERSAVER; - numscales = 32; - memcpy(mults, ezrat_mults, sizeof(ezrat_mults)); - memcpy(eblcr, ezrat_eblcr, sizeof(ezrat_eblcr)); - break; - - case 9: - longhaul_version = TYPE_POWERSAVER; - numscales = 32; - memcpy(mults, nehemiah_mults, sizeof(nehemiah_mults)); - memcpy(eblcr, nehemiah_eblcr, sizeof(nehemiah_eblcr)); - switch (c->x86_mask) { - case 0 ... 1: - cpu_model = CPU_NEHEMIAH; - cpuname = "C3 'Nehemiah A' [C5XLOE]"; - break; - case 2 ... 4: - cpu_model = CPU_NEHEMIAH; - cpuname = "C3 'Nehemiah B' [C5XLOH]"; - break; - case 5 ... 15: - cpu_model = CPU_NEHEMIAH_C; - cpuname = "C3 'Nehemiah C' [C5P]"; - break; - } - break; - - default: - cpuname = "Unknown"; - break; - } - /* Check Longhaul ver. 2 */ - if (longhaul_version == TYPE_LONGHAUL_V2) { - rdmsr(MSR_VIA_LONGHAUL, lo, hi); - if (lo == 0 && hi == 0) - /* Looks like MSR isn't present */ - longhaul_version = TYPE_LONGHAUL_V1; - } - - printk(KERN_INFO PFX "VIA %s CPU detected. ", cpuname); - switch (longhaul_version) { - case TYPE_LONGHAUL_V1: - case TYPE_LONGHAUL_V2: - printk(KERN_CONT "Longhaul v%d supported.\n", longhaul_version); - break; - case TYPE_POWERSAVER: - printk(KERN_CONT "Powersaver supported.\n"); - break; - }; - - /* Doesn't hurt */ - longhaul_setup_southbridge(); - - /* Find ACPI data for processor */ - acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT, - ACPI_UINT32_MAX, &longhaul_walk_callback, NULL, - NULL, (void *)&pr); - - /* Check ACPI support for C3 state */ - if (pr != NULL && longhaul_version == TYPE_POWERSAVER) { - cx = &pr->power.states[ACPI_STATE_C3]; - if (cx->address > 0 && cx->latency <= 1000) - longhaul_flags |= USE_ACPI_C3; - } - /* Disable if it isn't working */ - if (disable_acpi_c3) - longhaul_flags &= ~USE_ACPI_C3; - /* Check if northbridge is friendly */ - if (enable_arbiter_disable()) - longhaul_flags |= USE_NORTHBRIDGE; - - /* Check ACPI support for bus master arbiter disable */ - if (!(longhaul_flags & USE_ACPI_C3 - || longhaul_flags & USE_NORTHBRIDGE) - && ((pr == NULL) || !(pr->flags.bm_control))) { - printk(KERN_ERR PFX - "No ACPI support. Unsupported northbridge.\n"); - return -ENODEV; - } - - if (longhaul_flags & USE_NORTHBRIDGE) - printk(KERN_INFO PFX "Using northbridge support.\n"); - if (longhaul_flags & USE_ACPI_C3) - printk(KERN_INFO PFX "Using ACPI support.\n"); - - ret = longhaul_get_ranges(); - if (ret != 0) - return ret; - - if ((longhaul_version != TYPE_LONGHAUL_V1) && (scale_voltage != 0)) - longhaul_setup_voltagescaling(); - - policy->cpuinfo.transition_latency = 200000; /* nsec */ - policy->cur = calc_speed(longhaul_get_cpu_mult()); - - ret = cpufreq_frequency_table_cpuinfo(policy, longhaul_table); - if (ret) - return ret; - - cpufreq_frequency_table_get_attr(longhaul_table, policy->cpu); - - return 0; -} - -static int __devexit longhaul_cpu_exit(struct cpufreq_policy *policy) -{ - cpufreq_frequency_table_put_attr(policy->cpu); - return 0; -} - -static struct freq_attr *longhaul_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; - -static struct cpufreq_driver longhaul_driver = { - .verify = longhaul_verify, - .target = longhaul_target, - .get = longhaul_get, - .init = longhaul_cpu_init, - .exit = __devexit_p(longhaul_cpu_exit), - .name = "longhaul", - .owner = THIS_MODULE, - .attr = longhaul_attr, -}; - - -static int __init longhaul_init(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - - if (c->x86_vendor != X86_VENDOR_CENTAUR || c->x86 != 6) - return -ENODEV; - -#ifdef CONFIG_SMP - if (num_online_cpus() > 1) { - printk(KERN_ERR PFX "More than 1 CPU detected, " - "longhaul disabled.\n"); - return -ENODEV; - } -#endif -#ifdef CONFIG_X86_IO_APIC - if (cpu_has_apic) { - printk(KERN_ERR PFX "APIC detected. Longhaul is currently " - "broken in this configuration.\n"); - return -ENODEV; - } -#endif - switch (c->x86_model) { - case 6 ... 9: - return cpufreq_register_driver(&longhaul_driver); - case 10: - printk(KERN_ERR PFX "Use acpi-cpufreq driver for VIA C7\n"); - default: - ; - } - - return -ENODEV; -} - - -static void __exit longhaul_exit(void) -{ - int i; - - for (i = 0; i < numscales; i++) { - if (mults[i] == maxmult) { - longhaul_setstate(i); - break; - } - } - - cpufreq_unregister_driver(&longhaul_driver); - kfree(longhaul_table); -} - -/* Even if BIOS is exporting ACPI C3 state, and it is used - * with success when CPU is idle, this state doesn't - * trigger frequency transition in some cases. */ -module_param(disable_acpi_c3, int, 0644); -MODULE_PARM_DESC(disable_acpi_c3, "Don't use ACPI C3 support"); -/* Change CPU voltage with frequency. Very usefull to save - * power, but most VIA C3 processors aren't supporting it. */ -module_param(scale_voltage, int, 0644); -MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor"); -/* Force revision key to 0 for processors which doesn't - * support voltage scaling, but are introducing itself as - * such. */ -module_param(revid_errata, int, 0644); -MODULE_PARM_DESC(revid_errata, "Ignore CPU Revision ID"); - -MODULE_AUTHOR("Dave Jones <davej@redhat.com>"); -MODULE_DESCRIPTION("Longhaul driver for VIA Cyrix processors."); -MODULE_LICENSE("GPL"); - -late_initcall(longhaul_init); -module_exit(longhaul_exit); diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.h b/arch/x86/kernel/cpu/cpufreq/longhaul.h deleted file mode 100644 index cbf48fbca88..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/longhaul.h +++ /dev/null @@ -1,353 +0,0 @@ -/* - * longhaul.h - * (C) 2003 Dave Jones. - * - * Licensed under the terms of the GNU GPL License version 2. - * - * VIA-specific information - */ - -union msr_bcr2 { - struct { - unsigned Reseved:19, // 18:0 - ESOFTBF:1, // 19 - Reserved2:3, // 22:20 - CLOCKMUL:4, // 26:23 - Reserved3:5; // 31:27 - } bits; - unsigned long val; -}; - -union msr_longhaul { - struct { - unsigned RevisionID:4, // 3:0 - RevisionKey:4, // 7:4 - EnableSoftBusRatio:1, // 8 - EnableSoftVID:1, // 9 - EnableSoftBSEL:1, // 10 - Reserved:3, // 11:13 - SoftBusRatio4:1, // 14 - VRMRev:1, // 15 - SoftBusRatio:4, // 19:16 - SoftVID:5, // 24:20 - Reserved2:3, // 27:25 - SoftBSEL:2, // 29:28 - Reserved3:2, // 31:30 - MaxMHzBR:4, // 35:32 - MaximumVID:5, // 40:36 - MaxMHzFSB:2, // 42:41 - MaxMHzBR4:1, // 43 - Reserved4:4, // 47:44 - MinMHzBR:4, // 51:48 - MinimumVID:5, // 56:52 - MinMHzFSB:2, // 58:57 - MinMHzBR4:1, // 59 - Reserved5:4; // 63:60 - } bits; - unsigned long long val; -}; - -/* - * Clock ratio tables. Div/Mod by 10 to get ratio. - * The eblcr values specify the ratio read from the CPU. - * The mults values specify what to write to the CPU. - */ - -/* - * VIA C3 Samuel 1 & Samuel 2 (stepping 0) - */ -static const int __cpuinitdata samuel1_mults[16] = { - -1, /* 0000 -> RESERVED */ - 30, /* 0001 -> 3.0x */ - 40, /* 0010 -> 4.0x */ - -1, /* 0011 -> RESERVED */ - -1, /* 0100 -> RESERVED */ - 35, /* 0101 -> 3.5x */ - 45, /* 0110 -> 4.5x */ - 55, /* 0111 -> 5.5x */ - 60, /* 1000 -> 6.0x */ - 70, /* 1001 -> 7.0x */ - 80, /* 1010 -> 8.0x */ - 50, /* 1011 -> 5.0x */ - 65, /* 1100 -> 6.5x */ - 75, /* 1101 -> 7.5x */ - -1, /* 1110 -> RESERVED */ - -1, /* 1111 -> RESERVED */ -}; - -static const int __cpuinitdata samuel1_eblcr[16] = { - 50, /* 0000 -> RESERVED */ - 30, /* 0001 -> 3.0x */ - 40, /* 0010 -> 4.0x */ - -1, /* 0011 -> RESERVED */ - 55, /* 0100 -> 5.5x */ - 35, /* 0101 -> 3.5x */ - 45, /* 0110 -> 4.5x */ - -1, /* 0111 -> RESERVED */ - -1, /* 1000 -> RESERVED */ - 70, /* 1001 -> 7.0x */ - 80, /* 1010 -> 8.0x */ - 60, /* 1011 -> 6.0x */ - -1, /* 1100 -> RESERVED */ - 75, /* 1101 -> 7.5x */ - -1, /* 1110 -> RESERVED */ - 65, /* 1111 -> 6.5x */ -}; - -/* - * VIA C3 Samuel2 Stepping 1->15 - */ -static const int __cpuinitdata samuel2_eblcr[16] = { - 50, /* 0000 -> 5.0x */ - 30, /* 0001 -> 3.0x */ - 40, /* 0010 -> 4.0x */ - 100, /* 0011 -> 10.0x */ - 55, /* 0100 -> 5.5x */ - 35, /* 0101 -> 3.5x */ - 45, /* 0110 -> 4.5x */ - 110, /* 0111 -> 11.0x */ - 90, /* 1000 -> 9.0x */ - 70, /* 1001 -> 7.0x */ - 80, /* 1010 -> 8.0x */ - 60, /* 1011 -> 6.0x */ - 120, /* 1100 -> 12.0x */ - 75, /* 1101 -> 7.5x */ - 130, /* 1110 -> 13.0x */ - 65, /* 1111 -> 6.5x */ -}; - -/* - * VIA C3 Ezra - */ -static const int __cpuinitdata ezra_mults[16] = { - 100, /* 0000 -> 10.0x */ - 30, /* 0001 -> 3.0x */ - 40, /* 0010 -> 4.0x */ - 90, /* 0011 -> 9.0x */ - 95, /* 0100 -> 9.5x */ - 35, /* 0101 -> 3.5x */ - 45, /* 0110 -> 4.5x */ - 55, /* 0111 -> 5.5x */ - 60, /* 1000 -> 6.0x */ - 70, /* 1001 -> 7.0x */ - 80, /* 1010 -> 8.0x */ - 50, /* 1011 -> 5.0x */ - 65, /* 1100 -> 6.5x */ - 75, /* 1101 -> 7.5x */ - 85, /* 1110 -> 8.5x */ - 120, /* 1111 -> 12.0x */ -}; - -static const int __cpuinitdata ezra_eblcr[16] = { - 50, /* 0000 -> 5.0x */ - 30, /* 0001 -> 3.0x */ - 40, /* 0010 -> 4.0x */ - 100, /* 0011 -> 10.0x */ - 55, /* 0100 -> 5.5x */ - 35, /* 0101 -> 3.5x */ - 45, /* 0110 -> 4.5x */ - 95, /* 0111 -> 9.5x */ - 90, /* 1000 -> 9.0x */ - 70, /* 1001 -> 7.0x */ - 80, /* 1010 -> 8.0x */ - 60, /* 1011 -> 6.0x */ - 120, /* 1100 -> 12.0x */ - 75, /* 1101 -> 7.5x */ - 85, /* 1110 -> 8.5x */ - 65, /* 1111 -> 6.5x */ -}; - -/* - * VIA C3 (Ezra-T) [C5M]. - */ -static const int __cpuinitdata ezrat_mults[32] = { - 100, /* 0000 -> 10.0x */ - 30, /* 0001 -> 3.0x */ - 40, /* 0010 -> 4.0x */ - 90, /* 0011 -> 9.0x */ - 95, /* 0100 -> 9.5x */ - 35, /* 0101 -> 3.5x */ - 45, /* 0110 -> 4.5x */ - 55, /* 0111 -> 5.5x */ - 60, /* 1000 -> 6.0x */ - 70, /* 1001 -> 7.0x */ - 80, /* 1010 -> 8.0x */ - 50, /* 1011 -> 5.0x */ - 65, /* 1100 -> 6.5x */ - 75, /* 1101 -> 7.5x */ - 85, /* 1110 -> 8.5x */ - 120, /* 1111 -> 12.0x */ - - -1, /* 0000 -> RESERVED (10.0x) */ - 110, /* 0001 -> 11.0x */ - -1, /* 0010 -> 12.0x */ - -1, /* 0011 -> RESERVED (9.0x)*/ - 105, /* 0100 -> 10.5x */ - 115, /* 0101 -> 11.5x */ - 125, /* 0110 -> 12.5x */ - 135, /* 0111 -> 13.5x */ - 140, /* 1000 -> 14.0x */ - 150, /* 1001 -> 15.0x */ - 160, /* 1010 -> 16.0x */ - 130, /* 1011 -> 13.0x */ - 145, /* 1100 -> 14.5x */ - 155, /* 1101 -> 15.5x */ - -1, /* 1110 -> RESERVED (13.0x) */ - -1, /* 1111 -> RESERVED (12.0x) */ -}; - -static const int __cpuinitdata ezrat_eblcr[32] = { - 50, /* 0000 -> 5.0x */ - 30, /* 0001 -> 3.0x */ - 40, /* 0010 -> 4.0x */ - 100, /* 0011 -> 10.0x */ - 55, /* 0100 -> 5.5x */ - 35, /* 0101 -> 3.5x */ - 45, /* 0110 -> 4.5x */ - 95, /* 0111 -> 9.5x */ - 90, /* 1000 -> 9.0x */ - 70, /* 1001 -> 7.0x */ - 80, /* 1010 -> 8.0x */ - 60, /* 1011 -> 6.0x */ - 120, /* 1100 -> 12.0x */ - 75, /* 1101 -> 7.5x */ - 85, /* 1110 -> 8.5x */ - 65, /* 1111 -> 6.5x */ - - -1, /* 0000 -> RESERVED (9.0x) */ - 110, /* 0001 -> 11.0x */ - 120, /* 0010 -> 12.0x */ - -1, /* 0011 -> RESERVED (10.0x)*/ - 135, /* 0100 -> 13.5x */ - 115, /* 0101 -> 11.5x */ - 125, /* 0110 -> 12.5x */ - 105, /* 0111 -> 10.5x */ - 130, /* 1000 -> 13.0x */ - 150, /* 1001 -> 15.0x */ - 160, /* 1010 -> 16.0x */ - 140, /* 1011 -> 14.0x */ - -1, /* 1100 -> RESERVED (12.0x) */ - 155, /* 1101 -> 15.5x */ - -1, /* 1110 -> RESERVED (13.0x) */ - 145, /* 1111 -> 14.5x */ -}; - -/* - * VIA C3 Nehemiah */ - -static const int __cpuinitdata nehemiah_mults[32] = { - 100, /* 0000 -> 10.0x */ - -1, /* 0001 -> 16.0x */ - 40, /* 0010 -> 4.0x */ - 90, /* 0011 -> 9.0x */ - 95, /* 0100 -> 9.5x */ - -1, /* 0101 -> RESERVED */ - 45, /* 0110 -> 4.5x */ - 55, /* 0111 -> 5.5x */ - 60, /* 1000 -> 6.0x */ - 70, /* 1001 -> 7.0x */ - 80, /* 1010 -> 8.0x */ - 50, /* 1011 -> 5.0x */ - 65, /* 1100 -> 6.5x */ - 75, /* 1101 -> 7.5x */ - 85, /* 1110 -> 8.5x */ - 120, /* 1111 -> 12.0x */ - -1, /* 0000 -> 10.0x */ - 110, /* 0001 -> 11.0x */ - -1, /* 0010 -> 12.0x */ - -1, /* 0011 -> 9.0x */ - 105, /* 0100 -> 10.5x */ - 115, /* 0101 -> 11.5x */ - 125, /* 0110 -> 12.5x */ - 135, /* 0111 -> 13.5x */ - 140, /* 1000 -> 14.0x */ - 150, /* 1001 -> 15.0x */ - 160, /* 1010 -> 16.0x */ - 130, /* 1011 -> 13.0x */ - 145, /* 1100 -> 14.5x */ - 155, /* 1101 -> 15.5x */ - -1, /* 1110 -> RESERVED (13.0x) */ - -1, /* 1111 -> 12.0x */ -}; - -static const int __cpuinitdata nehemiah_eblcr[32] = { - 50, /* 0000 -> 5.0x */ - 160, /* 0001 -> 16.0x */ - 40, /* 0010 -> 4.0x */ - 100, /* 0011 -> 10.0x */ - 55, /* 0100 -> 5.5x */ - -1, /* 0101 -> RESERVED */ - 45, /* 0110 -> 4.5x */ - 95, /* 0111 -> 9.5x */ - 90, /* 1000 -> 9.0x */ - 70, /* 1001 -> 7.0x */ - 80, /* 1010 -> 8.0x */ - 60, /* 1011 -> 6.0x */ - 120, /* 1100 -> 12.0x */ - 75, /* 1101 -> 7.5x */ - 85, /* 1110 -> 8.5x */ - 65, /* 1111 -> 6.5x */ - 90, /* 0000 -> 9.0x */ - 110, /* 0001 -> 11.0x */ - 120, /* 0010 -> 12.0x */ - 100, /* 0011 -> 10.0x */ - 135, /* 0100 -> 13.5x */ - 115, /* 0101 -> 11.5x */ - 125, /* 0110 -> 12.5x */ - 105, /* 0111 -> 10.5x */ - 130, /* 1000 -> 13.0x */ - 150, /* 1001 -> 15.0x */ - 160, /* 1010 -> 16.0x */ - 140, /* 1011 -> 14.0x */ - 120, /* 1100 -> 12.0x */ - 155, /* 1101 -> 15.5x */ - -1, /* 1110 -> RESERVED (13.0x) */ - 145 /* 1111 -> 14.5x */ -}; - -/* - * Voltage scales. Div/Mod by 1000 to get actual voltage. - * Which scale to use depends on the VRM type in use. - */ - -struct mV_pos { - unsigned short mV; - unsigned short pos; -}; - -static const struct mV_pos __cpuinitdata vrm85_mV[32] = { - {1250, 8}, {1200, 6}, {1150, 4}, {1100, 2}, - {1050, 0}, {1800, 30}, {1750, 28}, {1700, 26}, - {1650, 24}, {1600, 22}, {1550, 20}, {1500, 18}, - {1450, 16}, {1400, 14}, {1350, 12}, {1300, 10}, - {1275, 9}, {1225, 7}, {1175, 5}, {1125, 3}, - {1075, 1}, {1825, 31}, {1775, 29}, {1725, 27}, - {1675, 25}, {1625, 23}, {1575, 21}, {1525, 19}, - {1475, 17}, {1425, 15}, {1375, 13}, {1325, 11} -}; - -static const unsigned char __cpuinitdata mV_vrm85[32] = { - 0x04, 0x14, 0x03, 0x13, 0x02, 0x12, 0x01, 0x11, - 0x00, 0x10, 0x0f, 0x1f, 0x0e, 0x1e, 0x0d, 0x1d, - 0x0c, 0x1c, 0x0b, 0x1b, 0x0a, 0x1a, 0x09, 0x19, - 0x08, 0x18, 0x07, 0x17, 0x06, 0x16, 0x05, 0x15 -}; - -static const struct mV_pos __cpuinitdata mobilevrm_mV[32] = { - {1750, 31}, {1700, 30}, {1650, 29}, {1600, 28}, - {1550, 27}, {1500, 26}, {1450, 25}, {1400, 24}, - {1350, 23}, {1300, 22}, {1250, 21}, {1200, 20}, - {1150, 19}, {1100, 18}, {1050, 17}, {1000, 16}, - {975, 15}, {950, 14}, {925, 13}, {900, 12}, - {875, 11}, {850, 10}, {825, 9}, {800, 8}, - {775, 7}, {750, 6}, {725, 5}, {700, 4}, - {675, 3}, {650, 2}, {625, 1}, {600, 0} -}; - -static const unsigned char __cpuinitdata mV_mobilevrm[32] = { - 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, - 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, - 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, - 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 -}; - diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c deleted file mode 100644 index fc09f142d94..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/longrun.c +++ /dev/null @@ -1,327 +0,0 @@ -/* - * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de> - * - * Licensed under the terms of the GNU GPL License version 2. - * - * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/cpufreq.h> -#include <linux/timex.h> - -#include <asm/msr.h> -#include <asm/processor.h> - -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "longrun", msg) - -static struct cpufreq_driver longrun_driver; - -/** - * longrun_{low,high}_freq is needed for the conversion of cpufreq kHz - * values into per cent values. In TMTA microcode, the following is valid: - * performance_pctg = (current_freq - low_freq)/(high_freq - low_freq) - */ -static unsigned int longrun_low_freq, longrun_high_freq; - - -/** - * longrun_get_policy - get the current LongRun policy - * @policy: struct cpufreq_policy where current policy is written into - * - * Reads the current LongRun policy by access to MSR_TMTA_LONGRUN_FLAGS - * and MSR_TMTA_LONGRUN_CTRL - */ -static void __init longrun_get_policy(struct cpufreq_policy *policy) -{ - u32 msr_lo, msr_hi; - - rdmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi); - dprintk("longrun flags are %x - %x\n", msr_lo, msr_hi); - if (msr_lo & 0x01) - policy->policy = CPUFREQ_POLICY_PERFORMANCE; - else - policy->policy = CPUFREQ_POLICY_POWERSAVE; - - rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi); - dprintk("longrun ctrl is %x - %x\n", msr_lo, msr_hi); - msr_lo &= 0x0000007F; - msr_hi &= 0x0000007F; - - if (longrun_high_freq <= longrun_low_freq) { - /* Assume degenerate Longrun table */ - policy->min = policy->max = longrun_high_freq; - } else { - policy->min = longrun_low_freq + msr_lo * - ((longrun_high_freq - longrun_low_freq) / 100); - policy->max = longrun_low_freq + msr_hi * - ((longrun_high_freq - longrun_low_freq) / 100); - } - policy->cpu = 0; -} - - -/** - * longrun_set_policy - sets a new CPUFreq policy - * @policy: new policy - * - * Sets a new CPUFreq policy on LongRun-capable processors. This function - * has to be called with cpufreq_driver locked. - */ -static int longrun_set_policy(struct cpufreq_policy *policy) -{ - u32 msr_lo, msr_hi; - u32 pctg_lo, pctg_hi; - - if (!policy) - return -EINVAL; - - if (longrun_high_freq <= longrun_low_freq) { - /* Assume degenerate Longrun table */ - pctg_lo = pctg_hi = 100; - } else { - pctg_lo = (policy->min - longrun_low_freq) / - ((longrun_high_freq - longrun_low_freq) / 100); - pctg_hi = (policy->max - longrun_low_freq) / - ((longrun_high_freq - longrun_low_freq) / 100); - } - - if (pctg_hi > 100) - pctg_hi = 100; - if (pctg_lo > pctg_hi) - pctg_lo = pctg_hi; - - /* performance or economy mode */ - rdmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi); - msr_lo &= 0xFFFFFFFE; - switch (policy->policy) { - case CPUFREQ_POLICY_PERFORMANCE: - msr_lo |= 0x00000001; - break; - case CPUFREQ_POLICY_POWERSAVE: - break; - } - wrmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi); - - /* lower and upper boundary */ - rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi); - msr_lo &= 0xFFFFFF80; - msr_hi &= 0xFFFFFF80; - msr_lo |= pctg_lo; - msr_hi |= pctg_hi; - wrmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi); - - return 0; -} - - -/** - * longrun_verify_poliy - verifies a new CPUFreq policy - * @policy: the policy to verify - * - * Validates a new CPUFreq policy. This function has to be called with - * cpufreq_driver locked. - */ -static int longrun_verify_policy(struct cpufreq_policy *policy) -{ - if (!policy) - return -EINVAL; - - policy->cpu = 0; - cpufreq_verify_within_limits(policy, - policy->cpuinfo.min_freq, - policy->cpuinfo.max_freq); - - if ((policy->policy != CPUFREQ_POLICY_POWERSAVE) && - (policy->policy != CPUFREQ_POLICY_PERFORMANCE)) - return -EINVAL; - - return 0; -} - -static unsigned int longrun_get(unsigned int cpu) -{ - u32 eax, ebx, ecx, edx; - - if (cpu) - return 0; - - cpuid(0x80860007, &eax, &ebx, &ecx, &edx); - dprintk("cpuid eax is %u\n", eax); - - return eax * 1000; -} - -/** - * longrun_determine_freqs - determines the lowest and highest possible core frequency - * @low_freq: an int to put the lowest frequency into - * @high_freq: an int to put the highest frequency into - * - * Determines the lowest and highest possible core frequencies on this CPU. - * This is necessary to calculate the performance percentage according to - * TMTA rules: - * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq) - */ -static unsigned int __cpuinit longrun_determine_freqs(unsigned int *low_freq, - unsigned int *high_freq) -{ - u32 msr_lo, msr_hi; - u32 save_lo, save_hi; - u32 eax, ebx, ecx, edx; - u32 try_hi; - struct cpuinfo_x86 *c = &cpu_data(0); - - if (!low_freq || !high_freq) - return -EINVAL; - - if (cpu_has(c, X86_FEATURE_LRTI)) { - /* if the LongRun Table Interface is present, the - * detection is a bit easier: - * For minimum frequency, read out the maximum - * level (msr_hi), write that into "currently - * selected level", and read out the frequency. - * For maximum frequency, read out level zero. - */ - /* minimum */ - rdmsr(MSR_TMTA_LRTI_READOUT, msr_lo, msr_hi); - wrmsr(MSR_TMTA_LRTI_READOUT, msr_hi, msr_hi); - rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi); - *low_freq = msr_lo * 1000; /* to kHz */ - - /* maximum */ - wrmsr(MSR_TMTA_LRTI_READOUT, 0, msr_hi); - rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi); - *high_freq = msr_lo * 1000; /* to kHz */ - - dprintk("longrun table interface told %u - %u kHz\n", - *low_freq, *high_freq); - - if (*low_freq > *high_freq) - *low_freq = *high_freq; - return 0; - } - - /* set the upper border to the value determined during TSC init */ - *high_freq = (cpu_khz / 1000); - *high_freq = *high_freq * 1000; - dprintk("high frequency is %u kHz\n", *high_freq); - - /* get current borders */ - rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi); - save_lo = msr_lo & 0x0000007F; - save_hi = msr_hi & 0x0000007F; - - /* if current perf_pctg is larger than 90%, we need to decrease the - * upper limit to make the calculation more accurate. - */ - cpuid(0x80860007, &eax, &ebx, &ecx, &edx); - /* try decreasing in 10% steps, some processors react only - * on some barrier values */ - for (try_hi = 80; try_hi > 0 && ecx > 90; try_hi -= 10) { - /* set to 0 to try_hi perf_pctg */ - msr_lo &= 0xFFFFFF80; - msr_hi &= 0xFFFFFF80; - msr_hi |= try_hi; - wrmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi); - - /* read out current core MHz and current perf_pctg */ - cpuid(0x80860007, &eax, &ebx, &ecx, &edx); - - /* restore values */ - wrmsr(MSR_TMTA_LONGRUN_CTRL, save_lo, save_hi); - } - dprintk("percentage is %u %%, freq is %u MHz\n", ecx, eax); - - /* performance_pctg = (current_freq - low_freq)/(high_freq - low_freq) - * eqals - * low_freq * (1 - perf_pctg) = (cur_freq - high_freq * perf_pctg) - * - * high_freq * perf_pctg is stored tempoarily into "ebx". - */ - ebx = (((cpu_khz / 1000) * ecx) / 100); /* to MHz */ - - if ((ecx > 95) || (ecx == 0) || (eax < ebx)) - return -EIO; - - edx = ((eax - ebx) * 100) / (100 - ecx); - *low_freq = edx * 1000; /* back to kHz */ - - dprintk("low frequency is %u kHz\n", *low_freq); - - if (*low_freq > *high_freq) - *low_freq = *high_freq; - - return 0; -} - - -static int __cpuinit longrun_cpu_init(struct cpufreq_policy *policy) -{ - int result = 0; - - /* capability check */ - if (policy->cpu != 0) - return -ENODEV; - - /* detect low and high frequency */ - result = longrun_determine_freqs(&longrun_low_freq, &longrun_high_freq); - if (result) - return result; - - /* cpuinfo and default policy values */ - policy->cpuinfo.min_freq = longrun_low_freq; - policy->cpuinfo.max_freq = longrun_high_freq; - policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; - longrun_get_policy(policy); - - return 0; -} - - -static struct cpufreq_driver longrun_driver = { - .flags = CPUFREQ_CONST_LOOPS, - .verify = longrun_verify_policy, - .setpolicy = longrun_set_policy, - .get = longrun_get, - .init = longrun_cpu_init, - .name = "longrun", - .owner = THIS_MODULE, -}; - - -/** - * longrun_init - initializes the Transmeta Crusoe LongRun CPUFreq driver - * - * Initializes the LongRun support. - */ -static int __init longrun_init(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - - if (c->x86_vendor != X86_VENDOR_TRANSMETA || - !cpu_has(c, X86_FEATURE_LONGRUN)) - return -ENODEV; - - return cpufreq_register_driver(&longrun_driver); -} - - -/** - * longrun_exit - unregisters LongRun support - */ -static void __exit longrun_exit(void) -{ - cpufreq_unregister_driver(&longrun_driver); -} - - -MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>"); -MODULE_DESCRIPTION("LongRun driver for Transmeta Crusoe and " - "Efficeon processors."); -MODULE_LICENSE("GPL"); - -module_init(longrun_init); -module_exit(longrun_exit); diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.c b/arch/x86/kernel/cpu/cpufreq/mperf.c deleted file mode 100644 index 911e193018a..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/mperf.c +++ /dev/null @@ -1,51 +0,0 @@ -#include <linux/kernel.h> -#include <linux/smp.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/cpufreq.h> -#include <linux/slab.h> - -#include "mperf.h" - -static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf); - -/* Called via smp_call_function_single(), on the target CPU */ -static void read_measured_perf_ctrs(void *_cur) -{ - struct aperfmperf *am = _cur; - - get_aperfmperf(am); -} - -/* - * Return the measured active (C0) frequency on this CPU since last call - * to this function. - * Input: cpu number - * Return: Average CPU frequency in terms of max frequency (zero on error) - * - * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance - * over a period of time, while CPU is in C0 state. - * IA32_MPERF counts at the rate of max advertised frequency - * IA32_APERF counts at the rate of actual CPU frequency - * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and - * no meaning should be associated with absolute values of these MSRs. - */ -unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy, - unsigned int cpu) -{ - struct aperfmperf perf; - unsigned long ratio; - unsigned int retval; - - if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1)) - return 0; - - ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf); - per_cpu(acfreq_old_perf, cpu) = perf; - - retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT; - - return retval; -} -EXPORT_SYMBOL_GPL(cpufreq_get_measured_perf); -MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.h b/arch/x86/kernel/cpu/cpufreq/mperf.h deleted file mode 100644 index 5dbf2950dc2..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/mperf.h +++ /dev/null @@ -1,9 +0,0 @@ -/* - * (c) 2010 Advanced Micro Devices, Inc. - * Your use of this code is subject to the terms and conditions of the - * GNU general public license version 2. See "COPYING" or - * http://www.gnu.org/licenses/gpl.html - */ - -unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy, - unsigned int cpu); diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c deleted file mode 100644 index bd1cac747f6..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c +++ /dev/null @@ -1,331 +0,0 @@ -/* - * Pentium 4/Xeon CPU on demand clock modulation/speed scaling - * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de> - * (C) 2002 Zwane Mwaikambo <zwane@commfireservices.com> - * (C) 2002 Arjan van de Ven <arjanv@redhat.com> - * (C) 2002 Tora T. Engstad - * All Rights Reserved - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * The author(s) of this software shall not be held liable for damages - * of any nature resulting due to the use of this software. This - * software is provided AS-IS with no warranties. - * - * Date Errata Description - * 20020525 N44, O17 12.5% or 25% DC causes lockup - * - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/smp.h> -#include <linux/cpufreq.h> -#include <linux/cpumask.h> -#include <linux/timex.h> - -#include <asm/processor.h> -#include <asm/msr.h> -#include <asm/timer.h> - -#include "speedstep-lib.h" - -#define PFX "p4-clockmod: " -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "p4-clockmod", msg) - -/* - * Duty Cycle (3bits), note DC_DISABLE is not specified in - * intel docs i just use it to mean disable - */ -enum { - DC_RESV, DC_DFLT, DC_25PT, DC_38PT, DC_50PT, - DC_64PT, DC_75PT, DC_88PT, DC_DISABLE -}; - -#define DC_ENTRIES 8 - - -static int has_N44_O17_errata[NR_CPUS]; -static unsigned int stock_freq; -static struct cpufreq_driver p4clockmod_driver; -static unsigned int cpufreq_p4_get(unsigned int cpu); - -static int cpufreq_p4_setdc(unsigned int cpu, unsigned int newstate) -{ - u32 l, h; - - if (!cpu_online(cpu) || - (newstate > DC_DISABLE) || (newstate == DC_RESV)) - return -EINVAL; - - rdmsr_on_cpu(cpu, MSR_IA32_THERM_STATUS, &l, &h); - - if (l & 0x01) - dprintk("CPU#%d currently thermal throttled\n", cpu); - - if (has_N44_O17_errata[cpu] && - (newstate == DC_25PT || newstate == DC_DFLT)) - newstate = DC_38PT; - - rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h); - if (newstate == DC_DISABLE) { - dprintk("CPU#%d disabling modulation\n", cpu); - wrmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, l & ~(1<<4), h); - } else { - dprintk("CPU#%d setting duty cycle to %d%%\n", - cpu, ((125 * newstate) / 10)); - /* bits 63 - 5 : reserved - * bit 4 : enable/disable - * bits 3-1 : duty cycle - * bit 0 : reserved - */ - l = (l & ~14); - l = l | (1<<4) | ((newstate & 0x7)<<1); - wrmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, l, h); - } - - return 0; -} - - -static struct cpufreq_frequency_table p4clockmod_table[] = { - {DC_RESV, CPUFREQ_ENTRY_INVALID}, - {DC_DFLT, 0}, - {DC_25PT, 0}, - {DC_38PT, 0}, - {DC_50PT, 0}, - {DC_64PT, 0}, - {DC_75PT, 0}, - {DC_88PT, 0}, - {DC_DISABLE, 0}, - {DC_RESV, CPUFREQ_TABLE_END}, -}; - - -static int cpufreq_p4_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) -{ - unsigned int newstate = DC_RESV; - struct cpufreq_freqs freqs; - int i; - - if (cpufreq_frequency_table_target(policy, &p4clockmod_table[0], - target_freq, relation, &newstate)) - return -EINVAL; - - freqs.old = cpufreq_p4_get(policy->cpu); - freqs.new = stock_freq * p4clockmod_table[newstate].index / 8; - - if (freqs.new == freqs.old) - return 0; - - /* notifiers */ - for_each_cpu(i, policy->cpus) { - freqs.cpu = i; - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - } - - /* run on each logical CPU, - * see section 13.15.3 of IA32 Intel Architecture Software - * Developer's Manual, Volume 3 - */ - for_each_cpu(i, policy->cpus) - cpufreq_p4_setdc(i, p4clockmod_table[newstate].index); - - /* notifiers */ - for_each_cpu(i, policy->cpus) { - freqs.cpu = i; - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - } - - return 0; -} - - -static int cpufreq_p4_verify(struct cpufreq_policy *policy) -{ - return cpufreq_frequency_table_verify(policy, &p4clockmod_table[0]); -} - - -static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c) -{ - if (c->x86 == 0x06) { - if (cpu_has(c, X86_FEATURE_EST)) - printk(KERN_WARNING PFX "Warning: EST-capable CPU " - "detected. The acpi-cpufreq module offers " - "voltage scaling in addition of frequency " - "scaling. You should use that instead of " - "p4-clockmod, if possible.\n"); - switch (c->x86_model) { - case 0x0E: /* Core */ - case 0x0F: /* Core Duo */ - case 0x16: /* Celeron Core */ - case 0x1C: /* Atom */ - p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS; - return speedstep_get_frequency(SPEEDSTEP_CPU_PCORE); - case 0x0D: /* Pentium M (Dothan) */ - p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS; - /* fall through */ - case 0x09: /* Pentium M (Banias) */ - return speedstep_get_frequency(SPEEDSTEP_CPU_PM); - } - } - - if (c->x86 != 0xF) - return 0; - - /* on P-4s, the TSC runs with constant frequency independent whether - * throttling is active or not. */ - p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS; - - if (speedstep_detect_processor() == SPEEDSTEP_CPU_P4M) { - printk(KERN_WARNING PFX "Warning: Pentium 4-M detected. " - "The speedstep-ich or acpi cpufreq modules offer " - "voltage scaling in addition of frequency scaling. " - "You should use either one instead of p4-clockmod, " - "if possible.\n"); - return speedstep_get_frequency(SPEEDSTEP_CPU_P4M); - } - - return speedstep_get_frequency(SPEEDSTEP_CPU_P4D); -} - - - -static int cpufreq_p4_cpu_init(struct cpufreq_policy *policy) -{ - struct cpuinfo_x86 *c = &cpu_data(policy->cpu); - int cpuid = 0; - unsigned int i; - -#ifdef CONFIG_SMP - cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu)); -#endif - - /* Errata workaround */ - cpuid = (c->x86 << 8) | (c->x86_model << 4) | c->x86_mask; - switch (cpuid) { - case 0x0f07: - case 0x0f0a: - case 0x0f11: - case 0x0f12: - has_N44_O17_errata[policy->cpu] = 1; - dprintk("has errata -- disabling low frequencies\n"); - } - - if (speedstep_detect_processor() == SPEEDSTEP_CPU_P4D && - c->x86_model < 2) { - /* switch to maximum frequency and measure result */ - cpufreq_p4_setdc(policy->cpu, DC_DISABLE); - recalibrate_cpu_khz(); - } - /* get max frequency */ - stock_freq = cpufreq_p4_get_frequency(c); - if (!stock_freq) - return -EINVAL; - - /* table init */ - for (i = 1; (p4clockmod_table[i].frequency != CPUFREQ_TABLE_END); i++) { - if ((i < 2) && (has_N44_O17_errata[policy->cpu])) - p4clockmod_table[i].frequency = CPUFREQ_ENTRY_INVALID; - else - p4clockmod_table[i].frequency = (stock_freq * i)/8; - } - cpufreq_frequency_table_get_attr(p4clockmod_table, policy->cpu); - - /* cpuinfo and default policy values */ - - /* the transition latency is set to be 1 higher than the maximum - * transition latency of the ondemand governor */ - policy->cpuinfo.transition_latency = 10000001; - policy->cur = stock_freq; - - return cpufreq_frequency_table_cpuinfo(policy, &p4clockmod_table[0]); -} - - -static int cpufreq_p4_cpu_exit(struct cpufreq_policy *policy) -{ - cpufreq_frequency_table_put_attr(policy->cpu); - return 0; -} - -static unsigned int cpufreq_p4_get(unsigned int cpu) -{ - u32 l, h; - - rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h); - - if (l & 0x10) { - l = l >> 1; - l &= 0x7; - } else - l = DC_DISABLE; - - if (l != DC_DISABLE) - return stock_freq * l / 8; - - return stock_freq; -} - -static struct freq_attr *p4clockmod_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; - -static struct cpufreq_driver p4clockmod_driver = { - .verify = cpufreq_p4_verify, - .target = cpufreq_p4_target, - .init = cpufreq_p4_cpu_init, - .exit = cpufreq_p4_cpu_exit, - .get = cpufreq_p4_get, - .name = "p4-clockmod", - .owner = THIS_MODULE, - .attr = p4clockmod_attr, -}; - - -static int __init cpufreq_p4_init(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - int ret; - - /* - * THERM_CONTROL is architectural for IA32 now, so - * we can rely on the capability checks - */ - if (c->x86_vendor != X86_VENDOR_INTEL) - return -ENODEV; - - if (!test_cpu_cap(c, X86_FEATURE_ACPI) || - !test_cpu_cap(c, X86_FEATURE_ACC)) - return -ENODEV; - - ret = cpufreq_register_driver(&p4clockmod_driver); - if (!ret) - printk(KERN_INFO PFX "P4/Xeon(TM) CPU On-Demand Clock " - "Modulation available\n"); - - return ret; -} - - -static void __exit cpufreq_p4_exit(void) -{ - cpufreq_unregister_driver(&p4clockmod_driver); -} - - -MODULE_AUTHOR("Zwane Mwaikambo <zwane@commfireservices.com>"); -MODULE_DESCRIPTION("cpufreq driver for Pentium(TM) 4/Xeon(TM)"); -MODULE_LICENSE("GPL"); - -late_initcall(cpufreq_p4_init); -module_exit(cpufreq_p4_exit); diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c deleted file mode 100644 index 4f6f679f279..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c +++ /dev/null @@ -1,626 +0,0 @@ -/* - * pcc-cpufreq.c - Processor Clocking Control firmware cpufreq interface - * - * Copyright (C) 2009 Red Hat, Matthew Garrett <mjg@redhat.com> - * Copyright (C) 2009 Hewlett-Packard Development Company, L.P. - * Nagananda Chumbalkar <nagananda.chumbalkar@hp.com> - * - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; version 2 of the License. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or NON - * INFRINGEMENT. See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write to the Free Software Foundation, Inc., - * 675 Mass Ave, Cambridge, MA 02139, USA. - * - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/smp.h> -#include <linux/sched.h> -#include <linux/cpufreq.h> -#include <linux/compiler.h> -#include <linux/slab.h> - -#include <linux/acpi.h> -#include <linux/io.h> -#include <linux/spinlock.h> -#include <linux/uaccess.h> - -#include <acpi/processor.h> - -#define PCC_VERSION "1.00.00" -#define POLL_LOOPS 300 - -#define CMD_COMPLETE 0x1 -#define CMD_GET_FREQ 0x0 -#define CMD_SET_FREQ 0x1 - -#define BUF_SZ 4 - -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "pcc-cpufreq", msg) - -struct pcc_register_resource { - u8 descriptor; - u16 length; - u8 space_id; - u8 bit_width; - u8 bit_offset; - u8 access_size; - u64 address; -} __attribute__ ((packed)); - -struct pcc_memory_resource { - u8 descriptor; - u16 length; - u8 space_id; - u8 resource_usage; - u8 type_specific; - u64 granularity; - u64 minimum; - u64 maximum; - u64 translation_offset; - u64 address_length; -} __attribute__ ((packed)); - -static struct cpufreq_driver pcc_cpufreq_driver; - -struct pcc_header { - u32 signature; - u16 length; - u8 major; - u8 minor; - u32 features; - u16 command; - u16 status; - u32 latency; - u32 minimum_time; - u32 maximum_time; - u32 nominal; - u32 throttled_frequency; - u32 minimum_frequency; -}; - -static void __iomem *pcch_virt_addr; -static struct pcc_header __iomem *pcch_hdr; - -static DEFINE_SPINLOCK(pcc_lock); - -static struct acpi_generic_address doorbell; - -static u64 doorbell_preserve; -static u64 doorbell_write; - -static u8 OSC_UUID[16] = {0x63, 0x9B, 0x2C, 0x9F, 0x70, 0x91, 0x49, 0x1f, - 0xBB, 0x4F, 0xA5, 0x98, 0x2F, 0xA1, 0xB5, 0x46}; - -struct pcc_cpu { - u32 input_offset; - u32 output_offset; -}; - -static struct pcc_cpu __percpu *pcc_cpu_info; - -static int pcc_cpufreq_verify(struct cpufreq_policy *policy) -{ - cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq, - policy->cpuinfo.max_freq); - return 0; -} - -static inline void pcc_cmd(void) -{ - u64 doorbell_value; - int i; - - acpi_read(&doorbell_value, &doorbell); - acpi_write((doorbell_value & doorbell_preserve) | doorbell_write, - &doorbell); - - for (i = 0; i < POLL_LOOPS; i++) { - if (ioread16(&pcch_hdr->status) & CMD_COMPLETE) - break; - } -} - -static inline void pcc_clear_mapping(void) -{ - if (pcch_virt_addr) - iounmap(pcch_virt_addr); - pcch_virt_addr = NULL; -} - -static unsigned int pcc_get_freq(unsigned int cpu) -{ - struct pcc_cpu *pcc_cpu_data; - unsigned int curr_freq; - unsigned int freq_limit; - u16 status; - u32 input_buffer; - u32 output_buffer; - - spin_lock(&pcc_lock); - - dprintk("get: get_freq for CPU %d\n", cpu); - pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu); - - input_buffer = 0x1; - iowrite32(input_buffer, - (pcch_virt_addr + pcc_cpu_data->input_offset)); - iowrite16(CMD_GET_FREQ, &pcch_hdr->command); - - pcc_cmd(); - - output_buffer = - ioread32(pcch_virt_addr + pcc_cpu_data->output_offset); - - /* Clear the input buffer - we are done with the current command */ - memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ); - - status = ioread16(&pcch_hdr->status); - if (status != CMD_COMPLETE) { - dprintk("get: FAILED: for CPU %d, status is %d\n", - cpu, status); - goto cmd_incomplete; - } - iowrite16(0, &pcch_hdr->status); - curr_freq = (((ioread32(&pcch_hdr->nominal) * (output_buffer & 0xff)) - / 100) * 1000); - - dprintk("get: SUCCESS: (virtual) output_offset for cpu %d is " - "0x%x, contains a value of: 0x%x. Speed is: %d MHz\n", - cpu, (pcch_virt_addr + pcc_cpu_data->output_offset), - output_buffer, curr_freq); - - freq_limit = (output_buffer >> 8) & 0xff; - if (freq_limit != 0xff) { - dprintk("get: frequency for cpu %d is being temporarily" - " capped at %d\n", cpu, curr_freq); - } - - spin_unlock(&pcc_lock); - return curr_freq; - -cmd_incomplete: - iowrite16(0, &pcch_hdr->status); - spin_unlock(&pcc_lock); - return -EINVAL; -} - -static int pcc_cpufreq_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) -{ - struct pcc_cpu *pcc_cpu_data; - struct cpufreq_freqs freqs; - u16 status; - u32 input_buffer; - int cpu; - - spin_lock(&pcc_lock); - cpu = policy->cpu; - pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu); - - dprintk("target: CPU %d should go to target freq: %d " - "(virtual) input_offset is 0x%x\n", - cpu, target_freq, - (pcch_virt_addr + pcc_cpu_data->input_offset)); - - freqs.new = target_freq; - freqs.cpu = cpu; - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - - input_buffer = 0x1 | (((target_freq * 100) - / (ioread32(&pcch_hdr->nominal) * 1000)) << 8); - iowrite32(input_buffer, - (pcch_virt_addr + pcc_cpu_data->input_offset)); - iowrite16(CMD_SET_FREQ, &pcch_hdr->command); - - pcc_cmd(); - - /* Clear the input buffer - we are done with the current command */ - memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ); - - status = ioread16(&pcch_hdr->status); - if (status != CMD_COMPLETE) { - dprintk("target: FAILED for cpu %d, with status: 0x%x\n", - cpu, status); - goto cmd_incomplete; - } - iowrite16(0, &pcch_hdr->status); - - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - dprintk("target: was SUCCESSFUL for cpu %d\n", cpu); - spin_unlock(&pcc_lock); - - return 0; - -cmd_incomplete: - iowrite16(0, &pcch_hdr->status); - spin_unlock(&pcc_lock); - return -EINVAL; -} - -static int pcc_get_offset(int cpu) -{ - acpi_status status; - struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL}; - union acpi_object *pccp, *offset; - struct pcc_cpu *pcc_cpu_data; - struct acpi_processor *pr; - int ret = 0; - - pr = per_cpu(processors, cpu); - pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu); - - status = acpi_evaluate_object(pr->handle, "PCCP", NULL, &buffer); - if (ACPI_FAILURE(status)) - return -ENODEV; - - pccp = buffer.pointer; - if (!pccp || pccp->type != ACPI_TYPE_PACKAGE) { - ret = -ENODEV; - goto out_free; - }; - - offset = &(pccp->package.elements[0]); - if (!offset || offset->type != ACPI_TYPE_INTEGER) { - ret = -ENODEV; - goto out_free; - } - - pcc_cpu_data->input_offset = offset->integer.value; - - offset = &(pccp->package.elements[1]); - if (!offset || offset->type != ACPI_TYPE_INTEGER) { - ret = -ENODEV; - goto out_free; - } - - pcc_cpu_data->output_offset = offset->integer.value; - - memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ); - memset_io((pcch_virt_addr + pcc_cpu_data->output_offset), 0, BUF_SZ); - - dprintk("pcc_get_offset: for CPU %d: pcc_cpu_data " - "input_offset: 0x%x, pcc_cpu_data output_offset: 0x%x\n", - cpu, pcc_cpu_data->input_offset, pcc_cpu_data->output_offset); -out_free: - kfree(buffer.pointer); - return ret; -} - -static int __init pcc_cpufreq_do_osc(acpi_handle *handle) -{ - acpi_status status; - struct acpi_object_list input; - struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL}; - union acpi_object in_params[4]; - union acpi_object *out_obj; - u32 capabilities[2]; - u32 errors; - u32 supported; - int ret = 0; - - input.count = 4; - input.pointer = in_params; - input.count = 4; - input.pointer = in_params; - in_params[0].type = ACPI_TYPE_BUFFER; - in_params[0].buffer.length = 16; - in_params[0].buffer.pointer = OSC_UUID; - in_params[1].type = ACPI_TYPE_INTEGER; - in_params[1].integer.value = 1; - in_params[2].type = ACPI_TYPE_INTEGER; - in_params[2].integer.value = 2; - in_params[3].type = ACPI_TYPE_BUFFER; - in_params[3].buffer.length = 8; - in_params[3].buffer.pointer = (u8 *)&capabilities; - - capabilities[0] = OSC_QUERY_ENABLE; - capabilities[1] = 0x1; - - status = acpi_evaluate_object(*handle, "_OSC", &input, &output); - if (ACPI_FAILURE(status)) - return -ENODEV; - - if (!output.length) - return -ENODEV; - - out_obj = output.pointer; - if (out_obj->type != ACPI_TYPE_BUFFER) { - ret = -ENODEV; - goto out_free; - } - - errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); - if (errors) { - ret = -ENODEV; - goto out_free; - } - - supported = *((u32 *)(out_obj->buffer.pointer + 4)); - if (!(supported & 0x1)) { - ret = -ENODEV; - goto out_free; - } - - kfree(output.pointer); - capabilities[0] = 0x0; - capabilities[1] = 0x1; - - status = acpi_evaluate_object(*handle, "_OSC", &input, &output); - if (ACPI_FAILURE(status)) - return -ENODEV; - - if (!output.length) - return -ENODEV; - - out_obj = output.pointer; - if (out_obj->type != ACPI_TYPE_BUFFER) { - ret = -ENODEV; - goto out_free; - } - - errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); - if (errors) { - ret = -ENODEV; - goto out_free; - } - - supported = *((u32 *)(out_obj->buffer.pointer + 4)); - if (!(supported & 0x1)) { - ret = -ENODEV; - goto out_free; - } - -out_free: - kfree(output.pointer); - return ret; -} - -static int __init pcc_cpufreq_probe(void) -{ - acpi_status status; - struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL}; - struct pcc_memory_resource *mem_resource; - struct pcc_register_resource *reg_resource; - union acpi_object *out_obj, *member; - acpi_handle handle, osc_handle, pcch_handle; - int ret = 0; - - status = acpi_get_handle(NULL, "\\_SB", &handle); - if (ACPI_FAILURE(status)) - return -ENODEV; - - status = acpi_get_handle(handle, "PCCH", &pcch_handle); - if (ACPI_FAILURE(status)) - return -ENODEV; - - status = acpi_get_handle(handle, "_OSC", &osc_handle); - if (ACPI_SUCCESS(status)) { - ret = pcc_cpufreq_do_osc(&osc_handle); - if (ret) - dprintk("probe: _OSC evaluation did not succeed\n"); - /* Firmware's use of _OSC is optional */ - ret = 0; - } - - status = acpi_evaluate_object(handle, "PCCH", NULL, &output); - if (ACPI_FAILURE(status)) - return -ENODEV; - - out_obj = output.pointer; - if (out_obj->type != ACPI_TYPE_PACKAGE) { - ret = -ENODEV; - goto out_free; - } - - member = &out_obj->package.elements[0]; - if (member->type != ACPI_TYPE_BUFFER) { - ret = -ENODEV; - goto out_free; - } - - mem_resource = (struct pcc_memory_resource *)member->buffer.pointer; - - dprintk("probe: mem_resource descriptor: 0x%x," - " length: %d, space_id: %d, resource_usage: %d," - " type_specific: %d, granularity: 0x%llx," - " minimum: 0x%llx, maximum: 0x%llx," - " translation_offset: 0x%llx, address_length: 0x%llx\n", - mem_resource->descriptor, mem_resource->length, - mem_resource->space_id, mem_resource->resource_usage, - mem_resource->type_specific, mem_resource->granularity, - mem_resource->minimum, mem_resource->maximum, - mem_resource->translation_offset, - mem_resource->address_length); - - if (mem_resource->space_id != ACPI_ADR_SPACE_SYSTEM_MEMORY) { - ret = -ENODEV; - goto out_free; - } - - pcch_virt_addr = ioremap_nocache(mem_resource->minimum, - mem_resource->address_length); - if (pcch_virt_addr == NULL) { - dprintk("probe: could not map shared mem region\n"); - goto out_free; - } - pcch_hdr = pcch_virt_addr; - - dprintk("probe: PCCH header (virtual) addr: 0x%p\n", pcch_hdr); - dprintk("probe: PCCH header is at physical address: 0x%llx," - " signature: 0x%x, length: %d bytes, major: %d, minor: %d," - " supported features: 0x%x, command field: 0x%x," - " status field: 0x%x, nominal latency: %d us\n", - mem_resource->minimum, ioread32(&pcch_hdr->signature), - ioread16(&pcch_hdr->length), ioread8(&pcch_hdr->major), - ioread8(&pcch_hdr->minor), ioread32(&pcch_hdr->features), - ioread16(&pcch_hdr->command), ioread16(&pcch_hdr->status), - ioread32(&pcch_hdr->latency)); - - dprintk("probe: min time between commands: %d us," - " max time between commands: %d us," - " nominal CPU frequency: %d MHz," - " minimum CPU frequency: %d MHz," - " minimum CPU frequency without throttling: %d MHz\n", - ioread32(&pcch_hdr->minimum_time), - ioread32(&pcch_hdr->maximum_time), - ioread32(&pcch_hdr->nominal), - ioread32(&pcch_hdr->throttled_frequency), - ioread32(&pcch_hdr->minimum_frequency)); - - member = &out_obj->package.elements[1]; - if (member->type != ACPI_TYPE_BUFFER) { - ret = -ENODEV; - goto pcch_free; - } - - reg_resource = (struct pcc_register_resource *)member->buffer.pointer; - - doorbell.space_id = reg_resource->space_id; - doorbell.bit_width = reg_resource->bit_width; - doorbell.bit_offset = reg_resource->bit_offset; - doorbell.access_width = 64; - doorbell.address = reg_resource->address; - - dprintk("probe: doorbell: space_id is %d, bit_width is %d, " - "bit_offset is %d, access_width is %d, address is 0x%llx\n", - doorbell.space_id, doorbell.bit_width, doorbell.bit_offset, - doorbell.access_width, reg_resource->address); - - member = &out_obj->package.elements[2]; - if (member->type != ACPI_TYPE_INTEGER) { - ret = -ENODEV; - goto pcch_free; - } - - doorbell_preserve = member->integer.value; - - member = &out_obj->package.elements[3]; - if (member->type != ACPI_TYPE_INTEGER) { - ret = -ENODEV; - goto pcch_free; - } - - doorbell_write = member->integer.value; - - dprintk("probe: doorbell_preserve: 0x%llx," - " doorbell_write: 0x%llx\n", - doorbell_preserve, doorbell_write); - - pcc_cpu_info = alloc_percpu(struct pcc_cpu); - if (!pcc_cpu_info) { - ret = -ENOMEM; - goto pcch_free; - } - - printk(KERN_DEBUG "pcc-cpufreq: (v%s) driver loaded with frequency" - " limits: %d MHz, %d MHz\n", PCC_VERSION, - ioread32(&pcch_hdr->minimum_frequency), - ioread32(&pcch_hdr->nominal)); - kfree(output.pointer); - return ret; -pcch_free: - pcc_clear_mapping(); -out_free: - kfree(output.pointer); - return ret; -} - -static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy) -{ - unsigned int cpu = policy->cpu; - unsigned int result = 0; - - if (!pcch_virt_addr) { - result = -1; - goto out; - } - - result = pcc_get_offset(cpu); - if (result) { - dprintk("init: PCCP evaluation failed\n"); - goto out; - } - - policy->max = policy->cpuinfo.max_freq = - ioread32(&pcch_hdr->nominal) * 1000; - policy->min = policy->cpuinfo.min_freq = - ioread32(&pcch_hdr->minimum_frequency) * 1000; - policy->cur = pcc_get_freq(cpu); - - if (!policy->cur) { - dprintk("init: Unable to get current CPU frequency\n"); - result = -EINVAL; - goto out; - } - - dprintk("init: policy->max is %d, policy->min is %d\n", - policy->max, policy->min); -out: - return result; -} - -static int pcc_cpufreq_cpu_exit(struct cpufreq_policy *policy) -{ - return 0; -} - -static struct cpufreq_driver pcc_cpufreq_driver = { - .flags = CPUFREQ_CONST_LOOPS, - .get = pcc_get_freq, - .verify = pcc_cpufreq_verify, - .target = pcc_cpufreq_target, - .init = pcc_cpufreq_cpu_init, - .exit = pcc_cpufreq_cpu_exit, - .name = "pcc-cpufreq", - .owner = THIS_MODULE, -}; - -static int __init pcc_cpufreq_init(void) -{ - int ret; - - if (acpi_disabled) - return 0; - - ret = pcc_cpufreq_probe(); - if (ret) { - dprintk("pcc_cpufreq_init: PCCH evaluation failed\n"); - return ret; - } - - ret = cpufreq_register_driver(&pcc_cpufreq_driver); - - return ret; -} - -static void __exit pcc_cpufreq_exit(void) -{ - cpufreq_unregister_driver(&pcc_cpufreq_driver); - - pcc_clear_mapping(); - - free_percpu(pcc_cpu_info); -} - -MODULE_AUTHOR("Matthew Garrett, Naga Chumbalkar"); -MODULE_VERSION(PCC_VERSION); -MODULE_DESCRIPTION("Processor Clocking Control interface driver"); -MODULE_LICENSE("GPL"); - -late_initcall(pcc_cpufreq_init); -module_exit(pcc_cpufreq_exit); diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c deleted file mode 100644 index b3379d6a5c5..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c +++ /dev/null @@ -1,261 +0,0 @@ -/* - * This file was based upon code in Powertweak Linux (http://powertweak.sf.net) - * (C) 2000-2003 Dave Jones, Arjan van de Ven, Janne Pänkälä, - * Dominik Brodowski. - * - * Licensed under the terms of the GNU GPL License version 2. - * - * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/cpufreq.h> -#include <linux/ioport.h> -#include <linux/timex.h> -#include <linux/io.h> - -#include <asm/msr.h> - -#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long - as it is unused */ - -#define PFX "powernow-k6: " -static unsigned int busfreq; /* FSB, in 10 kHz */ -static unsigned int max_multiplier; - - -/* Clock ratio multiplied by 10 - see table 27 in AMD#23446 */ -static struct cpufreq_frequency_table clock_ratio[] = { - {45, /* 000 -> 4.5x */ 0}, - {50, /* 001 -> 5.0x */ 0}, - {40, /* 010 -> 4.0x */ 0}, - {55, /* 011 -> 5.5x */ 0}, - {20, /* 100 -> 2.0x */ 0}, - {30, /* 101 -> 3.0x */ 0}, - {60, /* 110 -> 6.0x */ 0}, - {35, /* 111 -> 3.5x */ 0}, - {0, CPUFREQ_TABLE_END} -}; - - -/** - * powernow_k6_get_cpu_multiplier - returns the current FSB multiplier - * - * Returns the current setting of the frequency multiplier. Core clock - * speed is frequency of the Front-Side Bus multiplied with this value. - */ -static int powernow_k6_get_cpu_multiplier(void) -{ - u64 invalue = 0; - u32 msrval; - - msrval = POWERNOW_IOPORT + 0x1; - wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */ - invalue = inl(POWERNOW_IOPORT + 0x8); - msrval = POWERNOW_IOPORT + 0x0; - wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */ - - return clock_ratio[(invalue >> 5)&7].index; -} - - -/** - * powernow_k6_set_state - set the PowerNow! multiplier - * @best_i: clock_ratio[best_i] is the target multiplier - * - * Tries to change the PowerNow! multiplier - */ -static void powernow_k6_set_state(unsigned int best_i) -{ - unsigned long outvalue = 0, invalue = 0; - unsigned long msrval; - struct cpufreq_freqs freqs; - - if (clock_ratio[best_i].index > max_multiplier) { - printk(KERN_ERR PFX "invalid target frequency\n"); - return; - } - - freqs.old = busfreq * powernow_k6_get_cpu_multiplier(); - freqs.new = busfreq * clock_ratio[best_i].index; - freqs.cpu = 0; /* powernow-k6.c is UP only driver */ - - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - - /* we now need to transform best_i to the BVC format, see AMD#23446 */ - - outvalue = (1<<12) | (1<<10) | (1<<9) | (best_i<<5); - - msrval = POWERNOW_IOPORT + 0x1; - wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */ - invalue = inl(POWERNOW_IOPORT + 0x8); - invalue = invalue & 0xf; - outvalue = outvalue | invalue; - outl(outvalue , (POWERNOW_IOPORT + 0x8)); - msrval = POWERNOW_IOPORT + 0x0; - wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */ - - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - - return; -} - - -/** - * powernow_k6_verify - verifies a new CPUfreq policy - * @policy: new policy - * - * Policy must be within lowest and highest possible CPU Frequency, - * and at least one possible state must be within min and max. - */ -static int powernow_k6_verify(struct cpufreq_policy *policy) -{ - return cpufreq_frequency_table_verify(policy, &clock_ratio[0]); -} - - -/** - * powernow_k6_setpolicy - sets a new CPUFreq policy - * @policy: new policy - * @target_freq: the target frequency - * @relation: how that frequency relates to achieved frequency - * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) - * - * sets a new CPUFreq policy - */ -static int powernow_k6_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) -{ - unsigned int newstate = 0; - - if (cpufreq_frequency_table_target(policy, &clock_ratio[0], - target_freq, relation, &newstate)) - return -EINVAL; - - powernow_k6_set_state(newstate); - - return 0; -} - - -static int powernow_k6_cpu_init(struct cpufreq_policy *policy) -{ - unsigned int i, f; - int result; - - if (policy->cpu != 0) - return -ENODEV; - - /* get frequencies */ - max_multiplier = powernow_k6_get_cpu_multiplier(); - busfreq = cpu_khz / max_multiplier; - - /* table init */ - for (i = 0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) { - f = clock_ratio[i].index; - if (f > max_multiplier) - clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID; - else - clock_ratio[i].frequency = busfreq * f; - } - - /* cpuinfo and default policy values */ - policy->cpuinfo.transition_latency = 200000; - policy->cur = busfreq * max_multiplier; - - result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio); - if (result) - return result; - - cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu); - - return 0; -} - - -static int powernow_k6_cpu_exit(struct cpufreq_policy *policy) -{ - unsigned int i; - for (i = 0; i < 8; i++) { - if (i == max_multiplier) - powernow_k6_set_state(i); - } - cpufreq_frequency_table_put_attr(policy->cpu); - return 0; -} - -static unsigned int powernow_k6_get(unsigned int cpu) -{ - unsigned int ret; - ret = (busfreq * powernow_k6_get_cpu_multiplier()); - return ret; -} - -static struct freq_attr *powernow_k6_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; - -static struct cpufreq_driver powernow_k6_driver = { - .verify = powernow_k6_verify, - .target = powernow_k6_target, - .init = powernow_k6_cpu_init, - .exit = powernow_k6_cpu_exit, - .get = powernow_k6_get, - .name = "powernow-k6", - .owner = THIS_MODULE, - .attr = powernow_k6_attr, -}; - - -/** - * powernow_k6_init - initializes the k6 PowerNow! CPUFreq driver - * - * Initializes the K6 PowerNow! support. Returns -ENODEV on unsupported - * devices, -EINVAL or -ENOMEM on problems during initiatization, and zero - * on success. - */ -static int __init powernow_k6_init(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - - if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 != 5) || - ((c->x86_model != 12) && (c->x86_model != 13))) - return -ENODEV; - - if (!request_region(POWERNOW_IOPORT, 16, "PowerNow!")) { - printk(KERN_INFO PFX "PowerNow IOPORT region already used.\n"); - return -EIO; - } - - if (cpufreq_register_driver(&powernow_k6_driver)) { - release_region(POWERNOW_IOPORT, 16); - return -EINVAL; - } - - return 0; -} - - -/** - * powernow_k6_exit - unregisters AMD K6-2+/3+ PowerNow! support - * - * Unregisters AMD K6-2+ / K6-3+ PowerNow! support. - */ -static void __exit powernow_k6_exit(void) -{ - cpufreq_unregister_driver(&powernow_k6_driver); - release_region(POWERNOW_IOPORT, 16); -} - - -MODULE_AUTHOR("Arjan van de Ven, Dave Jones <davej@redhat.com>, " - "Dominik Brodowski <linux@brodo.de>"); -MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors."); -MODULE_LICENSE("GPL"); - -module_init(powernow_k6_init); -module_exit(powernow_k6_exit); diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c deleted file mode 100644 index 4a45fd6e41b..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c +++ /dev/null @@ -1,752 +0,0 @@ -/* - * AMD K7 Powernow driver. - * (C) 2003 Dave Jones on behalf of SuSE Labs. - * (C) 2003-2004 Dave Jones <davej@redhat.com> - * - * Licensed under the terms of the GNU GPL License version 2. - * Based upon datasheets & sample CPUs kindly provided by AMD. - * - * Errata 5: - * CPU may fail to execute a FID/VID change in presence of interrupt. - * - We cli/sti on stepping A0 CPUs around the FID/VID transition. - * Errata 15: - * CPU with half frequency multipliers may hang upon wakeup from disconnect. - * - We disable half multipliers if ACPI is used on A0 stepping CPUs. - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/moduleparam.h> -#include <linux/init.h> -#include <linux/cpufreq.h> -#include <linux/slab.h> -#include <linux/string.h> -#include <linux/dmi.h> -#include <linux/timex.h> -#include <linux/io.h> - -#include <asm/timer.h> /* Needed for recalibrate_cpu_khz() */ -#include <asm/msr.h> -#include <asm/system.h> - -#ifdef CONFIG_X86_POWERNOW_K7_ACPI -#include <linux/acpi.h> -#include <acpi/processor.h> -#endif - -#include "powernow-k7.h" - -#define PFX "powernow: " - - -struct psb_s { - u8 signature[10]; - u8 tableversion; - u8 flags; - u16 settlingtime; - u8 reserved1; - u8 numpst; -}; - -struct pst_s { - u32 cpuid; - u8 fsbspeed; - u8 maxfid; - u8 startvid; - u8 numpstates; -}; - -#ifdef CONFIG_X86_POWERNOW_K7_ACPI -union powernow_acpi_control_t { - struct { - unsigned long fid:5, - vid:5, - sgtc:20, - res1:2; - } bits; - unsigned long val; -}; -#endif - -#ifdef CONFIG_CPU_FREQ_DEBUG -/* divide by 1000 to get VCore voltage in V. */ -static const int mobile_vid_table[32] = { - 2000, 1950, 1900, 1850, 1800, 1750, 1700, 1650, - 1600, 1550, 1500, 1450, 1400, 1350, 1300, 0, - 1275, 1250, 1225, 1200, 1175, 1150, 1125, 1100, - 1075, 1050, 1025, 1000, 975, 950, 925, 0, -}; -#endif - -/* divide by 10 to get FID. */ -static const int fid_codes[32] = { - 110, 115, 120, 125, 50, 55, 60, 65, - 70, 75, 80, 85, 90, 95, 100, 105, - 30, 190, 40, 200, 130, 135, 140, 210, - 150, 225, 160, 165, 170, 180, -1, -1, -}; - -/* This parameter is used in order to force ACPI instead of legacy method for - * configuration purpose. - */ - -static int acpi_force; - -static struct cpufreq_frequency_table *powernow_table; - -static unsigned int can_scale_bus; -static unsigned int can_scale_vid; -static unsigned int minimum_speed = -1; -static unsigned int maximum_speed; -static unsigned int number_scales; -static unsigned int fsb; -static unsigned int latency; -static char have_a0; - -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "powernow-k7", msg) - -static int check_fsb(unsigned int fsbspeed) -{ - int delta; - unsigned int f = fsb / 1000; - - delta = (fsbspeed > f) ? fsbspeed - f : f - fsbspeed; - return delta < 5; -} - -static int check_powernow(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - unsigned int maxei, eax, ebx, ecx, edx; - - if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 != 6)) { -#ifdef MODULE - printk(KERN_INFO PFX "This module only works with " - "AMD K7 CPUs\n"); -#endif - return 0; - } - - /* Get maximum capabilities */ - maxei = cpuid_eax(0x80000000); - if (maxei < 0x80000007) { /* Any powernow info ? */ -#ifdef MODULE - printk(KERN_INFO PFX "No powernow capabilities detected\n"); -#endif - return 0; - } - - if ((c->x86_model == 6) && (c->x86_mask == 0)) { - printk(KERN_INFO PFX "K7 660[A0] core detected, " - "enabling errata workarounds\n"); - have_a0 = 1; - } - - cpuid(0x80000007, &eax, &ebx, &ecx, &edx); - - /* Check we can actually do something before we say anything.*/ - if (!(edx & (1 << 1 | 1 << 2))) - return 0; - - printk(KERN_INFO PFX "PowerNOW! Technology present. Can scale: "); - - if (edx & 1 << 1) { - printk("frequency"); - can_scale_bus = 1; - } - - if ((edx & (1 << 1 | 1 << 2)) == 0x6) - printk(" and "); - - if (edx & 1 << 2) { - printk("voltage"); - can_scale_vid = 1; - } - - printk(".\n"); - return 1; -} - -#ifdef CONFIG_X86_POWERNOW_K7_ACPI -static void invalidate_entry(unsigned int entry) -{ - powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID; -} -#endif - -static int get_ranges(unsigned char *pst) -{ - unsigned int j; - unsigned int speed; - u8 fid, vid; - - powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) * - (number_scales + 1)), GFP_KERNEL); - if (!powernow_table) - return -ENOMEM; - - for (j = 0 ; j < number_scales; j++) { - fid = *pst++; - - powernow_table[j].frequency = (fsb * fid_codes[fid]) / 10; - powernow_table[j].index = fid; /* lower 8 bits */ - - speed = powernow_table[j].frequency; - - if ((fid_codes[fid] % 10) == 5) { -#ifdef CONFIG_X86_POWERNOW_K7_ACPI - if (have_a0 == 1) - invalidate_entry(j); -#endif - } - - if (speed < minimum_speed) - minimum_speed = speed; - if (speed > maximum_speed) - maximum_speed = speed; - - vid = *pst++; - powernow_table[j].index |= (vid << 8); /* upper 8 bits */ - - dprintk(" FID: 0x%x (%d.%dx [%dMHz]) " - "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10, - fid_codes[fid] % 10, speed/1000, vid, - mobile_vid_table[vid]/1000, - mobile_vid_table[vid]%1000); - } - powernow_table[number_scales].frequency = CPUFREQ_TABLE_END; - powernow_table[number_scales].index = 0; - - return 0; -} - - -static void change_FID(int fid) -{ - union msr_fidvidctl fidvidctl; - - rdmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val); - if (fidvidctl.bits.FID != fid) { - fidvidctl.bits.SGTC = latency; - fidvidctl.bits.FID = fid; - fidvidctl.bits.VIDC = 0; - fidvidctl.bits.FIDC = 1; - wrmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val); - } -} - - -static void change_VID(int vid) -{ - union msr_fidvidctl fidvidctl; - - rdmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val); - if (fidvidctl.bits.VID != vid) { - fidvidctl.bits.SGTC = latency; - fidvidctl.bits.VID = vid; - fidvidctl.bits.FIDC = 0; - fidvidctl.bits.VIDC = 1; - wrmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val); - } -} - - -static void change_speed(unsigned int index) -{ - u8 fid, vid; - struct cpufreq_freqs freqs; - union msr_fidvidstatus fidvidstatus; - int cfid; - - /* fid are the lower 8 bits of the index we stored into - * the cpufreq frequency table in powernow_decode_bios, - * vid are the upper 8 bits. - */ - - fid = powernow_table[index].index & 0xFF; - vid = (powernow_table[index].index & 0xFF00) >> 8; - - freqs.cpu = 0; - - rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val); - cfid = fidvidstatus.bits.CFID; - freqs.old = fsb * fid_codes[cfid] / 10; - - freqs.new = powernow_table[index].frequency; - - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - - /* Now do the magic poking into the MSRs. */ - - if (have_a0 == 1) /* A0 errata 5 */ - local_irq_disable(); - - if (freqs.old > freqs.new) { - /* Going down, so change FID first */ - change_FID(fid); - change_VID(vid); - } else { - /* Going up, so change VID first */ - change_VID(vid); - change_FID(fid); - } - - - if (have_a0 == 1) - local_irq_enable(); - - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); -} - - -#ifdef CONFIG_X86_POWERNOW_K7_ACPI - -static struct acpi_processor_performance *acpi_processor_perf; - -static int powernow_acpi_init(void) -{ - int i; - int retval = 0; - union powernow_acpi_control_t pc; - - if (acpi_processor_perf != NULL && powernow_table != NULL) { - retval = -EINVAL; - goto err0; - } - - acpi_processor_perf = kzalloc(sizeof(struct acpi_processor_performance), - GFP_KERNEL); - if (!acpi_processor_perf) { - retval = -ENOMEM; - goto err0; - } - - if (!zalloc_cpumask_var(&acpi_processor_perf->shared_cpu_map, - GFP_KERNEL)) { - retval = -ENOMEM; - goto err05; - } - - if (acpi_processor_register_performance(acpi_processor_perf, 0)) { - retval = -EIO; - goto err1; - } - - if (acpi_processor_perf->control_register.space_id != - ACPI_ADR_SPACE_FIXED_HARDWARE) { - retval = -ENODEV; - goto err2; - } - - if (acpi_processor_perf->status_register.space_id != - ACPI_ADR_SPACE_FIXED_HARDWARE) { - retval = -ENODEV; - goto err2; - } - - number_scales = acpi_processor_perf->state_count; - - if (number_scales < 2) { - retval = -ENODEV; - goto err2; - } - - powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) * - (number_scales + 1)), GFP_KERNEL); - if (!powernow_table) { - retval = -ENOMEM; - goto err2; - } - - pc.val = (unsigned long) acpi_processor_perf->states[0].control; - for (i = 0; i < number_scales; i++) { - u8 fid, vid; - struct acpi_processor_px *state = - &acpi_processor_perf->states[i]; - unsigned int speed, speed_mhz; - - pc.val = (unsigned long) state->control; - dprintk("acpi: P%d: %d MHz %d mW %d uS control %08x SGTC %d\n", - i, - (u32) state->core_frequency, - (u32) state->power, - (u32) state->transition_latency, - (u32) state->control, - pc.bits.sgtc); - - vid = pc.bits.vid; - fid = pc.bits.fid; - - powernow_table[i].frequency = fsb * fid_codes[fid] / 10; - powernow_table[i].index = fid; /* lower 8 bits */ - powernow_table[i].index |= (vid << 8); /* upper 8 bits */ - - speed = powernow_table[i].frequency; - speed_mhz = speed / 1000; - - /* processor_perflib will multiply the MHz value by 1000 to - * get a KHz value (e.g. 1266000). However, powernow-k7 works - * with true KHz values (e.g. 1266768). To ensure that all - * powernow frequencies are available, we must ensure that - * ACPI doesn't restrict them, so we round up the MHz value - * to ensure that perflib's computed KHz value is greater than - * or equal to powernow's KHz value. - */ - if (speed % 1000 > 0) - speed_mhz++; - - if ((fid_codes[fid] % 10) == 5) { - if (have_a0 == 1) - invalidate_entry(i); - } - - dprintk(" FID: 0x%x (%d.%dx [%dMHz]) " - "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10, - fid_codes[fid] % 10, speed_mhz, vid, - mobile_vid_table[vid]/1000, - mobile_vid_table[vid]%1000); - - if (state->core_frequency != speed_mhz) { - state->core_frequency = speed_mhz; - dprintk(" Corrected ACPI frequency to %d\n", - speed_mhz); - } - - if (latency < pc.bits.sgtc) - latency = pc.bits.sgtc; - - if (speed < minimum_speed) - minimum_speed = speed; - if (speed > maximum_speed) - maximum_speed = speed; - } - - powernow_table[i].frequency = CPUFREQ_TABLE_END; - powernow_table[i].index = 0; - - /* notify BIOS that we exist */ - acpi_processor_notify_smm(THIS_MODULE); - - return 0; - -err2: - acpi_processor_unregister_performance(acpi_processor_perf, 0); -err1: - free_cpumask_var(acpi_processor_perf->shared_cpu_map); -err05: - kfree(acpi_processor_perf); -err0: - printk(KERN_WARNING PFX "ACPI perflib can not be used on " - "this platform\n"); - acpi_processor_perf = NULL; - return retval; -} -#else -static int powernow_acpi_init(void) -{ - printk(KERN_INFO PFX "no support for ACPI processor found." - " Please recompile your kernel with ACPI processor\n"); - return -EINVAL; -} -#endif - -static void print_pst_entry(struct pst_s *pst, unsigned int j) -{ - dprintk("PST:%d (@%p)\n", j, pst); - dprintk(" cpuid: 0x%x fsb: %d maxFID: 0x%x startvid: 0x%x\n", - pst->cpuid, pst->fsbspeed, pst->maxfid, pst->startvid); -} - -static int powernow_decode_bios(int maxfid, int startvid) -{ - struct psb_s *psb; - struct pst_s *pst; - unsigned int i, j; - unsigned char *p; - unsigned int etuple; - unsigned int ret; - - etuple = cpuid_eax(0x80000001); - - for (i = 0xC0000; i < 0xffff0 ; i += 16) { - - p = phys_to_virt(i); - - if (memcmp(p, "AMDK7PNOW!", 10) == 0) { - dprintk("Found PSB header at %p\n", p); - psb = (struct psb_s *) p; - dprintk("Table version: 0x%x\n", psb->tableversion); - if (psb->tableversion != 0x12) { - printk(KERN_INFO PFX "Sorry, only v1.2 tables" - " supported right now\n"); - return -ENODEV; - } - - dprintk("Flags: 0x%x\n", psb->flags); - if ((psb->flags & 1) == 0) - dprintk("Mobile voltage regulator\n"); - else - dprintk("Desktop voltage regulator\n"); - - latency = psb->settlingtime; - if (latency < 100) { - printk(KERN_INFO PFX "BIOS set settling time " - "to %d microseconds. " - "Should be at least 100. " - "Correcting.\n", latency); - latency = 100; - } - dprintk("Settling Time: %d microseconds.\n", - psb->settlingtime); - dprintk("Has %d PST tables. (Only dumping ones " - "relevant to this CPU).\n", - psb->numpst); - - p += sizeof(struct psb_s); - - pst = (struct pst_s *) p; - - for (j = 0; j < psb->numpst; j++) { - pst = (struct pst_s *) p; - number_scales = pst->numpstates; - - if ((etuple == pst->cpuid) && - check_fsb(pst->fsbspeed) && - (maxfid == pst->maxfid) && - (startvid == pst->startvid)) { - print_pst_entry(pst, j); - p = (char *)pst + sizeof(struct pst_s); - ret = get_ranges(p); - return ret; - } else { - unsigned int k; - p = (char *)pst + sizeof(struct pst_s); - for (k = 0; k < number_scales; k++) - p += 2; - } - } - printk(KERN_INFO PFX "No PST tables match this cpuid " - "(0x%x)\n", etuple); - printk(KERN_INFO PFX "This is indicative of a broken " - "BIOS.\n"); - - return -EINVAL; - } - p++; - } - - return -ENODEV; -} - - -static int powernow_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) -{ - unsigned int newstate; - - if (cpufreq_frequency_table_target(policy, powernow_table, target_freq, - relation, &newstate)) - return -EINVAL; - - change_speed(newstate); - - return 0; -} - - -static int powernow_verify(struct cpufreq_policy *policy) -{ - return cpufreq_frequency_table_verify(policy, powernow_table); -} - -/* - * We use the fact that the bus frequency is somehow - * a multiple of 100000/3 khz, then we compute sgtc according - * to this multiple. - * That way, we match more how AMD thinks all of that work. - * We will then get the same kind of behaviour already tested under - * the "well-known" other OS. - */ -static int __cpuinit fixup_sgtc(void) -{ - unsigned int sgtc; - unsigned int m; - - m = fsb / 3333; - if ((m % 10) >= 5) - m += 5; - - m /= 10; - - sgtc = 100 * m * latency; - sgtc = sgtc / 3; - if (sgtc > 0xfffff) { - printk(KERN_WARNING PFX "SGTC too large %d\n", sgtc); - sgtc = 0xfffff; - } - return sgtc; -} - -static unsigned int powernow_get(unsigned int cpu) -{ - union msr_fidvidstatus fidvidstatus; - unsigned int cfid; - - if (cpu) - return 0; - rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val); - cfid = fidvidstatus.bits.CFID; - - return fsb * fid_codes[cfid] / 10; -} - - -static int __cpuinit acer_cpufreq_pst(const struct dmi_system_id *d) -{ - printk(KERN_WARNING PFX - "%s laptop with broken PST tables in BIOS detected.\n", - d->ident); - printk(KERN_WARNING PFX - "You need to downgrade to 3A21 (09/09/2002), or try a newer " - "BIOS than 3A71 (01/20/2003)\n"); - printk(KERN_WARNING PFX - "cpufreq scaling has been disabled as a result of this.\n"); - return 0; -} - -/* - * Some Athlon laptops have really fucked PST tables. - * A BIOS update is all that can save them. - * Mention this, and disable cpufreq. - */ -static struct dmi_system_id __cpuinitdata powernow_dmi_table[] = { - { - .callback = acer_cpufreq_pst, - .ident = "Acer Aspire", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Insyde Software"), - DMI_MATCH(DMI_BIOS_VERSION, "3A71"), - }, - }, - { } -}; - -static int __cpuinit powernow_cpu_init(struct cpufreq_policy *policy) -{ - union msr_fidvidstatus fidvidstatus; - int result; - - if (policy->cpu != 0) - return -ENODEV; - - rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val); - - recalibrate_cpu_khz(); - - fsb = (10 * cpu_khz) / fid_codes[fidvidstatus.bits.CFID]; - if (!fsb) { - printk(KERN_WARNING PFX "can not determine bus frequency\n"); - return -EINVAL; - } - dprintk("FSB: %3dMHz\n", fsb/1000); - - if (dmi_check_system(powernow_dmi_table) || acpi_force) { - printk(KERN_INFO PFX "PSB/PST known to be broken. " - "Trying ACPI instead\n"); - result = powernow_acpi_init(); - } else { - result = powernow_decode_bios(fidvidstatus.bits.MFID, - fidvidstatus.bits.SVID); - if (result) { - printk(KERN_INFO PFX "Trying ACPI perflib\n"); - maximum_speed = 0; - minimum_speed = -1; - latency = 0; - result = powernow_acpi_init(); - if (result) { - printk(KERN_INFO PFX - "ACPI and legacy methods failed\n"); - } - } else { - /* SGTC use the bus clock as timer */ - latency = fixup_sgtc(); - printk(KERN_INFO PFX "SGTC: %d\n", latency); - } - } - - if (result) - return result; - - printk(KERN_INFO PFX "Minimum speed %d MHz. Maximum speed %d MHz.\n", - minimum_speed/1000, maximum_speed/1000); - - policy->cpuinfo.transition_latency = - cpufreq_scale(2000000UL, fsb, latency); - - policy->cur = powernow_get(0); - - cpufreq_frequency_table_get_attr(powernow_table, policy->cpu); - - return cpufreq_frequency_table_cpuinfo(policy, powernow_table); -} - -static int powernow_cpu_exit(struct cpufreq_policy *policy) -{ - cpufreq_frequency_table_put_attr(policy->cpu); - -#ifdef CONFIG_X86_POWERNOW_K7_ACPI - if (acpi_processor_perf) { - acpi_processor_unregister_performance(acpi_processor_perf, 0); - free_cpumask_var(acpi_processor_perf->shared_cpu_map); - kfree(acpi_processor_perf); - } -#endif - - kfree(powernow_table); - return 0; -} - -static struct freq_attr *powernow_table_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; - -static struct cpufreq_driver powernow_driver = { - .verify = powernow_verify, - .target = powernow_target, - .get = powernow_get, -#ifdef CONFIG_X86_POWERNOW_K7_ACPI - .bios_limit = acpi_processor_get_bios_limit, -#endif - .init = powernow_cpu_init, - .exit = powernow_cpu_exit, - .name = "powernow-k7", - .owner = THIS_MODULE, - .attr = powernow_table_attr, -}; - -static int __init powernow_init(void) -{ - if (check_powernow() == 0) - return -ENODEV; - return cpufreq_register_driver(&powernow_driver); -} - - -static void __exit powernow_exit(void) -{ - cpufreq_unregister_driver(&powernow_driver); -} - -module_param(acpi_force, int, 0444); -MODULE_PARM_DESC(acpi_force, "Force ACPI to be used."); - -MODULE_AUTHOR("Dave Jones <davej@redhat.com>"); -MODULE_DESCRIPTION("Powernow driver for AMD K7 processors."); -MODULE_LICENSE("GPL"); - -late_initcall(powernow_init); -module_exit(powernow_exit); - diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h b/arch/x86/kernel/cpu/cpufreq/powernow-k7.h deleted file mode 100644 index 35fb4eaf6e1..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * (C) 2003 Dave Jones. - * - * Licensed under the terms of the GNU GPL License version 2. - * - * AMD-specific information - * - */ - -union msr_fidvidctl { - struct { - unsigned FID:5, // 4:0 - reserved1:3, // 7:5 - VID:5, // 12:8 - reserved2:3, // 15:13 - FIDC:1, // 16 - VIDC:1, // 17 - reserved3:2, // 19:18 - FIDCHGRATIO:1, // 20 - reserved4:11, // 31-21 - SGTC:20, // 32:51 - reserved5:12; // 63:52 - } bits; - unsigned long long val; -}; - -union msr_fidvidstatus { - struct { - unsigned CFID:5, // 4:0 - reserved1:3, // 7:5 - SFID:5, // 12:8 - reserved2:3, // 15:13 - MFID:5, // 20:16 - reserved3:11, // 31:21 - CVID:5, // 36:32 - reserved4:3, // 39:37 - SVID:5, // 44:40 - reserved5:3, // 47:45 - MVID:5, // 52:48 - reserved6:11; // 63:53 - } bits; - unsigned long long val; -}; diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c deleted file mode 100644 index 491977baf6c..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ /dev/null @@ -1,1601 +0,0 @@ -/* - * (c) 2003-2010 Advanced Micro Devices, Inc. - * Your use of this code is subject to the terms and conditions of the - * GNU general public license version 2. See "COPYING" or - * http://www.gnu.org/licenses/gpl.html - * - * Support : mark.langsdorf@amd.com - * - * Based on the powernow-k7.c module written by Dave Jones. - * (C) 2003 Dave Jones on behalf of SuSE Labs - * (C) 2004 Dominik Brodowski <linux@brodo.de> - * (C) 2004 Pavel Machek <pavel@ucw.cz> - * Licensed under the terms of the GNU GPL License version 2. - * Based upon datasheets & sample CPUs kindly provided by AMD. - * - * Valuable input gratefully received from Dave Jones, Pavel Machek, - * Dominik Brodowski, Jacob Shin, and others. - * Originally developed by Paul Devriendt. - * Processor information obtained from Chapter 9 (Power and Thermal Management) - * of the "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD - * Opteron Processors" available for download from www.amd.com - * - * Tables for specific CPUs can be inferred from - * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/30430.pdf - */ - -#include <linux/kernel.h> -#include <linux/smp.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/cpufreq.h> -#include <linux/slab.h> -#include <linux/string.h> -#include <linux/cpumask.h> -#include <linux/sched.h> /* for current / set_cpus_allowed() */ -#include <linux/io.h> -#include <linux/delay.h> - -#include <asm/msr.h> - -#include <linux/acpi.h> -#include <linux/mutex.h> -#include <acpi/processor.h> - -#define PFX "powernow-k8: " -#define VERSION "version 2.20.00" -#include "powernow-k8.h" -#include "mperf.h" - -/* serialize freq changes */ -static DEFINE_MUTEX(fidvid_mutex); - -static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data); - -static int cpu_family = CPU_OPTERON; - -/* core performance boost */ -static bool cpb_capable, cpb_enabled; -static struct msr __percpu *msrs; - -static struct cpufreq_driver cpufreq_amd64_driver; - -#ifndef CONFIG_SMP -static inline const struct cpumask *cpu_core_mask(int cpu) -{ - return cpumask_of(0); -} -#endif - -/* Return a frequency in MHz, given an input fid */ -static u32 find_freq_from_fid(u32 fid) -{ - return 800 + (fid * 100); -} - -/* Return a frequency in KHz, given an input fid */ -static u32 find_khz_freq_from_fid(u32 fid) -{ - return 1000 * find_freq_from_fid(fid); -} - -static u32 find_khz_freq_from_pstate(struct cpufreq_frequency_table *data, - u32 pstate) -{ - return data[pstate].frequency; -} - -/* Return the vco fid for an input fid - * - * Each "low" fid has corresponding "high" fid, and you can get to "low" fids - * only from corresponding high fids. This returns "high" fid corresponding to - * "low" one. - */ -static u32 convert_fid_to_vco_fid(u32 fid) -{ - if (fid < HI_FID_TABLE_BOTTOM) - return 8 + (2 * fid); - else - return fid; -} - -/* - * Return 1 if the pending bit is set. Unless we just instructed the processor - * to transition to a new state, seeing this bit set is really bad news. - */ -static int pending_bit_stuck(void) -{ - u32 lo, hi; - - if (cpu_family == CPU_HW_PSTATE) - return 0; - - rdmsr(MSR_FIDVID_STATUS, lo, hi); - return lo & MSR_S_LO_CHANGE_PENDING ? 1 : 0; -} - -/* - * Update the global current fid / vid values from the status msr. - * Returns 1 on error. - */ -static int query_current_values_with_pending_wait(struct powernow_k8_data *data) -{ - u32 lo, hi; - u32 i = 0; - - if (cpu_family == CPU_HW_PSTATE) { - rdmsr(MSR_PSTATE_STATUS, lo, hi); - i = lo & HW_PSTATE_MASK; - data->currpstate = i; - - /* - * a workaround for family 11h erratum 311 might cause - * an "out-of-range Pstate if the core is in Pstate-0 - */ - if ((boot_cpu_data.x86 == 0x11) && (i >= data->numps)) - data->currpstate = HW_PSTATE_0; - - return 0; - } - do { - if (i++ > 10000) { - dprintk("detected change pending stuck\n"); - return 1; - } - rdmsr(MSR_FIDVID_STATUS, lo, hi); - } while (lo & MSR_S_LO_CHANGE_PENDING); - - data->currvid = hi & MSR_S_HI_CURRENT_VID; - data->currfid = lo & MSR_S_LO_CURRENT_FID; - - return 0; -} - -/* the isochronous relief time */ -static void count_off_irt(struct powernow_k8_data *data) -{ - udelay((1 << data->irt) * 10); - return; -} - -/* the voltage stabilization time */ -static void count_off_vst(struct powernow_k8_data *data) -{ - udelay(data->vstable * VST_UNITS_20US); - return; -} - -/* need to init the control msr to a safe value (for each cpu) */ -static void fidvid_msr_init(void) -{ - u32 lo, hi; - u8 fid, vid; - - rdmsr(MSR_FIDVID_STATUS, lo, hi); - vid = hi & MSR_S_HI_CURRENT_VID; - fid = lo & MSR_S_LO_CURRENT_FID; - lo = fid | (vid << MSR_C_LO_VID_SHIFT); - hi = MSR_C_HI_STP_GNT_BENIGN; - dprintk("cpu%d, init lo 0x%x, hi 0x%x\n", smp_processor_id(), lo, hi); - wrmsr(MSR_FIDVID_CTL, lo, hi); -} - -/* write the new fid value along with the other control fields to the msr */ -static int write_new_fid(struct powernow_k8_data *data, u32 fid) -{ - u32 lo; - u32 savevid = data->currvid; - u32 i = 0; - - if ((fid & INVALID_FID_MASK) || (data->currvid & INVALID_VID_MASK)) { - printk(KERN_ERR PFX "internal error - overflow on fid write\n"); - return 1; - } - - lo = fid; - lo |= (data->currvid << MSR_C_LO_VID_SHIFT); - lo |= MSR_C_LO_INIT_FID_VID; - - dprintk("writing fid 0x%x, lo 0x%x, hi 0x%x\n", - fid, lo, data->plllock * PLL_LOCK_CONVERSION); - - do { - wrmsr(MSR_FIDVID_CTL, lo, data->plllock * PLL_LOCK_CONVERSION); - if (i++ > 100) { - printk(KERN_ERR PFX - "Hardware error - pending bit very stuck - " - "no further pstate changes possible\n"); - return 1; - } - } while (query_current_values_with_pending_wait(data)); - - count_off_irt(data); - - if (savevid != data->currvid) { - printk(KERN_ERR PFX - "vid change on fid trans, old 0x%x, new 0x%x\n", - savevid, data->currvid); - return 1; - } - - if (fid != data->currfid) { - printk(KERN_ERR PFX - "fid trans failed, fid 0x%x, curr 0x%x\n", fid, - data->currfid); - return 1; - } - - return 0; -} - -/* Write a new vid to the hardware */ -static int write_new_vid(struct powernow_k8_data *data, u32 vid) -{ - u32 lo; - u32 savefid = data->currfid; - int i = 0; - - if ((data->currfid & INVALID_FID_MASK) || (vid & INVALID_VID_MASK)) { - printk(KERN_ERR PFX "internal error - overflow on vid write\n"); - return 1; - } - - lo = data->currfid; - lo |= (vid << MSR_C_LO_VID_SHIFT); - lo |= MSR_C_LO_INIT_FID_VID; - - dprintk("writing vid 0x%x, lo 0x%x, hi 0x%x\n", - vid, lo, STOP_GRANT_5NS); - - do { - wrmsr(MSR_FIDVID_CTL, lo, STOP_GRANT_5NS); - if (i++ > 100) { - printk(KERN_ERR PFX "internal error - pending bit " - "very stuck - no further pstate " - "changes possible\n"); - return 1; - } - } while (query_current_values_with_pending_wait(data)); - - if (savefid != data->currfid) { - printk(KERN_ERR PFX "fid changed on vid trans, old " - "0x%x new 0x%x\n", - savefid, data->currfid); - return 1; - } - - if (vid != data->currvid) { - printk(KERN_ERR PFX "vid trans failed, vid 0x%x, " - "curr 0x%x\n", - vid, data->currvid); - return 1; - } - - return 0; -} - -/* - * Reduce the vid by the max of step or reqvid. - * Decreasing vid codes represent increasing voltages: - * vid of 0 is 1.550V, vid of 0x1e is 0.800V, vid of VID_OFF is off. - */ -static int decrease_vid_code_by_step(struct powernow_k8_data *data, - u32 reqvid, u32 step) -{ - if ((data->currvid - reqvid) > step) - reqvid = data->currvid - step; - - if (write_new_vid(data, reqvid)) - return 1; - - count_off_vst(data); - - return 0; -} - -/* Change hardware pstate by single MSR write */ -static int transition_pstate(struct powernow_k8_data *data, u32 pstate) -{ - wrmsr(MSR_PSTATE_CTRL, pstate, 0); - data->currpstate = pstate; - return 0; -} - -/* Change Opteron/Athlon64 fid and vid, by the 3 phases. */ -static int transition_fid_vid(struct powernow_k8_data *data, - u32 reqfid, u32 reqvid) -{ - if (core_voltage_pre_transition(data, reqvid, reqfid)) - return 1; - - if (core_frequency_transition(data, reqfid)) - return 1; - - if (core_voltage_post_transition(data, reqvid)) - return 1; - - if (query_current_values_with_pending_wait(data)) - return 1; - - if ((reqfid != data->currfid) || (reqvid != data->currvid)) { - printk(KERN_ERR PFX "failed (cpu%d): req 0x%x 0x%x, " - "curr 0x%x 0x%x\n", - smp_processor_id(), - reqfid, reqvid, data->currfid, data->currvid); - return 1; - } - - dprintk("transitioned (cpu%d): new fid 0x%x, vid 0x%x\n", - smp_processor_id(), data->currfid, data->currvid); - - return 0; -} - -/* Phase 1 - core voltage transition ... setup voltage */ -static int core_voltage_pre_transition(struct powernow_k8_data *data, - u32 reqvid, u32 reqfid) -{ - u32 rvosteps = data->rvo; - u32 savefid = data->currfid; - u32 maxvid, lo, rvomult = 1; - - dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, " - "reqvid 0x%x, rvo 0x%x\n", - smp_processor_id(), - data->currfid, data->currvid, reqvid, data->rvo); - - if ((savefid < LO_FID_TABLE_TOP) && (reqfid < LO_FID_TABLE_TOP)) - rvomult = 2; - rvosteps *= rvomult; - rdmsr(MSR_FIDVID_STATUS, lo, maxvid); - maxvid = 0x1f & (maxvid >> 16); - dprintk("ph1 maxvid=0x%x\n", maxvid); - if (reqvid < maxvid) /* lower numbers are higher voltages */ - reqvid = maxvid; - - while (data->currvid > reqvid) { - dprintk("ph1: curr 0x%x, req vid 0x%x\n", - data->currvid, reqvid); - if (decrease_vid_code_by_step(data, reqvid, data->vidmvs)) - return 1; - } - - while ((rvosteps > 0) && - ((rvomult * data->rvo + data->currvid) > reqvid)) { - if (data->currvid == maxvid) { - rvosteps = 0; - } else { - dprintk("ph1: changing vid for rvo, req 0x%x\n", - data->currvid - 1); - if (decrease_vid_code_by_step(data, data->currvid-1, 1)) - return 1; - rvosteps--; - } - } - - if (query_current_values_with_pending_wait(data)) - return 1; - - if (savefid != data->currfid) { - printk(KERN_ERR PFX "ph1 err, currfid changed 0x%x\n", - data->currfid); - return 1; - } - - dprintk("ph1 complete, currfid 0x%x, currvid 0x%x\n", - data->currfid, data->currvid); - - return 0; -} - -/* Phase 2 - core frequency transition */ -static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid) -{ - u32 vcoreqfid, vcocurrfid, vcofiddiff; - u32 fid_interval, savevid = data->currvid; - - if (data->currfid == reqfid) { - printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n", - data->currfid); - return 0; - } - - dprintk("ph2 (cpu%d): starting, currfid 0x%x, currvid 0x%x, " - "reqfid 0x%x\n", - smp_processor_id(), - data->currfid, data->currvid, reqfid); - - vcoreqfid = convert_fid_to_vco_fid(reqfid); - vcocurrfid = convert_fid_to_vco_fid(data->currfid); - vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid - : vcoreqfid - vcocurrfid; - - if ((reqfid <= LO_FID_TABLE_TOP) && (data->currfid <= LO_FID_TABLE_TOP)) - vcofiddiff = 0; - - while (vcofiddiff > 2) { - (data->currfid & 1) ? (fid_interval = 1) : (fid_interval = 2); - - if (reqfid > data->currfid) { - if (data->currfid > LO_FID_TABLE_TOP) { - if (write_new_fid(data, - data->currfid + fid_interval)) - return 1; - } else { - if (write_new_fid - (data, - 2 + convert_fid_to_vco_fid(data->currfid))) - return 1; - } - } else { - if (write_new_fid(data, data->currfid - fid_interval)) - return 1; - } - - vcocurrfid = convert_fid_to_vco_fid(data->currfid); - vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid - : vcoreqfid - vcocurrfid; - } - - if (write_new_fid(data, reqfid)) - return 1; - - if (query_current_values_with_pending_wait(data)) - return 1; - - if (data->currfid != reqfid) { - printk(KERN_ERR PFX - "ph2: mismatch, failed fid transition, " - "curr 0x%x, req 0x%x\n", - data->currfid, reqfid); - return 1; - } - - if (savevid != data->currvid) { - printk(KERN_ERR PFX "ph2: vid changed, save 0x%x, curr 0x%x\n", - savevid, data->currvid); - return 1; - } - - dprintk("ph2 complete, currfid 0x%x, currvid 0x%x\n", - data->currfid, data->currvid); - - return 0; -} - -/* Phase 3 - core voltage transition flow ... jump to the final vid. */ -static int core_voltage_post_transition(struct powernow_k8_data *data, - u32 reqvid) -{ - u32 savefid = data->currfid; - u32 savereqvid = reqvid; - - dprintk("ph3 (cpu%d): starting, currfid 0x%x, currvid 0x%x\n", - smp_processor_id(), - data->currfid, data->currvid); - - if (reqvid != data->currvid) { - if (write_new_vid(data, reqvid)) - return 1; - - if (savefid != data->currfid) { - printk(KERN_ERR PFX - "ph3: bad fid change, save 0x%x, curr 0x%x\n", - savefid, data->currfid); - return 1; - } - - if (data->currvid != reqvid) { - printk(KERN_ERR PFX - "ph3: failed vid transition\n, " - "req 0x%x, curr 0x%x", - reqvid, data->currvid); - return 1; - } - } - - if (query_current_values_with_pending_wait(data)) - return 1; - - if (savereqvid != data->currvid) { - dprintk("ph3 failed, currvid 0x%x\n", data->currvid); - return 1; - } - - if (savefid != data->currfid) { - dprintk("ph3 failed, currfid changed 0x%x\n", - data->currfid); - return 1; - } - - dprintk("ph3 complete, currfid 0x%x, currvid 0x%x\n", - data->currfid, data->currvid); - - return 0; -} - -static void check_supported_cpu(void *_rc) -{ - u32 eax, ebx, ecx, edx; - int *rc = _rc; - - *rc = -ENODEV; - - if (current_cpu_data.x86_vendor != X86_VENDOR_AMD) - return; - - eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); - if (((eax & CPUID_XFAM) != CPUID_XFAM_K8) && - ((eax & CPUID_XFAM) < CPUID_XFAM_10H)) - return; - - if ((eax & CPUID_XFAM) == CPUID_XFAM_K8) { - if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) || - ((eax & CPUID_XMOD) > CPUID_XMOD_REV_MASK)) { - printk(KERN_INFO PFX - "Processor cpuid %x not supported\n", eax); - return; - } - - eax = cpuid_eax(CPUID_GET_MAX_CAPABILITIES); - if (eax < CPUID_FREQ_VOLT_CAPABILITIES) { - printk(KERN_INFO PFX - "No frequency change capabilities detected\n"); - return; - } - - cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); - if ((edx & P_STATE_TRANSITION_CAPABLE) - != P_STATE_TRANSITION_CAPABLE) { - printk(KERN_INFO PFX - "Power state transitions not supported\n"); - return; - } - } else { /* must be a HW Pstate capable processor */ - cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); - if ((edx & USE_HW_PSTATE) == USE_HW_PSTATE) - cpu_family = CPU_HW_PSTATE; - else - return; - } - - *rc = 0; -} - -static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst, - u8 maxvid) -{ - unsigned int j; - u8 lastfid = 0xff; - - for (j = 0; j < data->numps; j++) { - if (pst[j].vid > LEAST_VID) { - printk(KERN_ERR FW_BUG PFX "vid %d invalid : 0x%x\n", - j, pst[j].vid); - return -EINVAL; - } - if (pst[j].vid < data->rvo) { - /* vid + rvo >= 0 */ - printk(KERN_ERR FW_BUG PFX "0 vid exceeded with pstate" - " %d\n", j); - return -ENODEV; - } - if (pst[j].vid < maxvid + data->rvo) { - /* vid + rvo >= maxvid */ - printk(KERN_ERR FW_BUG PFX "maxvid exceeded with pstate" - " %d\n", j); - return -ENODEV; - } - if (pst[j].fid > MAX_FID) { - printk(KERN_ERR FW_BUG PFX "maxfid exceeded with pstate" - " %d\n", j); - return -ENODEV; - } - if (j && (pst[j].fid < HI_FID_TABLE_BOTTOM)) { - /* Only first fid is allowed to be in "low" range */ - printk(KERN_ERR FW_BUG PFX "two low fids - %d : " - "0x%x\n", j, pst[j].fid); - return -EINVAL; - } - if (pst[j].fid < lastfid) - lastfid = pst[j].fid; - } - if (lastfid & 1) { - printk(KERN_ERR FW_BUG PFX "lastfid invalid\n"); - return -EINVAL; - } - if (lastfid > LO_FID_TABLE_TOP) - printk(KERN_INFO FW_BUG PFX - "first fid not from lo freq table\n"); - - return 0; -} - -static void invalidate_entry(struct cpufreq_frequency_table *powernow_table, - unsigned int entry) -{ - powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID; -} - -static void print_basics(struct powernow_k8_data *data) -{ - int j; - for (j = 0; j < data->numps; j++) { - if (data->powernow_table[j].frequency != - CPUFREQ_ENTRY_INVALID) { - if (cpu_family == CPU_HW_PSTATE) { - printk(KERN_INFO PFX - " %d : pstate %d (%d MHz)\n", j, - data->powernow_table[j].index, - data->powernow_table[j].frequency/1000); - } else { - printk(KERN_INFO PFX - " %d : fid 0x%x (%d MHz), vid 0x%x\n", - j, - data->powernow_table[j].index & 0xff, - data->powernow_table[j].frequency/1000, - data->powernow_table[j].index >> 8); - } - } - } - if (data->batps) - printk(KERN_INFO PFX "Only %d pstates on battery\n", - data->batps); -} - -static u32 freq_from_fid_did(u32 fid, u32 did) -{ - u32 mhz = 0; - - if (boot_cpu_data.x86 == 0x10) - mhz = (100 * (fid + 0x10)) >> did; - else if (boot_cpu_data.x86 == 0x11) - mhz = (100 * (fid + 8)) >> did; - else - BUG(); - - return mhz * 1000; -} - -static int fill_powernow_table(struct powernow_k8_data *data, - struct pst_s *pst, u8 maxvid) -{ - struct cpufreq_frequency_table *powernow_table; - unsigned int j; - - if (data->batps) { - /* use ACPI support to get full speed on mains power */ - printk(KERN_WARNING PFX - "Only %d pstates usable (use ACPI driver for full " - "range\n", data->batps); - data->numps = data->batps; - } - - for (j = 1; j < data->numps; j++) { - if (pst[j-1].fid >= pst[j].fid) { - printk(KERN_ERR PFX "PST out of sequence\n"); - return -EINVAL; - } - } - - if (data->numps < 2) { - printk(KERN_ERR PFX "no p states to transition\n"); - return -ENODEV; - } - - if (check_pst_table(data, pst, maxvid)) - return -EINVAL; - - powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table) - * (data->numps + 1)), GFP_KERNEL); - if (!powernow_table) { - printk(KERN_ERR PFX "powernow_table memory alloc failure\n"); - return -ENOMEM; - } - - for (j = 0; j < data->numps; j++) { - int freq; - powernow_table[j].index = pst[j].fid; /* lower 8 bits */ - powernow_table[j].index |= (pst[j].vid << 8); /* upper 8 bits */ - freq = find_khz_freq_from_fid(pst[j].fid); - powernow_table[j].frequency = freq; - } - powernow_table[data->numps].frequency = CPUFREQ_TABLE_END; - powernow_table[data->numps].index = 0; - - if (query_current_values_with_pending_wait(data)) { - kfree(powernow_table); - return -EIO; - } - - dprintk("cfid 0x%x, cvid 0x%x\n", data->currfid, data->currvid); - data->powernow_table = powernow_table; - if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu) - print_basics(data); - - for (j = 0; j < data->numps; j++) - if ((pst[j].fid == data->currfid) && - (pst[j].vid == data->currvid)) - return 0; - - dprintk("currfid/vid do not match PST, ignoring\n"); - return 0; -} - -/* Find and validate the PSB/PST table in BIOS. */ -static int find_psb_table(struct powernow_k8_data *data) -{ - struct psb_s *psb; - unsigned int i; - u32 mvs; - u8 maxvid; - u32 cpst = 0; - u32 thiscpuid; - - for (i = 0xc0000; i < 0xffff0; i += 0x10) { - /* Scan BIOS looking for the signature. */ - /* It can not be at ffff0 - it is too big. */ - - psb = phys_to_virt(i); - if (memcmp(psb, PSB_ID_STRING, PSB_ID_STRING_LEN) != 0) - continue; - - dprintk("found PSB header at 0x%p\n", psb); - - dprintk("table vers: 0x%x\n", psb->tableversion); - if (psb->tableversion != PSB_VERSION_1_4) { - printk(KERN_ERR FW_BUG PFX "PSB table is not v1.4\n"); - return -ENODEV; - } - - dprintk("flags: 0x%x\n", psb->flags1); - if (psb->flags1) { - printk(KERN_ERR FW_BUG PFX "unknown flags\n"); - return -ENODEV; - } - - data->vstable = psb->vstable; - dprintk("voltage stabilization time: %d(*20us)\n", - data->vstable); - - dprintk("flags2: 0x%x\n", psb->flags2); - data->rvo = psb->flags2 & 3; - data->irt = ((psb->flags2) >> 2) & 3; - mvs = ((psb->flags2) >> 4) & 3; - data->vidmvs = 1 << mvs; - data->batps = ((psb->flags2) >> 6) & 3; - - dprintk("ramp voltage offset: %d\n", data->rvo); - dprintk("isochronous relief time: %d\n", data->irt); - dprintk("maximum voltage step: %d - 0x%x\n", mvs, data->vidmvs); - - dprintk("numpst: 0x%x\n", psb->num_tables); - cpst = psb->num_tables; - if ((psb->cpuid == 0x00000fc0) || - (psb->cpuid == 0x00000fe0)) { - thiscpuid = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); - if ((thiscpuid == 0x00000fc0) || - (thiscpuid == 0x00000fe0)) - cpst = 1; - } - if (cpst != 1) { - printk(KERN_ERR FW_BUG PFX "numpst must be 1\n"); - return -ENODEV; - } - - data->plllock = psb->plllocktime; - dprintk("plllocktime: 0x%x (units 1us)\n", psb->plllocktime); - dprintk("maxfid: 0x%x\n", psb->maxfid); - dprintk("maxvid: 0x%x\n", psb->maxvid); - maxvid = psb->maxvid; - - data->numps = psb->numps; - dprintk("numpstates: 0x%x\n", data->numps); - return fill_powernow_table(data, - (struct pst_s *)(psb+1), maxvid); - } - /* - * If you see this message, complain to BIOS manufacturer. If - * he tells you "we do not support Linux" or some similar - * nonsense, remember that Windows 2000 uses the same legacy - * mechanism that the old Linux PSB driver uses. Tell them it - * is broken with Windows 2000. - * - * The reference to the AMD documentation is chapter 9 in the - * BIOS and Kernel Developer's Guide, which is available on - * www.amd.com - */ - printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n"); - printk(KERN_ERR PFX "Make sure that your BIOS is up to date" - " and Cool'N'Quiet support is enabled in BIOS setup\n"); - return -ENODEV; -} - -static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, - unsigned int index) -{ - u64 control; - - if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE)) - return; - - control = data->acpi_data.states[index].control; - data->irt = (control >> IRT_SHIFT) & IRT_MASK; - data->rvo = (control >> RVO_SHIFT) & RVO_MASK; - data->exttype = (control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK; - data->plllock = (control >> PLL_L_SHIFT) & PLL_L_MASK; - data->vidmvs = 1 << ((control >> MVS_SHIFT) & MVS_MASK); - data->vstable = (control >> VST_SHIFT) & VST_MASK; -} - -static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) -{ - struct cpufreq_frequency_table *powernow_table; - int ret_val = -ENODEV; - u64 control, status; - - if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) { - dprintk("register performance failed: bad ACPI data\n"); - return -EIO; - } - - /* verify the data contained in the ACPI structures */ - if (data->acpi_data.state_count <= 1) { - dprintk("No ACPI P-States\n"); - goto err_out; - } - - control = data->acpi_data.control_register.space_id; - status = data->acpi_data.status_register.space_id; - - if ((control != ACPI_ADR_SPACE_FIXED_HARDWARE) || - (status != ACPI_ADR_SPACE_FIXED_HARDWARE)) { - dprintk("Invalid control/status registers (%x - %x)\n", - control, status); - goto err_out; - } - - /* fill in data->powernow_table */ - powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table) - * (data->acpi_data.state_count + 1)), GFP_KERNEL); - if (!powernow_table) { - dprintk("powernow_table memory alloc failure\n"); - goto err_out; - } - - /* fill in data */ - data->numps = data->acpi_data.state_count; - powernow_k8_acpi_pst_values(data, 0); - - if (cpu_family == CPU_HW_PSTATE) - ret_val = fill_powernow_table_pstate(data, powernow_table); - else - ret_val = fill_powernow_table_fidvid(data, powernow_table); - if (ret_val) - goto err_out_mem; - - powernow_table[data->acpi_data.state_count].frequency = - CPUFREQ_TABLE_END; - powernow_table[data->acpi_data.state_count].index = 0; - data->powernow_table = powernow_table; - - if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu) - print_basics(data); - - /* notify BIOS that we exist */ - acpi_processor_notify_smm(THIS_MODULE); - - if (!zalloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) { - printk(KERN_ERR PFX - "unable to alloc powernow_k8_data cpumask\n"); - ret_val = -ENOMEM; - goto err_out_mem; - } - - return 0; - -err_out_mem: - kfree(powernow_table); - -err_out: - acpi_processor_unregister_performance(&data->acpi_data, data->cpu); - - /* data->acpi_data.state_count informs us at ->exit() - * whether ACPI was used */ - data->acpi_data.state_count = 0; - - return ret_val; -} - -static int fill_powernow_table_pstate(struct powernow_k8_data *data, - struct cpufreq_frequency_table *powernow_table) -{ - int i; - u32 hi = 0, lo = 0; - rdmsr(MSR_PSTATE_CUR_LIMIT, lo, hi); - data->max_hw_pstate = (lo & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT; - - for (i = 0; i < data->acpi_data.state_count; i++) { - u32 index; - - index = data->acpi_data.states[i].control & HW_PSTATE_MASK; - if (index > data->max_hw_pstate) { - printk(KERN_ERR PFX "invalid pstate %d - " - "bad value %d.\n", i, index); - printk(KERN_ERR PFX "Please report to BIOS " - "manufacturer\n"); - invalidate_entry(powernow_table, i); - continue; - } - rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi); - if (!(hi & HW_PSTATE_VALID_MASK)) { - dprintk("invalid pstate %d, ignoring\n", index); - invalidate_entry(powernow_table, i); - continue; - } - - powernow_table[i].index = index; - - /* Frequency may be rounded for these */ - if ((boot_cpu_data.x86 == 0x10 && boot_cpu_data.x86_model < 10) - || boot_cpu_data.x86 == 0x11) { - powernow_table[i].frequency = - freq_from_fid_did(lo & 0x3f, (lo >> 6) & 7); - } else - powernow_table[i].frequency = - data->acpi_data.states[i].core_frequency * 1000; - } - return 0; -} - -static int fill_powernow_table_fidvid(struct powernow_k8_data *data, - struct cpufreq_frequency_table *powernow_table) -{ - int i; - - for (i = 0; i < data->acpi_data.state_count; i++) { - u32 fid; - u32 vid; - u32 freq, index; - u64 status, control; - - if (data->exttype) { - status = data->acpi_data.states[i].status; - fid = status & EXT_FID_MASK; - vid = (status >> VID_SHIFT) & EXT_VID_MASK; - } else { - control = data->acpi_data.states[i].control; - fid = control & FID_MASK; - vid = (control >> VID_SHIFT) & VID_MASK; - } - - dprintk(" %d : fid 0x%x, vid 0x%x\n", i, fid, vid); - - index = fid | (vid<<8); - powernow_table[i].index = index; - - freq = find_khz_freq_from_fid(fid); - powernow_table[i].frequency = freq; - - /* verify frequency is OK */ - if ((freq > (MAX_FREQ * 1000)) || (freq < (MIN_FREQ * 1000))) { - dprintk("invalid freq %u kHz, ignoring\n", freq); - invalidate_entry(powernow_table, i); - continue; - } - - /* verify voltage is OK - - * BIOSs are using "off" to indicate invalid */ - if (vid == VID_OFF) { - dprintk("invalid vid %u, ignoring\n", vid); - invalidate_entry(powernow_table, i); - continue; - } - - if (freq != (data->acpi_data.states[i].core_frequency * 1000)) { - printk(KERN_INFO PFX "invalid freq entries " - "%u kHz vs. %u kHz\n", freq, - (unsigned int) - (data->acpi_data.states[i].core_frequency - * 1000)); - invalidate_entry(powernow_table, i); - continue; - } - } - return 0; -} - -static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data) -{ - if (data->acpi_data.state_count) - acpi_processor_unregister_performance(&data->acpi_data, - data->cpu); - free_cpumask_var(data->acpi_data.shared_cpu_map); -} - -static int get_transition_latency(struct powernow_k8_data *data) -{ - int max_latency = 0; - int i; - for (i = 0; i < data->acpi_data.state_count; i++) { - int cur_latency = data->acpi_data.states[i].transition_latency - + data->acpi_data.states[i].bus_master_latency; - if (cur_latency > max_latency) - max_latency = cur_latency; - } - if (max_latency == 0) { - /* - * Fam 11h and later may return 0 as transition latency. This - * is intended and means "very fast". While cpufreq core and - * governors currently can handle that gracefully, better set it - * to 1 to avoid problems in the future. - */ - if (boot_cpu_data.x86 < 0x11) - printk(KERN_ERR FW_WARN PFX "Invalid zero transition " - "latency\n"); - max_latency = 1; - } - /* value in usecs, needs to be in nanoseconds */ - return 1000 * max_latency; -} - -/* Take a frequency, and issue the fid/vid transition command */ -static int transition_frequency_fidvid(struct powernow_k8_data *data, - unsigned int index) -{ - u32 fid = 0; - u32 vid = 0; - int res, i; - struct cpufreq_freqs freqs; - - dprintk("cpu %d transition to index %u\n", smp_processor_id(), index); - - /* fid/vid correctness check for k8 */ - /* fid are the lower 8 bits of the index we stored into - * the cpufreq frequency table in find_psb_table, vid - * are the upper 8 bits. - */ - fid = data->powernow_table[index].index & 0xFF; - vid = (data->powernow_table[index].index & 0xFF00) >> 8; - - dprintk("table matched fid 0x%x, giving vid 0x%x\n", fid, vid); - - if (query_current_values_with_pending_wait(data)) - return 1; - - if ((data->currvid == vid) && (data->currfid == fid)) { - dprintk("target matches current values (fid 0x%x, vid 0x%x)\n", - fid, vid); - return 0; - } - - dprintk("cpu %d, changing to fid 0x%x, vid 0x%x\n", - smp_processor_id(), fid, vid); - freqs.old = find_khz_freq_from_fid(data->currfid); - freqs.new = find_khz_freq_from_fid(fid); - - for_each_cpu(i, data->available_cores) { - freqs.cpu = i; - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - } - - res = transition_fid_vid(data, fid, vid); - freqs.new = find_khz_freq_from_fid(data->currfid); - - for_each_cpu(i, data->available_cores) { - freqs.cpu = i; - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - } - return res; -} - -/* Take a frequency, and issue the hardware pstate transition command */ -static int transition_frequency_pstate(struct powernow_k8_data *data, - unsigned int index) -{ - u32 pstate = 0; - int res, i; - struct cpufreq_freqs freqs; - - dprintk("cpu %d transition to index %u\n", smp_processor_id(), index); - - /* get MSR index for hardware pstate transition */ - pstate = index & HW_PSTATE_MASK; - if (pstate > data->max_hw_pstate) - return 0; - freqs.old = find_khz_freq_from_pstate(data->powernow_table, - data->currpstate); - freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); - - for_each_cpu(i, data->available_cores) { - freqs.cpu = i; - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - } - - res = transition_pstate(data, pstate); - freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); - - for_each_cpu(i, data->available_cores) { - freqs.cpu = i; - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - } - return res; -} - -/* Driver entry point to switch to the target frequency */ -static int powernowk8_target(struct cpufreq_policy *pol, - unsigned targfreq, unsigned relation) -{ - cpumask_var_t oldmask; - struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); - u32 checkfid; - u32 checkvid; - unsigned int newstate; - int ret = -EIO; - - if (!data) - return -EINVAL; - - checkfid = data->currfid; - checkvid = data->currvid; - - /* only run on specific CPU from here on. */ - /* This is poor form: use a workqueue or smp_call_function_single */ - if (!alloc_cpumask_var(&oldmask, GFP_KERNEL)) - return -ENOMEM; - - cpumask_copy(oldmask, tsk_cpus_allowed(current)); - set_cpus_allowed_ptr(current, cpumask_of(pol->cpu)); - - if (smp_processor_id() != pol->cpu) { - printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); - goto err_out; - } - - if (pending_bit_stuck()) { - printk(KERN_ERR PFX "failing targ, change pending bit set\n"); - goto err_out; - } - - dprintk("targ: cpu %d, %d kHz, min %d, max %d, relation %d\n", - pol->cpu, targfreq, pol->min, pol->max, relation); - - if (query_current_values_with_pending_wait(data)) - goto err_out; - - if (cpu_family != CPU_HW_PSTATE) { - dprintk("targ: curr fid 0x%x, vid 0x%x\n", - data->currfid, data->currvid); - - if ((checkvid != data->currvid) || - (checkfid != data->currfid)) { - printk(KERN_INFO PFX - "error - out of sync, fix 0x%x 0x%x, " - "vid 0x%x 0x%x\n", - checkfid, data->currfid, - checkvid, data->currvid); - } - } - - if (cpufreq_frequency_table_target(pol, data->powernow_table, - targfreq, relation, &newstate)) - goto err_out; - - mutex_lock(&fidvid_mutex); - - powernow_k8_acpi_pst_values(data, newstate); - - if (cpu_family == CPU_HW_PSTATE) - ret = transition_frequency_pstate(data, newstate); - else - ret = transition_frequency_fidvid(data, newstate); - if (ret) { - printk(KERN_ERR PFX "transition frequency failed\n"); - ret = 1; - mutex_unlock(&fidvid_mutex); - goto err_out; - } - mutex_unlock(&fidvid_mutex); - - if (cpu_family == CPU_HW_PSTATE) - pol->cur = find_khz_freq_from_pstate(data->powernow_table, - newstate); - else - pol->cur = find_khz_freq_from_fid(data->currfid); - ret = 0; - -err_out: - set_cpus_allowed_ptr(current, oldmask); - free_cpumask_var(oldmask); - return ret; -} - -/* Driver entry point to verify the policy and range of frequencies */ -static int powernowk8_verify(struct cpufreq_policy *pol) -{ - struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); - - if (!data) - return -EINVAL; - - return cpufreq_frequency_table_verify(pol, data->powernow_table); -} - -struct init_on_cpu { - struct powernow_k8_data *data; - int rc; -}; - -static void __cpuinit powernowk8_cpu_init_on_cpu(void *_init_on_cpu) -{ - struct init_on_cpu *init_on_cpu = _init_on_cpu; - - if (pending_bit_stuck()) { - printk(KERN_ERR PFX "failing init, change pending bit set\n"); - init_on_cpu->rc = -ENODEV; - return; - } - - if (query_current_values_with_pending_wait(init_on_cpu->data)) { - init_on_cpu->rc = -ENODEV; - return; - } - - if (cpu_family == CPU_OPTERON) - fidvid_msr_init(); - - init_on_cpu->rc = 0; -} - -/* per CPU init entry point to the driver */ -static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) -{ - static const char ACPI_PSS_BIOS_BUG_MSG[] = - KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n" - FW_BUG PFX "Try again with latest BIOS.\n"; - struct powernow_k8_data *data; - struct init_on_cpu init_on_cpu; - int rc; - struct cpuinfo_x86 *c = &cpu_data(pol->cpu); - - if (!cpu_online(pol->cpu)) - return -ENODEV; - - smp_call_function_single(pol->cpu, check_supported_cpu, &rc, 1); - if (rc) - return -ENODEV; - - data = kzalloc(sizeof(struct powernow_k8_data), GFP_KERNEL); - if (!data) { - printk(KERN_ERR PFX "unable to alloc powernow_k8_data"); - return -ENOMEM; - } - - data->cpu = pol->cpu; - data->currpstate = HW_PSTATE_INVALID; - - if (powernow_k8_cpu_init_acpi(data)) { - /* - * Use the PSB BIOS structure. This is only availabe on - * an UP version, and is deprecated by AMD. - */ - if (num_online_cpus() != 1) { - printk_once(ACPI_PSS_BIOS_BUG_MSG); - goto err_out; - } - if (pol->cpu != 0) { - printk(KERN_ERR FW_BUG PFX "No ACPI _PSS objects for " - "CPU other than CPU0. Complain to your BIOS " - "vendor.\n"); - goto err_out; - } - rc = find_psb_table(data); - if (rc) - goto err_out; - - /* Take a crude guess here. - * That guess was in microseconds, so multiply with 1000 */ - pol->cpuinfo.transition_latency = ( - ((data->rvo + 8) * data->vstable * VST_UNITS_20US) + - ((1 << data->irt) * 30)) * 1000; - } else /* ACPI _PSS objects available */ - pol->cpuinfo.transition_latency = get_transition_latency(data); - - /* only run on specific CPU from here on */ - init_on_cpu.data = data; - smp_call_function_single(data->cpu, powernowk8_cpu_init_on_cpu, - &init_on_cpu, 1); - rc = init_on_cpu.rc; - if (rc != 0) - goto err_out_exit_acpi; - - if (cpu_family == CPU_HW_PSTATE) - cpumask_copy(pol->cpus, cpumask_of(pol->cpu)); - else - cpumask_copy(pol->cpus, cpu_core_mask(pol->cpu)); - data->available_cores = pol->cpus; - - if (cpu_family == CPU_HW_PSTATE) - pol->cur = find_khz_freq_from_pstate(data->powernow_table, - data->currpstate); - else - pol->cur = find_khz_freq_from_fid(data->currfid); - dprintk("policy current frequency %d kHz\n", pol->cur); - - /* min/max the cpu is capable of */ - if (cpufreq_frequency_table_cpuinfo(pol, data->powernow_table)) { - printk(KERN_ERR FW_BUG PFX "invalid powernow_table\n"); - powernow_k8_cpu_exit_acpi(data); - kfree(data->powernow_table); - kfree(data); - return -EINVAL; - } - - /* Check for APERF/MPERF support in hardware */ - if (cpu_has(c, X86_FEATURE_APERFMPERF)) - cpufreq_amd64_driver.getavg = cpufreq_get_measured_perf; - - cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu); - - if (cpu_family == CPU_HW_PSTATE) - dprintk("cpu_init done, current pstate 0x%x\n", - data->currpstate); - else - dprintk("cpu_init done, current fid 0x%x, vid 0x%x\n", - data->currfid, data->currvid); - - per_cpu(powernow_data, pol->cpu) = data; - - return 0; - -err_out_exit_acpi: - powernow_k8_cpu_exit_acpi(data); - -err_out: - kfree(data); - return -ENODEV; -} - -static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol) -{ - struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); - - if (!data) - return -EINVAL; - - powernow_k8_cpu_exit_acpi(data); - - cpufreq_frequency_table_put_attr(pol->cpu); - - kfree(data->powernow_table); - kfree(data); - per_cpu(powernow_data, pol->cpu) = NULL; - - return 0; -} - -static void query_values_on_cpu(void *_err) -{ - int *err = _err; - struct powernow_k8_data *data = __get_cpu_var(powernow_data); - - *err = query_current_values_with_pending_wait(data); -} - -static unsigned int powernowk8_get(unsigned int cpu) -{ - struct powernow_k8_data *data = per_cpu(powernow_data, cpu); - unsigned int khz = 0; - int err; - - if (!data) - return 0; - - smp_call_function_single(cpu, query_values_on_cpu, &err, true); - if (err) - goto out; - - if (cpu_family == CPU_HW_PSTATE) - khz = find_khz_freq_from_pstate(data->powernow_table, - data->currpstate); - else - khz = find_khz_freq_from_fid(data->currfid); - - -out: - return khz; -} - -static void _cpb_toggle_msrs(bool t) -{ - int cpu; - - get_online_cpus(); - - rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs); - - for_each_cpu(cpu, cpu_online_mask) { - struct msr *reg = per_cpu_ptr(msrs, cpu); - if (t) - reg->l &= ~BIT(25); - else - reg->l |= BIT(25); - } - wrmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs); - - put_online_cpus(); -} - -/* - * Switch on/off core performance boosting. - * - * 0=disable - * 1=enable. - */ -static void cpb_toggle(bool t) -{ - if (!cpb_capable) - return; - - if (t && !cpb_enabled) { - cpb_enabled = true; - _cpb_toggle_msrs(t); - printk(KERN_INFO PFX "Core Boosting enabled.\n"); - } else if (!t && cpb_enabled) { - cpb_enabled = false; - _cpb_toggle_msrs(t); - printk(KERN_INFO PFX "Core Boosting disabled.\n"); - } -} - -static ssize_t store_cpb(struct cpufreq_policy *policy, const char *buf, - size_t count) -{ - int ret = -EINVAL; - unsigned long val = 0; - - ret = strict_strtoul(buf, 10, &val); - if (!ret && (val == 0 || val == 1) && cpb_capable) - cpb_toggle(val); - else - return -EINVAL; - - return count; -} - -static ssize_t show_cpb(struct cpufreq_policy *policy, char *buf) -{ - return sprintf(buf, "%u\n", cpb_enabled); -} - -#define define_one_rw(_name) \ -static struct freq_attr _name = \ -__ATTR(_name, 0644, show_##_name, store_##_name) - -define_one_rw(cpb); - -static struct freq_attr *powernow_k8_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - &cpb, - NULL, -}; - -static struct cpufreq_driver cpufreq_amd64_driver = { - .verify = powernowk8_verify, - .target = powernowk8_target, - .bios_limit = acpi_processor_get_bios_limit, - .init = powernowk8_cpu_init, - .exit = __devexit_p(powernowk8_cpu_exit), - .get = powernowk8_get, - .name = "powernow-k8", - .owner = THIS_MODULE, - .attr = powernow_k8_attr, -}; - -/* - * Clear the boost-disable flag on the CPU_DOWN path so that this cpu - * cannot block the remaining ones from boosting. On the CPU_UP path we - * simply keep the boost-disable flag in sync with the current global - * state. - */ -static int cpb_notify(struct notifier_block *nb, unsigned long action, - void *hcpu) -{ - unsigned cpu = (long)hcpu; - u32 lo, hi; - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - - if (!cpb_enabled) { - rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi); - lo |= BIT(25); - wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi); - } - break; - - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi); - lo &= ~BIT(25); - wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi); - break; - - default: - break; - } - - return NOTIFY_OK; -} - -static struct notifier_block cpb_nb = { - .notifier_call = cpb_notify, -}; - -/* driver entry point for init */ -static int __cpuinit powernowk8_init(void) -{ - unsigned int i, supported_cpus = 0, cpu; - - for_each_online_cpu(i) { - int rc; - smp_call_function_single(i, check_supported_cpu, &rc, 1); - if (rc == 0) - supported_cpus++; - } - - if (supported_cpus != num_online_cpus()) - return -ENODEV; - - printk(KERN_INFO PFX "Found %d %s (%d cpu cores) (" VERSION ")\n", - num_online_nodes(), boot_cpu_data.x86_model_id, supported_cpus); - - if (boot_cpu_has(X86_FEATURE_CPB)) { - - cpb_capable = true; - - register_cpu_notifier(&cpb_nb); - - msrs = msrs_alloc(); - if (!msrs) { - printk(KERN_ERR "%s: Error allocating msrs!\n", __func__); - return -ENOMEM; - } - - rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs); - - for_each_cpu(cpu, cpu_online_mask) { - struct msr *reg = per_cpu_ptr(msrs, cpu); - cpb_enabled |= !(!!(reg->l & BIT(25))); - } - - printk(KERN_INFO PFX "Core Performance Boosting: %s.\n", - (cpb_enabled ? "on" : "off")); - } - - return cpufreq_register_driver(&cpufreq_amd64_driver); -} - -/* driver entry point for term */ -static void __exit powernowk8_exit(void) -{ - dprintk("exit\n"); - - if (boot_cpu_has(X86_FEATURE_CPB)) { - msrs_free(msrs); - msrs = NULL; - - unregister_cpu_notifier(&cpb_nb); - } - - cpufreq_unregister_driver(&cpufreq_amd64_driver); -} - -MODULE_AUTHOR("Paul Devriendt <paul.devriendt@amd.com> and " - "Mark Langsdorf <mark.langsdorf@amd.com>"); -MODULE_DESCRIPTION("AMD Athlon 64 and Opteron processor frequency driver."); -MODULE_LICENSE("GPL"); - -late_initcall(powernowk8_init); -module_exit(powernowk8_exit); diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h deleted file mode 100644 index df3529b1c02..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h +++ /dev/null @@ -1,224 +0,0 @@ -/* - * (c) 2003-2006 Advanced Micro Devices, Inc. - * Your use of this code is subject to the terms and conditions of the - * GNU general public license version 2. See "COPYING" or - * http://www.gnu.org/licenses/gpl.html - */ - -enum pstate { - HW_PSTATE_INVALID = 0xff, - HW_PSTATE_0 = 0, - HW_PSTATE_1 = 1, - HW_PSTATE_2 = 2, - HW_PSTATE_3 = 3, - HW_PSTATE_4 = 4, - HW_PSTATE_5 = 5, - HW_PSTATE_6 = 6, - HW_PSTATE_7 = 7, -}; - -struct powernow_k8_data { - unsigned int cpu; - - u32 numps; /* number of p-states */ - u32 batps; /* number of p-states supported on battery */ - u32 max_hw_pstate; /* maximum legal hardware pstate */ - - /* these values are constant when the PSB is used to determine - * vid/fid pairings, but are modified during the ->target() call - * when ACPI is used */ - u32 rvo; /* ramp voltage offset */ - u32 irt; /* isochronous relief time */ - u32 vidmvs; /* usable value calculated from mvs */ - u32 vstable; /* voltage stabilization time, units 20 us */ - u32 plllock; /* pll lock time, units 1 us */ - u32 exttype; /* extended interface = 1 */ - - /* keep track of the current fid / vid or pstate */ - u32 currvid; - u32 currfid; - enum pstate currpstate; - - /* the powernow_table includes all frequency and vid/fid pairings: - * fid are the lower 8 bits of the index, vid are the upper 8 bits. - * frequency is in kHz */ - struct cpufreq_frequency_table *powernow_table; - - /* the acpi table needs to be kept. it's only available if ACPI was - * used to determine valid frequency/vid/fid states */ - struct acpi_processor_performance acpi_data; - - /* we need to keep track of associated cores, but let cpufreq - * handle hotplug events - so just point at cpufreq pol->cpus - * structure */ - struct cpumask *available_cores; -}; - -/* processor's cpuid instruction support */ -#define CPUID_PROCESSOR_SIGNATURE 1 /* function 1 */ -#define CPUID_XFAM 0x0ff00000 /* extended family */ -#define CPUID_XFAM_K8 0 -#define CPUID_XMOD 0x000f0000 /* extended model */ -#define CPUID_XMOD_REV_MASK 0x000c0000 -#define CPUID_XFAM_10H 0x00100000 /* family 0x10 */ -#define CPUID_USE_XFAM_XMOD 0x00000f00 -#define CPUID_GET_MAX_CAPABILITIES 0x80000000 -#define CPUID_FREQ_VOLT_CAPABILITIES 0x80000007 -#define P_STATE_TRANSITION_CAPABLE 6 - -/* Model Specific Registers for p-state transitions. MSRs are 64-bit. For */ -/* writes (wrmsr - opcode 0f 30), the register number is placed in ecx, and */ -/* the value to write is placed in edx:eax. For reads (rdmsr - opcode 0f 32), */ -/* the register number is placed in ecx, and the data is returned in edx:eax. */ - -#define MSR_FIDVID_CTL 0xc0010041 -#define MSR_FIDVID_STATUS 0xc0010042 - -/* Field definitions within the FID VID Low Control MSR : */ -#define MSR_C_LO_INIT_FID_VID 0x00010000 -#define MSR_C_LO_NEW_VID 0x00003f00 -#define MSR_C_LO_NEW_FID 0x0000003f -#define MSR_C_LO_VID_SHIFT 8 - -/* Field definitions within the FID VID High Control MSR : */ -#define MSR_C_HI_STP_GNT_TO 0x000fffff - -/* Field definitions within the FID VID Low Status MSR : */ -#define MSR_S_LO_CHANGE_PENDING 0x80000000 /* cleared when completed */ -#define MSR_S_LO_MAX_RAMP_VID 0x3f000000 -#define MSR_S_LO_MAX_FID 0x003f0000 -#define MSR_S_LO_START_FID 0x00003f00 -#define MSR_S_LO_CURRENT_FID 0x0000003f - -/* Field definitions within the FID VID High Status MSR : */ -#define MSR_S_HI_MIN_WORKING_VID 0x3f000000 -#define MSR_S_HI_MAX_WORKING_VID 0x003f0000 -#define MSR_S_HI_START_VID 0x00003f00 -#define MSR_S_HI_CURRENT_VID 0x0000003f -#define MSR_C_HI_STP_GNT_BENIGN 0x00000001 - - -/* Hardware Pstate _PSS and MSR definitions */ -#define USE_HW_PSTATE 0x00000080 -#define HW_PSTATE_MASK 0x00000007 -#define HW_PSTATE_VALID_MASK 0x80000000 -#define HW_PSTATE_MAX_MASK 0x000000f0 -#define HW_PSTATE_MAX_SHIFT 4 -#define MSR_PSTATE_DEF_BASE 0xc0010064 /* base of Pstate MSRs */ -#define MSR_PSTATE_STATUS 0xc0010063 /* Pstate Status MSR */ -#define MSR_PSTATE_CTRL 0xc0010062 /* Pstate control MSR */ -#define MSR_PSTATE_CUR_LIMIT 0xc0010061 /* pstate current limit MSR */ - -/* define the two driver architectures */ -#define CPU_OPTERON 0 -#define CPU_HW_PSTATE 1 - - -/* - * There are restrictions frequencies have to follow: - * - only 1 entry in the low fid table ( <=1.4GHz ) - * - lowest entry in the high fid table must be >= 2 * the entry in the - * low fid table - * - lowest entry in the high fid table must be a <= 200MHz + 2 * the entry - * in the low fid table - * - the parts can only step at <= 200 MHz intervals, odd fid values are - * supported in revision G and later revisions. - * - lowest frequency must be >= interprocessor hypertransport link speed - * (only applies to MP systems obviously) - */ - -/* fids (frequency identifiers) are arranged in 2 tables - lo and hi */ -#define LO_FID_TABLE_TOP 7 /* fid values marking the boundary */ -#define HI_FID_TABLE_BOTTOM 8 /* between the low and high tables */ - -#define LO_VCOFREQ_TABLE_TOP 1400 /* corresponding vco frequency values */ -#define HI_VCOFREQ_TABLE_BOTTOM 1600 - -#define MIN_FREQ_RESOLUTION 200 /* fids jump by 2 matching freq jumps by 200 */ - -#define MAX_FID 0x2a /* Spec only gives FID values as far as 5 GHz */ -#define LEAST_VID 0x3e /* Lowest (numerically highest) useful vid value */ - -#define MIN_FREQ 800 /* Min and max freqs, per spec */ -#define MAX_FREQ 5000 - -#define INVALID_FID_MASK 0xffffffc0 /* not a valid fid if these bits are set */ -#define INVALID_VID_MASK 0xffffffc0 /* not a valid vid if these bits are set */ - -#define VID_OFF 0x3f - -#define STOP_GRANT_5NS 1 /* min poss memory access latency for voltage change */ - -#define PLL_LOCK_CONVERSION (1000/5) /* ms to ns, then divide by clock period */ - -#define MAXIMUM_VID_STEPS 1 /* Current cpus only allow a single step of 25mV */ -#define VST_UNITS_20US 20 /* Voltage Stabilization Time is in units of 20us */ - -/* - * Most values of interest are encoded in a single field of the _PSS - * entries: the "control" value. - */ - -#define IRT_SHIFT 30 -#define RVO_SHIFT 28 -#define EXT_TYPE_SHIFT 27 -#define PLL_L_SHIFT 20 -#define MVS_SHIFT 18 -#define VST_SHIFT 11 -#define VID_SHIFT 6 -#define IRT_MASK 3 -#define RVO_MASK 3 -#define EXT_TYPE_MASK 1 -#define PLL_L_MASK 0x7f -#define MVS_MASK 3 -#define VST_MASK 0x7f -#define VID_MASK 0x1f -#define FID_MASK 0x1f -#define EXT_VID_MASK 0x3f -#define EXT_FID_MASK 0x3f - - -/* - * Version 1.4 of the PSB table. This table is constructed by BIOS and is - * to tell the OS's power management driver which VIDs and FIDs are - * supported by this particular processor. - * If the data in the PSB / PST is wrong, then this driver will program the - * wrong values into hardware, which is very likely to lead to a crash. - */ - -#define PSB_ID_STRING "AMDK7PNOW!" -#define PSB_ID_STRING_LEN 10 - -#define PSB_VERSION_1_4 0x14 - -struct psb_s { - u8 signature[10]; - u8 tableversion; - u8 flags1; - u16 vstable; - u8 flags2; - u8 num_tables; - u32 cpuid; - u8 plllocktime; - u8 maxfid; - u8 maxvid; - u8 numps; -}; - -/* Pairs of fid/vid values are appended to the version 1.4 PSB table. */ -struct pst_s { - u8 fid; - u8 vid; -}; - -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k8", msg) - -static int core_voltage_pre_transition(struct powernow_k8_data *data, - u32 reqvid, u32 regfid); -static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid); -static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid); - -static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index); - -static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table); -static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table); diff --git a/arch/x86/kernel/cpu/cpufreq/sc520_freq.c b/arch/x86/kernel/cpu/cpufreq/sc520_freq.c deleted file mode 100644 index 435a996a613..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/sc520_freq.c +++ /dev/null @@ -1,194 +0,0 @@ -/* - * sc520_freq.c: cpufreq driver for the AMD Elan sc520 - * - * Copyright (C) 2005 Sean Young <sean@mess.org> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Based on elanfreq.c - * - * 2005-03-30: - initial revision - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> - -#include <linux/delay.h> -#include <linux/cpufreq.h> -#include <linux/timex.h> -#include <linux/io.h> - -#include <asm/msr.h> - -#define MMCR_BASE 0xfffef000 /* The default base address */ -#define OFFS_CPUCTL 0x2 /* CPU Control Register */ - -static __u8 __iomem *cpuctl; - -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "sc520_freq", msg) -#define PFX "sc520_freq: " - -static struct cpufreq_frequency_table sc520_freq_table[] = { - {0x01, 100000}, - {0x02, 133000}, - {0, CPUFREQ_TABLE_END}, -}; - -static unsigned int sc520_freq_get_cpu_frequency(unsigned int cpu) -{ - u8 clockspeed_reg = *cpuctl; - - switch (clockspeed_reg & 0x03) { - default: - printk(KERN_ERR PFX "error: cpuctl register has unexpected " - "value %02x\n", clockspeed_reg); - case 0x01: - return 100000; - case 0x02: - return 133000; - } -} - -static void sc520_freq_set_cpu_state(unsigned int state) -{ - - struct cpufreq_freqs freqs; - u8 clockspeed_reg; - - freqs.old = sc520_freq_get_cpu_frequency(0); - freqs.new = sc520_freq_table[state].frequency; - freqs.cpu = 0; /* AMD Elan is UP */ - - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - - dprintk("attempting to set frequency to %i kHz\n", - sc520_freq_table[state].frequency); - - local_irq_disable(); - - clockspeed_reg = *cpuctl & ~0x03; - *cpuctl = clockspeed_reg | sc520_freq_table[state].index; - - local_irq_enable(); - - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); -}; - -static int sc520_freq_verify(struct cpufreq_policy *policy) -{ - return cpufreq_frequency_table_verify(policy, &sc520_freq_table[0]); -} - -static int sc520_freq_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) -{ - unsigned int newstate = 0; - - if (cpufreq_frequency_table_target(policy, sc520_freq_table, - target_freq, relation, &newstate)) - return -EINVAL; - - sc520_freq_set_cpu_state(newstate); - - return 0; -} - - -/* - * Module init and exit code - */ - -static int sc520_freq_cpu_init(struct cpufreq_policy *policy) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - int result; - - /* capability check */ - if (c->x86_vendor != X86_VENDOR_AMD || - c->x86 != 4 || c->x86_model != 9) - return -ENODEV; - - /* cpuinfo and default policy values */ - policy->cpuinfo.transition_latency = 1000000; /* 1ms */ - policy->cur = sc520_freq_get_cpu_frequency(0); - - result = cpufreq_frequency_table_cpuinfo(policy, sc520_freq_table); - if (result) - return result; - - cpufreq_frequency_table_get_attr(sc520_freq_table, policy->cpu); - - return 0; -} - - -static int sc520_freq_cpu_exit(struct cpufreq_policy *policy) -{ - cpufreq_frequency_table_put_attr(policy->cpu); - return 0; -} - - -static struct freq_attr *sc520_freq_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; - - -static struct cpufreq_driver sc520_freq_driver = { - .get = sc520_freq_get_cpu_frequency, - .verify = sc520_freq_verify, - .target = sc520_freq_target, - .init = sc520_freq_cpu_init, - .exit = sc520_freq_cpu_exit, - .name = "sc520_freq", - .owner = THIS_MODULE, - .attr = sc520_freq_attr, -}; - - -static int __init sc520_freq_init(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - int err; - - /* Test if we have the right hardware */ - if (c->x86_vendor != X86_VENDOR_AMD || - c->x86 != 4 || c->x86_model != 9) { - dprintk("no Elan SC520 processor found!\n"); - return -ENODEV; - } - cpuctl = ioremap((unsigned long)(MMCR_BASE + OFFS_CPUCTL), 1); - if (!cpuctl) { - printk(KERN_ERR "sc520_freq: error: failed to remap memory\n"); - return -ENOMEM; - } - - err = cpufreq_register_driver(&sc520_freq_driver); - if (err) - iounmap(cpuctl); - - return err; -} - - -static void __exit sc520_freq_exit(void) -{ - cpufreq_unregister_driver(&sc520_freq_driver); - iounmap(cpuctl); -} - - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Sean Young <sean@mess.org>"); -MODULE_DESCRIPTION("cpufreq driver for AMD's Elan sc520 CPU"); - -module_init(sc520_freq_init); -module_exit(sc520_freq_exit); - diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c deleted file mode 100644 index 9b1ff37de46..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c +++ /dev/null @@ -1,636 +0,0 @@ -/* - * cpufreq driver for Enhanced SpeedStep, as found in Intel's Pentium - * M (part of the Centrino chipset). - * - * Since the original Pentium M, most new Intel CPUs support Enhanced - * SpeedStep. - * - * Despite the "SpeedStep" in the name, this is almost entirely unlike - * traditional SpeedStep. - * - * Modelled on speedstep.c - * - * Copyright (C) 2003 Jeremy Fitzhardinge <jeremy@goop.org> - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/cpufreq.h> -#include <linux/sched.h> /* current */ -#include <linux/delay.h> -#include <linux/compiler.h> -#include <linux/gfp.h> - -#include <asm/msr.h> -#include <asm/processor.h> -#include <asm/cpufeature.h> - -#define PFX "speedstep-centrino: " -#define MAINTAINER "cpufreq@vger.kernel.org" - -#define dprintk(msg...) \ - cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg) - -#define INTEL_MSR_RANGE (0xffff) - -struct cpu_id -{ - __u8 x86; /* CPU family */ - __u8 x86_model; /* model */ - __u8 x86_mask; /* stepping */ -}; - -enum { - CPU_BANIAS, - CPU_DOTHAN_A1, - CPU_DOTHAN_A2, - CPU_DOTHAN_B0, - CPU_MP4HT_D0, - CPU_MP4HT_E0, -}; - -static const struct cpu_id cpu_ids[] = { - [CPU_BANIAS] = { 6, 9, 5 }, - [CPU_DOTHAN_A1] = { 6, 13, 1 }, - [CPU_DOTHAN_A2] = { 6, 13, 2 }, - [CPU_DOTHAN_B0] = { 6, 13, 6 }, - [CPU_MP4HT_D0] = {15, 3, 4 }, - [CPU_MP4HT_E0] = {15, 4, 1 }, -}; -#define N_IDS ARRAY_SIZE(cpu_ids) - -struct cpu_model -{ - const struct cpu_id *cpu_id; - const char *model_name; - unsigned max_freq; /* max clock in kHz */ - - struct cpufreq_frequency_table *op_points; /* clock/voltage pairs */ -}; -static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, - const struct cpu_id *x); - -/* Operating points for current CPU */ -static DEFINE_PER_CPU(struct cpu_model *, centrino_model); -static DEFINE_PER_CPU(const struct cpu_id *, centrino_cpu); - -static struct cpufreq_driver centrino_driver; - -#ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE - -/* Computes the correct form for IA32_PERF_CTL MSR for a particular - frequency/voltage operating point; frequency in MHz, volts in mV. - This is stored as "index" in the structure. */ -#define OP(mhz, mv) \ - { \ - .frequency = (mhz) * 1000, \ - .index = (((mhz)/100) << 8) | ((mv - 700) / 16) \ - } - -/* - * These voltage tables were derived from the Intel Pentium M - * datasheet, document 25261202.pdf, Table 5. I have verified they - * are consistent with my IBM ThinkPad X31, which has a 1.3GHz Pentium - * M. - */ - -/* Ultra Low Voltage Intel Pentium M processor 900MHz (Banias) */ -static struct cpufreq_frequency_table banias_900[] = -{ - OP(600, 844), - OP(800, 988), - OP(900, 1004), - { .frequency = CPUFREQ_TABLE_END } -}; - -/* Ultra Low Voltage Intel Pentium M processor 1000MHz (Banias) */ -static struct cpufreq_frequency_table banias_1000[] = -{ - OP(600, 844), - OP(800, 972), - OP(900, 988), - OP(1000, 1004), - { .frequency = CPUFREQ_TABLE_END } -}; - -/* Low Voltage Intel Pentium M processor 1.10GHz (Banias) */ -static struct cpufreq_frequency_table banias_1100[] = -{ - OP( 600, 956), - OP( 800, 1020), - OP( 900, 1100), - OP(1000, 1164), - OP(1100, 1180), - { .frequency = CPUFREQ_TABLE_END } -}; - - -/* Low Voltage Intel Pentium M processor 1.20GHz (Banias) */ -static struct cpufreq_frequency_table banias_1200[] = -{ - OP( 600, 956), - OP( 800, 1004), - OP( 900, 1020), - OP(1000, 1100), - OP(1100, 1164), - OP(1200, 1180), - { .frequency = CPUFREQ_TABLE_END } -}; - -/* Intel Pentium M processor 1.30GHz (Banias) */ -static struct cpufreq_frequency_table banias_1300[] = -{ - OP( 600, 956), - OP( 800, 1260), - OP(1000, 1292), - OP(1200, 1356), - OP(1300, 1388), - { .frequency = CPUFREQ_TABLE_END } -}; - -/* Intel Pentium M processor 1.40GHz (Banias) */ -static struct cpufreq_frequency_table banias_1400[] = -{ - OP( 600, 956), - OP( 800, 1180), - OP(1000, 1308), - OP(1200, 1436), - OP(1400, 1484), - { .frequency = CPUFREQ_TABLE_END } -}; - -/* Intel Pentium M processor 1.50GHz (Banias) */ -static struct cpufreq_frequency_table banias_1500[] = -{ - OP( 600, 956), - OP( 800, 1116), - OP(1000, 1228), - OP(1200, 1356), - OP(1400, 1452), - OP(1500, 1484), - { .frequency = CPUFREQ_TABLE_END } -}; - -/* Intel Pentium M processor 1.60GHz (Banias) */ -static struct cpufreq_frequency_table banias_1600[] = -{ - OP( 600, 956), - OP( 800, 1036), - OP(1000, 1164), - OP(1200, 1276), - OP(1400, 1420), - OP(1600, 1484), - { .frequency = CPUFREQ_TABLE_END } -}; - -/* Intel Pentium M processor 1.70GHz (Banias) */ -static struct cpufreq_frequency_table banias_1700[] = -{ - OP( 600, 956), - OP( 800, 1004), - OP(1000, 1116), - OP(1200, 1228), - OP(1400, 1308), - OP(1700, 1484), - { .frequency = CPUFREQ_TABLE_END } -}; -#undef OP - -#define _BANIAS(cpuid, max, name) \ -{ .cpu_id = cpuid, \ - .model_name = "Intel(R) Pentium(R) M processor " name "MHz", \ - .max_freq = (max)*1000, \ - .op_points = banias_##max, \ -} -#define BANIAS(max) _BANIAS(&cpu_ids[CPU_BANIAS], max, #max) - -/* CPU models, their operating frequency range, and freq/voltage - operating points */ -static struct cpu_model models[] = -{ - _BANIAS(&cpu_ids[CPU_BANIAS], 900, " 900"), - BANIAS(1000), - BANIAS(1100), - BANIAS(1200), - BANIAS(1300), - BANIAS(1400), - BANIAS(1500), - BANIAS(1600), - BANIAS(1700), - - /* NULL model_name is a wildcard */ - { &cpu_ids[CPU_DOTHAN_A1], NULL, 0, NULL }, - { &cpu_ids[CPU_DOTHAN_A2], NULL, 0, NULL }, - { &cpu_ids[CPU_DOTHAN_B0], NULL, 0, NULL }, - { &cpu_ids[CPU_MP4HT_D0], NULL, 0, NULL }, - { &cpu_ids[CPU_MP4HT_E0], NULL, 0, NULL }, - - { NULL, } -}; -#undef _BANIAS -#undef BANIAS - -static int centrino_cpu_init_table(struct cpufreq_policy *policy) -{ - struct cpuinfo_x86 *cpu = &cpu_data(policy->cpu); - struct cpu_model *model; - - for(model = models; model->cpu_id != NULL; model++) - if (centrino_verify_cpu_id(cpu, model->cpu_id) && - (model->model_name == NULL || - strcmp(cpu->x86_model_id, model->model_name) == 0)) - break; - - if (model->cpu_id == NULL) { - /* No match at all */ - dprintk("no support for CPU model \"%s\": " - "send /proc/cpuinfo to " MAINTAINER "\n", - cpu->x86_model_id); - return -ENOENT; - } - - if (model->op_points == NULL) { - /* Matched a non-match */ - dprintk("no table support for CPU model \"%s\"\n", - cpu->x86_model_id); - dprintk("try using the acpi-cpufreq driver\n"); - return -ENOENT; - } - - per_cpu(centrino_model, policy->cpu) = model; - - dprintk("found \"%s\": max frequency: %dkHz\n", - model->model_name, model->max_freq); - - return 0; -} - -#else -static inline int centrino_cpu_init_table(struct cpufreq_policy *policy) -{ - return -ENODEV; -} -#endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */ - -static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, - const struct cpu_id *x) -{ - if ((c->x86 == x->x86) && - (c->x86_model == x->x86_model) && - (c->x86_mask == x->x86_mask)) - return 1; - return 0; -} - -/* To be called only after centrino_model is initialized */ -static unsigned extract_clock(unsigned msr, unsigned int cpu, int failsafe) -{ - int i; - - /* - * Extract clock in kHz from PERF_CTL value - * for centrino, as some DSDTs are buggy. - * Ideally, this can be done using the acpi_data structure. - */ - if ((per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_BANIAS]) || - (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_A1]) || - (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_B0])) { - msr = (msr >> 8) & 0xff; - return msr * 100000; - } - - if ((!per_cpu(centrino_model, cpu)) || - (!per_cpu(centrino_model, cpu)->op_points)) - return 0; - - msr &= 0xffff; - for (i = 0; - per_cpu(centrino_model, cpu)->op_points[i].frequency - != CPUFREQ_TABLE_END; - i++) { - if (msr == per_cpu(centrino_model, cpu)->op_points[i].index) - return per_cpu(centrino_model, cpu)-> - op_points[i].frequency; - } - if (failsafe) - return per_cpu(centrino_model, cpu)->op_points[i-1].frequency; - else - return 0; -} - -/* Return the current CPU frequency in kHz */ -static unsigned int get_cur_freq(unsigned int cpu) -{ - unsigned l, h; - unsigned clock_freq; - - rdmsr_on_cpu(cpu, MSR_IA32_PERF_STATUS, &l, &h); - clock_freq = extract_clock(l, cpu, 0); - - if (unlikely(clock_freq == 0)) { - /* - * On some CPUs, we can see transient MSR values (which are - * not present in _PSS), while CPU is doing some automatic - * P-state transition (like TM2). Get the last freq set - * in PERF_CTL. - */ - rdmsr_on_cpu(cpu, MSR_IA32_PERF_CTL, &l, &h); - clock_freq = extract_clock(l, cpu, 1); - } - return clock_freq; -} - - -static int centrino_cpu_init(struct cpufreq_policy *policy) -{ - struct cpuinfo_x86 *cpu = &cpu_data(policy->cpu); - unsigned freq; - unsigned l, h; - int ret; - int i; - - /* Only Intel makes Enhanced Speedstep-capable CPUs */ - if (cpu->x86_vendor != X86_VENDOR_INTEL || - !cpu_has(cpu, X86_FEATURE_EST)) - return -ENODEV; - - if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC)) - centrino_driver.flags |= CPUFREQ_CONST_LOOPS; - - if (policy->cpu != 0) - return -ENODEV; - - for (i = 0; i < N_IDS; i++) - if (centrino_verify_cpu_id(cpu, &cpu_ids[i])) - break; - - if (i != N_IDS) - per_cpu(centrino_cpu, policy->cpu) = &cpu_ids[i]; - - if (!per_cpu(centrino_cpu, policy->cpu)) { - dprintk("found unsupported CPU with " - "Enhanced SpeedStep: send /proc/cpuinfo to " - MAINTAINER "\n"); - return -ENODEV; - } - - if (centrino_cpu_init_table(policy)) { - return -ENODEV; - } - - /* Check to see if Enhanced SpeedStep is enabled, and try to - enable it if not. */ - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - - if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) { - l |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP; - dprintk("trying to enable Enhanced SpeedStep (%x)\n", l); - wrmsr(MSR_IA32_MISC_ENABLE, l, h); - - /* check to see if it stuck */ - rdmsr(MSR_IA32_MISC_ENABLE, l, h); - if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) { - printk(KERN_INFO PFX - "couldn't enable Enhanced SpeedStep\n"); - return -ENODEV; - } - } - - freq = get_cur_freq(policy->cpu); - policy->cpuinfo.transition_latency = 10000; - /* 10uS transition latency */ - policy->cur = freq; - - dprintk("centrino_cpu_init: cur=%dkHz\n", policy->cur); - - ret = cpufreq_frequency_table_cpuinfo(policy, - per_cpu(centrino_model, policy->cpu)->op_points); - if (ret) - return (ret); - - cpufreq_frequency_table_get_attr( - per_cpu(centrino_model, policy->cpu)->op_points, policy->cpu); - - return 0; -} - -static int centrino_cpu_exit(struct cpufreq_policy *policy) -{ - unsigned int cpu = policy->cpu; - - if (!per_cpu(centrino_model, cpu)) - return -ENODEV; - - cpufreq_frequency_table_put_attr(cpu); - - per_cpu(centrino_model, cpu) = NULL; - - return 0; -} - -/** - * centrino_verify - verifies a new CPUFreq policy - * @policy: new policy - * - * Limit must be within this model's frequency range at least one - * border included. - */ -static int centrino_verify (struct cpufreq_policy *policy) -{ - return cpufreq_frequency_table_verify(policy, - per_cpu(centrino_model, policy->cpu)->op_points); -} - -/** - * centrino_setpolicy - set a new CPUFreq policy - * @policy: new policy - * @target_freq: the target frequency - * @relation: how that frequency relates to achieved frequency - * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) - * - * Sets a new CPUFreq policy. - */ -static int centrino_target (struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) -{ - unsigned int newstate = 0; - unsigned int msr, oldmsr = 0, h = 0, cpu = policy->cpu; - struct cpufreq_freqs freqs; - int retval = 0; - unsigned int j, k, first_cpu, tmp; - cpumask_var_t covered_cpus; - - if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL))) - return -ENOMEM; - - if (unlikely(per_cpu(centrino_model, cpu) == NULL)) { - retval = -ENODEV; - goto out; - } - - if (unlikely(cpufreq_frequency_table_target(policy, - per_cpu(centrino_model, cpu)->op_points, - target_freq, - relation, - &newstate))) { - retval = -EINVAL; - goto out; - } - - first_cpu = 1; - for_each_cpu(j, policy->cpus) { - int good_cpu; - - /* cpufreq holds the hotplug lock, so we are safe here */ - if (!cpu_online(j)) - continue; - - /* - * Support for SMP systems. - * Make sure we are running on CPU that wants to change freq - */ - if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) - good_cpu = cpumask_any_and(policy->cpus, - cpu_online_mask); - else - good_cpu = j; - - if (good_cpu >= nr_cpu_ids) { - dprintk("couldn't limit to CPUs in this domain\n"); - retval = -EAGAIN; - if (first_cpu) { - /* We haven't started the transition yet. */ - goto out; - } - break; - } - - msr = per_cpu(centrino_model, cpu)->op_points[newstate].index; - - if (first_cpu) { - rdmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, &oldmsr, &h); - if (msr == (oldmsr & 0xffff)) { - dprintk("no change needed - msr was and needs " - "to be %x\n", oldmsr); - retval = 0; - goto out; - } - - freqs.old = extract_clock(oldmsr, cpu, 0); - freqs.new = extract_clock(msr, cpu, 0); - - dprintk("target=%dkHz old=%d new=%d msr=%04x\n", - target_freq, freqs.old, freqs.new, msr); - - for_each_cpu(k, policy->cpus) { - if (!cpu_online(k)) - continue; - freqs.cpu = k; - cpufreq_notify_transition(&freqs, - CPUFREQ_PRECHANGE); - } - - first_cpu = 0; - /* all but 16 LSB are reserved, treat them with care */ - oldmsr &= ~0xffff; - msr &= 0xffff; - oldmsr |= msr; - } - - wrmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, oldmsr, h); - if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) - break; - - cpumask_set_cpu(j, covered_cpus); - } - - for_each_cpu(k, policy->cpus) { - if (!cpu_online(k)) - continue; - freqs.cpu = k; - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - } - - if (unlikely(retval)) { - /* - * We have failed halfway through the frequency change. - * We have sent callbacks to policy->cpus and - * MSRs have already been written on coverd_cpus. - * Best effort undo.. - */ - - for_each_cpu(j, covered_cpus) - wrmsr_on_cpu(j, MSR_IA32_PERF_CTL, oldmsr, h); - - tmp = freqs.new; - freqs.new = freqs.old; - freqs.old = tmp; - for_each_cpu(j, policy->cpus) { - if (!cpu_online(j)) - continue; - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - } - } - retval = 0; - -out: - free_cpumask_var(covered_cpus); - return retval; -} - -static struct freq_attr* centrino_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; - -static struct cpufreq_driver centrino_driver = { - .name = "centrino", /* should be speedstep-centrino, - but there's a 16 char limit */ - .init = centrino_cpu_init, - .exit = centrino_cpu_exit, - .verify = centrino_verify, - .target = centrino_target, - .get = get_cur_freq, - .attr = centrino_attr, - .owner = THIS_MODULE, -}; - - -/** - * centrino_init - initializes the Enhanced SpeedStep CPUFreq driver - * - * Initializes the Enhanced SpeedStep support. Returns -ENODEV on - * unsupported devices, -ENOENT if there's no voltage table for this - * particular CPU model, -EINVAL on problems during initiatization, - * and zero on success. - * - * This is quite picky. Not only does the CPU have to advertise the - * "est" flag in the cpuid capability flags, we look for a specific - * CPU model and stepping, and we need to have the exact model name in - * our voltage tables. That is, be paranoid about not releasing - * someone's valuable magic smoke. - */ -static int __init centrino_init(void) -{ - struct cpuinfo_x86 *cpu = &cpu_data(0); - - if (!cpu_has(cpu, X86_FEATURE_EST)) - return -ENODEV; - - return cpufreq_register_driver(¢rino_driver); -} - -static void __exit centrino_exit(void) -{ - cpufreq_unregister_driver(¢rino_driver); -} - -MODULE_AUTHOR ("Jeremy Fitzhardinge <jeremy@goop.org>"); -MODULE_DESCRIPTION ("Enhanced SpeedStep driver for Intel Pentium M processors."); -MODULE_LICENSE ("GPL"); - -late_initcall(centrino_init); -module_exit(centrino_exit); diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c deleted file mode 100644 index 561758e9518..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c +++ /dev/null @@ -1,452 +0,0 @@ -/* - * (C) 2001 Dave Jones, Arjan van de ven. - * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de> - * - * Licensed under the terms of the GNU GPL License version 2. - * Based upon reverse engineered information, and on Intel documentation - * for chipsets ICH2-M and ICH3-M. - * - * Many thanks to Ducrot Bruno for finding and fixing the last - * "missing link" for ICH2-M/ICH3-M support, and to Thomas Winkler - * for extensive testing. - * - * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* - */ - - -/********************************************************************* - * SPEEDSTEP - DEFINITIONS * - *********************************************************************/ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/cpufreq.h> -#include <linux/pci.h> -#include <linux/sched.h> - -#include "speedstep-lib.h" - - -/* speedstep_chipset: - * It is necessary to know which chipset is used. As accesses to - * this device occur at various places in this module, we need a - * static struct pci_dev * pointing to that device. - */ -static struct pci_dev *speedstep_chipset_dev; - - -/* speedstep_processor - */ -static enum speedstep_processor speedstep_processor; - -static u32 pmbase; - -/* - * There are only two frequency states for each processor. Values - * are in kHz for the time being. - */ -static struct cpufreq_frequency_table speedstep_freqs[] = { - {SPEEDSTEP_HIGH, 0}, - {SPEEDSTEP_LOW, 0}, - {0, CPUFREQ_TABLE_END}, -}; - - -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "speedstep-ich", msg) - - -/** - * speedstep_find_register - read the PMBASE address - * - * Returns: -ENODEV if no register could be found - */ -static int speedstep_find_register(void) -{ - if (!speedstep_chipset_dev) - return -ENODEV; - - /* get PMBASE */ - pci_read_config_dword(speedstep_chipset_dev, 0x40, &pmbase); - if (!(pmbase & 0x01)) { - printk(KERN_ERR "speedstep-ich: could not find speedstep register\n"); - return -ENODEV; - } - - pmbase &= 0xFFFFFFFE; - if (!pmbase) { - printk(KERN_ERR "speedstep-ich: could not find speedstep register\n"); - return -ENODEV; - } - - dprintk("pmbase is 0x%x\n", pmbase); - return 0; -} - -/** - * speedstep_set_state - set the SpeedStep state - * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH) - * - * Tries to change the SpeedStep state. Can be called from - * smp_call_function_single. - */ -static void speedstep_set_state(unsigned int state) -{ - u8 pm2_blk; - u8 value; - unsigned long flags; - - if (state > 0x1) - return; - - /* Disable IRQs */ - local_irq_save(flags); - - /* read state */ - value = inb(pmbase + 0x50); - - dprintk("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value); - - /* write new state */ - value &= 0xFE; - value |= state; - - dprintk("writing 0x%x to pmbase 0x%x + 0x50\n", value, pmbase); - - /* Disable bus master arbitration */ - pm2_blk = inb(pmbase + 0x20); - pm2_blk |= 0x01; - outb(pm2_blk, (pmbase + 0x20)); - - /* Actual transition */ - outb(value, (pmbase + 0x50)); - - /* Restore bus master arbitration */ - pm2_blk &= 0xfe; - outb(pm2_blk, (pmbase + 0x20)); - - /* check if transition was successful */ - value = inb(pmbase + 0x50); - - /* Enable IRQs */ - local_irq_restore(flags); - - dprintk("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value); - - if (state == (value & 0x1)) - dprintk("change to %u MHz succeeded\n", - speedstep_get_frequency(speedstep_processor) / 1000); - else - printk(KERN_ERR "cpufreq: change failed - I/O error\n"); - - return; -} - -/* Wrapper for smp_call_function_single. */ -static void _speedstep_set_state(void *_state) -{ - speedstep_set_state(*(unsigned int *)_state); -} - -/** - * speedstep_activate - activate SpeedStep control in the chipset - * - * Tries to activate the SpeedStep status and control registers. - * Returns -EINVAL on an unsupported chipset, and zero on success. - */ -static int speedstep_activate(void) -{ - u16 value = 0; - - if (!speedstep_chipset_dev) - return -EINVAL; - - pci_read_config_word(speedstep_chipset_dev, 0x00A0, &value); - if (!(value & 0x08)) { - value |= 0x08; - dprintk("activating SpeedStep (TM) registers\n"); - pci_write_config_word(speedstep_chipset_dev, 0x00A0, value); - } - - return 0; -} - - -/** - * speedstep_detect_chipset - detect the Southbridge which contains SpeedStep logic - * - * Detects ICH2-M, ICH3-M and ICH4-M so far. The pci_dev points to - * the LPC bridge / PM module which contains all power-management - * functions. Returns the SPEEDSTEP_CHIPSET_-number for the detected - * chipset, or zero on failure. - */ -static unsigned int speedstep_detect_chipset(void) -{ - speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL, - PCI_DEVICE_ID_INTEL_82801DB_12, - PCI_ANY_ID, PCI_ANY_ID, - NULL); - if (speedstep_chipset_dev) - return 4; /* 4-M */ - - speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL, - PCI_DEVICE_ID_INTEL_82801CA_12, - PCI_ANY_ID, PCI_ANY_ID, - NULL); - if (speedstep_chipset_dev) - return 3; /* 3-M */ - - - speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL, - PCI_DEVICE_ID_INTEL_82801BA_10, - PCI_ANY_ID, PCI_ANY_ID, - NULL); - if (speedstep_chipset_dev) { - /* speedstep.c causes lockups on Dell Inspirons 8000 and - * 8100 which use a pretty old revision of the 82815 - * host brige. Abort on these systems. - */ - static struct pci_dev *hostbridge; - - hostbridge = pci_get_subsys(PCI_VENDOR_ID_INTEL, - PCI_DEVICE_ID_INTEL_82815_MC, - PCI_ANY_ID, PCI_ANY_ID, - NULL); - - if (!hostbridge) - return 2; /* 2-M */ - - if (hostbridge->revision < 5) { - dprintk("hostbridge does not support speedstep\n"); - speedstep_chipset_dev = NULL; - pci_dev_put(hostbridge); - return 0; - } - - pci_dev_put(hostbridge); - return 2; /* 2-M */ - } - - return 0; -} - -static void get_freq_data(void *_speed) -{ - unsigned int *speed = _speed; - - *speed = speedstep_get_frequency(speedstep_processor); -} - -static unsigned int speedstep_get(unsigned int cpu) -{ - unsigned int speed; - - /* You're supposed to ensure CPU is online. */ - if (smp_call_function_single(cpu, get_freq_data, &speed, 1) != 0) - BUG(); - - dprintk("detected %u kHz as current frequency\n", speed); - return speed; -} - -/** - * speedstep_target - set a new CPUFreq policy - * @policy: new policy - * @target_freq: the target frequency - * @relation: how that frequency relates to achieved frequency - * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) - * - * Sets a new CPUFreq policy. - */ -static int speedstep_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) -{ - unsigned int newstate = 0, policy_cpu; - struct cpufreq_freqs freqs; - int i; - - if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0], - target_freq, relation, &newstate)) - return -EINVAL; - - policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask); - freqs.old = speedstep_get(policy_cpu); - freqs.new = speedstep_freqs[newstate].frequency; - freqs.cpu = policy->cpu; - - dprintk("transiting from %u to %u kHz\n", freqs.old, freqs.new); - - /* no transition necessary */ - if (freqs.old == freqs.new) - return 0; - - for_each_cpu(i, policy->cpus) { - freqs.cpu = i; - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - } - - smp_call_function_single(policy_cpu, _speedstep_set_state, &newstate, - true); - - for_each_cpu(i, policy->cpus) { - freqs.cpu = i; - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - } - - return 0; -} - - -/** - * speedstep_verify - verifies a new CPUFreq policy - * @policy: new policy - * - * Limit must be within speedstep_low_freq and speedstep_high_freq, with - * at least one border included. - */ -static int speedstep_verify(struct cpufreq_policy *policy) -{ - return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]); -} - -struct get_freqs { - struct cpufreq_policy *policy; - int ret; -}; - -static void get_freqs_on_cpu(void *_get_freqs) -{ - struct get_freqs *get_freqs = _get_freqs; - - get_freqs->ret = - speedstep_get_freqs(speedstep_processor, - &speedstep_freqs[SPEEDSTEP_LOW].frequency, - &speedstep_freqs[SPEEDSTEP_HIGH].frequency, - &get_freqs->policy->cpuinfo.transition_latency, - &speedstep_set_state); -} - -static int speedstep_cpu_init(struct cpufreq_policy *policy) -{ - int result; - unsigned int policy_cpu, speed; - struct get_freqs gf; - - /* only run on CPU to be set, or on its sibling */ -#ifdef CONFIG_SMP - cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu)); -#endif - policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask); - - /* detect low and high frequency and transition latency */ - gf.policy = policy; - smp_call_function_single(policy_cpu, get_freqs_on_cpu, &gf, 1); - if (gf.ret) - return gf.ret; - - /* get current speed setting */ - speed = speedstep_get(policy_cpu); - if (!speed) - return -EIO; - - dprintk("currently at %s speed setting - %i MHz\n", - (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency) - ? "low" : "high", - (speed / 1000)); - - /* cpuinfo and default policy values */ - policy->cur = speed; - - result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs); - if (result) - return result; - - cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu); - - return 0; -} - - -static int speedstep_cpu_exit(struct cpufreq_policy *policy) -{ - cpufreq_frequency_table_put_attr(policy->cpu); - return 0; -} - -static struct freq_attr *speedstep_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; - - -static struct cpufreq_driver speedstep_driver = { - .name = "speedstep-ich", - .verify = speedstep_verify, - .target = speedstep_target, - .init = speedstep_cpu_init, - .exit = speedstep_cpu_exit, - .get = speedstep_get, - .owner = THIS_MODULE, - .attr = speedstep_attr, -}; - - -/** - * speedstep_init - initializes the SpeedStep CPUFreq driver - * - * Initializes the SpeedStep support. Returns -ENODEV on unsupported - * devices, -EINVAL on problems during initiatization, and zero on - * success. - */ -static int __init speedstep_init(void) -{ - /* detect processor */ - speedstep_processor = speedstep_detect_processor(); - if (!speedstep_processor) { - dprintk("Intel(R) SpeedStep(TM) capable processor " - "not found\n"); - return -ENODEV; - } - - /* detect chipset */ - if (!speedstep_detect_chipset()) { - dprintk("Intel(R) SpeedStep(TM) for this chipset not " - "(yet) available.\n"); - return -ENODEV; - } - - /* activate speedstep support */ - if (speedstep_activate()) { - pci_dev_put(speedstep_chipset_dev); - return -EINVAL; - } - - if (speedstep_find_register()) - return -ENODEV; - - return cpufreq_register_driver(&speedstep_driver); -} - - -/** - * speedstep_exit - unregisters SpeedStep support - * - * Unregisters SpeedStep support. - */ -static void __exit speedstep_exit(void) -{ - pci_dev_put(speedstep_chipset_dev); - cpufreq_unregister_driver(&speedstep_driver); -} - - -MODULE_AUTHOR("Dave Jones <davej@redhat.com>, " - "Dominik Brodowski <linux@brodo.de>"); -MODULE_DESCRIPTION("Speedstep driver for Intel mobile processors on chipsets " - "with ICH-M southbridges."); -MODULE_LICENSE("GPL"); - -module_init(speedstep_init); -module_exit(speedstep_exit); diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c deleted file mode 100644 index a94ec6be69f..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c +++ /dev/null @@ -1,481 +0,0 @@ -/* - * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de> - * - * Licensed under the terms of the GNU GPL License version 2. - * - * Library for common functions for Intel SpeedStep v.1 and v.2 support - * - * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/moduleparam.h> -#include <linux/init.h> -#include <linux/cpufreq.h> - -#include <asm/msr.h> -#include <asm/tsc.h> -#include "speedstep-lib.h" - -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "speedstep-lib", msg) - -#define PFX "speedstep-lib: " - -#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK -static int relaxed_check; -#else -#define relaxed_check 0 -#endif - -/********************************************************************* - * GET PROCESSOR CORE SPEED IN KHZ * - *********************************************************************/ - -static unsigned int pentium3_get_frequency(enum speedstep_processor processor) -{ - /* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */ - struct { - unsigned int ratio; /* Frequency Multiplier (x10) */ - u8 bitmap; /* power on configuration bits - [27, 25:22] (in MSR 0x2a) */ - } msr_decode_mult[] = { - { 30, 0x01 }, - { 35, 0x05 }, - { 40, 0x02 }, - { 45, 0x06 }, - { 50, 0x00 }, - { 55, 0x04 }, - { 60, 0x0b }, - { 65, 0x0f }, - { 70, 0x09 }, - { 75, 0x0d }, - { 80, 0x0a }, - { 85, 0x26 }, - { 90, 0x20 }, - { 100, 0x2b }, - { 0, 0xff } /* error or unknown value */ - }; - - /* PIII(-M) FSB settings: see table b1-b of 24547206.pdf */ - struct { - unsigned int value; /* Front Side Bus speed in MHz */ - u8 bitmap; /* power on configuration bits [18: 19] - (in MSR 0x2a) */ - } msr_decode_fsb[] = { - { 66, 0x0 }, - { 100, 0x2 }, - { 133, 0x1 }, - { 0, 0xff} - }; - - u32 msr_lo, msr_tmp; - int i = 0, j = 0; - - /* read MSR 0x2a - we only need the low 32 bits */ - rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp); - dprintk("P3 - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp); - msr_tmp = msr_lo; - - /* decode the FSB */ - msr_tmp &= 0x00c0000; - msr_tmp >>= 18; - while (msr_tmp != msr_decode_fsb[i].bitmap) { - if (msr_decode_fsb[i].bitmap == 0xff) - return 0; - i++; - } - - /* decode the multiplier */ - if (processor == SPEEDSTEP_CPU_PIII_C_EARLY) { - dprintk("workaround for early PIIIs\n"); - msr_lo &= 0x03c00000; - } else - msr_lo &= 0x0bc00000; - msr_lo >>= 22; - while (msr_lo != msr_decode_mult[j].bitmap) { - if (msr_decode_mult[j].bitmap == 0xff) - return 0; - j++; - } - - dprintk("speed is %u\n", - (msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100)); - - return msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100; -} - - -static unsigned int pentiumM_get_frequency(void) -{ - u32 msr_lo, msr_tmp; - - rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp); - dprintk("PM - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp); - - /* see table B-2 of 24547212.pdf */ - if (msr_lo & 0x00040000) { - printk(KERN_DEBUG PFX "PM - invalid FSB: 0x%x 0x%x\n", - msr_lo, msr_tmp); - return 0; - } - - msr_tmp = (msr_lo >> 22) & 0x1f; - dprintk("bits 22-26 are 0x%x, speed is %u\n", - msr_tmp, (msr_tmp * 100 * 1000)); - - return msr_tmp * 100 * 1000; -} - -static unsigned int pentium_core_get_frequency(void) -{ - u32 fsb = 0; - u32 msr_lo, msr_tmp; - int ret; - - rdmsr(MSR_FSB_FREQ, msr_lo, msr_tmp); - /* see table B-2 of 25366920.pdf */ - switch (msr_lo & 0x07) { - case 5: - fsb = 100000; - break; - case 1: - fsb = 133333; - break; - case 3: - fsb = 166667; - break; - case 2: - fsb = 200000; - break; - case 0: - fsb = 266667; - break; - case 4: - fsb = 333333; - break; - default: - printk(KERN_ERR "PCORE - MSR_FSB_FREQ undefined value"); - } - - rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp); - dprintk("PCORE - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", - msr_lo, msr_tmp); - - msr_tmp = (msr_lo >> 22) & 0x1f; - dprintk("bits 22-26 are 0x%x, speed is %u\n", - msr_tmp, (msr_tmp * fsb)); - - ret = (msr_tmp * fsb); - return ret; -} - - -static unsigned int pentium4_get_frequency(void) -{ - struct cpuinfo_x86 *c = &boot_cpu_data; - u32 msr_lo, msr_hi, mult; - unsigned int fsb = 0; - unsigned int ret; - u8 fsb_code; - - /* Pentium 4 Model 0 and 1 do not have the Core Clock Frequency - * to System Bus Frequency Ratio Field in the Processor Frequency - * Configuration Register of the MSR. Therefore the current - * frequency cannot be calculated and has to be measured. - */ - if (c->x86_model < 2) - return cpu_khz; - - rdmsr(0x2c, msr_lo, msr_hi); - - dprintk("P4 - MSR_EBC_FREQUENCY_ID: 0x%x 0x%x\n", msr_lo, msr_hi); - - /* decode the FSB: see IA-32 Intel (C) Architecture Software - * Developer's Manual, Volume 3: System Prgramming Guide, - * revision #12 in Table B-1: MSRs in the Pentium 4 and - * Intel Xeon Processors, on page B-4 and B-5. - */ - fsb_code = (msr_lo >> 16) & 0x7; - switch (fsb_code) { - case 0: - fsb = 100 * 1000; - break; - case 1: - fsb = 13333 * 10; - break; - case 2: - fsb = 200 * 1000; - break; - } - - if (!fsb) - printk(KERN_DEBUG PFX "couldn't detect FSB speed. " - "Please send an e-mail to <linux@brodo.de>\n"); - - /* Multiplier. */ - mult = msr_lo >> 24; - - dprintk("P4 - FSB %u kHz; Multiplier %u; Speed %u kHz\n", - fsb, mult, (fsb * mult)); - - ret = (fsb * mult); - return ret; -} - - -/* Warning: may get called from smp_call_function_single. */ -unsigned int speedstep_get_frequency(enum speedstep_processor processor) -{ - switch (processor) { - case SPEEDSTEP_CPU_PCORE: - return pentium_core_get_frequency(); - case SPEEDSTEP_CPU_PM: - return pentiumM_get_frequency(); - case SPEEDSTEP_CPU_P4D: - case SPEEDSTEP_CPU_P4M: - return pentium4_get_frequency(); - case SPEEDSTEP_CPU_PIII_T: - case SPEEDSTEP_CPU_PIII_C: - case SPEEDSTEP_CPU_PIII_C_EARLY: - return pentium3_get_frequency(processor); - default: - return 0; - }; - return 0; -} -EXPORT_SYMBOL_GPL(speedstep_get_frequency); - - -/********************************************************************* - * DETECT SPEEDSTEP-CAPABLE PROCESSOR * - *********************************************************************/ - -unsigned int speedstep_detect_processor(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - u32 ebx, msr_lo, msr_hi; - - dprintk("x86: %x, model: %x\n", c->x86, c->x86_model); - - if ((c->x86_vendor != X86_VENDOR_INTEL) || - ((c->x86 != 6) && (c->x86 != 0xF))) - return 0; - - if (c->x86 == 0xF) { - /* Intel Mobile Pentium 4-M - * or Intel Mobile Pentium 4 with 533 MHz FSB */ - if (c->x86_model != 2) - return 0; - - ebx = cpuid_ebx(0x00000001); - ebx &= 0x000000FF; - - dprintk("ebx value is %x, x86_mask is %x\n", ebx, c->x86_mask); - - switch (c->x86_mask) { - case 4: - /* - * B-stepping [M-P4-M] - * sample has ebx = 0x0f, production has 0x0e. - */ - if ((ebx == 0x0e) || (ebx == 0x0f)) - return SPEEDSTEP_CPU_P4M; - break; - case 7: - /* - * C-stepping [M-P4-M] - * needs to have ebx=0x0e, else it's a celeron: - * cf. 25130917.pdf / page 7, footnote 5 even - * though 25072120.pdf / page 7 doesn't say - * samples are only of B-stepping... - */ - if (ebx == 0x0e) - return SPEEDSTEP_CPU_P4M; - break; - case 9: - /* - * D-stepping [M-P4-M or M-P4/533] - * - * this is totally strange: CPUID 0x0F29 is - * used by M-P4-M, M-P4/533 and(!) Celeron CPUs. - * The latter need to be sorted out as they don't - * support speedstep. - * Celerons with CPUID 0x0F29 may have either - * ebx=0x8 or 0xf -- 25130917.pdf doesn't say anything - * specific. - * M-P4-Ms may have either ebx=0xe or 0xf [see above] - * M-P4/533 have either ebx=0xe or 0xf. [25317607.pdf] - * also, M-P4M HTs have ebx=0x8, too - * For now, they are distinguished by the model_id - * string - */ - if ((ebx == 0x0e) || - (strstr(c->x86_model_id, - "Mobile Intel(R) Pentium(R) 4") != NULL)) - return SPEEDSTEP_CPU_P4M; - break; - default: - break; - } - return 0; - } - - switch (c->x86_model) { - case 0x0B: /* Intel PIII [Tualatin] */ - /* cpuid_ebx(1) is 0x04 for desktop PIII, - * 0x06 for mobile PIII-M */ - ebx = cpuid_ebx(0x00000001); - dprintk("ebx is %x\n", ebx); - - ebx &= 0x000000FF; - - if (ebx != 0x06) - return 0; - - /* So far all PIII-M processors support SpeedStep. See - * Intel's 24540640.pdf of June 2003 - */ - return SPEEDSTEP_CPU_PIII_T; - - case 0x08: /* Intel PIII [Coppermine] */ - - /* all mobile PIII Coppermines have FSB 100 MHz - * ==> sort out a few desktop PIIIs. */ - rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_hi); - dprintk("Coppermine: MSR_IA32_EBL_CR_POWERON is 0x%x, 0x%x\n", - msr_lo, msr_hi); - msr_lo &= 0x00c0000; - if (msr_lo != 0x0080000) - return 0; - - /* - * If the processor is a mobile version, - * platform ID has bit 50 set - * it has SpeedStep technology if either - * bit 56 or 57 is set - */ - rdmsr(MSR_IA32_PLATFORM_ID, msr_lo, msr_hi); - dprintk("Coppermine: MSR_IA32_PLATFORM ID is 0x%x, 0x%x\n", - msr_lo, msr_hi); - if ((msr_hi & (1<<18)) && - (relaxed_check ? 1 : (msr_hi & (3<<24)))) { - if (c->x86_mask == 0x01) { - dprintk("early PIII version\n"); - return SPEEDSTEP_CPU_PIII_C_EARLY; - } else - return SPEEDSTEP_CPU_PIII_C; - } - - default: - return 0; - } -} -EXPORT_SYMBOL_GPL(speedstep_detect_processor); - - -/********************************************************************* - * DETECT SPEEDSTEP SPEEDS * - *********************************************************************/ - -unsigned int speedstep_get_freqs(enum speedstep_processor processor, - unsigned int *low_speed, - unsigned int *high_speed, - unsigned int *transition_latency, - void (*set_state) (unsigned int state)) -{ - unsigned int prev_speed; - unsigned int ret = 0; - unsigned long flags; - struct timeval tv1, tv2; - - if ((!processor) || (!low_speed) || (!high_speed) || (!set_state)) - return -EINVAL; - - dprintk("trying to determine both speeds\n"); - - /* get current speed */ - prev_speed = speedstep_get_frequency(processor); - if (!prev_speed) - return -EIO; - - dprintk("previous speed is %u\n", prev_speed); - - local_irq_save(flags); - - /* switch to low state */ - set_state(SPEEDSTEP_LOW); - *low_speed = speedstep_get_frequency(processor); - if (!*low_speed) { - ret = -EIO; - goto out; - } - - dprintk("low speed is %u\n", *low_speed); - - /* start latency measurement */ - if (transition_latency) - do_gettimeofday(&tv1); - - /* switch to high state */ - set_state(SPEEDSTEP_HIGH); - - /* end latency measurement */ - if (transition_latency) - do_gettimeofday(&tv2); - - *high_speed = speedstep_get_frequency(processor); - if (!*high_speed) { - ret = -EIO; - goto out; - } - - dprintk("high speed is %u\n", *high_speed); - - if (*low_speed == *high_speed) { - ret = -ENODEV; - goto out; - } - - /* switch to previous state, if necessary */ - if (*high_speed != prev_speed) - set_state(SPEEDSTEP_LOW); - - if (transition_latency) { - *transition_latency = (tv2.tv_sec - tv1.tv_sec) * USEC_PER_SEC + - tv2.tv_usec - tv1.tv_usec; - dprintk("transition latency is %u uSec\n", *transition_latency); - - /* convert uSec to nSec and add 20% for safety reasons */ - *transition_latency *= 1200; - - /* check if the latency measurement is too high or too low - * and set it to a safe value (500uSec) in that case - */ - if (*transition_latency > 10000000 || - *transition_latency < 50000) { - printk(KERN_WARNING PFX "frequency transition " - "measured seems out of range (%u " - "nSec), falling back to a safe one of" - "%u nSec.\n", - *transition_latency, 500000); - *transition_latency = 500000; - } - } - -out: - local_irq_restore(flags); - return ret; -} -EXPORT_SYMBOL_GPL(speedstep_get_freqs); - -#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK -module_param(relaxed_check, int, 0444); -MODULE_PARM_DESC(relaxed_check, - "Don't do all checks for speedstep capability."); -#endif - -MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>"); -MODULE_DESCRIPTION("Library for Intel SpeedStep 1 or 2 cpufreq drivers."); -MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h deleted file mode 100644 index 70d9cea1219..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h +++ /dev/null @@ -1,49 +0,0 @@ -/* - * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de> - * - * Licensed under the terms of the GNU GPL License version 2. - * - * Library for common functions for Intel SpeedStep v.1 and v.2 support - * - * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous* - */ - - - -/* processors */ -enum speedstep_processor { - SPEEDSTEP_CPU_PIII_C_EARLY = 0x00000001, /* Coppermine core */ - SPEEDSTEP_CPU_PIII_C = 0x00000002, /* Coppermine core */ - SPEEDSTEP_CPU_PIII_T = 0x00000003, /* Tualatin core */ - SPEEDSTEP_CPU_P4M = 0x00000004, /* P4-M */ -/* the following processors are not speedstep-capable and are not auto-detected - * in speedstep_detect_processor(). However, their speed can be detected using - * the speedstep_get_frequency() call. */ - SPEEDSTEP_CPU_PM = 0xFFFFFF03, /* Pentium M */ - SPEEDSTEP_CPU_P4D = 0xFFFFFF04, /* desktop P4 */ - SPEEDSTEP_CPU_PCORE = 0xFFFFFF05, /* Core */ -}; - -/* speedstep states -- only two of them */ - -#define SPEEDSTEP_HIGH 0x00000000 -#define SPEEDSTEP_LOW 0x00000001 - - -/* detect a speedstep-capable processor */ -extern enum speedstep_processor speedstep_detect_processor(void); - -/* detect the current speed (in khz) of the processor */ -extern unsigned int speedstep_get_frequency(enum speedstep_processor processor); - - -/* detect the low and high speeds of the processor. The callback - * set_state"'s first argument is either SPEEDSTEP_HIGH or - * SPEEDSTEP_LOW; the second argument is zero so that no - * cpufreq_notify_transition calls are initiated. - */ -extern unsigned int speedstep_get_freqs(enum speedstep_processor processor, - unsigned int *low_speed, - unsigned int *high_speed, - unsigned int *transition_latency, - void (*set_state) (unsigned int state)); diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c deleted file mode 100644 index 8abd869baab..00000000000 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c +++ /dev/null @@ -1,467 +0,0 @@ -/* - * Intel SpeedStep SMI driver. - * - * (C) 2003 Hiroshi Miura <miura@da-cha.org> - * - * Licensed under the terms of the GNU GPL License version 2. - * - */ - - -/********************************************************************* - * SPEEDSTEP - DEFINITIONS * - *********************************************************************/ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/moduleparam.h> -#include <linux/init.h> -#include <linux/cpufreq.h> -#include <linux/delay.h> -#include <linux/io.h> -#include <asm/ist.h> - -#include "speedstep-lib.h" - -/* speedstep system management interface port/command. - * - * These parameters are got from IST-SMI BIOS call. - * If user gives it, these are used. - * - */ -static int smi_port; -static int smi_cmd; -static unsigned int smi_sig; - -/* info about the processor */ -static enum speedstep_processor speedstep_processor; - -/* - * There are only two frequency states for each processor. Values - * are in kHz for the time being. - */ -static struct cpufreq_frequency_table speedstep_freqs[] = { - {SPEEDSTEP_HIGH, 0}, - {SPEEDSTEP_LOW, 0}, - {0, CPUFREQ_TABLE_END}, -}; - -#define GET_SPEEDSTEP_OWNER 0 -#define GET_SPEEDSTEP_STATE 1 -#define SET_SPEEDSTEP_STATE 2 -#define GET_SPEEDSTEP_FREQS 4 - -/* how often shall the SMI call be tried if it failed, e.g. because - * of DMA activity going on? */ -#define SMI_TRIES 5 - -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ - "speedstep-smi", msg) - -/** - * speedstep_smi_ownership - */ -static int speedstep_smi_ownership(void) -{ - u32 command, result, magic, dummy; - u32 function = GET_SPEEDSTEP_OWNER; - unsigned char magic_data[] = "Copyright (c) 1999 Intel Corporation"; - - command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff); - magic = virt_to_phys(magic_data); - - dprintk("trying to obtain ownership with command %x at port %x\n", - command, smi_port); - - __asm__ __volatile__( - "push %%ebp\n" - "out %%al, (%%dx)\n" - "pop %%ebp\n" - : "=D" (result), - "=a" (dummy), "=b" (dummy), "=c" (dummy), "=d" (dummy), - "=S" (dummy) - : "a" (command), "b" (function), "c" (0), "d" (smi_port), - "D" (0), "S" (magic) - : "memory" - ); - - dprintk("result is %x\n", result); - - return result; -} - -/** - * speedstep_smi_get_freqs - get SpeedStep preferred & current freq. - * @low: the low frequency value is placed here - * @high: the high frequency value is placed here - * - * Only available on later SpeedStep-enabled systems, returns false results or - * even hangs [cf. bugme.osdl.org # 1422] on earlier systems. Empirical testing - * shows that the latter occurs if !(ist_info.event & 0xFFFF). - */ -static int speedstep_smi_get_freqs(unsigned int *low, unsigned int *high) -{ - u32 command, result = 0, edi, high_mhz, low_mhz, dummy; - u32 state = 0; - u32 function = GET_SPEEDSTEP_FREQS; - - if (!(ist_info.event & 0xFFFF)) { - dprintk("bug #1422 -- can't read freqs from BIOS\n"); - return -ENODEV; - } - - command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff); - - dprintk("trying to determine frequencies with command %x at port %x\n", - command, smi_port); - - __asm__ __volatile__( - "push %%ebp\n" - "out %%al, (%%dx)\n" - "pop %%ebp" - : "=a" (result), - "=b" (high_mhz), - "=c" (low_mhz), - "=d" (state), "=D" (edi), "=S" (dummy) - : "a" (command), - "b" (function), - "c" (state), - "d" (smi_port), "S" (0), "D" (0) - ); - - dprintk("result %x, low_freq %u, high_freq %u\n", - result, low_mhz, high_mhz); - - /* abort if results are obviously incorrect... */ - if ((high_mhz + low_mhz) < 600) - return -EINVAL; - - *high = high_mhz * 1000; - *low = low_mhz * 1000; - - return result; -} - -/** - * speedstep_get_state - set the SpeedStep state - * @state: processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH) - * - */ -static int speedstep_get_state(void) -{ - u32 function = GET_SPEEDSTEP_STATE; - u32 result, state, edi, command, dummy; - - command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff); - - dprintk("trying to determine current setting with command %x " - "at port %x\n", command, smi_port); - - __asm__ __volatile__( - "push %%ebp\n" - "out %%al, (%%dx)\n" - "pop %%ebp\n" - : "=a" (result), - "=b" (state), "=D" (edi), - "=c" (dummy), "=d" (dummy), "=S" (dummy) - : "a" (command), "b" (function), "c" (0), - "d" (smi_port), "S" (0), "D" (0) - ); - - dprintk("state is %x, result is %x\n", state, result); - - return state & 1; -} - - -/** - * speedstep_set_state - set the SpeedStep state - * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH) - * - */ -static void speedstep_set_state(unsigned int state) -{ - unsigned int result = 0, command, new_state, dummy; - unsigned long flags; - unsigned int function = SET_SPEEDSTEP_STATE; - unsigned int retry = 0; - - if (state > 0x1) - return; - - /* Disable IRQs */ - local_irq_save(flags); - - command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff); - - dprintk("trying to set frequency to state %u " - "with command %x at port %x\n", - state, command, smi_port); - - do { - if (retry) { - dprintk("retry %u, previous result %u, waiting...\n", - retry, result); - mdelay(retry * 50); - } - retry++; - __asm__ __volatile__( - "push %%ebp\n" - "out %%al, (%%dx)\n" - "pop %%ebp" - : "=b" (new_state), "=D" (result), - "=c" (dummy), "=a" (dummy), - "=d" (dummy), "=S" (dummy) - : "a" (command), "b" (function), "c" (state), - "d" (smi_port), "S" (0), "D" (0) - ); - } while ((new_state != state) && (retry <= SMI_TRIES)); - - /* enable IRQs */ - local_irq_restore(flags); - - if (new_state == state) - dprintk("change to %u MHz succeeded after %u tries " - "with result %u\n", - (speedstep_freqs[new_state].frequency / 1000), - retry, result); - else - printk(KERN_ERR "cpufreq: change to state %u " - "failed with new_state %u and result %u\n", - state, new_state, result); - - return; -} - - -/** - * speedstep_target - set a new CPUFreq policy - * @policy: new policy - * @target_freq: new freq - * @relation: - * - * Sets a new CPUFreq policy/freq. - */ -static int speedstep_target(struct cpufreq_policy *policy, - unsigned int target_freq, unsigned int relation) -{ - unsigned int newstate = 0; - struct cpufreq_freqs freqs; - - if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0], - target_freq, relation, &newstate)) - return -EINVAL; - - freqs.old = speedstep_freqs[speedstep_get_state()].frequency; - freqs.new = speedstep_freqs[newstate].frequency; - freqs.cpu = 0; /* speedstep.c is UP only driver */ - - if (freqs.old == freqs.new) - return 0; - - cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); - speedstep_set_state(newstate); - cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); - - return 0; -} - - -/** - * speedstep_verify - verifies a new CPUFreq policy - * @policy: new policy - * - * Limit must be within speedstep_low_freq and speedstep_high_freq, with - * at least one border included. - */ -static int speedstep_verify(struct cpufreq_policy *policy) -{ - return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]); -} - - -static int speedstep_cpu_init(struct cpufreq_policy *policy) -{ - int result; - unsigned int speed, state; - unsigned int *low, *high; - - /* capability check */ - if (policy->cpu != 0) - return -ENODEV; - - result = speedstep_smi_ownership(); - if (result) { - dprintk("fails in aquiring ownership of a SMI interface.\n"); - return -EINVAL; - } - - /* detect low and high frequency */ - low = &speedstep_freqs[SPEEDSTEP_LOW].frequency; - high = &speedstep_freqs[SPEEDSTEP_HIGH].frequency; - - result = speedstep_smi_get_freqs(low, high); - if (result) { - /* fall back to speedstep_lib.c dection mechanism: - * try both states out */ - dprintk("could not detect low and high frequencies " - "by SMI call.\n"); - result = speedstep_get_freqs(speedstep_processor, - low, high, - NULL, - &speedstep_set_state); - - if (result) { - dprintk("could not detect two different speeds" - " -- aborting.\n"); - return result; - } else - dprintk("workaround worked.\n"); - } - - /* get current speed setting */ - state = speedstep_get_state(); - speed = speedstep_freqs[state].frequency; - - dprintk("currently at %s speed setting - %i MHz\n", - (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency) - ? "low" : "high", - (speed / 1000)); - - /* cpuinfo and default policy values */ - policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; - policy->cur = speed; - - result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs); - if (result) - return result; - - cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu); - - return 0; -} - -static int speedstep_cpu_exit(struct cpufreq_policy *policy) -{ - cpufreq_frequency_table_put_attr(policy->cpu); - return 0; -} - -static unsigned int speedstep_get(unsigned int cpu) -{ - if (cpu) - return -ENODEV; - return speedstep_get_frequency(speedstep_processor); -} - - -static int speedstep_resume(struct cpufreq_policy *policy) -{ - int result = speedstep_smi_ownership(); - - if (result) - dprintk("fails in re-aquiring ownership of a SMI interface.\n"); - - return result; -} - -static struct freq_attr *speedstep_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; - -static struct cpufreq_driver speedstep_driver = { - .name = "speedstep-smi", - .verify = speedstep_verify, - .target = speedstep_target, - .init = speedstep_cpu_init, - .exit = speedstep_cpu_exit, - .get = speedstep_get, - .resume = speedstep_resume, - .owner = THIS_MODULE, - .attr = speedstep_attr, -}; - -/** - * speedstep_init - initializes the SpeedStep CPUFreq driver - * - * Initializes the SpeedStep support. Returns -ENODEV on unsupported - * BIOS, -EINVAL on problems during initiatization, and zero on - * success. - */ -static int __init speedstep_init(void) -{ - speedstep_processor = speedstep_detect_processor(); - - switch (speedstep_processor) { - case SPEEDSTEP_CPU_PIII_T: - case SPEEDSTEP_CPU_PIII_C: - case SPEEDSTEP_CPU_PIII_C_EARLY: - break; - default: - speedstep_processor = 0; - } - - if (!speedstep_processor) { - dprintk("No supported Intel CPU detected.\n"); - return -ENODEV; - } - - dprintk("signature:0x%.8lx, command:0x%.8lx, " - "event:0x%.8lx, perf_level:0x%.8lx.\n", - ist_info.signature, ist_info.command, - ist_info.event, ist_info.perf_level); - - /* Error if no IST-SMI BIOS or no PARM - sig= 'ISGE' aka 'Intel Speedstep Gate E' */ - if ((ist_info.signature != 0x47534943) && ( - (smi_port == 0) || (smi_cmd == 0))) - return -ENODEV; - - if (smi_sig == 1) - smi_sig = 0x47534943; - else - smi_sig = ist_info.signature; - - /* setup smi_port from MODLULE_PARM or BIOS */ - if ((smi_port > 0xff) || (smi_port < 0)) - return -EINVAL; - else if (smi_port == 0) - smi_port = ist_info.command & 0xff; - - if ((smi_cmd > 0xff) || (smi_cmd < 0)) - return -EINVAL; - else if (smi_cmd == 0) - smi_cmd = (ist_info.command >> 16) & 0xff; - - return cpufreq_register_driver(&speedstep_driver); -} - - -/** - * speedstep_exit - unregisters SpeedStep support - * - * Unregisters SpeedStep support. - */ -static void __exit speedstep_exit(void) -{ - cpufreq_unregister_driver(&speedstep_driver); -} - -module_param(smi_port, int, 0444); -module_param(smi_cmd, int, 0444); -module_param(smi_sig, uint, 0444); - -MODULE_PARM_DESC(smi_port, "Override the BIOS-given IST port with this value " - "-- Intel's default setting is 0xb2"); -MODULE_PARM_DESC(smi_cmd, "Override the BIOS-given IST command with this value " - "-- Intel's default setting is 0x82"); -MODULE_PARM_DESC(smi_sig, "Set to 1 to fake the IST signature when using the " - "SMI interface."); - -MODULE_AUTHOR("Hiroshi Miura"); -MODULE_DESCRIPTION("Speedstep driver for IST applet SMI interface."); -MODULE_LICENSE("GPL"); - -module_init(speedstep_init); -module_exit(speedstep_exit); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index d16c2c53d6b..1edf5ba4fb2 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -29,10 +29,10 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) { + u64 misc_enable; + /* Unmask CPUID levels if masked: */ if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { - u64 misc_enable; - rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) { @@ -118,8 +118,6 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) * (model 2) with the same problem. */ if (c->x86 == 15) { - u64 misc_enable; - rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) { @@ -130,6 +128,19 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) } } #endif + + /* + * If fast string is not enabled in IA32_MISC_ENABLE for any reason, + * clear the fast string and enhanced fast string CPU capabilities. + */ + if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { + rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); + if (!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) { + printk(KERN_INFO "Disabled fast string operations\n"); + setup_clear_cpu_cap(X86_FEATURE_REP_GOOD); + setup_clear_cpu_cap(X86_FEATURE_ERMS); + } + } } #ifdef CONFIG_X86_32 @@ -276,14 +287,13 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) { -#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) +#ifdef CONFIG_NUMA unsigned node; int cpu = smp_processor_id(); - int apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid; /* Don't do the funky fallback heuristics the AMD version employs for now. */ - node = apicid_to_node[apicid]; + node = numa_cpu_node(cpu); if (node == NUMA_NO_NODE || !node_online(node)) { /* reuse the value from init_cpu_to_node() */ node = cpu_to_node(cpu); @@ -401,12 +411,10 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) switch (c->x86_model) { case 5: - if (c->x86_mask == 0) { - if (l2 == 0) - p = "Celeron (Covington)"; - else if (l2 == 256) - p = "Mobile Pentium II (Dixon)"; - } + if (l2 == 0) + p = "Celeron (Covington)"; + else if (l2 == 256) + p = "Mobile Pentium II (Dixon)"; break; case 6: diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 12cd823c8d0..c105c533ed9 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -45,6 +45,7 @@ static const struct _cache_table __cpuinitconst cache_table[] = { 0x0a, LVL_1_DATA, 8 }, /* 2 way set assoc, 32 byte line size */ { 0x0c, LVL_1_DATA, 16 }, /* 4-way set assoc, 32 byte line size */ { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */ + { 0x0e, LVL_1_DATA, 24 }, /* 6-way set assoc, 64 byte line size */ { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */ { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */ { 0x23, LVL_3, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ @@ -66,6 +67,7 @@ static const struct _cache_table __cpuinitconst cache_table[] = { 0x45, LVL_2, MB(2) }, /* 4-way set assoc, 32 byte line size */ { 0x46, LVL_3, MB(4) }, /* 4-way set assoc, 64 byte line size */ { 0x47, LVL_3, MB(8) }, /* 8-way set assoc, 64 byte line size */ + { 0x48, LVL_2, MB(3) }, /* 12-way set assoc, 64 byte line size */ { 0x49, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */ { 0x4a, LVL_3, MB(6) }, /* 12-way set assoc, 64 byte line size */ { 0x4b, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */ @@ -87,6 +89,7 @@ static const struct _cache_table __cpuinitconst cache_table[] = { 0x7c, LVL_2, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ { 0x7d, LVL_2, MB(2) }, /* 8-way set assoc, 64 byte line size */ { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */ + { 0x80, LVL_2, 512 }, /* 8-way set assoc, 64 byte line size */ { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */ { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */ { 0x84, LVL_2, MB(1) }, /* 8-way set assoc, 32 byte line size */ @@ -149,8 +152,7 @@ union _cpuid4_leaf_ecx { }; struct amd_l3_cache { - struct pci_dev *dev; - bool can_disable; + struct amd_northbridge *nb; unsigned indices; u8 subcaches[4]; }; @@ -266,7 +268,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, line_size = l2.line_size; lines_per_tag = l2.lines_per_tag; /* cpu_data has errata corrections for K7 applied */ - size_in_kb = current_cpu_data.x86_cache_size; + size_in_kb = __this_cpu_read(cpu_info.x86_cache_size); break; case 3: if (!l3.val) @@ -288,7 +290,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, eax->split.type = types[leaf]; eax->split.level = levels[leaf]; eax->split.num_threads_sharing = 0; - eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1; + eax->split.num_cores_on_die = __this_cpu_read(cpu_info.x86_max_cores) - 1; if (assoc == 0xffff) @@ -302,8 +304,9 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, struct _cache_attr { struct attribute attr; - ssize_t (*show)(struct _cpuid4_info *, char *); - ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count); + ssize_t (*show)(struct _cpuid4_info *, char *, unsigned int); + ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count, + unsigned int); }; #ifdef CONFIG_AMD_NB @@ -311,14 +314,12 @@ struct _cache_attr { /* * L3 cache descriptors */ -static struct amd_l3_cache **__cpuinitdata l3_caches; - static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3) { unsigned int sc0, sc1, sc2, sc3; u32 val = 0; - pci_read_config_dword(l3->dev, 0x1C4, &val); + pci_read_config_dword(l3->nb->misc, 0x1C4, &val); /* calculate subcache sizes */ l3->subcaches[0] = sc0 = !(val & BIT(0)); @@ -326,50 +327,17 @@ static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3) l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9)); l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13)); - l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1; + l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; } -static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node) -{ - struct amd_l3_cache *l3; - struct pci_dev *dev = node_to_k8_nb_misc(node); - - l3 = kzalloc(sizeof(struct amd_l3_cache), GFP_ATOMIC); - if (!l3) { - printk(KERN_WARNING "Error allocating L3 struct\n"); - return NULL; - } - - l3->dev = dev; - - amd_calc_l3_indices(l3); - - return l3; -} - -static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, - int index) +static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, + int index) { + static struct amd_l3_cache *__cpuinitdata l3_caches; int node; - if (boot_cpu_data.x86 != 0x10) - return; - - if (index < 3) - return; - - /* see errata #382 and #388 */ - if (boot_cpu_data.x86_model < 0x8) - return; - - if ((boot_cpu_data.x86_model == 0x8 || - boot_cpu_data.x86_model == 0x9) - && - boot_cpu_data.x86_mask < 0x1) - return; - - /* not in virtualized environments */ - if (k8_northbridges.num == 0) + /* only for L3, and not in virtualized environments */ + if (index < 3 || amd_nb_num() == 0) return; /* @@ -377,7 +345,7 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, * never freed but this is done only on shutdown so it doesn't matter. */ if (!l3_caches) { - int size = k8_northbridges.num * sizeof(struct amd_l3_cache *); + int size = amd_nb_num() * sizeof(struct amd_l3_cache); l3_caches = kzalloc(size, GFP_ATOMIC); if (!l3_caches) @@ -386,14 +354,12 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, node = amd_get_nb_id(smp_processor_id()); - if (!l3_caches[node]) { - l3_caches[node] = amd_init_l3_cache(node); - l3_caches[node]->can_disable = true; + if (!l3_caches[node].nb) { + l3_caches[node].nb = node_to_amd_nb(node); + amd_calc_l3_indices(&l3_caches[node]); } - WARN_ON(!l3_caches[node]); - - this_leaf->l3 = l3_caches[node]; + this_leaf->l3 = &l3_caches[node]; } /* @@ -407,7 +373,7 @@ int amd_get_l3_disable_slot(struct amd_l3_cache *l3, unsigned slot) { unsigned int reg = 0; - pci_read_config_dword(l3->dev, 0x1BC + slot * 4, ®); + pci_read_config_dword(l3->nb->misc, 0x1BC + slot * 4, ®); /* check whether this slot is activated already */ if (reg & (3UL << 30)) @@ -421,7 +387,8 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, { int index; - if (!this_leaf->l3 || !this_leaf->l3->can_disable) + if (!this_leaf->l3 || + !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) return -EINVAL; index = amd_get_l3_disable_slot(this_leaf->l3, slot); @@ -433,7 +400,8 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, #define SHOW_CACHE_DISABLE(slot) \ static ssize_t \ -show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf) \ +show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf, \ + unsigned int cpu) \ { \ return show_cache_disable(this_leaf, buf, slot); \ } @@ -456,7 +424,7 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu, if (!l3->subcaches[i]) continue; - pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg); + pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg); /* * We need to WBINVD on a core on the node containing the L3 @@ -466,7 +434,7 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu, wbinvd_on_cpu(cpu); reg |= BIT(31); - pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg); + pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg); } } @@ -485,27 +453,16 @@ int amd_set_l3_disable_slot(struct amd_l3_cache *l3, int cpu, unsigned slot, { int ret = 0; -#define SUBCACHE_MASK (3UL << 20) -#define SUBCACHE_INDEX 0xfff - - /* - * check whether this slot is already used or - * the index is already disabled - */ + /* check if @slot is already used or the index is already disabled */ ret = amd_get_l3_disable_slot(l3, slot); if (ret >= 0) return -EINVAL; - /* - * check whether the other slot has disabled the - * same index already - */ - if (index == amd_get_l3_disable_slot(l3, !slot)) + if (index > l3->indices) return -EINVAL; - /* do not allow writes outside of allowed bits */ - if ((index & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) || - ((index & SUBCACHE_INDEX) > l3->indices)) + /* check whether the other slot has disabled the same index already */ + if (index == amd_get_l3_disable_slot(l3, !slot)) return -EINVAL; amd_l3_disable_index(l3, cpu, slot, index); @@ -523,7 +480,8 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!this_leaf->l3 || !this_leaf->l3->can_disable) + if (!this_leaf->l3 || + !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) return -EINVAL; cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); @@ -544,7 +502,8 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, #define STORE_CACHE_DISABLE(slot) \ static ssize_t \ store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \ - const char *buf, size_t count) \ + const char *buf, size_t count, \ + unsigned int cpu) \ { \ return store_cache_disable(this_leaf, buf, count, slot); \ } @@ -556,25 +515,55 @@ static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, show_cache_disable_1, store_cache_disable_1); -#else /* CONFIG_AMD_NB */ -static void __cpuinit -amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index) +static ssize_t +show_subcaches(struct _cpuid4_info *this_leaf, char *buf, unsigned int cpu) { -}; + if (!this_leaf->l3 || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + return -EINVAL; + + return sprintf(buf, "%x\n", amd_get_subcaches(cpu)); +} + +static ssize_t +store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count, + unsigned int cpu) +{ + unsigned long val; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!this_leaf->l3 || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + return -EINVAL; + + if (strict_strtoul(buf, 16, &val) < 0) + return -EINVAL; + + if (amd_set_subcaches(cpu, val)) + return -EINVAL; + + return count; +} + +static struct _cache_attr subcaches = + __ATTR(subcaches, 0644, show_subcaches, store_subcaches); + +#else /* CONFIG_AMD_NB */ +#define amd_init_l3_cache(x, y) #endif /* CONFIG_AMD_NB */ static int __cpuinit cpuid4_cache_lookup_regs(int index, struct _cpuid4_info_regs *this_leaf) { - union _cpuid4_leaf_eax eax; - union _cpuid4_leaf_ebx ebx; - union _cpuid4_leaf_ecx ecx; + union _cpuid4_leaf_eax eax; + union _cpuid4_leaf_ebx ebx; + union _cpuid4_leaf_ecx ecx; unsigned edx; if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { amd_cpuid4(index, &eax, &ebx, &ecx); - amd_check_l3_disable(this_leaf, index); + amd_init_l3_cache(this_leaf, index); } else { cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); } @@ -767,11 +756,11 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) struct cpuinfo_x86 *c = &cpu_data(cpu); if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) { - for_each_cpu(i, c->llc_shared_map) { + for_each_cpu(i, cpu_llc_shared_mask(cpu)) { if (!per_cpu(ici_cpuid4_info, i)) continue; this_leaf = CPUID4_INFO_IDX(i, index); - for_each_cpu(sibling, c->llc_shared_map) { + for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) { if (!cpu_online(sibling)) continue; set_bit(sibling, this_leaf->shared_cpu_map); @@ -905,8 +894,8 @@ static DEFINE_PER_CPU(struct _index_kobject *, ici_index_kobject); #define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(ici_index_kobject, x))[y])) #define show_one_plus(file_name, object, val) \ -static ssize_t show_##file_name \ - (struct _cpuid4_info *this_leaf, char *buf) \ +static ssize_t show_##file_name(struct _cpuid4_info *this_leaf, char *buf, \ + unsigned int cpu) \ { \ return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \ } @@ -917,7 +906,8 @@ show_one_plus(physical_line_partition, ebx.split.physical_line_partition, 1); show_one_plus(ways_of_associativity, ebx.split.ways_of_associativity, 1); show_one_plus(number_of_sets, ecx.split.number_of_sets, 1); -static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf) +static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf, + unsigned int cpu) { return sprintf(buf, "%luK\n", this_leaf->size / 1024); } @@ -941,17 +931,20 @@ static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf, return n; } -static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf) +static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf, + unsigned int cpu) { return show_shared_cpu_map_func(leaf, 0, buf); } -static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf) +static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf, + unsigned int cpu) { return show_shared_cpu_map_func(leaf, 1, buf); } -static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) +static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf, + unsigned int cpu) { switch (this_leaf->eax.split.type) { case CACHE_TYPE_DATA: @@ -982,30 +975,54 @@ define_one_ro(size); define_one_ro(shared_cpu_map); define_one_ro(shared_cpu_list); -#define DEFAULT_SYSFS_CACHE_ATTRS \ - &type.attr, \ - &level.attr, \ - &coherency_line_size.attr, \ - &physical_line_partition.attr, \ - &ways_of_associativity.attr, \ - &number_of_sets.attr, \ - &size.attr, \ - &shared_cpu_map.attr, \ - &shared_cpu_list.attr - static struct attribute *default_attrs[] = { - DEFAULT_SYSFS_CACHE_ATTRS, + &type.attr, + &level.attr, + &coherency_line_size.attr, + &physical_line_partition.attr, + &ways_of_associativity.attr, + &number_of_sets.attr, + &size.attr, + &shared_cpu_map.attr, + &shared_cpu_list.attr, NULL }; -static struct attribute *default_l3_attrs[] = { - DEFAULT_SYSFS_CACHE_ATTRS, #ifdef CONFIG_AMD_NB - &cache_disable_0.attr, - &cache_disable_1.attr, +static struct attribute ** __cpuinit amd_l3_attrs(void) +{ + static struct attribute **attrs; + int n; + + if (attrs) + return attrs; + + n = sizeof (default_attrs) / sizeof (struct attribute *); + + if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) + n += 2; + + if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + n += 1; + + attrs = kzalloc(n * sizeof (struct attribute *), GFP_KERNEL); + if (attrs == NULL) + return attrs = default_attrs; + + for (n = 0; default_attrs[n]; n++) + attrs[n] = default_attrs[n]; + + if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) { + attrs[n++] = &cache_disable_0.attr; + attrs[n++] = &cache_disable_1.attr; + } + + if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + attrs[n++] = &subcaches.attr; + + return attrs; +} #endif - NULL -}; static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) { @@ -1015,7 +1032,7 @@ static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) ret = fattr->show ? fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), - buf) : + buf, this_leaf->cpu) : 0; return ret; } @@ -1029,7 +1046,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr, ret = fattr->store ? fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), - buf, count) : + buf, count, this_leaf->cpu) : 0; return ret; } @@ -1116,11 +1133,11 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) this_leaf = CPUID4_INFO_IDX(cpu, i); - if (this_leaf->l3 && this_leaf->l3->can_disable) - ktype_cache.default_attrs = default_l3_attrs; - else - ktype_cache.default_attrs = default_attrs; - + ktype_cache.default_attrs = default_attrs; +#ifdef CONFIG_AMD_NB + if (this_leaf->l3) + ktype_cache.default_attrs = amd_l3_attrs(); +#endif retval = kobject_init_and_add(&(this_object->kobj), &ktype_cache, per_cpu(ici_cache_kobject, cpu), diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c index 8209472b27a..83930deec3c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-apei.c +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c @@ -106,24 +106,34 @@ int apei_write_mce(struct mce *m) ssize_t apei_read_mce(struct mce *m, u64 *record_id) { struct cper_mce_record rcd; - ssize_t len; - - len = erst_read_next(&rcd.hdr, sizeof(rcd)); - if (len <= 0) - return len; - /* Can not skip other records in storage via ERST unless clear them */ - else if (len != sizeof(rcd) || - uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) { - if (printk_ratelimit()) - pr_warning( - "MCE-APEI: Can not skip the unknown record in ERST"); - return -EIO; - } - + int rc, pos; + + rc = erst_get_record_id_begin(&pos); + if (rc) + return rc; +retry: + rc = erst_get_record_id_next(&pos, record_id); + if (rc) + goto out; + /* no more record */ + if (*record_id == APEI_ERST_INVALID_RECORD_ID) + goto out; + rc = erst_read(*record_id, &rcd.hdr, sizeof(rcd)); + /* someone else has cleared the record, try next one */ + if (rc == -ENOENT) + goto retry; + else if (rc < 0) + goto out; + /* try to skip other type records in storage */ + else if (rc != sizeof(rcd) || + uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) + goto retry; memcpy(m, &rcd.mce, sizeof(*m)); - *record_id = rcd.hdr.record_id; + rc = sizeof(*m); +out: + erst_get_record_id_end(); - return sizeof(*m); + return rc; } /* Check whether there is record in ERST */ diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index e7dbde7bfed..0ed633c5048 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -25,13 +25,14 @@ #include <linux/gfp.h> #include <asm/mce.h> #include <asm/apic.h> +#include <asm/nmi.h> /* Update fake mce registers on current CPU. */ static void inject_mce(struct mce *m) { struct mce *i = &per_cpu(injectm, m->extcpu); - /* Make sure noone reads partially written injectm */ + /* Make sure no one reads partially written injectm */ i->finished = 0; mb(); m->finished = 0; @@ -83,7 +84,7 @@ static int mce_raise_notify(struct notifier_block *self, struct die_args *args = (struct die_args *)data; int cpu = smp_processor_id(); struct mce *m = &__get_cpu_var(injectm); - if (val != DIE_NMI_IPI || !cpumask_test_cpu(cpu, mce_inject_cpumask)) + if (val != DIE_NMI || !cpumask_test_cpu(cpu, mce_inject_cpumask)) return NOTIFY_DONE; cpumask_clear_cpu(cpu, mce_inject_cpumask); if (m->inject_flags & MCJ_EXCEPTION) @@ -95,7 +96,7 @@ static int mce_raise_notify(struct notifier_block *self, static struct notifier_block mce_raise_nb = { .notifier_call = mce_raise_notify, - .priority = 1000, + .priority = NMI_LOCAL_NORMAL_PRIOR, }; /* Inject mce on current CPU */ diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 7a35b72d7c0..ff1ae9b6464 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -21,6 +21,7 @@ #include <linux/percpu.h> #include <linux/string.h> #include <linux/sysdev.h> +#include <linux/syscore_ops.h> #include <linux/delay.h> #include <linux/ctype.h> #include <linux/sched.h> @@ -104,20 +105,6 @@ static int cpu_missing; ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); EXPORT_SYMBOL_GPL(x86_mce_decoder_chain); -static int default_decode_mce(struct notifier_block *nb, unsigned long val, - void *data) -{ - pr_emerg(HW_ERR "No human readable MCE decoding support on this CPU type.\n"); - pr_emerg(HW_ERR "Run the message through 'mcelog --ascii' to decode.\n"); - - return NOTIFY_STOP; -} - -static struct notifier_block mce_dec_nb = { - .notifier_call = default_decode_mce, - .priority = -1, -}; - /* MCA banks polled by the period polling timer for corrected events */ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL @@ -211,6 +198,8 @@ void mce_log(struct mce *mce) static void print_mce(struct mce *m) { + int ret = 0; + pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n", m->extcpu, m->mcgstatus, m->bank, m->status); @@ -238,7 +227,11 @@ static void print_mce(struct mce *m) * Print out human-readable details about the MCE error, * (if the CPU has an implementation for that) */ - atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); + ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); + if (ret == NOTIFY_STOP) + return; + + pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); } #define PANIC_TIMEOUT 5 /* 5 seconds */ @@ -326,7 +319,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp) static int msr_to_offset(u32 msr) { - unsigned bank = __get_cpu_var(injectm.bank); + unsigned bank = __this_cpu_read(injectm.bank); if (msr == rip_msr) return offsetof(struct mce, ip); @@ -346,7 +339,7 @@ static u64 mce_rdmsrl(u32 msr) { u64 v; - if (__get_cpu_var(injectm).finished) { + if (__this_cpu_read(injectm.finished)) { int offset = msr_to_offset(msr); if (offset < 0) @@ -369,7 +362,7 @@ static u64 mce_rdmsrl(u32 msr) static void mce_wrmsrl(u32 msr, u64 v) { - if (__get_cpu_var(injectm).finished) { + if (__this_cpu_read(injectm.finished)) { int offset = msr_to_offset(msr); if (offset >= 0) @@ -589,7 +582,6 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { mce_log(&m); atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m); - add_taint(TAINT_MACHINE_CHECK); } /* @@ -881,7 +873,7 @@ reset: * Check if the address reported by the CPU is in a format we can parse. * It would be possible to add code for most other cases, but all would * be somewhat complicated (e.g. segment offset would require an instruction - * parser). So only support physical addresses upto page granuality for now. + * parser). So only support physical addresses up to page granuality for now. */ static int mce_usable_address(struct mce *m) { @@ -1159,7 +1151,7 @@ static void mce_start_timer(unsigned long data) WARN_ON(smp_processor_id() != data); - if (mce_available(¤t_cpu_data)) { + if (mce_available(__this_cpu_ptr(&cpu_info))) { machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_poll_banks)); } @@ -1625,7 +1617,7 @@ out: static unsigned int mce_poll(struct file *file, poll_table *wait) { poll_wait(file, &mce_wait, wait); - if (rcu_dereference_check_mce(mcelog.next)) + if (rcu_access_index(mcelog.next)) return POLLIN | POLLRDNORM; if (!mce_apei_read_done && apei_check_mce()) return POLLIN | POLLRDNORM; @@ -1721,8 +1713,6 @@ __setup("mce", mcheck_enable); int __init mcheck_init(void) { - atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb); - mcheck_intel_therm_init(); return 0; @@ -1749,14 +1739,14 @@ static int mce_disable_error_reporting(void) return 0; } -static int mce_suspend(struct sys_device *dev, pm_message_t state) +static int mce_suspend(void) { return mce_disable_error_reporting(); } -static int mce_shutdown(struct sys_device *dev) +static void mce_shutdown(void) { - return mce_disable_error_reporting(); + mce_disable_error_reporting(); } /* @@ -1764,18 +1754,22 @@ static int mce_shutdown(struct sys_device *dev) * Only one CPU is active at this time, the others get re-added later using * CPU hotplug: */ -static int mce_resume(struct sys_device *dev) +static void mce_resume(void) { __mcheck_cpu_init_generic(); - __mcheck_cpu_init_vendor(¤t_cpu_data); - - return 0; + __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info)); } +static struct syscore_ops mce_syscore_ops = { + .suspend = mce_suspend, + .shutdown = mce_shutdown, + .resume = mce_resume, +}; + static void mce_cpu_restart(void *data) { del_timer_sync(&__get_cpu_var(mce_timer)); - if (!mce_available(¤t_cpu_data)) + if (!mce_available(__this_cpu_ptr(&cpu_info))) return; __mcheck_cpu_init_generic(); __mcheck_cpu_init_timer(); @@ -1790,7 +1784,7 @@ static void mce_restart(void) /* Toggle features for corrected errors */ static void mce_disable_ce(void *all) { - if (!mce_available(¤t_cpu_data)) + if (!mce_available(__this_cpu_ptr(&cpu_info))) return; if (all) del_timer_sync(&__get_cpu_var(mce_timer)); @@ -1799,7 +1793,7 @@ static void mce_disable_ce(void *all) static void mce_enable_ce(void *all) { - if (!mce_available(¤t_cpu_data)) + if (!mce_available(__this_cpu_ptr(&cpu_info))) return; cmci_reenable(); cmci_recheck(); @@ -1808,9 +1802,6 @@ static void mce_enable_ce(void *all) } static struct sysdev_class mce_sysclass = { - .suspend = mce_suspend, - .shutdown = mce_shutdown, - .resume = mce_resume, .name = "machinecheck", }; @@ -2022,7 +2013,7 @@ static void __cpuinit mce_disable_cpu(void *h) unsigned long action = *(unsigned long *)h; int i; - if (!mce_available(¤t_cpu_data)) + if (!mce_available(__this_cpu_ptr(&cpu_info))) return; if (!(action & CPU_TASKS_FROZEN)) @@ -2040,7 +2031,7 @@ static void __cpuinit mce_reenable_cpu(void *h) unsigned long action = *(unsigned long *)h; int i; - if (!mce_available(¤t_cpu_data)) + if (!mce_available(__this_cpu_ptr(&cpu_info))) return; if (!(action & CPU_TASKS_FROZEN)) @@ -2139,6 +2130,7 @@ static __init int mcheck_init_device(void) return err; } + register_syscore_ops(&mce_syscore_ops); register_hotcpu_notifier(&mce_cpu_notifier); misc_register(&mce_log_device); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 80c482382d5..bb0adad3514 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -31,8 +31,6 @@ #include <asm/mce.h> #include <asm/msr.h> -#define PFX "mce_threshold: " -#define VERSION "version 1.1.1" #define NR_BANKS 6 #define NR_BLOCKS 9 #define THRESHOLD_MAX 0xFFF @@ -59,12 +57,6 @@ struct threshold_block { struct list_head miscj; }; -/* defaults used early on boot */ -static struct threshold_block threshold_defaults = { - .interrupt_enable = 0, - .threshold_limit = THRESHOLD_MAX, -}; - struct threshold_bank { struct kobject *kobj; struct threshold_block *blocks; @@ -89,50 +81,101 @@ static void amd_threshold_interrupt(void); struct thresh_restart { struct threshold_block *b; int reset; + int set_lvt_off; + int lvt_off; u16 old_limit; }; +static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) +{ + int msr = (hi & MASK_LVTOFF_HI) >> 20; + + if (apic < 0) { + pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt " + "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu, + b->bank, b->block, b->address, hi, lo); + return 0; + } + + if (apic != msr) { + pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d " + "for bank %d, block %d (MSR%08X=0x%x%08x)\n", + b->cpu, apic, b->bank, b->block, b->address, hi, lo); + return 0; + } + + return 1; +}; + /* must be called with correct cpu affinity */ /* Called via smp_call_function_single() */ static void threshold_restart_bank(void *_tr) { struct thresh_restart *tr = _tr; - u32 mci_misc_hi, mci_misc_lo; + u32 hi, lo; - rdmsr(tr->b->address, mci_misc_lo, mci_misc_hi); + rdmsr(tr->b->address, lo, hi); - if (tr->b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX)) + if (tr->b->threshold_limit < (hi & THRESHOLD_MAX)) tr->reset = 1; /* limit cannot be lower than err count */ if (tr->reset) { /* reset err count and overflow bit */ - mci_misc_hi = - (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) | + hi = + (hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) | (THRESHOLD_MAX - tr->b->threshold_limit); } else if (tr->old_limit) { /* change limit w/o reset */ - int new_count = (mci_misc_hi & THRESHOLD_MAX) + + int new_count = (hi & THRESHOLD_MAX) + (tr->old_limit - tr->b->threshold_limit); - mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) | + hi = (hi & ~MASK_ERR_COUNT_HI) | (new_count & THRESHOLD_MAX); } + if (tr->set_lvt_off) { + if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) { + /* set new lvt offset */ + hi &= ~MASK_LVTOFF_HI; + hi |= tr->lvt_off << 20; + } + } + tr->b->interrupt_enable ? - (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) : - (mci_misc_hi &= ~MASK_INT_TYPE_HI); + (hi = (hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) : + (hi &= ~MASK_INT_TYPE_HI); + + hi |= MASK_COUNT_EN_HI; + wrmsr(tr->b->address, lo, hi); +} - mci_misc_hi |= MASK_COUNT_EN_HI; - wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi); +static void mce_threshold_block_init(struct threshold_block *b, int offset) +{ + struct thresh_restart tr = { + .b = b, + .set_lvt_off = 1, + .lvt_off = offset, + }; + + b->threshold_limit = THRESHOLD_MAX; + threshold_restart_bank(&tr); +}; + +static int setup_APIC_mce(int reserved, int new) +{ + if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR, + APIC_EILVT_MSG_FIX, 0)) + return new; + + return reserved; } /* cpu init entry point, called from mce.c with preempt off */ void mce_amd_feature_init(struct cpuinfo_x86 *c) { + struct threshold_block b; unsigned int cpu = smp_processor_id(); u32 low = 0, high = 0, address = 0; unsigned int bank, block; - struct thresh_restart tr; - int lvt_off = -1; - u8 offset; + int offset = -1; for (bank = 0; bank < NR_BANKS; ++bank) { for (block = 0; block < NR_BLOCKS; ++block) { @@ -163,39 +206,16 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) if (shared_bank[bank] && c->cpu_core_id) break; #endif - offset = (high & MASK_LVTOFF_HI) >> 20; - if (lvt_off < 0) { - if (setup_APIC_eilvt(offset, - THRESHOLD_APIC_VECTOR, - APIC_EILVT_MSG_FIX, 0)) { - pr_err(FW_BUG "cpu %d, failed to " - "setup threshold interrupt " - "for bank %d, block %d " - "(MSR%08X=0x%x%08x)", - smp_processor_id(), bank, block, - address, high, low); - continue; - } - lvt_off = offset; - } else if (lvt_off != offset) { - pr_err(FW_BUG "cpu %d, invalid threshold " - "interrupt offset %d for bank %d," - "block %d (MSR%08X=0x%x%08x)", - smp_processor_id(), lvt_off, bank, - block, address, high, low); - continue; - } - - high &= ~MASK_LVTOFF_HI; - high |= lvt_off << 20; - wrmsr(address, low, high); + offset = setup_APIC_mce(offset, + (high & MASK_LVTOFF_HI) >> 20); - threshold_defaults.address = address; - tr.b = &threshold_defaults; - tr.reset = 0; - tr.old_limit = 0; - threshold_restart_bank(&tr); + memset(&b, 0, sizeof(b)); + b.cpu = cpu; + b.bank = bank; + b.block = block; + b.address = address; + mce_threshold_block_init(&b, offset); mce_threshold_vector = amd_threshold_interrupt; } } @@ -298,9 +318,8 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size) b->interrupt_enable = !!new; + memset(&tr, 0, sizeof(tr)); tr.b = b; - tr.reset = 0; - tr.old_limit = 0; smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); @@ -321,10 +340,10 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size) if (new < 1) new = 1; + memset(&tr, 0, sizeof(tr)); tr.old_limit = b->threshold_limit; b->threshold_limit = new; tr.b = b; - tr.reset = 0; smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); @@ -490,6 +509,7 @@ recurse: out_free: if (b) { kobject_put(&b->kobj); + list_del(&b->miscj); kfree(b); } return err; @@ -508,15 +528,12 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) int i, err = 0; struct threshold_bank *b = NULL; char name[32]; -#ifdef CONFIG_SMP - struct cpuinfo_x86 *c = &cpu_data(cpu); -#endif sprintf(name, "threshold_bank%i", bank); #ifdef CONFIG_SMP if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ - i = cpumask_first(c->llc_shared_map); + i = cpumask_first(cpu_llc_shared_mask(cpu)); /* first core not up yet */ if (cpu_data(i).cpu_core_id) @@ -536,7 +553,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (err) goto out; - cpumask_copy(b->cpus, c->llc_shared_map); + cpumask_copy(b->cpus, cpu_llc_shared_mask(cpu)); per_cpu(threshold_banks, cpu)[bank] = b; goto out; @@ -603,9 +620,9 @@ static __cpuinit int threshold_create_device(unsigned int cpu) continue; err = threshold_create_bank(cpu, bank); if (err) - goto out; + return err; } -out: + return err; } diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 6fcd0936194..8694ef56459 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -130,7 +130,7 @@ void cmci_recheck(void) unsigned long flags; int banks; - if (!mce_available(¤t_cpu_data) || !cmci_supported(&banks)) + if (!mce_available(__this_cpu_ptr(&cpu_info)) || !cmci_supported(&banks)) return; local_irq_save(flags); machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 4b683267eca..27c625178bf 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -53,8 +53,14 @@ struct thermal_state { struct _thermal_state core_power_limit; struct _thermal_state package_throttle; struct _thermal_state package_power_limit; + struct _thermal_state core_thresh0; + struct _thermal_state core_thresh1; }; +/* Callback to handle core threshold interrupts */ +int (*platform_thermal_notify)(__u64 msr_val); +EXPORT_SYMBOL(platform_thermal_notify); + static DEFINE_PER_CPU(struct thermal_state, thermal_state); static atomic_t therm_throt_en = ATOMIC_INIT(0); @@ -181,8 +187,6 @@ static int therm_throt_process(bool new_event, int event, int level) this_cpu, level == CORE_LEVEL ? "Core" : "Package", state->count); - - add_taint(TAINT_MACHINE_CHECK); return 1; } if (old_event) { @@ -200,6 +204,22 @@ static int therm_throt_process(bool new_event, int event, int level) return 0; } +static int thresh_event_valid(int event) +{ + struct _thermal_state *state; + unsigned int this_cpu = smp_processor_id(); + struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu); + u64 now = get_jiffies_64(); + + state = (event == 0) ? &pstate->core_thresh0 : &pstate->core_thresh1; + + if (time_before64(now, state->next_check)) + return 0; + + state->next_check = now + CHECK_INTERVAL; + return 1; +} + #ifdef CONFIG_SYSFS /* Add/Remove thermal_throttle interface for CPU device: */ static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev, @@ -313,32 +333,50 @@ device_initcall(thermal_throttle_init_device); #define PACKAGE_THROTTLED ((__u64)2 << 62) #define PACKAGE_POWER_LIMIT ((__u64)3 << 62) +static void notify_thresholds(__u64 msr_val) +{ + /* check whether the interrupt handler is defined; + * otherwise simply return + */ + if (!platform_thermal_notify) + return; + + /* lower threshold reached */ + if ((msr_val & THERM_LOG_THRESHOLD0) && thresh_event_valid(0)) + platform_thermal_notify(msr_val); + /* higher threshold reached */ + if ((msr_val & THERM_LOG_THRESHOLD1) && thresh_event_valid(1)) + platform_thermal_notify(msr_val); +} + /* Thermal transition interrupt handler */ static void intel_thermal_interrupt(void) { __u64 msr_val; - struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); rdmsrl(MSR_IA32_THERM_STATUS, msr_val); + /* Check for violation of core thermal thresholds*/ + notify_thresholds(msr_val); + if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT, THERMAL_THROTTLING_EVENT, CORE_LEVEL) != 0) mce_log_therm_throt_event(CORE_THROTTLED | msr_val); - if (cpu_has(c, X86_FEATURE_PLN)) + if (this_cpu_has(X86_FEATURE_PLN)) if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, POWER_LIMIT_EVENT, CORE_LEVEL) != 0) mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val); - if (cpu_has(c, X86_FEATURE_PTS)) { + if (this_cpu_has(X86_FEATURE_PTS)) { rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, THERMAL_THROTTLING_EVENT, PACKAGE_LEVEL) != 0) mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val); - if (cpu_has(c, X86_FEATURE_PLN)) + if (this_cpu_has(X86_FEATURE_PLN)) if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_POWER_LIMIT, POWER_LIMIT_EVENT, @@ -352,7 +390,6 @@ static void unexpected_thermal_interrupt(void) { printk(KERN_ERR "CPU%d: Unexpected LVT thermal interrupt!\n", smp_processor_id()); - add_taint(TAINT_MACHINE_CHECK); } static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt; @@ -405,18 +442,20 @@ void intel_init_thermal(struct cpuinfo_x86 *c) */ rdmsr(MSR_IA32_MISC_ENABLE, l, h); + h = lvtthmr_init; /* * The initial value of thermal LVT entries on all APs always reads * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI * sequence to them and LVT registers are reset to 0s except for * the mask bits which are set to 1s when APs receive INIT IPI. - * Always restore the value that BIOS has programmed on AP based on - * BSP's info we saved since BIOS is always setting the same value - * for all threads/cores + * If BIOS takes over the thermal interrupt and sets its interrupt + * delivery mode to SMI (not fixed), it restores the value that the + * BIOS has programmed on AP based on BSP's info we saved since BIOS + * is always setting the same value for all threads/cores. */ - apic_write(APIC_LVTTHMR, lvtthmr_init); + if ((h & APIC_DM_FIXED_MASK) != APIC_DM_FIXED) + apic_write(APIC_LVTTHMR, lvtthmr_init); - h = lvtthmr_init; if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { printk(KERN_DEBUG diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 9f27228ceff..a71efcdbb09 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -1,6 +1,6 @@ /* * This only handles 32bit MTRR on 32bit hosts. This is strictly wrong - * because MTRRs can span upto 40 bits (36bits on most modern x86) + * because MTRRs can span up to 40 bits (36bits on most modern x86) */ #define DEBUG diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 01c0f3ee6cc..929739a653d 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -45,6 +45,7 @@ #include <linux/cpu.h> #include <linux/pci.h> #include <linux/smp.h> +#include <linux/syscore_ops.h> #include <asm/processor.h> #include <asm/e820.h> @@ -292,14 +293,24 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ /* * HACK! - * We use this same function to initialize the mtrrs on boot. - * The state of the boot cpu's mtrrs has been saved, and we want - * to replicate across all the APs. - * If we're doing that @reg is set to something special... + * + * We use this same function to initialize the mtrrs during boot, + * resume, runtime cpu online and on an explicit request to set a + * specific MTRR. + * + * During boot or suspend, the state of the boot cpu's mtrrs has been + * saved, and we want to replicate that across all the cpus that come + * online (either at the end of boot or resume or during a runtime cpu + * online). If we're doing that, @reg is set to something special and on + * this cpu we still do mtrr_if->set_all(). During boot/resume, this + * is unnecessary if at this point we are still on the cpu that started + * the boot/resume sequence. But there is no guarantee that we are still + * on the same cpu. So we do mtrr_if->set_all() on this cpu aswell to be + * sure that we are in sync with everyone else. */ if (reg != ~0U) mtrr_if->set(reg, base, size, type); - else if (!mtrr_aps_delayed_init) + else mtrr_if->set_all(); /* Wait for the others */ @@ -630,7 +641,7 @@ struct mtrr_value { static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES]; -static int mtrr_save(struct sys_device *sysdev, pm_message_t state) +static int mtrr_save(void) { int i; @@ -642,7 +653,7 @@ static int mtrr_save(struct sys_device *sysdev, pm_message_t state) return 0; } -static int mtrr_restore(struct sys_device *sysdev) +static void mtrr_restore(void) { int i; @@ -653,12 +664,11 @@ static int mtrr_restore(struct sys_device *sysdev) mtrr_value[i].ltype); } } - return 0; } -static struct sysdev_driver mtrr_sysdev_driver = { +static struct syscore_ops mtrr_syscore_ops = { .suspend = mtrr_save, .resume = mtrr_restore, }; @@ -793,13 +803,21 @@ void set_mtrr_aps_delayed_init(void) } /* - * MTRR initialization for all AP's + * Delayed MTRR initialization for all AP's */ void mtrr_aps_init(void) { if (!use_intel()) return; + /* + * Check if someone has requested the delay of AP MTRR initialization, + * by doing set_mtrr_aps_delayed_init(), prior to this point. If not, + * then we are done. + */ + if (!mtrr_aps_delayed_init) + return; + set_mtrr(~0U, 0, 0, 0); mtrr_aps_delayed_init = false; } @@ -831,7 +849,7 @@ static int __init mtrr_init_finialize(void) * TBD: is there any system with such CPU which supports * suspend/resume? If no, we should remove the code. */ - sysdev_driver_register(&cpu_sysdev_class, &mtrr_sysdev_driver); + register_syscore_ops(&mtrr_syscore_ops); return 0; } diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index fe73c1844a9..3a0338b4b17 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -30,6 +30,8 @@ #include <asm/stacktrace.h> #include <asm/nmi.h> #include <asm/compat.h> +#include <asm/smp.h> +#include <asm/alternative.h> #if 0 #undef wrmsrl @@ -49,7 +51,6 @@ static unsigned long copy_from_user_nmi(void *to, const void __user *from, unsigned long n) { unsigned long offset, addr = (unsigned long)from; - int type = in_nmi() ? KM_NMI : KM_IRQ0; unsigned long size, len = 0; struct page *page; void *map; @@ -63,9 +64,9 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n) offset = addr & (PAGE_SIZE - 1); size = min(PAGE_SIZE - offset, n - len); - map = kmap_atomic(page, type); + map = kmap_atomic(page); memcpy(to, map+offset, size); - kunmap_atomic(map, type); + kunmap_atomic(map); put_page(page); len += size; @@ -94,6 +95,8 @@ struct amd_nb { struct event_constraint event_constraints[X86_PMC_IDX_MAX]; }; +struct intel_percore; + #define MAX_LBR_ENTRIES 16 struct cpu_hw_events { @@ -129,6 +132,13 @@ struct cpu_hw_events { struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; /* + * Intel percore register state. + * Coordinate shared resources between HT threads. + */ + int percore_used; /* Used by this CPU? */ + struct intel_percore *per_core; + + /* * AMD specific bits */ struct amd_nb *amd_nb; @@ -167,7 +177,7 @@ struct cpu_hw_events { /* * Constraint on the Event code + UMask */ -#define PEBS_EVENT_CONSTRAINT(c, n) \ +#define INTEL_UEVENT_CONSTRAINT(c, n) \ EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK) #define EVENT_CONSTRAINT_END \ @@ -176,6 +186,28 @@ struct cpu_hw_events { #define for_each_event_constraint(e, c) \ for ((e) = (c); (e)->weight; (e)++) +/* + * Extra registers for specific events. + * Some events need large masks and require external MSRs. + * Define a mapping to these extra registers. + */ +struct extra_reg { + unsigned int event; + unsigned int msr; + u64 config_mask; + u64 valid_mask; +}; + +#define EVENT_EXTRA_REG(e, ms, m, vm) { \ + .event = (e), \ + .msr = (ms), \ + .config_mask = (m), \ + .valid_mask = (vm), \ + } +#define INTEL_EVENT_EXTRA_REG(event, msr, vm) \ + EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm) +#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0) + union perf_capabilities { struct { u64 lbr_format : 6; @@ -220,6 +252,7 @@ struct x86_pmu { void (*put_event_constraints)(struct cpu_hw_events *cpuc, struct perf_event *event); struct event_constraint *event_constraints; + struct event_constraint *percore_constraints; void (*quirks)(void); int perfctr_second_write; @@ -238,6 +271,7 @@ struct x86_pmu { * Intel DebugStore bits */ int bts, pebs; + int bts_active, pebs_active; int pebs_record_size; void (*drain_pebs)(struct pt_regs *regs); struct event_constraint *pebs_constraints; @@ -247,6 +281,11 @@ struct x86_pmu { */ unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */ int lbr_nr; /* hardware stack size */ + + /* + * Extra registers for events + */ + struct extra_reg *extra_regs; }; static struct x86_pmu x86_pmu __read_mostly; @@ -271,6 +310,10 @@ static u64 __read_mostly hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; +static u64 __read_mostly hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX]; /* * Propagate event elapsed time into the generic event. @@ -298,7 +341,7 @@ x86_perf_event_update(struct perf_event *event) */ again: prev_raw_count = local64_read(&hwc->prev_count); - rdmsrl(hwc->event_base + idx, new_raw_count); + rdmsrl(hwc->event_base, new_raw_count); if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, new_raw_count) != prev_raw_count) @@ -321,6 +364,55 @@ again: return new_raw_count; } +static inline int x86_pmu_addr_offset(int index) +{ + int offset; + + /* offset = X86_FEATURE_PERFCTR_CORE ? index << 1 : index */ + alternative_io(ASM_NOP2, + "shll $1, %%eax", + X86_FEATURE_PERFCTR_CORE, + "=a" (offset), + "a" (index)); + + return offset; +} + +static inline unsigned int x86_pmu_config_addr(int index) +{ + return x86_pmu.eventsel + x86_pmu_addr_offset(index); +} + +static inline unsigned int x86_pmu_event_addr(int index) +{ + return x86_pmu.perfctr + x86_pmu_addr_offset(index); +} + +/* + * Find and validate any extra registers to set up. + */ +static int x86_pmu_extra_regs(u64 config, struct perf_event *event) +{ + struct extra_reg *er; + + event->hw.extra_reg = 0; + event->hw.extra_config = 0; + + if (!x86_pmu.extra_regs) + return 0; + + for (er = x86_pmu.extra_regs; er->msr; er++) { + if (er->event != (config & er->config_mask)) + continue; + if (event->attr.config1 & ~er->valid_mask) + return -EINVAL; + event->hw.extra_reg = er->msr; + event->hw.extra_config = event->attr.config1; + break; + } + return 0; +} + static atomic_t active_events; static DEFINE_MUTEX(pmc_reserve_mutex); @@ -330,16 +422,13 @@ static bool reserve_pmc_hardware(void) { int i; - if (nmi_watchdog == NMI_LOCAL_APIC) - disable_lapic_nmi_watchdog(); - for (i = 0; i < x86_pmu.num_counters; i++) { - if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) + if (!reserve_perfctr_nmi(x86_pmu_event_addr(i))) goto perfctr_fail; } for (i = 0; i < x86_pmu.num_counters; i++) { - if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) + if (!reserve_evntsel_nmi(x86_pmu_config_addr(i))) goto eventsel_fail; } @@ -347,16 +436,13 @@ static bool reserve_pmc_hardware(void) eventsel_fail: for (i--; i >= 0; i--) - release_evntsel_nmi(x86_pmu.eventsel + i); + release_evntsel_nmi(x86_pmu_config_addr(i)); i = x86_pmu.num_counters; perfctr_fail: for (i--; i >= 0; i--) - release_perfctr_nmi(x86_pmu.perfctr + i); - - if (nmi_watchdog == NMI_LOCAL_APIC) - enable_lapic_nmi_watchdog(); + release_perfctr_nmi(x86_pmu_event_addr(i)); return false; } @@ -366,12 +452,9 @@ static void release_pmc_hardware(void) int i; for (i = 0; i < x86_pmu.num_counters; i++) { - release_perfctr_nmi(x86_pmu.perfctr + i); - release_evntsel_nmi(x86_pmu.eventsel + i); + release_perfctr_nmi(x86_pmu_event_addr(i)); + release_evntsel_nmi(x86_pmu_config_addr(i)); } - - if (nmi_watchdog == NMI_LOCAL_APIC) - enable_lapic_nmi_watchdog(); } #else @@ -381,7 +464,64 @@ static void release_pmc_hardware(void) {} #endif -static int reserve_ds_buffers(void); +static bool check_hw_exists(void) +{ + u64 val, val_new = 0; + int i, reg, ret = 0; + + /* + * Check to see if the BIOS enabled any of the counters, if so + * complain and bail. + */ + for (i = 0; i < x86_pmu.num_counters; i++) { + reg = x86_pmu_config_addr(i); + ret = rdmsrl_safe(reg, &val); + if (ret) + goto msr_fail; + if (val & ARCH_PERFMON_EVENTSEL_ENABLE) + goto bios_fail; + } + + if (x86_pmu.num_counters_fixed) { + reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; + ret = rdmsrl_safe(reg, &val); + if (ret) + goto msr_fail; + for (i = 0; i < x86_pmu.num_counters_fixed; i++) { + if (val & (0x03 << i*4)) + goto bios_fail; + } + } + + /* + * Now write a value and read it back to see if it matches, + * this is needed to detect certain hardware emulators (qemu/kvm) + * that don't trap on the MSR access and always return 0s. + */ + val = 0xabcdUL; + ret = checking_wrmsrl(x86_pmu_event_addr(0), val); + ret |= rdmsrl_safe(x86_pmu_event_addr(0), &val_new); + if (ret || val != val_new) + goto msr_fail; + + return true; + +bios_fail: + /* + * We still allow the PMU driver to operate: + */ + printk(KERN_CONT "Broken BIOS detected, complain to your hardware vendor.\n"); + printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg, val); + + return true; + +msr_fail: + printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n"); + + return false; +} + +static void reserve_ds_buffers(void); static void release_ds_buffers(void); static void hw_perf_event_destroy(struct perf_event *event) @@ -399,8 +539,9 @@ static inline int x86_pmu_initialized(void) } static inline int -set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr) +set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event) { + struct perf_event_attr *attr = &event->attr; unsigned int cache_type, cache_op, cache_result; u64 config, val; @@ -427,8 +568,8 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr) return -EINVAL; hwc->config |= val; - - return 0; + attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result]; + return x86_pmu_extra_regs(val, event); } static int x86_setup_perfctr(struct perf_event *event) @@ -437,7 +578,7 @@ static int x86_setup_perfctr(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; u64 config; - if (!hwc->sample_period) { + if (!is_sampling_event(event)) { hwc->sample_period = x86_pmu.max_period; hwc->last_period = hwc->sample_period; local64_set(&hwc->period_left, hwc->sample_period); @@ -452,11 +593,15 @@ static int x86_setup_perfctr(struct perf_event *event) return -EOPNOTSUPP; } + /* + * Do not allow config1 (extended registers) to propagate, + * there's no sane user-space generalization yet: + */ if (attr->type == PERF_TYPE_RAW) return 0; if (attr->type == PERF_TYPE_HW_CACHE) - return set_ext_hw_attr(hwc, attr); + return set_ext_hw_attr(hwc, event); if (attr->config >= x86_pmu.max_events) return -EINVAL; @@ -475,10 +620,10 @@ static int x86_setup_perfctr(struct perf_event *event) /* * Branch tracing: */ - if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) && - (hwc->sample_period == 1)) { + if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS && + !attr->freq && hwc->sample_period == 1) { /* BTS is not supported by this architecture. */ - if (!x86_pmu.bts) + if (!x86_pmu.bts_active) return -EOPNOTSUPP; /* BTS is currently only allowed for user-mode. */ @@ -497,12 +642,13 @@ static int x86_pmu_hw_config(struct perf_event *event) int precise = 0; /* Support for constant skid */ - if (x86_pmu.pebs) + if (x86_pmu.pebs_active) { precise++; - /* Support for IP fixup */ - if (x86_pmu.lbr_nr) - precise++; + /* Support for IP fixup */ + if (x86_pmu.lbr_nr) + precise++; + } if (event->attr.precise_ip > precise) return -EOPNOTSUPP; @@ -544,11 +690,8 @@ static int __x86_pmu_event_init(struct perf_event *event) if (atomic_read(&active_events) == 0) { if (!reserve_pmc_hardware()) err = -EBUSY; - else { - err = reserve_ds_buffers(); - if (err) - release_pmc_hardware(); - } + else + reserve_ds_buffers(); } if (!err) atomic_inc(&active_events); @@ -576,11 +719,11 @@ static void x86_pmu_disable_all(void) if (!test_bit(idx, cpuc->active_mask)) continue; - rdmsrl(x86_pmu.eventsel + idx, val); + rdmsrl(x86_pmu_config_addr(idx), val); if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE)) continue; val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; - wrmsrl(x86_pmu.eventsel + idx, val); + wrmsrl(x86_pmu_config_addr(idx), val); } } @@ -601,21 +744,26 @@ static void x86_pmu_disable(struct pmu *pmu) x86_pmu.disable_all(); } +static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, + u64 enable_mask) +{ + if (hwc->extra_reg) + wrmsrl(hwc->extra_reg, hwc->extra_config); + wrmsrl(hwc->config_base, hwc->config | enable_mask); +} + static void x86_pmu_enable_all(int added) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int idx; for (idx = 0; idx < x86_pmu.num_counters; idx++) { - struct perf_event *event = cpuc->events[idx]; - u64 val; + struct hw_perf_event *hwc = &cpuc->events[idx]->hw; if (!test_bit(idx, cpuc->active_mask)) continue; - val = event->hw.config; - val |= ARCH_PERFMON_EVENTSEL_ENABLE; - wrmsrl(x86_pmu.eventsel + idx, val); + __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); } } @@ -780,15 +928,10 @@ static inline void x86_assign_hw_event(struct perf_event *event, hwc->event_base = 0; } else if (hwc->idx >= X86_PMC_IDX_FIXED) { hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; - /* - * We set it so that event_base + idx in wrmsr/rdmsr maps to - * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2: - */ - hwc->event_base = - MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED; + hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - X86_PMC_IDX_FIXED); } else { - hwc->config_base = x86_pmu.eventsel; - hwc->event_base = x86_pmu.perfctr; + hwc->config_base = x86_pmu_config_addr(hwc->idx); + hwc->event_base = x86_pmu_event_addr(hwc->idx); } } @@ -874,17 +1017,11 @@ static void x86_pmu_enable(struct pmu *pmu) x86_pmu.enable_all(added); } -static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, - u64 enable_mask) -{ - wrmsrl(hwc->config_base + hwc->idx, hwc->config | enable_mask); -} - static inline void x86_pmu_disable_event(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; - wrmsrl(hwc->config_base + hwc->idx, hwc->config); + wrmsrl(hwc->config_base, hwc->config); } static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); @@ -937,7 +1074,7 @@ x86_perf_event_set_period(struct perf_event *event) */ local64_set(&hwc->prev_count, (u64)-left); - wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask); + wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask); /* * Due to erratum on certan cpu we need @@ -945,7 +1082,7 @@ x86_perf_event_set_period(struct perf_event *event) * is updated properly */ if (x86_pmu.perfctr_second_write) { - wrmsrl(hwc->event_base + idx, + wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask); } @@ -956,8 +1093,7 @@ x86_perf_event_set_period(struct perf_event *event) static void x86_pmu_enable_event(struct perf_event *event) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - if (cpuc->enabled) + if (__this_cpu_read(cpu_hw_events.enabled)) __x86_pmu_enable_event(&event->hw, ARCH_PERFMON_EVENTSEL_ENABLE); } @@ -989,7 +1125,7 @@ static int x86_pmu_add(struct perf_event *event, int flags) /* * If group events scheduling transaction was started, - * skip the schedulability test here, it will be peformed + * skip the schedulability test here, it will be performed * at commit time (->commit_txn) as a whole */ if (cpuc->group_flag & PERF_EVENT_TXN) @@ -1073,8 +1209,8 @@ void perf_event_print_debug(void) pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); for (idx = 0; idx < x86_pmu.num_counters; idx++) { - rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); - rdmsrl(x86_pmu.perfctr + idx, pmc_count); + rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl); + rdmsrl(x86_pmu_event_addr(idx), pmc_count); prev_left = per_cpu(pmc_prev_left[idx], cpu); @@ -1159,6 +1295,16 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) cpuc = &__get_cpu_var(cpu_hw_events); + /* + * Some chipsets need to unmask the LVTPC in a particular spot + * inside the nmi handler. As a result, the unmasking was pushed + * into all the nmi handlers. + * + * This generic handler doesn't seem to have any issues where the + * unmasking occurs so it was left at the top. + */ + apic_write(APIC_LVTPC, APIC_DM_NMI); + for (idx = 0; idx < x86_pmu.num_counters; idx++) { if (!test_bit(idx, cpuc->active_mask)) { /* @@ -1227,11 +1373,10 @@ perf_event_nmi_handler(struct notifier_block *self, switch (cmd) { case DIE_NMI: - case DIE_NMI_IPI: break; case DIE_NMIUNKNOWN: this_nmi = percpu_read(irq_stat.__nmi_count); - if (this_nmi != __get_cpu_var(pmu_nmi).marked) + if (this_nmi != __this_cpu_read(pmu_nmi.marked)) /* let the kernel handle the unknown nmi */ return NOTIFY_DONE; /* @@ -1246,8 +1391,6 @@ perf_event_nmi_handler(struct notifier_block *self, return NOTIFY_DONE; } - apic_write(APIC_LVTPC, APIC_DM_NMI); - handled = x86_pmu.handle_irq(args->regs); if (!handled) return NOTIFY_DONE; @@ -1255,8 +1398,8 @@ perf_event_nmi_handler(struct notifier_block *self, this_nmi = percpu_read(irq_stat.__nmi_count); if ((handled > 1) || /* the next nmi could be a back-to-back nmi */ - ((__get_cpu_var(pmu_nmi).marked == this_nmi) && - (__get_cpu_var(pmu_nmi).handled > 1))) { + ((__this_cpu_read(pmu_nmi.marked) == this_nmi) && + (__this_cpu_read(pmu_nmi.handled) > 1))) { /* * We could have two subsequent back-to-back nmis: The * first handles more than one counter, the 2nd @@ -1267,8 +1410,8 @@ perf_event_nmi_handler(struct notifier_block *self, * handling more than one counter. We will mark the * next (3rd) and then drop it if unhandled. */ - __get_cpu_var(pmu_nmi).marked = this_nmi + 1; - __get_cpu_var(pmu_nmi).handled = handled; + __this_cpu_write(pmu_nmi.marked, this_nmi + 1); + __this_cpu_write(pmu_nmi.handled, handled); } return NOTIFY_STOP; @@ -1277,7 +1420,7 @@ perf_event_nmi_handler(struct notifier_block *self, static __read_mostly struct notifier_block perf_event_nmi_notifier = { .notifier_call = perf_event_nmi_handler, .next = NULL, - .priority = 1 + .priority = NMI_LOCAL_LOW_PRIOR, }; static struct event_constraint unconstrained; @@ -1350,7 +1493,7 @@ static void __init pmu_check_apic(void) pr_info("no hardware sampling interrupt available.\n"); } -void __init init_hw_perf_events(void) +static int __init init_hw_perf_events(void) { struct event_constraint *c; int err; @@ -1365,15 +1508,19 @@ void __init init_hw_perf_events(void) err = amd_pmu_init(); break; default: - return; + return 0; } if (err != 0) { pr_cont("no PMU driver, software events only.\n"); - return; + return 0; } pmu_check_apic(); + /* sanity check that the hardware exists or is emulated */ + if (!check_hw_exists()) + return 0; + pr_cont("%s PMU driver.\n", x86_pmu.name); if (x86_pmu.quirks) @@ -1420,9 +1567,12 @@ void __init init_hw_perf_events(void) pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed); pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl); - perf_pmu_register(&pmu); + perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW); perf_cpu_notifier(x86_pmu_notifier); + + return 0; } +early_initcall(init_hw_perf_events); static inline void x86_pmu_read(struct perf_event *event) { @@ -1436,11 +1586,9 @@ static inline void x86_pmu_read(struct perf_event *event) */ static void x86_pmu_start_txn(struct pmu *pmu) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - perf_pmu_disable(pmu); - cpuc->group_flag |= PERF_EVENT_TXN; - cpuc->n_txn = 0; + __this_cpu_or(cpu_hw_events.group_flag, PERF_EVENT_TXN); + __this_cpu_write(cpu_hw_events.n_txn, 0); } /* @@ -1450,14 +1598,12 @@ static void x86_pmu_start_txn(struct pmu *pmu) */ static void x86_pmu_cancel_txn(struct pmu *pmu) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - - cpuc->group_flag &= ~PERF_EVENT_TXN; + __this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN); /* * Truncate the collected events. */ - cpuc->n_added -= cpuc->n_txn; - cpuc->n_events -= cpuc->n_txn; + __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn)); + __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn)); perf_pmu_enable(pmu); } @@ -1566,7 +1712,7 @@ out: return ret; } -int x86_pmu_event_init(struct perf_event *event) +static int x86_pmu_event_init(struct perf_event *event) { struct pmu *tmp; int err; @@ -1627,17 +1773,6 @@ static struct pmu pmu = { * callchain support */ -static void -backtrace_warning_symbol(void *data, char *msg, unsigned long symbol) -{ - /* Ignore warnings */ -} - -static void backtrace_warning(void *data, char *msg) -{ - /* Ignore warnings */ -} - static int backtrace_stack(void *data, char *name) { return 0; @@ -1651,8 +1786,6 @@ static void backtrace_address(void *data, unsigned long addr, int reliable) } static const struct stacktrace_ops backtrace_ops = { - .warning = backtrace_warning, - .warning_symbol = backtrace_warning_symbol, .stack = backtrace_stack, .address = backtrace_address, .walk_stack = print_context_stack_bp, @@ -1668,7 +1801,7 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) perf_callchain_store(entry, regs->ip); - dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry); + dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); } #ifdef CONFIG_COMPAT diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 46d58448c3a..fe29c1d2219 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -1,7 +1,5 @@ #ifdef CONFIG_CPU_SUP_AMD -static DEFINE_RAW_SPINLOCK(amd_nb_lock); - static __initconst const u64 amd_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] @@ -10,7 +8,7 @@ static __initconst const u64 amd_hw_cache_event_ids [ C(L1D) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ - [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */ + [ C(RESULT_MISS) ] = 0x0141, /* Data Cache Misses */ }, [ C(OP_WRITE) ] = { [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */ @@ -98,12 +96,14 @@ static __initconst const u64 amd_hw_cache_event_ids */ static const u64 amd_perfmon_event_map[] = { - [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, - [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, + [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, + [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, + [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00d0, /* "Decoder empty" event */ + [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x00d1, /* "Dispatch stalls" event */ }; static u64 amd_pmu_event_map(int hw_event) @@ -129,6 +129,11 @@ static int amd_pmu_hw_config(struct perf_event *event) /* * AMD64 events are detected based on their event codes. */ +static inline unsigned int amd_get_event_code(struct hw_perf_event *hwc) +{ + return ((hwc->config >> 24) & 0x0f00) | (hwc->config & 0x00ff); +} + static inline int amd_is_nb_event(struct hw_perf_event *hwc) { return (hwc->config & 0xe0) == 0xe0; @@ -275,17 +280,17 @@ done: return &emptyconstraint; } -static struct amd_nb *amd_alloc_nb(int cpu, int nb_id) +static struct amd_nb *amd_alloc_nb(int cpu) { struct amd_nb *nb; int i; - nb = kmalloc(sizeof(struct amd_nb), GFP_KERNEL); + nb = kmalloc_node(sizeof(struct amd_nb), GFP_KERNEL | __GFP_ZERO, + cpu_to_node(cpu)); if (!nb) return NULL; - memset(nb, 0, sizeof(*nb)); - nb->nb_id = nb_id; + nb->nb_id = -1; /* * initialize all possible NB constraints @@ -306,7 +311,7 @@ static int amd_pmu_cpu_prepare(int cpu) if (boot_cpu_data.x86_max_cores < 2) return NOTIFY_OK; - cpuc->amd_nb = amd_alloc_nb(cpu, -1); + cpuc->amd_nb = amd_alloc_nb(cpu); if (!cpuc->amd_nb) return NOTIFY_BAD; @@ -325,8 +330,6 @@ static void amd_pmu_cpu_starting(int cpu) nb_id = amd_get_nb_id(cpu); WARN_ON_ONCE(nb_id == BAD_APICID); - raw_spin_lock(&amd_nb_lock); - for_each_online_cpu(i) { nb = per_cpu(cpu_hw_events, i).amd_nb; if (WARN_ON_ONCE(!nb)) @@ -341,8 +344,6 @@ static void amd_pmu_cpu_starting(int cpu) cpuc->amd_nb->nb_id = nb_id; cpuc->amd_nb->refcnt++; - - raw_spin_unlock(&amd_nb_lock); } static void amd_pmu_cpu_dead(int cpu) @@ -354,8 +355,6 @@ static void amd_pmu_cpu_dead(int cpu) cpuhw = &per_cpu(cpu_hw_events, cpu); - raw_spin_lock(&amd_nb_lock); - if (cpuhw->amd_nb) { struct amd_nb *nb = cpuhw->amd_nb; @@ -364,8 +363,6 @@ static void amd_pmu_cpu_dead(int cpu) cpuhw->amd_nb = NULL; } - - raw_spin_unlock(&amd_nb_lock); } static __initconst const struct x86_pmu amd_pmu = { @@ -395,13 +392,195 @@ static __initconst const struct x86_pmu amd_pmu = { .cpu_dead = amd_pmu_cpu_dead, }; +/* AMD Family 15h */ + +#define AMD_EVENT_TYPE_MASK 0x000000F0ULL + +#define AMD_EVENT_FP 0x00000000ULL ... 0x00000010ULL +#define AMD_EVENT_LS 0x00000020ULL ... 0x00000030ULL +#define AMD_EVENT_DC 0x00000040ULL ... 0x00000050ULL +#define AMD_EVENT_CU 0x00000060ULL ... 0x00000070ULL +#define AMD_EVENT_IC_DE 0x00000080ULL ... 0x00000090ULL +#define AMD_EVENT_EX_LS 0x000000C0ULL +#define AMD_EVENT_DE 0x000000D0ULL +#define AMD_EVENT_NB 0x000000E0ULL ... 0x000000F0ULL + +/* + * AMD family 15h event code/PMC mappings: + * + * type = event_code & 0x0F0: + * + * 0x000 FP PERF_CTL[5:3] + * 0x010 FP PERF_CTL[5:3] + * 0x020 LS PERF_CTL[5:0] + * 0x030 LS PERF_CTL[5:0] + * 0x040 DC PERF_CTL[5:0] + * 0x050 DC PERF_CTL[5:0] + * 0x060 CU PERF_CTL[2:0] + * 0x070 CU PERF_CTL[2:0] + * 0x080 IC/DE PERF_CTL[2:0] + * 0x090 IC/DE PERF_CTL[2:0] + * 0x0A0 --- + * 0x0B0 --- + * 0x0C0 EX/LS PERF_CTL[5:0] + * 0x0D0 DE PERF_CTL[2:0] + * 0x0E0 NB NB_PERF_CTL[3:0] + * 0x0F0 NB NB_PERF_CTL[3:0] + * + * Exceptions: + * + * 0x000 FP PERF_CTL[3], PERF_CTL[5:3] (*) + * 0x003 FP PERF_CTL[3] + * 0x004 FP PERF_CTL[3], PERF_CTL[5:3] (*) + * 0x00B FP PERF_CTL[3] + * 0x00D FP PERF_CTL[3] + * 0x023 DE PERF_CTL[2:0] + * 0x02D LS PERF_CTL[3] + * 0x02E LS PERF_CTL[3,0] + * 0x043 CU PERF_CTL[2:0] + * 0x045 CU PERF_CTL[2:0] + * 0x046 CU PERF_CTL[2:0] + * 0x054 CU PERF_CTL[2:0] + * 0x055 CU PERF_CTL[2:0] + * 0x08F IC PERF_CTL[0] + * 0x187 DE PERF_CTL[0] + * 0x188 DE PERF_CTL[0] + * 0x0DB EX PERF_CTL[5:0] + * 0x0DC LS PERF_CTL[5:0] + * 0x0DD LS PERF_CTL[5:0] + * 0x0DE LS PERF_CTL[5:0] + * 0x0DF LS PERF_CTL[5:0] + * 0x1D6 EX PERF_CTL[5:0] + * 0x1D8 EX PERF_CTL[5:0] + * + * (*) depending on the umask all FPU counters may be used + */ + +static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0); +static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0); +static struct event_constraint amd_f15_PMC3 = EVENT_CONSTRAINT(0, 0x08, 0); +static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT(0, 0x09, 0); +static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0); +static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0); + +static struct event_constraint * +amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + unsigned int event_code = amd_get_event_code(hwc); + + switch (event_code & AMD_EVENT_TYPE_MASK) { + case AMD_EVENT_FP: + switch (event_code) { + case 0x000: + if (!(hwc->config & 0x0000F000ULL)) + break; + if (!(hwc->config & 0x00000F00ULL)) + break; + return &amd_f15_PMC3; + case 0x004: + if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1) + break; + return &amd_f15_PMC3; + case 0x003: + case 0x00B: + case 0x00D: + return &amd_f15_PMC3; + } + return &amd_f15_PMC53; + case AMD_EVENT_LS: + case AMD_EVENT_DC: + case AMD_EVENT_EX_LS: + switch (event_code) { + case 0x023: + case 0x043: + case 0x045: + case 0x046: + case 0x054: + case 0x055: + return &amd_f15_PMC20; + case 0x02D: + return &amd_f15_PMC3; + case 0x02E: + return &amd_f15_PMC30; + default: + return &amd_f15_PMC50; + } + case AMD_EVENT_CU: + case AMD_EVENT_IC_DE: + case AMD_EVENT_DE: + switch (event_code) { + case 0x08F: + case 0x187: + case 0x188: + return &amd_f15_PMC0; + case 0x0DB ... 0x0DF: + case 0x1D6: + case 0x1D8: + return &amd_f15_PMC50; + default: + return &amd_f15_PMC20; + } + case AMD_EVENT_NB: + /* not yet implemented */ + return &emptyconstraint; + default: + return &emptyconstraint; + } +} + +static __initconst const struct x86_pmu amd_pmu_f15h = { + .name = "AMD Family 15h", + .handle_irq = x86_pmu_handle_irq, + .disable_all = x86_pmu_disable_all, + .enable_all = x86_pmu_enable_all, + .enable = x86_pmu_enable_event, + .disable = x86_pmu_disable_event, + .hw_config = amd_pmu_hw_config, + .schedule_events = x86_schedule_events, + .eventsel = MSR_F15H_PERF_CTL, + .perfctr = MSR_F15H_PERF_CTR, + .event_map = amd_pmu_event_map, + .max_events = ARRAY_SIZE(amd_perfmon_event_map), + .num_counters = 6, + .cntval_bits = 48, + .cntval_mask = (1ULL << 48) - 1, + .apic = 1, + /* use highest bit to detect overflow */ + .max_period = (1ULL << 47) - 1, + .get_event_constraints = amd_get_event_constraints_f15h, + /* nortbridge counters not yet implemented: */ +#if 0 + .put_event_constraints = amd_put_event_constraints, + + .cpu_prepare = amd_pmu_cpu_prepare, + .cpu_starting = amd_pmu_cpu_starting, + .cpu_dead = amd_pmu_cpu_dead, +#endif +}; + static __init int amd_pmu_init(void) { /* Performance-monitoring supported from K7 and later: */ if (boot_cpu_data.x86 < 6) return -ENODEV; - x86_pmu = amd_pmu; + /* + * If core performance counter extensions exists, it must be + * family 15h, otherwise fail. See x86_pmu_addr_offset(). + */ + switch (boot_cpu_data.x86) { + case 0x15: + if (!cpu_has_perfctr_core) + return -ENODEV; + x86_pmu = amd_pmu_f15h; + break; + default: + if (cpu_has_perfctr_core) + return -ENODEV; + x86_pmu = amd_pmu; + break; + } /* Events are common for all AMDs */ memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index c8f5c088cad..41178c826c4 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1,9 +1,31 @@ #ifdef CONFIG_CPU_SUP_INTEL +#define MAX_EXTRA_REGS 2 + +/* + * Per register state. + */ +struct er_account { + int ref; /* reference count */ + unsigned int extra_reg; /* extra MSR number */ + u64 extra_config; /* extra MSR config */ +}; + +/* + * Per core state + * This used to coordinate shared registers for HT threads. + */ +struct intel_percore { + raw_spinlock_t lock; /* protect structure */ + struct er_account regs[MAX_EXTRA_REGS]; + int refcnt; /* number of threads */ + unsigned core_id; +}; + /* * Intel PerfMon, used on Core and later. */ -static const u64 intel_perfmon_event_map[] = +static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly = { [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, @@ -14,7 +36,7 @@ static const u64 intel_perfmon_event_map[] = [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, }; -static struct event_constraint intel_core_event_constraints[] = +static struct event_constraint intel_core_event_constraints[] __read_mostly = { INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ @@ -25,7 +47,7 @@ static struct event_constraint intel_core_event_constraints[] = EVENT_CONSTRAINT_END }; -static struct event_constraint intel_core2_event_constraints[] = +static struct event_constraint intel_core2_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ @@ -48,7 +70,7 @@ static struct event_constraint intel_core2_event_constraints[] = EVENT_CONSTRAINT_END }; -static struct event_constraint intel_nehalem_event_constraints[] = +static struct event_constraint intel_nehalem_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ @@ -64,7 +86,19 @@ static struct event_constraint intel_nehalem_event_constraints[] = EVENT_CONSTRAINT_END }; -static struct event_constraint intel_westmere_event_constraints[] = +static struct extra_reg intel_nehalem_extra_regs[] __read_mostly = +{ + INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff), + EVENT_EXTRA_END +}; + +static struct event_constraint intel_nehalem_percore_constraints[] __read_mostly = +{ + INTEL_EVENT_CONSTRAINT(0xb7, 0), + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_westmere_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ @@ -76,7 +110,34 @@ static struct event_constraint intel_westmere_event_constraints[] = EVENT_CONSTRAINT_END }; -static struct event_constraint intel_gen_event_constraints[] = +static struct event_constraint intel_snb_event_constraints[] __read_mostly = +{ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ + INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */ + INTEL_EVENT_CONSTRAINT(0xb7, 0x1), /* OFF_CORE_RESPONSE_0 */ + INTEL_EVENT_CONSTRAINT(0xbb, 0x8), /* OFF_CORE_RESPONSE_1 */ + INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ + INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ + EVENT_CONSTRAINT_END +}; + +static struct extra_reg intel_westmere_extra_regs[] __read_mostly = +{ + INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff), + INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff), + EVENT_EXTRA_END +}; + +static struct event_constraint intel_westmere_percore_constraints[] __read_mostly = +{ + INTEL_EVENT_CONSTRAINT(0xb7, 0), + INTEL_EVENT_CONSTRAINT(0xbb, 0), + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_gen_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ @@ -89,6 +150,103 @@ static u64 intel_pmu_event_map(int hw_event) return intel_perfmon_event_map[hw_event]; } +static __initconst const u64 snb_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0xf1d0, /* MEM_UOP_RETIRED.LOADS */ + [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPLACEMENT */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0xf2d0, /* MEM_UOP_RETIRED.STORES */ + [ C(RESULT_MISS) ] = 0x0851, /* L1D.ALL_M_REPLACEMENT */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x024e, /* HW_PRE_REQ.DL1_MISS */ + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0280, /* ICACHE.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_WRITE) ] = { + /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_PREFETCH) ] = { + /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOP_RETIRED.ALL_LOADS */ + [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.CAUSES_A_WALK */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOP_RETIRED.ALL_STORES */ + [ C(RESULT_MISS) ] = 0x0149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x1085, /* ITLB_MISSES.STLB_HIT */ + [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.CAUSES_A_WALK */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ + [ C(RESULT_MISS) ] = 0x00c5, /* BR_MISP_RETIRED.ALL_BRANCHES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + static __initconst const u64 westmere_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] @@ -124,16 +282,26 @@ static __initconst const u64 westmere_hw_cache_event_ids }, [ C(LL ) ] = { [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ - [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ + /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, }, + /* + * Use RFO, not WRITEBACK, because a write miss would typically occur + * on RFO. + */ [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ - [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ + /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, }, [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ - [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ + /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, }, }, [ C(DTLB) ] = { @@ -180,6 +348,59 @@ static __initconst const u64 westmere_hw_cache_event_ids }, }; +/* + * Nehalem/Westmere MSR_OFFCORE_RESPONSE bits; + * See IA32 SDM Vol 3B 30.6.1.3 + */ + +#define NHM_DMND_DATA_RD (1 << 0) +#define NHM_DMND_RFO (1 << 1) +#define NHM_DMND_IFETCH (1 << 2) +#define NHM_DMND_WB (1 << 3) +#define NHM_PF_DATA_RD (1 << 4) +#define NHM_PF_DATA_RFO (1 << 5) +#define NHM_PF_IFETCH (1 << 6) +#define NHM_OFFCORE_OTHER (1 << 7) +#define NHM_UNCORE_HIT (1 << 8) +#define NHM_OTHER_CORE_HIT_SNP (1 << 9) +#define NHM_OTHER_CORE_HITM (1 << 10) + /* reserved */ +#define NHM_REMOTE_CACHE_FWD (1 << 12) +#define NHM_REMOTE_DRAM (1 << 13) +#define NHM_LOCAL_DRAM (1 << 14) +#define NHM_NON_DRAM (1 << 15) + +#define NHM_ALL_DRAM (NHM_REMOTE_DRAM|NHM_LOCAL_DRAM) + +#define NHM_DMND_READ (NHM_DMND_DATA_RD) +#define NHM_DMND_WRITE (NHM_DMND_RFO|NHM_DMND_WB) +#define NHM_DMND_PREFETCH (NHM_PF_DATA_RD|NHM_PF_DATA_RFO) + +#define NHM_L3_HIT (NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM) +#define NHM_L3_MISS (NHM_NON_DRAM|NHM_ALL_DRAM|NHM_REMOTE_CACHE_FWD) +#define NHM_L3_ACCESS (NHM_L3_HIT|NHM_L3_MISS) + +static __initconst const u64 nehalem_hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_L3_ACCESS, + [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_L3_MISS, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_L3_ACCESS, + [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_L3_MISS, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS, + [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_L3_MISS, + }, + } +}; + static __initconst const u64 nehalem_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] @@ -187,12 +408,12 @@ static __initconst const u64 nehalem_hw_cache_event_ids { [ C(L1D) ] = { [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ - [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ + [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */ + [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */ }, [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ - [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ + [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */ + [ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */ }, [ C(OP_PREFETCH) ] = { [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ @@ -215,16 +436,26 @@ static __initconst const u64 nehalem_hw_cache_event_ids }, [ C(LL ) ] = { [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ - [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ + /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, }, + /* + * Use RFO, not WRITEBACK, because a write miss would typically occur + * on RFO. + */ [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ - [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ + /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, }, [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ - [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ + /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, }, }, [ C(DTLB) ] = { @@ -649,7 +880,7 @@ static void intel_pmu_enable_event(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) { - if (!__get_cpu_var(cpu_hw_events).enabled) + if (!__this_cpu_read(cpu_hw_events.enabled)) return; intel_pmu_enable_bts(hwc->config); @@ -679,7 +910,7 @@ static int intel_pmu_save_and_restart(struct perf_event *event) static void intel_pmu_reset(void) { - struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds; + struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds); unsigned long flags; int idx; @@ -691,8 +922,8 @@ static void intel_pmu_reset(void) printk("clearing PMU state on CPU#%d\n", smp_processor_id()); for (idx = 0; idx < x86_pmu.num_counters; idx++) { - checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); - checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); + checking_wrmsrl(x86_pmu_config_addr(idx), 0ull); + checking_wrmsrl(x86_pmu_event_addr(idx), 0ull); } for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); @@ -719,6 +950,16 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) cpuc = &__get_cpu_var(cpu_hw_events); + /* + * Some chipsets need to unmask the LVTPC in a particular spot + * inside the nmi handler. As a result, the unmasking was pushed + * into all the nmi handlers. + * + * This handler doesn't seem to have any issues with the unmasking + * so it was left at the top. + */ + apic_write(APIC_LVTPC, APIC_DM_NMI); + intel_pmu_disable_all(); handled = intel_pmu_drain_bts_buffer(); status = intel_pmu_get_status(); @@ -784,6 +1025,9 @@ intel_bts_constraints(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; unsigned int hw_event, bts_event; + if (event->attr.freq) + return NULL; + hw_event = hwc->config & INTEL_ARCH_EVENT_MASK; bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS); @@ -794,6 +1038,67 @@ intel_bts_constraints(struct perf_event *event) } static struct event_constraint * +intel_percore_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + unsigned int e = hwc->config & ARCH_PERFMON_EVENTSEL_EVENT; + struct event_constraint *c; + struct intel_percore *pc; + struct er_account *era; + int i; + int free_slot; + int found; + + if (!x86_pmu.percore_constraints || hwc->extra_alloc) + return NULL; + + for (c = x86_pmu.percore_constraints; c->cmask; c++) { + if (e != c->code) + continue; + + /* + * Allocate resource per core. + */ + pc = cpuc->per_core; + if (!pc) + break; + c = &emptyconstraint; + raw_spin_lock(&pc->lock); + free_slot = -1; + found = 0; + for (i = 0; i < MAX_EXTRA_REGS; i++) { + era = &pc->regs[i]; + if (era->ref > 0 && hwc->extra_reg == era->extra_reg) { + /* Allow sharing same config */ + if (hwc->extra_config == era->extra_config) { + era->ref++; + cpuc->percore_used = 1; + hwc->extra_alloc = 1; + c = NULL; + } + /* else conflict */ + found = 1; + break; + } else if (era->ref == 0 && free_slot == -1) + free_slot = i; + } + if (!found && free_slot != -1) { + era = &pc->regs[free_slot]; + era->ref = 1; + era->extra_reg = hwc->extra_reg; + era->extra_config = hwc->extra_config; + cpuc->percore_used = 1; + hwc->extra_alloc = 1; + c = NULL; + } + raw_spin_unlock(&pc->lock); + return c; + } + + return NULL; +} + +static struct event_constraint * intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) { struct event_constraint *c; @@ -806,9 +1111,51 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event if (c) return c; + c = intel_percore_constraints(cpuc, event); + if (c) + return c; + return x86_get_event_constraints(cpuc, event); } +static void intel_put_event_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + struct extra_reg *er; + struct intel_percore *pc; + struct er_account *era; + struct hw_perf_event *hwc = &event->hw; + int i, allref; + + if (!cpuc->percore_used) + return; + + for (er = x86_pmu.extra_regs; er->msr; er++) { + if (er->event != (hwc->config & er->config_mask)) + continue; + + pc = cpuc->per_core; + raw_spin_lock(&pc->lock); + for (i = 0; i < MAX_EXTRA_REGS; i++) { + era = &pc->regs[i]; + if (era->ref > 0 && + era->extra_config == hwc->extra_config && + era->extra_reg == er->msr) { + era->ref--; + hwc->extra_alloc = 0; + break; + } + } + allref = 0; + for (i = 0; i < MAX_EXTRA_REGS; i++) + allref += pc->regs[i].ref; + if (allref == 0) + cpuc->percore_used = 0; + raw_spin_unlock(&pc->lock); + break; + } +} + static int intel_pmu_hw_config(struct perf_event *event) { int ret = x86_pmu_hw_config(event); @@ -816,6 +1163,32 @@ static int intel_pmu_hw_config(struct perf_event *event) if (ret) return ret; + if (event->attr.precise_ip && + (event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) { + /* + * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P + * (0x003c) so that we can use it with PEBS. + * + * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't + * PEBS capable. However we can use INST_RETIRED.ANY_P + * (0x00c0), which is a PEBS capable event, to get the same + * count. + * + * INST_RETIRED.ANY_P counts the number of cycles that retires + * CNTMASK instructions. By setting CNTMASK to a value (16) + * larger than the maximum number of instructions that can be + * retired per cycle (4) and then inverting the condition, we + * count all cycles that retire 16 or less instructions, which + * is every cycle. + * + * Thereby we gain a PEBS capable cycle counter. + */ + u64 alt_config = 0x108000c0; /* INST_RETIRED.TOTAL_CYCLES */ + + alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK); + event->hw.config = alt_config; + } + if (event->attr.type != PERF_TYPE_RAW) return 0; @@ -854,20 +1227,67 @@ static __initconst const struct x86_pmu core_pmu = { */ .max_period = (1ULL << 31) - 1, .get_event_constraints = intel_get_event_constraints, + .put_event_constraints = intel_put_event_constraints, .event_constraints = intel_core_event_constraints, }; +static int intel_pmu_cpu_prepare(int cpu) +{ + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + + if (!cpu_has_ht_siblings()) + return NOTIFY_OK; + + cpuc->per_core = kzalloc_node(sizeof(struct intel_percore), + GFP_KERNEL, cpu_to_node(cpu)); + if (!cpuc->per_core) + return NOTIFY_BAD; + + raw_spin_lock_init(&cpuc->per_core->lock); + cpuc->per_core->core_id = -1; + return NOTIFY_OK; +} + static void intel_pmu_cpu_starting(int cpu) { + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + int core_id = topology_core_id(cpu); + int i; + init_debug_store_on_cpu(cpu); /* * Deal with CPUs that don't clear their LBRs on power-up. */ intel_pmu_lbr_reset(); + + if (!cpu_has_ht_siblings()) + return; + + for_each_cpu(i, topology_thread_cpumask(cpu)) { + struct intel_percore *pc = per_cpu(cpu_hw_events, i).per_core; + + if (pc && pc->core_id == core_id) { + kfree(cpuc->per_core); + cpuc->per_core = pc; + break; + } + } + + cpuc->per_core->core_id = core_id; + cpuc->per_core->refcnt++; } static void intel_pmu_cpu_dying(int cpu) { + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + struct intel_percore *pc = cpuc->per_core; + + if (pc) { + if (pc->core_id == -1 || --pc->refcnt == 0) + kfree(pc); + cpuc->per_core = NULL; + } + fini_debug_store_on_cpu(cpu); } @@ -892,7 +1312,9 @@ static __initconst const struct x86_pmu intel_pmu = { */ .max_period = (1ULL << 31) - 1, .get_event_constraints = intel_get_event_constraints, + .put_event_constraints = intel_put_event_constraints, + .cpu_prepare = intel_pmu_cpu_prepare, .cpu_starting = intel_pmu_cpu_starting, .cpu_dying = intel_pmu_cpu_dying, }; @@ -913,7 +1335,7 @@ static void intel_clovertown_quirks(void) * AJ106 could possibly be worked around by not allowing LBR * usage from PEBS, including the fixup. * AJ68 could possibly be worked around by always programming - * a pebs_event_reset[0] value and coping with the lost events. + * a pebs_event_reset[0] value and coping with the lost events. * * But taken together it might just make sense to not enable PEBS on * these chips. @@ -998,6 +1420,7 @@ static __init int intel_pmu_init(void) intel_pmu_lbr_init_core(); x86_pmu.event_constraints = intel_core2_event_constraints; + x86_pmu.pebs_constraints = intel_core2_pebs_event_constraints; pr_cont("Core2 events, "); break; @@ -1006,11 +1429,33 @@ static __init int intel_pmu_init(void) case 46: /* 45 nm nehalem-ex, "Beckton" */ memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs, + sizeof(hw_cache_extra_regs)); intel_pmu_lbr_init_nhm(); x86_pmu.event_constraints = intel_nehalem_event_constraints; + x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints; + x86_pmu.percore_constraints = intel_nehalem_percore_constraints; x86_pmu.enable_all = intel_pmu_nhm_enable_all; + x86_pmu.extra_regs = intel_nehalem_extra_regs; + + /* UOPS_ISSUED.STALLED_CYCLES */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; + /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1; + + if (ebx & 0x40) { + /* + * Erratum AAJ80 detected, we work it around by using + * the BR_MISP_EXEC.ANY event. This will over-count + * branch-misses, but it's still much better than the + * architectural event which is often completely bogus: + */ + intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89; + + pr_cont("erratum AAJ80 worked around, "); + } pr_cont("Nehalem events, "); break; @@ -1021,21 +1466,51 @@ static __init int intel_pmu_init(void) intel_pmu_lbr_init_atom(); x86_pmu.event_constraints = intel_gen_event_constraints; + x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints; pr_cont("Atom events, "); break; case 37: /* 32 nm nehalem, "Clarkdale" */ case 44: /* 32 nm nehalem, "Gulftown" */ + case 47: /* 32 nm Xeon E7 */ memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs, + sizeof(hw_cache_extra_regs)); intel_pmu_lbr_init_nhm(); x86_pmu.event_constraints = intel_westmere_event_constraints; + x86_pmu.percore_constraints = intel_westmere_percore_constraints; x86_pmu.enable_all = intel_pmu_nhm_enable_all; + x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints; + x86_pmu.extra_regs = intel_westmere_extra_regs; + + /* UOPS_ISSUED.STALLED_CYCLES */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; + /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1; + pr_cont("Westmere events, "); break; + case 42: /* SandyBridge */ + memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + intel_pmu_lbr_init_nhm(); + + x86_pmu.event_constraints = intel_snb_event_constraints; + x86_pmu.pebs_constraints = intel_snb_pebs_events; + + /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; + /* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x18001b1; + + pr_cont("SandyBridge events, "); + break; + default: /* * default constraints for v2 and up diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 4977f9c400e..bab491b8ee2 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -74,6 +74,107 @@ static void fini_debug_store_on_cpu(int cpu) wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0); } +static int alloc_pebs_buffer(int cpu) +{ + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + int node = cpu_to_node(cpu); + int max, thresh = 1; /* always use a single PEBS record */ + void *buffer; + + if (!x86_pmu.pebs) + return 0; + + buffer = kmalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node); + if (unlikely(!buffer)) + return -ENOMEM; + + max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size; + + ds->pebs_buffer_base = (u64)(unsigned long)buffer; + ds->pebs_index = ds->pebs_buffer_base; + ds->pebs_absolute_maximum = ds->pebs_buffer_base + + max * x86_pmu.pebs_record_size; + + ds->pebs_interrupt_threshold = ds->pebs_buffer_base + + thresh * x86_pmu.pebs_record_size; + + return 0; +} + +static void release_pebs_buffer(int cpu) +{ + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + + if (!ds || !x86_pmu.pebs) + return; + + kfree((void *)(unsigned long)ds->pebs_buffer_base); + ds->pebs_buffer_base = 0; +} + +static int alloc_bts_buffer(int cpu) +{ + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + int node = cpu_to_node(cpu); + int max, thresh; + void *buffer; + + if (!x86_pmu.bts) + return 0; + + buffer = kmalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node); + if (unlikely(!buffer)) + return -ENOMEM; + + max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE; + thresh = max / 16; + + ds->bts_buffer_base = (u64)(unsigned long)buffer; + ds->bts_index = ds->bts_buffer_base; + ds->bts_absolute_maximum = ds->bts_buffer_base + + max * BTS_RECORD_SIZE; + ds->bts_interrupt_threshold = ds->bts_absolute_maximum - + thresh * BTS_RECORD_SIZE; + + return 0; +} + +static void release_bts_buffer(int cpu) +{ + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + + if (!ds || !x86_pmu.bts) + return; + + kfree((void *)(unsigned long)ds->bts_buffer_base); + ds->bts_buffer_base = 0; +} + +static int alloc_ds_buffer(int cpu) +{ + int node = cpu_to_node(cpu); + struct debug_store *ds; + + ds = kmalloc_node(sizeof(*ds), GFP_KERNEL | __GFP_ZERO, node); + if (unlikely(!ds)) + return -ENOMEM; + + per_cpu(cpu_hw_events, cpu).ds = ds; + + return 0; +} + +static void release_ds_buffer(int cpu) +{ + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + + if (!ds) + return; + + per_cpu(cpu_hw_events, cpu).ds = NULL; + kfree(ds); +} + static void release_ds_buffers(void) { int cpu; @@ -82,93 +183,77 @@ static void release_ds_buffers(void) return; get_online_cpus(); - for_each_online_cpu(cpu) fini_debug_store_on_cpu(cpu); for_each_possible_cpu(cpu) { - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; - - if (!ds) - continue; - - per_cpu(cpu_hw_events, cpu).ds = NULL; - - kfree((void *)(unsigned long)ds->pebs_buffer_base); - kfree((void *)(unsigned long)ds->bts_buffer_base); - kfree(ds); + release_pebs_buffer(cpu); + release_bts_buffer(cpu); + release_ds_buffer(cpu); } - put_online_cpus(); } -static int reserve_ds_buffers(void) +static void reserve_ds_buffers(void) { - int cpu, err = 0; + int bts_err = 0, pebs_err = 0; + int cpu; + + x86_pmu.bts_active = 0; + x86_pmu.pebs_active = 0; if (!x86_pmu.bts && !x86_pmu.pebs) - return 0; + return; + + if (!x86_pmu.bts) + bts_err = 1; + + if (!x86_pmu.pebs) + pebs_err = 1; get_online_cpus(); for_each_possible_cpu(cpu) { - struct debug_store *ds; - void *buffer; - int max, thresh; + if (alloc_ds_buffer(cpu)) { + bts_err = 1; + pebs_err = 1; + } - err = -ENOMEM; - ds = kzalloc(sizeof(*ds), GFP_KERNEL); - if (unlikely(!ds)) + if (!bts_err && alloc_bts_buffer(cpu)) + bts_err = 1; + + if (!pebs_err && alloc_pebs_buffer(cpu)) + pebs_err = 1; + + if (bts_err && pebs_err) break; - per_cpu(cpu_hw_events, cpu).ds = ds; - - if (x86_pmu.bts) { - buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL); - if (unlikely(!buffer)) - break; - - max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE; - thresh = max / 16; - - ds->bts_buffer_base = (u64)(unsigned long)buffer; - ds->bts_index = ds->bts_buffer_base; - ds->bts_absolute_maximum = ds->bts_buffer_base + - max * BTS_RECORD_SIZE; - ds->bts_interrupt_threshold = ds->bts_absolute_maximum - - thresh * BTS_RECORD_SIZE; - } + } - if (x86_pmu.pebs) { - buffer = kzalloc(PEBS_BUFFER_SIZE, GFP_KERNEL); - if (unlikely(!buffer)) - break; - - max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size; - - ds->pebs_buffer_base = (u64)(unsigned long)buffer; - ds->pebs_index = ds->pebs_buffer_base; - ds->pebs_absolute_maximum = ds->pebs_buffer_base + - max * x86_pmu.pebs_record_size; - /* - * Always use single record PEBS - */ - ds->pebs_interrupt_threshold = ds->pebs_buffer_base + - x86_pmu.pebs_record_size; - } + if (bts_err) { + for_each_possible_cpu(cpu) + release_bts_buffer(cpu); + } - err = 0; + if (pebs_err) { + for_each_possible_cpu(cpu) + release_pebs_buffer(cpu); } - if (err) - release_ds_buffers(); - else { + if (bts_err && pebs_err) { + for_each_possible_cpu(cpu) + release_ds_buffer(cpu); + } else { + if (x86_pmu.bts && !bts_err) + x86_pmu.bts_active = 1; + + if (x86_pmu.pebs && !pebs_err) + x86_pmu.pebs_active = 1; + for_each_online_cpu(cpu) init_debug_store_on_cpu(cpu); } put_online_cpus(); - - return err; } /* @@ -233,7 +318,7 @@ static int intel_pmu_drain_bts_buffer(void) if (!event) return 0; - if (!ds) + if (!x86_pmu.bts_active) return 0; at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; @@ -276,30 +361,70 @@ static int intel_pmu_drain_bts_buffer(void) /* * PEBS */ +static struct event_constraint intel_core2_pebs_event_constraints[] = { + INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ + INTEL_UEVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */ + INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */ + INTEL_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */ + INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_atom_pebs_event_constraints[] = { + INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ + INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */ + INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ + EVENT_CONSTRAINT_END +}; -static struct event_constraint intel_core_pebs_events[] = { - PEBS_EVENT_CONSTRAINT(0x00c0, 0x1), /* INSTR_RETIRED.ANY */ - PEBS_EVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */ - PEBS_EVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */ - PEBS_EVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */ - PEBS_EVENT_CONSTRAINT(0x01cb, 0x1), /* MEM_LOAD_RETIRED.L1D_MISS */ - PEBS_EVENT_CONSTRAINT(0x02cb, 0x1), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */ - PEBS_EVENT_CONSTRAINT(0x04cb, 0x1), /* MEM_LOAD_RETIRED.L2_MISS */ - PEBS_EVENT_CONSTRAINT(0x08cb, 0x1), /* MEM_LOAD_RETIRED.L2_LINE_MISS */ - PEBS_EVENT_CONSTRAINT(0x10cb, 0x1), /* MEM_LOAD_RETIRED.DTLB_MISS */ +static struct event_constraint intel_nehalem_pebs_event_constraints[] = { + INTEL_EVENT_CONSTRAINT(0x0b, 0xf), /* MEM_INST_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ + INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */ + INTEL_EVENT_CONSTRAINT(0xc0, 0xf), /* INST_RETIRED.ANY */ + INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ + INTEL_UEVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */ + INTEL_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */ + INTEL_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ + INTEL_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ EVENT_CONSTRAINT_END }; -static struct event_constraint intel_nehalem_pebs_events[] = { - PEBS_EVENT_CONSTRAINT(0x00c0, 0xf), /* INSTR_RETIRED.ANY */ - PEBS_EVENT_CONSTRAINT(0xfec1, 0xf), /* X87_OPS_RETIRED.ANY */ - PEBS_EVENT_CONSTRAINT(0x00c5, 0xf), /* BR_INST_RETIRED.MISPRED */ - PEBS_EVENT_CONSTRAINT(0x1fc7, 0xf), /* SIMD_INST_RETURED.ANY */ - PEBS_EVENT_CONSTRAINT(0x01cb, 0xf), /* MEM_LOAD_RETIRED.L1D_MISS */ - PEBS_EVENT_CONSTRAINT(0x02cb, 0xf), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */ - PEBS_EVENT_CONSTRAINT(0x04cb, 0xf), /* MEM_LOAD_RETIRED.L2_MISS */ - PEBS_EVENT_CONSTRAINT(0x08cb, 0xf), /* MEM_LOAD_RETIRED.L2_LINE_MISS */ - PEBS_EVENT_CONSTRAINT(0x10cb, 0xf), /* MEM_LOAD_RETIRED.DTLB_MISS */ +static struct event_constraint intel_westmere_pebs_event_constraints[] = { + INTEL_EVENT_CONSTRAINT(0x0b, 0xf), /* MEM_INST_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ + INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */ + INTEL_EVENT_CONSTRAINT(0xc0, 0xf), /* INSTR_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */ + INTEL_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ + INTEL_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_snb_pebs_events[] = { + INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ + INTEL_UEVENT_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */ + INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */ + INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.* */ + INTEL_UEVENT_CONSTRAINT(0x11d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_LOADS */ + INTEL_UEVENT_CONSTRAINT(0x12d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_STORES */ + INTEL_UEVENT_CONSTRAINT(0x21d0, 0xf), /* MEM_UOP_RETIRED.LOCK_LOADS */ + INTEL_UEVENT_CONSTRAINT(0x22d0, 0xf), /* MEM_UOP_RETIRED.LOCK_STORES */ + INTEL_UEVENT_CONSTRAINT(0x41d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_LOADS */ + INTEL_UEVENT_CONSTRAINT(0x42d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_STORES */ + INTEL_UEVENT_CONSTRAINT(0x81d0, 0xf), /* MEM_UOP_RETIRED.ANY_LOADS */ + INTEL_UEVENT_CONSTRAINT(0x82d0, 0xf), /* MEM_UOP_RETIRED.ANY_STORES */ + INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ + INTEL_UEVENT_CONSTRAINT(0x02d4, 0xf), /* MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS */ EVENT_CONSTRAINT_END }; @@ -503,7 +628,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) struct pebs_record_core *at, *top; int n; - if (!ds || !x86_pmu.pebs) + if (!x86_pmu.pebs_active) return; at = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base; @@ -545,7 +670,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) u64 status = 0; int bit, n; - if (!ds || !x86_pmu.pebs) + if (!x86_pmu.pebs_active) return; at = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base; @@ -610,29 +735,25 @@ static void intel_ds_init(void) printk(KERN_CONT "PEBS fmt0%c, ", pebs_type); x86_pmu.pebs_record_size = sizeof(struct pebs_record_core); x86_pmu.drain_pebs = intel_pmu_drain_pebs_core; - x86_pmu.pebs_constraints = intel_core_pebs_events; break; case 1: printk(KERN_CONT "PEBS fmt1%c, ", pebs_type); x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm); x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm; - x86_pmu.pebs_constraints = intel_nehalem_pebs_events; break; default: printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type); x86_pmu.pebs = 0; - break; } } } #else /* CONFIG_CPU_SUP_INTEL */ -static int reserve_ds_buffers(void) +static void reserve_ds_buffers(void) { - return 0; } static void release_ds_buffers(void) diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index 81400b93e69..ead584fb6a7 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -1,5 +1,5 @@ /* - * Netburst Perfomance Events (P4, old Xeon) + * Netburst Performance Events (P4, old Xeon) * * Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org> * Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com> @@ -468,7 +468,7 @@ static struct p4_event_bind p4_event_bind_map[] = { .opcode = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED), .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, .escr_emask = - P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS), + P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS), .cntr = { {12, 13, 16}, {14, 15, 17} }, }, [P4_EVENT_X87_ASSIST] = { @@ -679,10 +679,10 @@ static int p4_validate_raw_event(struct perf_event *event) */ /* - * if an event is shared accross the logical threads + * if an event is shared across the logical threads * the user needs special permissions to be able to use it */ - if (p4_event_bind_map[v].shared) { + if (p4_ht_active() && p4_event_bind_map[v].shared) { if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) return -EACCES; } @@ -727,7 +727,8 @@ static int p4_hw_config(struct perf_event *event) event->hw.config = p4_set_ht_bit(event->hw.config); if (event->attr.type == PERF_TYPE_RAW) { - + struct p4_event_bind *bind; + unsigned int esel; /* * Clear bits we reserve to be managed by kernel itself * and never allowed from a user space @@ -743,6 +744,13 @@ static int p4_hw_config(struct perf_event *event) * bits since we keep additional info here (for cache events and etc) */ event->hw.config |= event->attr.config; + bind = p4_config_get_bind(event->attr.config); + if (!bind) { + rc = -EINVAL; + goto out; + } + esel = P4_OPCODE_ESEL(bind->opcode); + event->hw.config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel)); } rc = x86_setup_perfctr(event); @@ -753,19 +761,27 @@ out: static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc) { - int overflow = 0; - u32 low, high; - - rdmsr(hwc->config_base + hwc->idx, low, high); + u64 v; - /* we need to check high bit for unflagged overflows */ - if ((low & P4_CCCR_OVF) || !(high & (1 << 31))) { - overflow = 1; - (void)checking_wrmsrl(hwc->config_base + hwc->idx, - ((u64)low) & ~P4_CCCR_OVF); + /* an official way for overflow indication */ + rdmsrl(hwc->config_base, v); + if (v & P4_CCCR_OVF) { + wrmsrl(hwc->config_base, v & ~P4_CCCR_OVF); + return 1; } - return overflow; + /* + * In some circumstances the overflow might issue an NMI but did + * not set P4_CCCR_OVF bit. Because a counter holds a negative value + * we simply check for high bit being set, if it's cleared it means + * the counter has reached zero value and continued counting before + * real NMI signal was received: + */ + rdmsrl(hwc->event_base, v); + if (!(v & ARCH_P4_UNFLAGGED_BIT)) + return 1; + + return 0; } static void p4_pmu_disable_pebs(void) @@ -775,13 +791,13 @@ static void p4_pmu_disable_pebs(void) * * It's still allowed that two threads setup same cache * events so we can't simply clear metrics until we knew - * noone is depending on us, so we need kind of counter + * no one is depending on us, so we need kind of counter * for "ReplayEvent" users. * * What is more complex -- RAW events, if user (for some * reason) will pass some cache event metric with improper * event opcode -- it's fine from hardware point of view - * but completely nonsence from "meaning" of such action. + * but completely nonsense from "meaning" of such action. * * So at moment let leave metrics turned on forever -- it's * ok for now but need to be revisited! @@ -800,7 +816,7 @@ static inline void p4_pmu_disable_event(struct perf_event *event) * state we need to clear P4_CCCR_OVF, otherwise interrupt get * asserted again and again */ - (void)checking_wrmsrl(hwc->config_base + hwc->idx, + (void)checking_wrmsrl(hwc->config_base, (u64)(p4_config_unpack_cccr(hwc->config)) & ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED); } @@ -870,7 +886,7 @@ static void p4_pmu_enable_event(struct perf_event *event) p4_pmu_enable_pebs(hwc->config); (void)checking_wrmsrl(escr_addr, escr_conf); - (void)checking_wrmsrl(hwc->config_base + hwc->idx, + (void)checking_wrmsrl(hwc->config_base, (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE); } @@ -896,8 +912,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) int idx, handled = 0; u64 val; - data.addr = 0; - data.raw = NULL; + perf_sample_data_init(&data, 0); cpuc = &__get_cpu_var(cpu_hw_events); @@ -931,14 +946,23 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) if (!x86_perf_event_set_period(event)) continue; if (perf_event_overflow(event, 1, &data, regs)) - p4_pmu_disable_event(event); + x86_pmu_stop(event, 0); } - if (handled) { - /* p4 quirk: unmask it again */ - apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED); + if (handled) inc_irq_stat(apic_perf_irqs); - } + + /* + * When dealing with the unmasking of the LVTPC on P4 perf hw, it has + * been observed that the OVF bit flag has to be cleared first _before_ + * the LVTPC can be unmasked. + * + * The reason is the NMI line will continue to be asserted while the OVF + * bit is set. This causes a second NMI to generate if the LVTPC is + * unmasked before the OVF bit is cleared, leading to unknown NMI + * messages. + */ + apic_write(APIC_LVTPC, APIC_DM_NMI); return handled; } @@ -1152,9 +1176,9 @@ static __initconst const struct x86_pmu p4_pmu = { */ .num_counters = ARCH_P4_MAX_CCCR, .apic = 1, - .cntval_bits = 40, - .cntval_mask = (1ULL << 40) - 1, - .max_period = (1ULL << 39) - 1, + .cntval_bits = ARCH_P4_CNTRVAL_BITS, + .cntval_mask = ARCH_P4_CNTRVAL_MASK, + .max_period = (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1, .hw_config = p4_hw_config, .schedule_events = p4_pmu_schedule_events, /* @@ -1172,7 +1196,7 @@ static __init int p4_pmu_init(void) { unsigned int low, high; - /* If we get stripped -- indexig fails */ + /* If we get stripped -- indexing fails */ BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC); rdmsr(MSR_IA32_MISC_ENABLE, low, high); diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c index 34ba07be2cd..20c097e3386 100644 --- a/arch/x86/kernel/cpu/perf_event_p6.c +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -68,7 +68,7 @@ p6_pmu_disable_event(struct perf_event *event) if (cpuc->enabled) val |= ARCH_PERFMON_EVENTSEL_ENABLE; - (void)checking_wrmsrl(hwc->config_base + hwc->idx, val); + (void)checking_wrmsrl(hwc->config_base, val); } static void p6_pmu_enable_event(struct perf_event *event) @@ -81,7 +81,7 @@ static void p6_pmu_enable_event(struct perf_event *event) if (cpuc->enabled) val |= ARCH_PERFMON_EVENTSEL_ENABLE; - (void)checking_wrmsrl(hwc->config_base + hwc->idx, val); + (void)checking_wrmsrl(hwc->config_base, val); } static __initconst const struct x86_pmu p6_pmu = { diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index d9f4ff8fcd6..966512b2cac 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -16,32 +16,12 @@ #include <linux/kernel.h> #include <linux/bitops.h> #include <linux/smp.h> -#include <linux/nmi.h> +#include <asm/nmi.h> #include <linux/kprobes.h> #include <asm/apic.h> #include <asm/perf_event.h> -struct nmi_watchdog_ctlblk { - unsigned int cccr_msr; - unsigned int perfctr_msr; /* the MSR to reset in NMI handler */ - unsigned int evntsel_msr; /* the MSR to select the events to handle */ -}; - -/* Interface defining a CPU specific perfctr watchdog */ -struct wd_ops { - int (*reserve)(void); - void (*unreserve)(void); - int (*setup)(unsigned nmi_hz); - void (*rearm)(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz); - void (*stop)(void); - unsigned perfctr; - unsigned evntsel; - u64 checkbit; -}; - -static const struct wd_ops *wd_ops; - /* * this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's * offset from MSR_P4_BSU_ESCR0. @@ -60,14 +40,14 @@ static const struct wd_ops *wd_ops; static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS); static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS); -static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk); - /* converts an msr to an appropriate reservation bit */ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) { /* returns the bit offset of the performance counter register */ switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: + if (msr >= MSR_F15H_PERF_CTR) + return (msr - MSR_F15H_PERF_CTR) >> 1; return msr - MSR_K7_PERFCTR0; case X86_VENDOR_INTEL: if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) @@ -92,6 +72,8 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) /* returns the bit offset of the event selection register */ switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: + if (msr >= MSR_F15H_PERF_CTL) + return (msr - MSR_F15H_PERF_CTL) >> 1; return msr - MSR_K7_EVNTSEL0; case X86_VENDOR_INTEL: if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) @@ -172,623 +154,3 @@ void release_evntsel_nmi(unsigned int msr) clear_bit(counter, evntsel_nmi_owner); } EXPORT_SYMBOL(release_evntsel_nmi); - -void disable_lapic_nmi_watchdog(void) -{ - BUG_ON(nmi_watchdog != NMI_LOCAL_APIC); - - if (atomic_read(&nmi_active) <= 0) - return; - - on_each_cpu(stop_apic_nmi_watchdog, NULL, 1); - - if (wd_ops) - wd_ops->unreserve(); - - BUG_ON(atomic_read(&nmi_active) != 0); -} - -void enable_lapic_nmi_watchdog(void) -{ - BUG_ON(nmi_watchdog != NMI_LOCAL_APIC); - - /* are we already enabled */ - if (atomic_read(&nmi_active) != 0) - return; - - /* are we lapic aware */ - if (!wd_ops) - return; - if (!wd_ops->reserve()) { - printk(KERN_ERR "NMI watchdog: cannot reserve perfctrs\n"); - return; - } - - on_each_cpu(setup_apic_nmi_watchdog, NULL, 1); - touch_nmi_watchdog(); -} - -/* - * Activate the NMI watchdog via the local APIC. - */ - -static unsigned int adjust_for_32bit_ctr(unsigned int hz) -{ - u64 counter_val; - unsigned int retval = hz; - - /* - * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter - * are writable, with higher bits sign extending from bit 31. - * So, we can only program the counter with 31 bit values and - * 32nd bit should be 1, for 33.. to be 1. - * Find the appropriate nmi_hz - */ - counter_val = (u64)cpu_khz * 1000; - do_div(counter_val, retval); - if (counter_val > 0x7fffffffULL) { - u64 count = (u64)cpu_khz * 1000; - do_div(count, 0x7fffffffUL); - retval = count + 1; - } - return retval; -} - -static void write_watchdog_counter(unsigned int perfctr_msr, - const char *descr, unsigned nmi_hz) -{ - u64 count = (u64)cpu_khz * 1000; - - do_div(count, nmi_hz); - if (descr) - pr_debug("setting %s to -0x%08Lx\n", descr, count); - wrmsrl(perfctr_msr, 0 - count); -} - -static void write_watchdog_counter32(unsigned int perfctr_msr, - const char *descr, unsigned nmi_hz) -{ - u64 count = (u64)cpu_khz * 1000; - - do_div(count, nmi_hz); - if (descr) - pr_debug("setting %s to -0x%08Lx\n", descr, count); - wrmsr(perfctr_msr, (u32)(-count), 0); -} - -/* - * AMD K7/K8/Family10h/Family11h support. - * AMD keeps this interface nicely stable so there is not much variety - */ -#define K7_EVNTSEL_ENABLE (1 << 22) -#define K7_EVNTSEL_INT (1 << 20) -#define K7_EVNTSEL_OS (1 << 17) -#define K7_EVNTSEL_USR (1 << 16) -#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 -#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING - -static int setup_k7_watchdog(unsigned nmi_hz) -{ - unsigned int perfctr_msr, evntsel_msr; - unsigned int evntsel; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - perfctr_msr = wd_ops->perfctr; - evntsel_msr = wd_ops->evntsel; - - wrmsrl(perfctr_msr, 0UL); - - evntsel = K7_EVNTSEL_INT - | K7_EVNTSEL_OS - | K7_EVNTSEL_USR - | K7_NMI_EVENT; - - /* setup the timer */ - wrmsr(evntsel_msr, evntsel, 0); - write_watchdog_counter(perfctr_msr, "K7_PERFCTR0", nmi_hz); - - /* initialize the wd struct before enabling */ - wd->perfctr_msr = perfctr_msr; - wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = 0; /* unused */ - - /* ok, everything is initialized, announce that we're set */ - cpu_nmi_set_wd_enabled(); - - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= K7_EVNTSEL_ENABLE; - wrmsr(evntsel_msr, evntsel, 0); - - return 1; -} - -static void single_msr_stop_watchdog(void) -{ - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - wrmsr(wd->evntsel_msr, 0, 0); -} - -static int single_msr_reserve(void) -{ - if (!reserve_perfctr_nmi(wd_ops->perfctr)) - return 0; - - if (!reserve_evntsel_nmi(wd_ops->evntsel)) { - release_perfctr_nmi(wd_ops->perfctr); - return 0; - } - return 1; -} - -static void single_msr_unreserve(void) -{ - release_evntsel_nmi(wd_ops->evntsel); - release_perfctr_nmi(wd_ops->perfctr); -} - -static void __kprobes -single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) -{ - /* start the cycle over again */ - write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz); -} - -static const struct wd_ops k7_wd_ops = { - .reserve = single_msr_reserve, - .unreserve = single_msr_unreserve, - .setup = setup_k7_watchdog, - .rearm = single_msr_rearm, - .stop = single_msr_stop_watchdog, - .perfctr = MSR_K7_PERFCTR0, - .evntsel = MSR_K7_EVNTSEL0, - .checkbit = 1ULL << 47, -}; - -/* - * Intel Model 6 (PPro+,P2,P3,P-M,Core1) - */ -#define P6_EVNTSEL0_ENABLE (1 << 22) -#define P6_EVNTSEL_INT (1 << 20) -#define P6_EVNTSEL_OS (1 << 17) -#define P6_EVNTSEL_USR (1 << 16) -#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79 -#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED - -static int setup_p6_watchdog(unsigned nmi_hz) -{ - unsigned int perfctr_msr, evntsel_msr; - unsigned int evntsel; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - perfctr_msr = wd_ops->perfctr; - evntsel_msr = wd_ops->evntsel; - - /* KVM doesn't implement this MSR */ - if (wrmsr_safe(perfctr_msr, 0, 0) < 0) - return 0; - - evntsel = P6_EVNTSEL_INT - | P6_EVNTSEL_OS - | P6_EVNTSEL_USR - | P6_NMI_EVENT; - - /* setup the timer */ - wrmsr(evntsel_msr, evntsel, 0); - nmi_hz = adjust_for_32bit_ctr(nmi_hz); - write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0", nmi_hz); - - /* initialize the wd struct before enabling */ - wd->perfctr_msr = perfctr_msr; - wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = 0; /* unused */ - - /* ok, everything is initialized, announce that we're set */ - cpu_nmi_set_wd_enabled(); - - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= P6_EVNTSEL0_ENABLE; - wrmsr(evntsel_msr, evntsel, 0); - - return 1; -} - -static void __kprobes p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) -{ - /* - * P6 based Pentium M need to re-unmask - * the apic vector but it doesn't hurt - * other P6 variant. - * ArchPerfom/Core Duo also needs this - */ - apic_write(APIC_LVTPC, APIC_DM_NMI); - - /* P6/ARCH_PERFMON has 32 bit counter write */ - write_watchdog_counter32(wd->perfctr_msr, NULL, nmi_hz); -} - -static const struct wd_ops p6_wd_ops = { - .reserve = single_msr_reserve, - .unreserve = single_msr_unreserve, - .setup = setup_p6_watchdog, - .rearm = p6_rearm, - .stop = single_msr_stop_watchdog, - .perfctr = MSR_P6_PERFCTR0, - .evntsel = MSR_P6_EVNTSEL0, - .checkbit = 1ULL << 39, -}; - -/* - * Intel P4 performance counters. - * By far the most complicated of all. - */ -#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1 << 7) -#define P4_ESCR_EVENT_SELECT(N) ((N) << 25) -#define P4_ESCR_OS (1 << 3) -#define P4_ESCR_USR (1 << 2) -#define P4_CCCR_OVF_PMI0 (1 << 26) -#define P4_CCCR_OVF_PMI1 (1 << 27) -#define P4_CCCR_THRESHOLD(N) ((N) << 20) -#define P4_CCCR_COMPLEMENT (1 << 19) -#define P4_CCCR_COMPARE (1 << 18) -#define P4_CCCR_REQUIRED (3 << 16) -#define P4_CCCR_ESCR_SELECT(N) ((N) << 13) -#define P4_CCCR_ENABLE (1 << 12) -#define P4_CCCR_OVF (1 << 31) - -#define P4_CONTROLS 18 -static unsigned int p4_controls[18] = { - MSR_P4_BPU_CCCR0, - MSR_P4_BPU_CCCR1, - MSR_P4_BPU_CCCR2, - MSR_P4_BPU_CCCR3, - MSR_P4_MS_CCCR0, - MSR_P4_MS_CCCR1, - MSR_P4_MS_CCCR2, - MSR_P4_MS_CCCR3, - MSR_P4_FLAME_CCCR0, - MSR_P4_FLAME_CCCR1, - MSR_P4_FLAME_CCCR2, - MSR_P4_FLAME_CCCR3, - MSR_P4_IQ_CCCR0, - MSR_P4_IQ_CCCR1, - MSR_P4_IQ_CCCR2, - MSR_P4_IQ_CCCR3, - MSR_P4_IQ_CCCR4, - MSR_P4_IQ_CCCR5, -}; -/* - * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter - * CRU_ESCR0 (with any non-null event selector) through a complemented - * max threshold. [IA32-Vol3, Section 14.9.9] - */ -static int setup_p4_watchdog(unsigned nmi_hz) -{ - unsigned int perfctr_msr, evntsel_msr, cccr_msr; - unsigned int evntsel, cccr_val; - unsigned int misc_enable, dummy; - unsigned int ht_num; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy); - if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL)) - return 0; - -#ifdef CONFIG_SMP - /* detect which hyperthread we are on */ - if (smp_num_siblings == 2) { - unsigned int ebx, apicid; - - ebx = cpuid_ebx(1); - apicid = (ebx >> 24) & 0xff; - ht_num = apicid & 1; - } else -#endif - ht_num = 0; - - /* - * performance counters are shared resources - * assign each hyperthread its own set - * (re-use the ESCR0 register, seems safe - * and keeps the cccr_val the same) - */ - if (!ht_num) { - /* logical cpu 0 */ - perfctr_msr = MSR_P4_IQ_PERFCTR0; - evntsel_msr = MSR_P4_CRU_ESCR0; - cccr_msr = MSR_P4_IQ_CCCR0; - cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4); - - /* - * If we're on the kdump kernel or other situation, we may - * still have other performance counter registers set to - * interrupt and they'll keep interrupting forever because - * of the P4_CCCR_OVF quirk. So we need to ACK all the - * pending interrupts and disable all the registers here, - * before reenabling the NMI delivery. Refer to p4_rearm() - * about the P4_CCCR_OVF quirk. - */ - if (reset_devices) { - unsigned int low, high; - int i; - - for (i = 0; i < P4_CONTROLS; i++) { - rdmsr(p4_controls[i], low, high); - low &= ~(P4_CCCR_ENABLE | P4_CCCR_OVF); - wrmsr(p4_controls[i], low, high); - } - } - } else { - /* logical cpu 1 */ - perfctr_msr = MSR_P4_IQ_PERFCTR1; - evntsel_msr = MSR_P4_CRU_ESCR0; - cccr_msr = MSR_P4_IQ_CCCR1; - - /* Pentium 4 D processors don't support P4_CCCR_OVF_PMI1 */ - if (boot_cpu_data.x86_model == 4 && boot_cpu_data.x86_mask == 4) - cccr_val = P4_CCCR_OVF_PMI0; - else - cccr_val = P4_CCCR_OVF_PMI1; - cccr_val |= P4_CCCR_ESCR_SELECT(4); - } - - evntsel = P4_ESCR_EVENT_SELECT(0x3F) - | P4_ESCR_OS - | P4_ESCR_USR; - - cccr_val |= P4_CCCR_THRESHOLD(15) - | P4_CCCR_COMPLEMENT - | P4_CCCR_COMPARE - | P4_CCCR_REQUIRED; - - wrmsr(evntsel_msr, evntsel, 0); - wrmsr(cccr_msr, cccr_val, 0); - write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz); - - wd->perfctr_msr = perfctr_msr; - wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = cccr_msr; - - /* ok, everything is initialized, announce that we're set */ - cpu_nmi_set_wd_enabled(); - - apic_write(APIC_LVTPC, APIC_DM_NMI); - cccr_val |= P4_CCCR_ENABLE; - wrmsr(cccr_msr, cccr_val, 0); - return 1; -} - -static void stop_p4_watchdog(void) -{ - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - wrmsr(wd->cccr_msr, 0, 0); - wrmsr(wd->evntsel_msr, 0, 0); -} - -static int p4_reserve(void) -{ - if (!reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR0)) - return 0; -#ifdef CONFIG_SMP - if (smp_num_siblings > 1 && !reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR1)) - goto fail1; -#endif - if (!reserve_evntsel_nmi(MSR_P4_CRU_ESCR0)) - goto fail2; - /* RED-PEN why is ESCR1 not reserved here? */ - return 1; - fail2: -#ifdef CONFIG_SMP - if (smp_num_siblings > 1) - release_perfctr_nmi(MSR_P4_IQ_PERFCTR1); - fail1: -#endif - release_perfctr_nmi(MSR_P4_IQ_PERFCTR0); - return 0; -} - -static void p4_unreserve(void) -{ -#ifdef CONFIG_SMP - if (smp_num_siblings > 1) - release_perfctr_nmi(MSR_P4_IQ_PERFCTR1); -#endif - release_evntsel_nmi(MSR_P4_CRU_ESCR0); - release_perfctr_nmi(MSR_P4_IQ_PERFCTR0); -} - -static void __kprobes p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) -{ - unsigned dummy; - /* - * P4 quirks: - * - An overflown perfctr will assert its interrupt - * until the OVF flag in its CCCR is cleared. - * - LVTPC is masked on interrupt and must be - * unmasked by the LVTPC handler. - */ - rdmsrl(wd->cccr_msr, dummy); - dummy &= ~P4_CCCR_OVF; - wrmsrl(wd->cccr_msr, dummy); - apic_write(APIC_LVTPC, APIC_DM_NMI); - /* start the cycle over again */ - write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz); -} - -static const struct wd_ops p4_wd_ops = { - .reserve = p4_reserve, - .unreserve = p4_unreserve, - .setup = setup_p4_watchdog, - .rearm = p4_rearm, - .stop = stop_p4_watchdog, - /* RED-PEN this is wrong for the other sibling */ - .perfctr = MSR_P4_BPU_PERFCTR0, - .evntsel = MSR_P4_BSU_ESCR0, - .checkbit = 1ULL << 39, -}; - -/* - * Watchdog using the Intel architected PerfMon. - * Used for Core2 and hopefully all future Intel CPUs. - */ -#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL -#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK - -static struct wd_ops intel_arch_wd_ops; - -static int setup_intel_arch_watchdog(unsigned nmi_hz) -{ - unsigned int ebx; - union cpuid10_eax eax; - unsigned int unused; - unsigned int perfctr_msr, evntsel_msr; - unsigned int evntsel; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - /* - * Check whether the Architectural PerfMon supports - * Unhalted Core Cycles Event or not. - * NOTE: Corresponding bit = 0 in ebx indicates event present. - */ - cpuid(10, &(eax.full), &ebx, &unused, &unused); - if ((eax.split.mask_length < - (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || - (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) - return 0; - - perfctr_msr = wd_ops->perfctr; - evntsel_msr = wd_ops->evntsel; - - wrmsrl(perfctr_msr, 0UL); - - evntsel = ARCH_PERFMON_EVENTSEL_INT - | ARCH_PERFMON_EVENTSEL_OS - | ARCH_PERFMON_EVENTSEL_USR - | ARCH_PERFMON_NMI_EVENT_SEL - | ARCH_PERFMON_NMI_EVENT_UMASK; - - /* setup the timer */ - wrmsr(evntsel_msr, evntsel, 0); - nmi_hz = adjust_for_32bit_ctr(nmi_hz); - write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz); - - wd->perfctr_msr = perfctr_msr; - wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = 0; /* unused */ - - /* ok, everything is initialized, announce that we're set */ - cpu_nmi_set_wd_enabled(); - - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= ARCH_PERFMON_EVENTSEL_ENABLE; - wrmsr(evntsel_msr, evntsel, 0); - intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1); - return 1; -} - -static struct wd_ops intel_arch_wd_ops __read_mostly = { - .reserve = single_msr_reserve, - .unreserve = single_msr_unreserve, - .setup = setup_intel_arch_watchdog, - .rearm = p6_rearm, - .stop = single_msr_stop_watchdog, - .perfctr = MSR_ARCH_PERFMON_PERFCTR1, - .evntsel = MSR_ARCH_PERFMON_EVENTSEL1, -}; - -static void probe_nmi_watchdog(void) -{ - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - if (boot_cpu_data.x86 == 6 || - (boot_cpu_data.x86 >= 0xf && boot_cpu_data.x86 <= 0x15)) - wd_ops = &k7_wd_ops; - return; - case X86_VENDOR_INTEL: - /* Work around where perfctr1 doesn't have a working enable - * bit as described in the following errata: - * AE49 Core Duo and Intel Core Solo 65 nm - * AN49 Intel Pentium Dual-Core - * AF49 Dual-Core Intel Xeon Processor LV - */ - if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) || - ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 15 && - boot_cpu_data.x86_mask == 4))) { - intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0; - intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0; - } - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { - wd_ops = &intel_arch_wd_ops; - break; - } - switch (boot_cpu_data.x86) { - case 6: - if (boot_cpu_data.x86_model > 13) - return; - - wd_ops = &p6_wd_ops; - break; - case 15: - wd_ops = &p4_wd_ops; - break; - default: - return; - } - break; - } -} - -/* Interface to nmi.c */ - -int lapic_watchdog_init(unsigned nmi_hz) -{ - if (!wd_ops) { - probe_nmi_watchdog(); - if (!wd_ops) { - printk(KERN_INFO "NMI watchdog: CPU not supported\n"); - return -1; - } - - if (!wd_ops->reserve()) { - printk(KERN_ERR - "NMI watchdog: cannot reserve perfctrs\n"); - return -1; - } - } - - if (!(wd_ops->setup(nmi_hz))) { - printk(KERN_ERR "Cannot setup NMI watchdog on CPU %d\n", - raw_smp_processor_id()); - return -1; - } - - return 0; -} - -void lapic_watchdog_stop(void) -{ - if (wd_ops) - wd_ops->stop(); -} - -unsigned lapic_adjust_nmi_hz(unsigned hz) -{ - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - if (wd->perfctr_msr == MSR_P6_PERFCTR0 || - wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR1) - hz = adjust_for_32bit_ctr(hz); - return hz; -} - -int __kprobes lapic_wd_event(unsigned nmi_hz) -{ - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - u64 ctr; - - rdmsrl(wd->perfctr_msr, ctr); - if (ctr & wd_ops->checkbit) /* perfctr still running? */ - return 0; - - wd_ops->rearm(wd, nmi_hz); - return 1; -} diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 227b0448960..d22d0c4edcf 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -86,7 +86,7 @@ static void __init vmware_platform_setup(void) } /* - * While checking the dmi string infomation, just checking the product + * While checking the dmi string information, just checking the product * serial key should be enough, as this will always have a VMware * specific string when running under VMware hypervisor. */ diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 1b7b31ab7d8..212a6a42527 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -33,7 +33,6 @@ #include <linux/init.h> #include <linux/poll.h> #include <linux/smp.h> -#include <linux/smp_lock.h> #include <linux/major.h> #include <linux/fs.h> #include <linux/device.h> diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index 67414550c3c..642f75a68cd 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c @@ -14,9 +14,6 @@ static void *kdump_buf_page; -/* Stores the physical address of elf header of crash image. */ -unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; - static inline bool is_crashed_pfn_valid(unsigned long pfn) { #ifndef CONFIG_X86_PAE @@ -61,7 +58,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, if (!is_crashed_pfn_valid(pfn)) return -EFAULT; - vaddr = kmap_atomic_pfn(pfn, KM_PTE0); + vaddr = kmap_atomic_pfn(pfn); if (!userbuf) { memcpy(buf, (vaddr + offset), csize); diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c index 994828899e0..afa64adb75e 100644 --- a/arch/x86/kernel/crash_dump_64.c +++ b/arch/x86/kernel/crash_dump_64.c @@ -10,9 +10,6 @@ #include <linux/uaccess.h> #include <linux/io.h> -/* Stores the physical address of elf header of crash image. */ -unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; - /** * copy_oldmem_page - copy one page from "oldmem" * @pfn: page frame number to be copied diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c new file mode 100644 index 00000000000..e90f08458e6 --- /dev/null +++ b/arch/x86/kernel/devicetree.c @@ -0,0 +1,439 @@ +/* + * Architecture specific OF callbacks. + */ +#include <linux/bootmem.h> +#include <linux/io.h> +#include <linux/interrupt.h> +#include <linux/list.h> +#include <linux/of.h> +#include <linux/of_fdt.h> +#include <linux/of_address.h> +#include <linux/of_platform.h> +#include <linux/of_irq.h> +#include <linux/slab.h> +#include <linux/pci.h> +#include <linux/of_pci.h> + +#include <asm/hpet.h> +#include <asm/irq_controller.h> +#include <asm/apic.h> +#include <asm/pci_x86.h> + +__initdata u64 initial_dtb; +char __initdata cmd_line[COMMAND_LINE_SIZE]; +static LIST_HEAD(irq_domains); +static DEFINE_RAW_SPINLOCK(big_irq_lock); + +int __initdata of_ioapic; + +#ifdef CONFIG_X86_IO_APIC +static void add_interrupt_host(struct irq_domain *ih) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&big_irq_lock, flags); + list_add(&ih->l, &irq_domains); + raw_spin_unlock_irqrestore(&big_irq_lock, flags); +} +#endif + +static struct irq_domain *get_ih_from_node(struct device_node *controller) +{ + struct irq_domain *ih, *found = NULL; + unsigned long flags; + + raw_spin_lock_irqsave(&big_irq_lock, flags); + list_for_each_entry(ih, &irq_domains, l) { + if (ih->controller == controller) { + found = ih; + break; + } + } + raw_spin_unlock_irqrestore(&big_irq_lock, flags); + return found; +} + +unsigned int irq_create_of_mapping(struct device_node *controller, + const u32 *intspec, unsigned int intsize) +{ + struct irq_domain *ih; + u32 virq, type; + int ret; + + ih = get_ih_from_node(controller); + if (!ih) + return 0; + ret = ih->xlate(ih, intspec, intsize, &virq, &type); + if (ret) + return 0; + if (type == IRQ_TYPE_NONE) + return virq; + irq_set_irq_type(virq, type); + return virq; +} +EXPORT_SYMBOL_GPL(irq_create_of_mapping); + +unsigned long pci_address_to_pio(phys_addr_t address) +{ + /* + * The ioport address can be directly used by inX / outX + */ + BUG_ON(address >= (1 << 16)); + return (unsigned long)address; +} +EXPORT_SYMBOL_GPL(pci_address_to_pio); + +void __init early_init_dt_scan_chosen_arch(unsigned long node) +{ + BUG(); +} + +void __init early_init_dt_add_memory_arch(u64 base, u64 size) +{ + BUG(); +} + +void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align) +{ + return __alloc_bootmem(size, align, __pa(MAX_DMA_ADDRESS)); +} + +void __init add_dtb(u64 data) +{ + initial_dtb = data + offsetof(struct setup_data, data); +} + +/* + * CE4100 ids. Will be moved to machine_device_initcall() once we have it. + */ +static struct of_device_id __initdata ce4100_ids[] = { + { .compatible = "intel,ce4100-cp", }, + { .compatible = "isa", }, + { .compatible = "pci", }, + {}, +}; + +static int __init add_bus_probe(void) +{ + if (!of_have_populated_dt()) + return 0; + + return of_platform_bus_probe(NULL, ce4100_ids, NULL); +} +module_init(add_bus_probe); + +#ifdef CONFIG_PCI +static int x86_of_pci_irq_enable(struct pci_dev *dev) +{ + struct of_irq oirq; + u32 virq; + int ret; + u8 pin; + + ret = pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); + if (ret) + return ret; + if (!pin) + return 0; + + ret = of_irq_map_pci(dev, &oirq); + if (ret) + return ret; + + virq = irq_create_of_mapping(oirq.controller, oirq.specifier, + oirq.size); + if (virq == 0) + return -EINVAL; + dev->irq = virq; + return 0; +} + +static void x86_of_pci_irq_disable(struct pci_dev *dev) +{ +} + +void __cpuinit x86_of_pci_init(void) +{ + struct device_node *np; + + pcibios_enable_irq = x86_of_pci_irq_enable; + pcibios_disable_irq = x86_of_pci_irq_disable; + + for_each_node_by_type(np, "pci") { + const void *prop; + struct pci_bus *bus; + unsigned int bus_min; + struct device_node *child; + + prop = of_get_property(np, "bus-range", NULL); + if (!prop) + continue; + bus_min = be32_to_cpup(prop); + + bus = pci_find_bus(0, bus_min); + if (!bus) { + printk(KERN_ERR "Can't find a node for bus %s.\n", + np->full_name); + continue; + } + + if (bus->self) + bus->self->dev.of_node = np; + else + bus->dev.of_node = np; + + for_each_child_of_node(np, child) { + struct pci_dev *dev; + u32 devfn; + + prop = of_get_property(child, "reg", NULL); + if (!prop) + continue; + + devfn = (be32_to_cpup(prop) >> 8) & 0xff; + dev = pci_get_slot(bus, devfn); + if (!dev) + continue; + dev->dev.of_node = child; + pci_dev_put(dev); + } + } +} +#endif + +static void __init dtb_setup_hpet(void) +{ +#ifdef CONFIG_HPET_TIMER + struct device_node *dn; + struct resource r; + int ret; + + dn = of_find_compatible_node(NULL, NULL, "intel,ce4100-hpet"); + if (!dn) + return; + ret = of_address_to_resource(dn, 0, &r); + if (ret) { + WARN_ON(1); + return; + } + hpet_address = r.start; +#endif +} + +static void __init dtb_lapic_setup(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + struct device_node *dn; + struct resource r; + int ret; + + dn = of_find_compatible_node(NULL, NULL, "intel,ce4100-lapic"); + if (!dn) + return; + + ret = of_address_to_resource(dn, 0, &r); + if (WARN_ON(ret)) + return; + + /* Did the boot loader setup the local APIC ? */ + if (!cpu_has_apic) { + if (apic_force_enable(r.start)) + return; + } + smp_found_config = 1; + pic_mode = 1; + register_lapic_address(r.start); + generic_processor_info(boot_cpu_physical_apicid, + GET_APIC_VERSION(apic_read(APIC_LVR))); +#endif +} + +#ifdef CONFIG_X86_IO_APIC +static unsigned int ioapic_id; + +static void __init dtb_add_ioapic(struct device_node *dn) +{ + struct resource r; + int ret; + + ret = of_address_to_resource(dn, 0, &r); + if (ret) { + printk(KERN_ERR "Can't obtain address from node %s.\n", + dn->full_name); + return; + } + mp_register_ioapic(++ioapic_id, r.start, gsi_top); +} + +static void __init dtb_ioapic_setup(void) +{ + struct device_node *dn; + + for_each_compatible_node(dn, NULL, "intel,ce4100-ioapic") + dtb_add_ioapic(dn); + + if (nr_ioapics) { + of_ioapic = 1; + return; + } + printk(KERN_ERR "Error: No information about IO-APIC in OF.\n"); +} +#else +static void __init dtb_ioapic_setup(void) {} +#endif + +static void __init dtb_apic_setup(void) +{ + dtb_lapic_setup(); + dtb_ioapic_setup(); +} + +#ifdef CONFIG_OF_FLATTREE +static void __init x86_flattree_get_config(void) +{ + u32 size, map_len; + void *new_dtb; + + if (!initial_dtb) + return; + + map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), + (u64)sizeof(struct boot_param_header)); + + initial_boot_params = early_memremap(initial_dtb, map_len); + size = be32_to_cpu(initial_boot_params->totalsize); + if (map_len < size) { + early_iounmap(initial_boot_params, map_len); + initial_boot_params = early_memremap(initial_dtb, size); + map_len = size; + } + + new_dtb = alloc_bootmem(size); + memcpy(new_dtb, initial_boot_params, size); + early_iounmap(initial_boot_params, map_len); + + initial_boot_params = new_dtb; + + /* root level address cells */ + of_scan_flat_dt(early_init_dt_scan_root, NULL); + + unflatten_device_tree(); +} +#else +static inline void x86_flattree_get_config(void) { } +#endif + +void __init x86_dtb_init(void) +{ + x86_flattree_get_config(); + + if (!of_have_populated_dt()) + return; + + dtb_setup_hpet(); + dtb_apic_setup(); +} + +#ifdef CONFIG_X86_IO_APIC + +struct of_ioapic_type { + u32 out_type; + u32 trigger; + u32 polarity; +}; + +static struct of_ioapic_type of_ioapic_type[] = +{ + { + .out_type = IRQ_TYPE_EDGE_RISING, + .trigger = IOAPIC_EDGE, + .polarity = 1, + }, + { + .out_type = IRQ_TYPE_LEVEL_LOW, + .trigger = IOAPIC_LEVEL, + .polarity = 0, + }, + { + .out_type = IRQ_TYPE_LEVEL_HIGH, + .trigger = IOAPIC_LEVEL, + .polarity = 1, + }, + { + .out_type = IRQ_TYPE_EDGE_FALLING, + .trigger = IOAPIC_EDGE, + .polarity = 0, + }, +}; + +static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize, + u32 *out_hwirq, u32 *out_type) +{ + struct io_apic_irq_attr attr; + struct of_ioapic_type *it; + u32 line, idx, type; + + if (intsize < 2) + return -EINVAL; + + line = *intspec; + idx = (u32) id->priv; + *out_hwirq = line + mp_gsi_routing[idx].gsi_base; + + intspec++; + type = *intspec; + + if (type >= ARRAY_SIZE(of_ioapic_type)) + return -EINVAL; + + it = of_ioapic_type + type; + *out_type = it->out_type; + + set_io_apic_irq_attr(&attr, idx, line, it->trigger, it->polarity); + + return io_apic_setup_irq_pin_once(*out_hwirq, cpu_to_node(0), &attr); +} + +static void __init ioapic_add_ofnode(struct device_node *np) +{ + struct resource r; + int i, ret; + + ret = of_address_to_resource(np, 0, &r); + if (ret) { + printk(KERN_ERR "Failed to obtain address for %s\n", + np->full_name); + return; + } + + for (i = 0; i < nr_ioapics; i++) { + if (r.start == mp_ioapics[i].apicaddr) { + struct irq_domain *id; + + id = kzalloc(sizeof(*id), GFP_KERNEL); + BUG_ON(!id); + id->controller = np; + id->xlate = ioapic_xlate; + id->priv = (void *)i; + add_interrupt_host(id); + return; + } + } + printk(KERN_ERR "IOxAPIC at %s is not registered.\n", np->full_name); +} + +void __init x86_add_irq_domains(void) +{ + struct device_node *dp; + + if (!of_have_populated_dt()) + return; + + for_each_node_with_property(dp, "interrupt-controller") { + if (of_device_is_compatible(dp, "intel,ce4100-ioapic")) + ioapic_add_ofnode(dp); + } +} +#else +void __init x86_add_irq_domains(void) { } +#endif diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 6e8752c1bd5..1aae78f775f 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -27,7 +27,7 @@ static int die_counter; void printk_address(unsigned long address, int reliable) { - printk(" [<%p>] %s%pS\n", (void *) address, + printk(" [<%p>] %s%pB\n", (void *) address, reliable ? "" : "? ", (void *) address); } @@ -135,20 +135,6 @@ print_context_stack_bp(struct thread_info *tinfo, } EXPORT_SYMBOL_GPL(print_context_stack_bp); - -static void -print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) -{ - printk(data); - print_symbol(msg, symbol); - printk("\n"); -} - -static void print_trace_warning(void *data, char *msg) -{ - printk("%s%s\n", (char *)data, msg); -} - static int print_trace_stack(void *data, char *name) { printk("%s <%s> ", (char *)data, name); @@ -166,8 +152,6 @@ static void print_trace_address(void *data, unsigned long addr, int reliable) } static const struct stacktrace_ops print_trace_ops = { - .warning = print_trace_warning, - .warning_symbol = print_trace_warning_symbol, .stack = print_trace_stack, .address = print_trace_address, .walk_stack = print_context_stack, @@ -197,14 +181,10 @@ void show_stack(struct task_struct *task, unsigned long *sp) */ void dump_stack(void) { - unsigned long bp = 0; + unsigned long bp; unsigned long stack; -#ifdef CONFIG_FRAME_POINTER - if (!bp) - get_bp(bp); -#endif - + bp = stack_frame(current, NULL); printk("Pid: %d, comm: %.20s %s %s %.*s\n", current->pid, current->comm, print_tainted(), init_utsname()->release, @@ -240,6 +220,7 @@ unsigned __kprobes long oops_begin(void) bust_spinlocks(1); return flags; } +EXPORT_SYMBOL_GPL(oops_begin); void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) { @@ -282,7 +263,6 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) printk("DEBUG_PAGEALLOC"); #endif printk("\n"); - sysfs_printk_last_file(); if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) return 1; @@ -325,41 +305,6 @@ void die(const char *str, struct pt_regs *regs, long err) oops_end(flags, regs, sig); } -void notrace __kprobes -die_nmi(char *str, struct pt_regs *regs, int do_panic) -{ - unsigned long flags; - - if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) - return; - - /* - * We are in trouble anyway, lets at least try - * to get a message out. - */ - flags = oops_begin(); - printk(KERN_EMERG "%s", str); - printk(" on CPU%d, ip %08lx, registers:\n", - smp_processor_id(), regs->ip); - show_registers(regs); - oops_end(flags, regs, 0); - if (do_panic || panic_on_oops) - panic("Non maskable interrupt"); - nmi_exit(); - local_irq_enable(); - do_exit(SIGBUS); -} - -static int __init oops_setup(char *s) -{ - if (!s) - return -EINVAL; - if (!strcmp(s, "panic")) - panic_on_oops = 1; - return 0; -} -early_param("oops", oops_setup); - static int __init kstack_setup(char *s) { if (!s) diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 0f6376ffa2d..3b97a80ce32 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -34,17 +34,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, stack = (unsigned long *)task->thread.sp; } -#ifdef CONFIG_FRAME_POINTER - if (!bp) { - if (task == current) { - /* Grab bp right from our regs */ - get_bp(bp); - } else { - /* bp is the last reg pushed by switch_to */ - bp = *(unsigned long *) task->thread.sp; - } - } -#endif + if (!bp) + bp = stack_frame(task, regs); for (;;) { struct thread_info *context; @@ -82,11 +73,11 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, if (kstack_end(stack)) break; if (i && ((i % STACKSLOTS_PER_LINE) == 0)) - printk("\n%s", log_lvl); - printk(" %08lx", *stack++); + printk(KERN_CONT "\n"); + printk(KERN_CONT " %08lx", *stack++); touch_nmi_watchdog(); } - printk("\n"); + printk(KERN_CONT "\n"); show_trace_log_lvl(task, regs, sp, bp, log_lvl); } @@ -112,8 +103,7 @@ void show_registers(struct pt_regs *regs) u8 *ip; printk(KERN_EMERG "Stack:\n"); - show_stack_log_lvl(NULL, regs, ®s->sp, - 0, KERN_EMERG); + show_stack_log_lvl(NULL, regs, ®s->sp, 0, KERN_EMERG); printk(KERN_EMERG "Code: "); diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 57a21f11c79..e71c98d3c0d 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -149,29 +149,19 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned used = 0; struct thread_info *tinfo; int graph = 0; + unsigned long dummy; if (!task) task = current; if (!stack) { - unsigned long dummy; stack = &dummy; if (task && task != current) stack = (unsigned long *)task->thread.sp; } -#ifdef CONFIG_FRAME_POINTER - if (!bp) { - if (task == current) { - /* Grab bp right from our regs */ - get_bp(bp); - } else { - /* bp is the last reg pushed by switch_to */ - bp = *(unsigned long *) task->thread.sp; - } - } -#endif - + if (!bp) + bp = stack_frame(task, regs); /* * Print function call entries in all stacks, starting at the * current stack address. If the stacks consist of nested @@ -265,20 +255,20 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, if (stack >= irq_stack && stack <= irq_stack_end) { if (stack == irq_stack_end) { stack = (unsigned long *) (irq_stack_end[-1]); - printk(" <EOI> "); + printk(KERN_CONT " <EOI> "); } } else { if (((long) stack & (THREAD_SIZE-1)) == 0) break; } if (i && ((i % STACKSLOTS_PER_LINE) == 0)) - printk("\n%s", log_lvl); - printk(" %016lx", *stack++); + printk(KERN_CONT "\n"); + printk(KERN_CONT " %016lx", *stack++); touch_nmi_watchdog(); } preempt_enable(); - printk("\n"); + printk(KERN_CONT "\n"); show_trace_log_lvl(task, regs, sp, bp, log_lvl); } @@ -308,7 +298,7 @@ void show_registers(struct pt_regs *regs) printk(KERN_EMERG "Stack:\n"); show_stack_log_lvl(NULL, regs, (unsigned long *)sp, - regs->bp, KERN_EMERG); + 0, KERN_EMERG); printk(KERN_EMERG "Code: "); diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 0c2b7ef7a34..3e2ef842531 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -11,9 +11,11 @@ #include <linux/kernel.h> #include <linux/types.h> #include <linux/init.h> +#include <linux/crash_dump.h> #include <linux/bootmem.h> #include <linux/pfn.h> #include <linux/suspend.h> +#include <linux/acpi.h> #include <linux/firmware-map.h> #include <linux/memblock.h> @@ -666,21 +668,15 @@ __init void e820_setup_gap(void) * boot_params.e820_map, others are passed via SETUP_E820_EXT node of * linked list of struct setup_data, which is parsed here. */ -void __init parse_e820_ext(struct setup_data *sdata, unsigned long pa_data) +void __init parse_e820_ext(struct setup_data *sdata) { - u32 map_len; int entries; struct e820entry *extmap; entries = sdata->len / sizeof(struct e820entry); - map_len = sdata->len + sizeof(struct setup_data); - if (map_len > PAGE_SIZE) - sdata = early_ioremap(pa_data, map_len); extmap = (struct e820entry *)(sdata->data); __append_e820_map(extmap, entries); sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); - if (map_len > PAGE_SIZE) - early_iounmap(sdata, map_len); printk(KERN_INFO "extended physical RAM map:\n"); e820_print_map("extended"); } @@ -846,15 +842,21 @@ static int __init parse_memopt(char *p) if (!p) return -EINVAL; -#ifdef CONFIG_X86_32 if (!strcmp(p, "nopentium")) { +#ifdef CONFIG_X86_32 setup_clear_cpu_cap(X86_FEATURE_PSE); return 0; - } +#else + printk(KERN_WARNING "mem=nopentium ignored! (only supported on x86_32)\n"); + return -EINVAL; #endif + } userdef = 1; mem_size = memparse(p, &p); + /* don't remove all of memory when handling "mem={invalid}" param */ + if (mem_size == 0) + return -EINVAL; e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); return 0; diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 76b8cd953de..3755ef49439 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -143,15 +143,10 @@ static void __init ati_bugs(int num, int slot, int func) static u32 __init ati_sbx00_rev(int num, int slot, int func) { - u32 old, d; + u32 d; - d = read_pci_config(num, slot, func, 0x70); - old = d; - d &= ~(1<<8); - write_pci_config(num, slot, func, 0x70, d); d = read_pci_config(num, slot, func, 0x8); d &= 0xff; - write_pci_config(num, slot, func, 0x70, old); return d; } @@ -160,11 +155,19 @@ static void __init ati_bugs_contd(int num, int slot, int func) { u32 d, rev; - if (acpi_use_timer_override) + rev = ati_sbx00_rev(num, slot, func); + if (rev >= 0x40) + acpi_fix_pin2_polarity = 1; + + /* + * SB600: revisions 0x11, 0x12, 0x13, 0x14, ... + * SB700: revisions 0x39, 0x3a, ... + * SB800: revisions 0x40, 0x41, ... + */ + if (rev >= 0x39) return; - rev = ati_sbx00_rev(num, slot, func); - if (rev > 0x13) + if (acpi_use_timer_override) return; /* check for IRQ0 interrupt swap */ diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 4572f25f932..cd28a350f7f 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -240,7 +240,7 @@ static int __init setup_early_printk(char *buf) if (!strncmp(buf, "xen", 3)) early_console_register(&xenboot_console, keep); #endif -#ifdef CONFIG_X86_MRST_EARLY_PRINTK +#ifdef CONFIG_EARLY_PRINTK_MRST if (!strncmp(buf, "mrst", 4)) { mrst_early_console_init(); early_console_register(&early_mrst_console, keep); @@ -250,7 +250,6 @@ static int __init setup_early_printk(char *buf) hsu_early_console_init(); early_console_register(&early_hsu_console, keep); } - #endif buf++; } diff --git a/arch/x86/kernel/early_printk_mrst.c b/arch/x86/kernel/early_printk_mrst.c deleted file mode 100644 index 65df603622b..00000000000 --- a/arch/x86/kernel/early_printk_mrst.c +++ /dev/null @@ -1,319 +0,0 @@ -/* - * early_printk_mrst.c - early consoles for Intel MID platforms - * - * Copyright (c) 2008-2010, Intel Corporation - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - */ - -/* - * This file implements two early consoles named mrst and hsu. - * mrst is based on Maxim3110 spi-uart device, it exists in both - * Moorestown and Medfield platforms, while hsu is based on a High - * Speed UART device which only exists in the Medfield platform - */ - -#include <linux/serial_reg.h> -#include <linux/serial_mfd.h> -#include <linux/kmsg_dump.h> -#include <linux/console.h> -#include <linux/kernel.h> -#include <linux/delay.h> -#include <linux/init.h> -#include <linux/io.h> - -#include <asm/fixmap.h> -#include <asm/pgtable.h> -#include <asm/mrst.h> - -#define MRST_SPI_TIMEOUT 0x200000 -#define MRST_REGBASE_SPI0 0xff128000 -#define MRST_REGBASE_SPI1 0xff128400 -#define MRST_CLK_SPI0_REG 0xff11d86c - -/* Bit fields in CTRLR0 */ -#define SPI_DFS_OFFSET 0 - -#define SPI_FRF_OFFSET 4 -#define SPI_FRF_SPI 0x0 -#define SPI_FRF_SSP 0x1 -#define SPI_FRF_MICROWIRE 0x2 -#define SPI_FRF_RESV 0x3 - -#define SPI_MODE_OFFSET 6 -#define SPI_SCPH_OFFSET 6 -#define SPI_SCOL_OFFSET 7 -#define SPI_TMOD_OFFSET 8 -#define SPI_TMOD_TR 0x0 /* xmit & recv */ -#define SPI_TMOD_TO 0x1 /* xmit only */ -#define SPI_TMOD_RO 0x2 /* recv only */ -#define SPI_TMOD_EPROMREAD 0x3 /* eeprom read mode */ - -#define SPI_SLVOE_OFFSET 10 -#define SPI_SRL_OFFSET 11 -#define SPI_CFS_OFFSET 12 - -/* Bit fields in SR, 7 bits */ -#define SR_MASK 0x7f /* cover 7 bits */ -#define SR_BUSY (1 << 0) -#define SR_TF_NOT_FULL (1 << 1) -#define SR_TF_EMPT (1 << 2) -#define SR_RF_NOT_EMPT (1 << 3) -#define SR_RF_FULL (1 << 4) -#define SR_TX_ERR (1 << 5) -#define SR_DCOL (1 << 6) - -struct dw_spi_reg { - u32 ctrl0; - u32 ctrl1; - u32 ssienr; - u32 mwcr; - u32 ser; - u32 baudr; - u32 txfltr; - u32 rxfltr; - u32 txflr; - u32 rxflr; - u32 sr; - u32 imr; - u32 isr; - u32 risr; - u32 txoicr; - u32 rxoicr; - u32 rxuicr; - u32 msticr; - u32 icr; - u32 dmacr; - u32 dmatdlr; - u32 dmardlr; - u32 idr; - u32 version; - - /* Currently operates as 32 bits, though only the low 16 bits matter */ - u32 dr; -} __packed; - -#define dw_readl(dw, name) __raw_readl(&(dw)->name) -#define dw_writel(dw, name, val) __raw_writel((val), &(dw)->name) - -/* Default use SPI0 register for mrst, we will detect Penwell and use SPI1 */ -static unsigned long mrst_spi_paddr = MRST_REGBASE_SPI0; - -static u32 *pclk_spi0; -/* Always contains an accessable address, start with 0 */ -static struct dw_spi_reg *pspi; - -static struct kmsg_dumper dw_dumper; -static int dumper_registered; - -static void dw_kmsg_dump(struct kmsg_dumper *dumper, - enum kmsg_dump_reason reason, - const char *s1, unsigned long l1, - const char *s2, unsigned long l2) -{ - int i; - - /* When run to this, we'd better re-init the HW */ - mrst_early_console_init(); - - for (i = 0; i < l1; i++) - early_mrst_console.write(&early_mrst_console, s1 + i, 1); - for (i = 0; i < l2; i++) - early_mrst_console.write(&early_mrst_console, s2 + i, 1); -} - -/* Set the ratio rate to 115200, 8n1, IRQ disabled */ -static void max3110_write_config(void) -{ - u16 config; - - config = 0xc001; - dw_writel(pspi, dr, config); -} - -/* Translate char to a eligible word and send to max3110 */ -static void max3110_write_data(char c) -{ - u16 data; - - data = 0x8000 | c; - dw_writel(pspi, dr, data); -} - -void mrst_early_console_init(void) -{ - u32 ctrlr0 = 0; - u32 spi0_cdiv; - u32 freq; /* Freqency info only need be searched once */ - - /* Base clk is 100 MHz, the actual clk = 100M / (clk_divider + 1) */ - pclk_spi0 = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE, - MRST_CLK_SPI0_REG); - spi0_cdiv = ((*pclk_spi0) & 0xe00) >> 9; - freq = 100000000 / (spi0_cdiv + 1); - - if (mrst_identify_cpu() == MRST_CPU_CHIP_PENWELL) - mrst_spi_paddr = MRST_REGBASE_SPI1; - - pspi = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE, - mrst_spi_paddr); - - /* Disable SPI controller */ - dw_writel(pspi, ssienr, 0); - - /* Set control param, 8 bits, transmit only mode */ - ctrlr0 = dw_readl(pspi, ctrl0); - - ctrlr0 &= 0xfcc0; - ctrlr0 |= 0xf | (SPI_FRF_SPI << SPI_FRF_OFFSET) - | (SPI_TMOD_TO << SPI_TMOD_OFFSET); - dw_writel(pspi, ctrl0, ctrlr0); - - /* - * Change the spi0 clk to comply with 115200 bps, use 100000 to - * calculate the clk dividor to make the clock a little slower - * than real baud rate. - */ - dw_writel(pspi, baudr, freq/100000); - - /* Disable all INT for early phase */ - dw_writel(pspi, imr, 0x0); - - /* Set the cs to spi-uart */ - dw_writel(pspi, ser, 0x2); - - /* Enable the HW, the last step for HW init */ - dw_writel(pspi, ssienr, 0x1); - - /* Set the default configuration */ - max3110_write_config(); - - /* Register the kmsg dumper */ - if (!dumper_registered) { - dw_dumper.dump = dw_kmsg_dump; - kmsg_dump_register(&dw_dumper); - dumper_registered = 1; - } -} - -/* Slave select should be called in the read/write function */ -static void early_mrst_spi_putc(char c) -{ - unsigned int timeout; - u32 sr; - - timeout = MRST_SPI_TIMEOUT; - /* Early putc needs to make sure the TX FIFO is not full */ - while (--timeout) { - sr = dw_readl(pspi, sr); - if (!(sr & SR_TF_NOT_FULL)) - cpu_relax(); - else - break; - } - - if (!timeout) - pr_warning("MRST earlycon: timed out\n"); - else - max3110_write_data(c); -} - -/* Early SPI only uses polling mode */ -static void early_mrst_spi_write(struct console *con, const char *str, unsigned n) -{ - int i; - - for (i = 0; i < n && *str; i++) { - if (*str == '\n') - early_mrst_spi_putc('\r'); - early_mrst_spi_putc(*str); - str++; - } -} - -struct console early_mrst_console = { - .name = "earlymrst", - .write = early_mrst_spi_write, - .flags = CON_PRINTBUFFER, - .index = -1, -}; - -/* - * Following is the early console based on Medfield HSU (High - * Speed UART) device. - */ -#define HSU_PORT2_PADDR 0xffa28180 - -static void __iomem *phsu; - -void hsu_early_console_init(void) -{ - u8 lcr; - - phsu = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE, - HSU_PORT2_PADDR); - - /* Disable FIFO */ - writeb(0x0, phsu + UART_FCR); - - /* Set to default 115200 bps, 8n1 */ - lcr = readb(phsu + UART_LCR); - writeb((0x80 | lcr), phsu + UART_LCR); - writeb(0x18, phsu + UART_DLL); - writeb(lcr, phsu + UART_LCR); - writel(0x3600, phsu + UART_MUL*4); - - writeb(0x8, phsu + UART_MCR); - writeb(0x7, phsu + UART_FCR); - writeb(0x3, phsu + UART_LCR); - - /* Clear IRQ status */ - readb(phsu + UART_LSR); - readb(phsu + UART_RX); - readb(phsu + UART_IIR); - readb(phsu + UART_MSR); - - /* Enable FIFO */ - writeb(0x7, phsu + UART_FCR); -} - -#define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE) - -static void early_hsu_putc(char ch) -{ - unsigned int timeout = 10000; /* 10ms */ - u8 status; - - while (--timeout) { - status = readb(phsu + UART_LSR); - if (status & BOTH_EMPTY) - break; - udelay(1); - } - - /* Only write the char when there was no timeout */ - if (timeout) - writeb(ch, phsu + UART_TX); -} - -static void early_hsu_write(struct console *con, const char *str, unsigned n) -{ - int i; - - for (i = 0; i < n && *str; i++) { - if (*str == '\n') - early_hsu_putc('\r'); - early_hsu_putc(*str); - str++; - } -} - -struct console early_hsu_console = { - .name = "earlyhsu", - .write = early_hsu_write, - .flags = CON_PRINTBUFFER, - .index = -1, -}; diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c deleted file mode 100644 index 0fe27d7c625..00000000000 --- a/arch/x86/kernel/efi.c +++ /dev/null @@ -1,613 +0,0 @@ -/* - * Common EFI (Extensible Firmware Interface) support functions - * Based on Extensible Firmware Interface Specification version 1.0 - * - * Copyright (C) 1999 VA Linux Systems - * Copyright (C) 1999 Walt Drummond <drummond@valinux.com> - * Copyright (C) 1999-2002 Hewlett-Packard Co. - * David Mosberger-Tang <davidm@hpl.hp.com> - * Stephane Eranian <eranian@hpl.hp.com> - * Copyright (C) 2005-2008 Intel Co. - * Fenghua Yu <fenghua.yu@intel.com> - * Bibo Mao <bibo.mao@intel.com> - * Chandramouli Narayanan <mouli@linux.intel.com> - * Huang Ying <ying.huang@intel.com> - * - * Copied from efi_32.c to eliminate the duplicated code between EFI - * 32/64 support code. --ying 2007-10-26 - * - * All EFI Runtime Services are not implemented yet as EFI only - * supports physical mode addressing on SoftSDV. This is to be fixed - * in a future version. --drummond 1999-07-20 - * - * Implemented EFI runtime services and virtual mode calls. --davidm - * - * Goutham Rao: <goutham.rao@intel.com> - * Skip non-WB memory and ignore empty memory ranges. - */ - -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/efi.h> -#include <linux/bootmem.h> -#include <linux/memblock.h> -#include <linux/spinlock.h> -#include <linux/uaccess.h> -#include <linux/time.h> -#include <linux/io.h> -#include <linux/reboot.h> -#include <linux/bcd.h> - -#include <asm/setup.h> -#include <asm/efi.h> -#include <asm/time.h> -#include <asm/cacheflush.h> -#include <asm/tlbflush.h> -#include <asm/x86_init.h> - -#define EFI_DEBUG 1 -#define PFX "EFI: " - -int efi_enabled; -EXPORT_SYMBOL(efi_enabled); - -struct efi efi; -EXPORT_SYMBOL(efi); - -struct efi_memory_map memmap; - -static struct efi efi_phys __initdata; -static efi_system_table_t efi_systab __initdata; - -static int __init setup_noefi(char *arg) -{ - efi_enabled = 0; - return 0; -} -early_param("noefi", setup_noefi); - -int add_efi_memmap; -EXPORT_SYMBOL(add_efi_memmap); - -static int __init setup_add_efi_memmap(char *arg) -{ - add_efi_memmap = 1; - return 0; -} -early_param("add_efi_memmap", setup_add_efi_memmap); - - -static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) -{ - return efi_call_virt2(get_time, tm, tc); -} - -static efi_status_t virt_efi_set_time(efi_time_t *tm) -{ - return efi_call_virt1(set_time, tm); -} - -static efi_status_t virt_efi_get_wakeup_time(efi_bool_t *enabled, - efi_bool_t *pending, - efi_time_t *tm) -{ - return efi_call_virt3(get_wakeup_time, - enabled, pending, tm); -} - -static efi_status_t virt_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm) -{ - return efi_call_virt2(set_wakeup_time, - enabled, tm); -} - -static efi_status_t virt_efi_get_variable(efi_char16_t *name, - efi_guid_t *vendor, - u32 *attr, - unsigned long *data_size, - void *data) -{ - return efi_call_virt5(get_variable, - name, vendor, attr, - data_size, data); -} - -static efi_status_t virt_efi_get_next_variable(unsigned long *name_size, - efi_char16_t *name, - efi_guid_t *vendor) -{ - return efi_call_virt3(get_next_variable, - name_size, name, vendor); -} - -static efi_status_t virt_efi_set_variable(efi_char16_t *name, - efi_guid_t *vendor, - unsigned long attr, - unsigned long data_size, - void *data) -{ - return efi_call_virt5(set_variable, - name, vendor, attr, - data_size, data); -} - -static efi_status_t virt_efi_get_next_high_mono_count(u32 *count) -{ - return efi_call_virt1(get_next_high_mono_count, count); -} - -static void virt_efi_reset_system(int reset_type, - efi_status_t status, - unsigned long data_size, - efi_char16_t *data) -{ - efi_call_virt4(reset_system, reset_type, status, - data_size, data); -} - -static efi_status_t virt_efi_set_virtual_address_map( - unsigned long memory_map_size, - unsigned long descriptor_size, - u32 descriptor_version, - efi_memory_desc_t *virtual_map) -{ - return efi_call_virt4(set_virtual_address_map, - memory_map_size, descriptor_size, - descriptor_version, virtual_map); -} - -static efi_status_t __init phys_efi_set_virtual_address_map( - unsigned long memory_map_size, - unsigned long descriptor_size, - u32 descriptor_version, - efi_memory_desc_t *virtual_map) -{ - efi_status_t status; - - efi_call_phys_prelog(); - status = efi_call_phys4(efi_phys.set_virtual_address_map, - memory_map_size, descriptor_size, - descriptor_version, virtual_map); - efi_call_phys_epilog(); - return status; -} - -static efi_status_t __init phys_efi_get_time(efi_time_t *tm, - efi_time_cap_t *tc) -{ - efi_status_t status; - - efi_call_phys_prelog(); - status = efi_call_phys2(efi_phys.get_time, tm, tc); - efi_call_phys_epilog(); - return status; -} - -int efi_set_rtc_mmss(unsigned long nowtime) -{ - int real_seconds, real_minutes; - efi_status_t status; - efi_time_t eft; - efi_time_cap_t cap; - - status = efi.get_time(&eft, &cap); - if (status != EFI_SUCCESS) { - printk(KERN_ERR "Oops: efitime: can't read time!\n"); - return -1; - } - - real_seconds = nowtime % 60; - real_minutes = nowtime / 60; - if (((abs(real_minutes - eft.minute) + 15)/30) & 1) - real_minutes += 30; - real_minutes %= 60; - eft.minute = real_minutes; - eft.second = real_seconds; - - status = efi.set_time(&eft); - if (status != EFI_SUCCESS) { - printk(KERN_ERR "Oops: efitime: can't write time!\n"); - return -1; - } - return 0; -} - -unsigned long efi_get_time(void) -{ - efi_status_t status; - efi_time_t eft; - efi_time_cap_t cap; - - status = efi.get_time(&eft, &cap); - if (status != EFI_SUCCESS) - printk(KERN_ERR "Oops: efitime: can't read time!\n"); - - return mktime(eft.year, eft.month, eft.day, eft.hour, - eft.minute, eft.second); -} - -/* - * Tell the kernel about the EFI memory map. This might include - * more than the max 128 entries that can fit in the e820 legacy - * (zeropage) memory map. - */ - -static void __init do_add_efi_memmap(void) -{ - void *p; - - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - efi_memory_desc_t *md = p; - unsigned long long start = md->phys_addr; - unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; - int e820_type; - - switch (md->type) { - case EFI_LOADER_CODE: - case EFI_LOADER_DATA: - case EFI_BOOT_SERVICES_CODE: - case EFI_BOOT_SERVICES_DATA: - case EFI_CONVENTIONAL_MEMORY: - if (md->attribute & EFI_MEMORY_WB) - e820_type = E820_RAM; - else - e820_type = E820_RESERVED; - break; - case EFI_ACPI_RECLAIM_MEMORY: - e820_type = E820_ACPI; - break; - case EFI_ACPI_MEMORY_NVS: - e820_type = E820_NVS; - break; - case EFI_UNUSABLE_MEMORY: - e820_type = E820_UNUSABLE; - break; - default: - /* - * EFI_RESERVED_TYPE EFI_RUNTIME_SERVICES_CODE - * EFI_RUNTIME_SERVICES_DATA EFI_MEMORY_MAPPED_IO - * EFI_MEMORY_MAPPED_IO_PORT_SPACE EFI_PAL_CODE - */ - e820_type = E820_RESERVED; - break; - } - e820_add_region(start, size, e820_type); - } - sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); -} - -void __init efi_memblock_x86_reserve_range(void) -{ - unsigned long pmap; - -#ifdef CONFIG_X86_32 - pmap = boot_params.efi_info.efi_memmap; -#else - pmap = (boot_params.efi_info.efi_memmap | - ((__u64)boot_params.efi_info.efi_memmap_hi<<32)); -#endif - memmap.phys_map = (void *)pmap; - memmap.nr_map = boot_params.efi_info.efi_memmap_size / - boot_params.efi_info.efi_memdesc_size; - memmap.desc_version = boot_params.efi_info.efi_memdesc_version; - memmap.desc_size = boot_params.efi_info.efi_memdesc_size; - memblock_x86_reserve_range(pmap, pmap + memmap.nr_map * memmap.desc_size, - "EFI memmap"); -} - -#if EFI_DEBUG -static void __init print_efi_memmap(void) -{ - efi_memory_desc_t *md; - void *p; - int i; - - for (p = memmap.map, i = 0; - p < memmap.map_end; - p += memmap.desc_size, i++) { - md = p; - printk(KERN_INFO PFX "mem%02u: type=%u, attr=0x%llx, " - "range=[0x%016llx-0x%016llx) (%lluMB)\n", - i, md->type, md->attribute, md->phys_addr, - md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT), - (md->num_pages >> (20 - EFI_PAGE_SHIFT))); - } -} -#endif /* EFI_DEBUG */ - -void __init efi_init(void) -{ - efi_config_table_t *config_tables; - efi_runtime_services_t *runtime; - efi_char16_t *c16; - char vendor[100] = "unknown"; - int i = 0; - void *tmp; - -#ifdef CONFIG_X86_32 - efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab; -#else - efi_phys.systab = (efi_system_table_t *) - (boot_params.efi_info.efi_systab | - ((__u64)boot_params.efi_info.efi_systab_hi<<32)); -#endif - - efi.systab = early_ioremap((unsigned long)efi_phys.systab, - sizeof(efi_system_table_t)); - if (efi.systab == NULL) - printk(KERN_ERR "Couldn't map the EFI system table!\n"); - memcpy(&efi_systab, efi.systab, sizeof(efi_system_table_t)); - early_iounmap(efi.systab, sizeof(efi_system_table_t)); - efi.systab = &efi_systab; - - /* - * Verify the EFI Table - */ - if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) - printk(KERN_ERR "EFI system table signature incorrect!\n"); - if ((efi.systab->hdr.revision >> 16) == 0) - printk(KERN_ERR "Warning: EFI system table version " - "%d.%02d, expected 1.00 or greater!\n", - efi.systab->hdr.revision >> 16, - efi.systab->hdr.revision & 0xffff); - - /* - * Show what we know for posterity - */ - c16 = tmp = early_ioremap(efi.systab->fw_vendor, 2); - if (c16) { - for (i = 0; i < sizeof(vendor) - 1 && *c16; ++i) - vendor[i] = *c16++; - vendor[i] = '\0'; - } else - printk(KERN_ERR PFX "Could not map the firmware vendor!\n"); - early_iounmap(tmp, 2); - - printk(KERN_INFO "EFI v%u.%.02u by %s\n", - efi.systab->hdr.revision >> 16, - efi.systab->hdr.revision & 0xffff, vendor); - - /* - * Let's see what config tables the firmware passed to us. - */ - config_tables = early_ioremap( - efi.systab->tables, - efi.systab->nr_tables * sizeof(efi_config_table_t)); - if (config_tables == NULL) - printk(KERN_ERR "Could not map EFI Configuration Table!\n"); - - printk(KERN_INFO); - for (i = 0; i < efi.systab->nr_tables; i++) { - if (!efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID)) { - efi.mps = config_tables[i].table; - printk(" MPS=0x%lx ", config_tables[i].table); - } else if (!efi_guidcmp(config_tables[i].guid, - ACPI_20_TABLE_GUID)) { - efi.acpi20 = config_tables[i].table; - printk(" ACPI 2.0=0x%lx ", config_tables[i].table); - } else if (!efi_guidcmp(config_tables[i].guid, - ACPI_TABLE_GUID)) { - efi.acpi = config_tables[i].table; - printk(" ACPI=0x%lx ", config_tables[i].table); - } else if (!efi_guidcmp(config_tables[i].guid, - SMBIOS_TABLE_GUID)) { - efi.smbios = config_tables[i].table; - printk(" SMBIOS=0x%lx ", config_tables[i].table); -#ifdef CONFIG_X86_UV - } else if (!efi_guidcmp(config_tables[i].guid, - UV_SYSTEM_TABLE_GUID)) { - efi.uv_systab = config_tables[i].table; - printk(" UVsystab=0x%lx ", config_tables[i].table); -#endif - } else if (!efi_guidcmp(config_tables[i].guid, - HCDP_TABLE_GUID)) { - efi.hcdp = config_tables[i].table; - printk(" HCDP=0x%lx ", config_tables[i].table); - } else if (!efi_guidcmp(config_tables[i].guid, - UGA_IO_PROTOCOL_GUID)) { - efi.uga = config_tables[i].table; - printk(" UGA=0x%lx ", config_tables[i].table); - } - } - printk("\n"); - early_iounmap(config_tables, - efi.systab->nr_tables * sizeof(efi_config_table_t)); - - /* - * Check out the runtime services table. We need to map - * the runtime services table so that we can grab the physical - * address of several of the EFI runtime functions, needed to - * set the firmware into virtual mode. - */ - runtime = early_ioremap((unsigned long)efi.systab->runtime, - sizeof(efi_runtime_services_t)); - if (runtime != NULL) { - /* - * We will only need *early* access to the following - * two EFI runtime services before set_virtual_address_map - * is invoked. - */ - efi_phys.get_time = (efi_get_time_t *)runtime->get_time; - efi_phys.set_virtual_address_map = - (efi_set_virtual_address_map_t *) - runtime->set_virtual_address_map; - /* - * Make efi_get_time can be called before entering - * virtual mode. - */ - efi.get_time = phys_efi_get_time; - } else - printk(KERN_ERR "Could not map the EFI runtime service " - "table!\n"); - early_iounmap(runtime, sizeof(efi_runtime_services_t)); - - /* Map the EFI memory map */ - memmap.map = early_ioremap((unsigned long)memmap.phys_map, - memmap.nr_map * memmap.desc_size); - if (memmap.map == NULL) - printk(KERN_ERR "Could not map the EFI memory map!\n"); - memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size); - - if (memmap.desc_size != sizeof(efi_memory_desc_t)) - printk(KERN_WARNING - "Kernel-defined memdesc doesn't match the one from EFI!\n"); - - if (add_efi_memmap) - do_add_efi_memmap(); - -#ifdef CONFIG_X86_32 - x86_platform.get_wallclock = efi_get_time; - x86_platform.set_wallclock = efi_set_rtc_mmss; -#endif - - /* Setup for EFI runtime service */ - reboot_type = BOOT_EFI; - -#if EFI_DEBUG - print_efi_memmap(); -#endif -} - -static void __init runtime_code_page_mkexec(void) -{ - efi_memory_desc_t *md; - void *p; - u64 addr, npages; - - /* Make EFI runtime service code area executable */ - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; - - if (md->type != EFI_RUNTIME_SERVICES_CODE) - continue; - - addr = md->virt_addr; - npages = md->num_pages; - memrange_efi_to_native(&addr, &npages); - set_memory_x(addr, npages); - } -} - -/* - * This function will switch the EFI runtime services to virtual mode. - * Essentially, look through the EFI memmap and map every region that - * has the runtime attribute bit set in its memory descriptor and update - * that memory descriptor with the virtual address obtained from ioremap(). - * This enables the runtime services to be called without having to - * thunk back into physical mode for every invocation. - */ -void __init efi_enter_virtual_mode(void) -{ - efi_memory_desc_t *md; - efi_status_t status; - unsigned long size; - u64 end, systab, addr, npages, end_pfn; - void *p, *va; - - efi.systab = NULL; - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; - if (!(md->attribute & EFI_MEMORY_RUNTIME)) - continue; - - size = md->num_pages << EFI_PAGE_SHIFT; - end = md->phys_addr + size; - - end_pfn = PFN_UP(end); - if (end_pfn <= max_low_pfn_mapped - || (end_pfn > (1UL << (32 - PAGE_SHIFT)) - && end_pfn <= max_pfn_mapped)) - va = __va(md->phys_addr); - else - va = efi_ioremap(md->phys_addr, size, md->type); - - md->virt_addr = (u64) (unsigned long) va; - - if (!va) { - printk(KERN_ERR PFX "ioremap of 0x%llX failed!\n", - (unsigned long long)md->phys_addr); - continue; - } - - if (!(md->attribute & EFI_MEMORY_WB)) { - addr = md->virt_addr; - npages = md->num_pages; - memrange_efi_to_native(&addr, &npages); - set_memory_uc(addr, npages); - } - - systab = (u64) (unsigned long) efi_phys.systab; - if (md->phys_addr <= systab && systab < end) { - systab += md->virt_addr - md->phys_addr; - efi.systab = (efi_system_table_t *) (unsigned long) systab; - } - } - - BUG_ON(!efi.systab); - - status = phys_efi_set_virtual_address_map( - memmap.desc_size * memmap.nr_map, - memmap.desc_size, - memmap.desc_version, - memmap.phys_map); - - if (status != EFI_SUCCESS) { - printk(KERN_ALERT "Unable to switch EFI into virtual mode " - "(status=%lx)!\n", status); - panic("EFI call to SetVirtualAddressMap() failed!"); - } - - /* - * Now that EFI is in virtual mode, update the function - * pointers in the runtime service table to the new virtual addresses. - * - * Call EFI services through wrapper functions. - */ - efi.get_time = virt_efi_get_time; - efi.set_time = virt_efi_set_time; - efi.get_wakeup_time = virt_efi_get_wakeup_time; - efi.set_wakeup_time = virt_efi_set_wakeup_time; - efi.get_variable = virt_efi_get_variable; - efi.get_next_variable = virt_efi_get_next_variable; - efi.set_variable = virt_efi_set_variable; - efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count; - efi.reset_system = virt_efi_reset_system; - efi.set_virtual_address_map = virt_efi_set_virtual_address_map; - if (__supported_pte_mask & _PAGE_NX) - runtime_code_page_mkexec(); - early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size); - memmap.map = NULL; -} - -/* - * Convenience functions to obtain memory types and attributes - */ -u32 efi_mem_type(unsigned long phys_addr) -{ - efi_memory_desc_t *md; - void *p; - - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; - if ((md->phys_addr <= phys_addr) && - (phys_addr < (md->phys_addr + - (md->num_pages << EFI_PAGE_SHIFT)))) - return md->type; - } - return 0; -} - -u64 efi_mem_attributes(unsigned long phys_addr) -{ - efi_memory_desc_t *md; - void *p; - - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; - if ((md->phys_addr <= phys_addr) && - (phys_addr < (md->phys_addr + - (md->num_pages << EFI_PAGE_SHIFT)))) - return md->attribute; - } - return 0; -} diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c deleted file mode 100644 index 5cab48ee61a..00000000000 --- a/arch/x86/kernel/efi_32.c +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Extensible Firmware Interface - * - * Based on Extensible Firmware Interface Specification version 1.0 - * - * Copyright (C) 1999 VA Linux Systems - * Copyright (C) 1999 Walt Drummond <drummond@valinux.com> - * Copyright (C) 1999-2002 Hewlett-Packard Co. - * David Mosberger-Tang <davidm@hpl.hp.com> - * Stephane Eranian <eranian@hpl.hp.com> - * - * All EFI Runtime Services are not implemented yet as EFI only - * supports physical mode addressing on SoftSDV. This is to be fixed - * in a future version. --drummond 1999-07-20 - * - * Implemented EFI runtime services and virtual mode calls. --davidm - * - * Goutham Rao: <goutham.rao@intel.com> - * Skip non-WB memory and ignore empty memory ranges. - */ - -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/ioport.h> -#include <linux/efi.h> - -#include <asm/io.h> -#include <asm/page.h> -#include <asm/pgtable.h> -#include <asm/tlbflush.h> -#include <asm/efi.h> - -/* - * To make EFI call EFI runtime service in physical addressing mode we need - * prelog/epilog before/after the invocation to disable interrupt, to - * claim EFI runtime service handler exclusively and to duplicate a memory in - * low memory space say 0 - 3G. - */ - -static unsigned long efi_rt_eflags; -static pgd_t efi_bak_pg_dir_pointer[2]; - -void efi_call_phys_prelog(void) -{ - unsigned long cr4; - unsigned long temp; - struct desc_ptr gdt_descr; - - local_irq_save(efi_rt_eflags); - - /* - * If I don't have PAE, I should just duplicate two entries in page - * directory. If I have PAE, I just need to duplicate one entry in - * page directory. - */ - cr4 = read_cr4_safe(); - - if (cr4 & X86_CR4_PAE) { - efi_bak_pg_dir_pointer[0].pgd = - swapper_pg_dir[pgd_index(0)].pgd; - swapper_pg_dir[0].pgd = - swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd; - } else { - efi_bak_pg_dir_pointer[0].pgd = - swapper_pg_dir[pgd_index(0)].pgd; - efi_bak_pg_dir_pointer[1].pgd = - swapper_pg_dir[pgd_index(0x400000)].pgd; - swapper_pg_dir[pgd_index(0)].pgd = - swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd; - temp = PAGE_OFFSET + 0x400000; - swapper_pg_dir[pgd_index(0x400000)].pgd = - swapper_pg_dir[pgd_index(temp)].pgd; - } - - /* - * After the lock is released, the original page table is restored. - */ - __flush_tlb_all(); - - gdt_descr.address = __pa(get_cpu_gdt_table(0)); - gdt_descr.size = GDT_SIZE - 1; - load_gdt(&gdt_descr); -} - -void efi_call_phys_epilog(void) -{ - unsigned long cr4; - struct desc_ptr gdt_descr; - - gdt_descr.address = (unsigned long)get_cpu_gdt_table(0); - gdt_descr.size = GDT_SIZE - 1; - load_gdt(&gdt_descr); - - cr4 = read_cr4_safe(); - - if (cr4 & X86_CR4_PAE) { - swapper_pg_dir[pgd_index(0)].pgd = - efi_bak_pg_dir_pointer[0].pgd; - } else { - swapper_pg_dir[pgd_index(0)].pgd = - efi_bak_pg_dir_pointer[0].pgd; - swapper_pg_dir[pgd_index(0x400000)].pgd = - efi_bak_pg_dir_pointer[1].pgd; - } - - /* - * After the lock is released, the original page table is restored. - */ - __flush_tlb_all(); - - local_irq_restore(efi_rt_eflags); -} diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c deleted file mode 100644 index ac0621a7ac3..00000000000 --- a/arch/x86/kernel/efi_64.c +++ /dev/null @@ -1,114 +0,0 @@ -/* - * x86_64 specific EFI support functions - * Based on Extensible Firmware Interface Specification version 1.0 - * - * Copyright (C) 2005-2008 Intel Co. - * Fenghua Yu <fenghua.yu@intel.com> - * Bibo Mao <bibo.mao@intel.com> - * Chandramouli Narayanan <mouli@linux.intel.com> - * Huang Ying <ying.huang@intel.com> - * - * Code to convert EFI to E820 map has been implemented in elilo bootloader - * based on a EFI patch by Edgar Hucek. Based on the E820 map, the page table - * is setup appropriately for EFI runtime code. - * - mouli 06/14/2007. - * - */ - -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/mm.h> -#include <linux/types.h> -#include <linux/spinlock.h> -#include <linux/bootmem.h> -#include <linux/ioport.h> -#include <linux/module.h> -#include <linux/efi.h> -#include <linux/uaccess.h> -#include <linux/io.h> -#include <linux/reboot.h> - -#include <asm/setup.h> -#include <asm/page.h> -#include <asm/e820.h> -#include <asm/pgtable.h> -#include <asm/tlbflush.h> -#include <asm/proto.h> -#include <asm/efi.h> -#include <asm/cacheflush.h> -#include <asm/fixmap.h> - -static pgd_t save_pgd __initdata; -static unsigned long efi_flags __initdata; - -static void __init early_mapping_set_exec(unsigned long start, - unsigned long end, - int executable) -{ - unsigned long num_pages; - - start &= PMD_MASK; - end = (end + PMD_SIZE - 1) & PMD_MASK; - num_pages = (end - start) >> PAGE_SHIFT; - if (executable) - set_memory_x((unsigned long)__va(start), num_pages); - else - set_memory_nx((unsigned long)__va(start), num_pages); -} - -static void __init early_runtime_code_mapping_set_exec(int executable) -{ - efi_memory_desc_t *md; - void *p; - - if (!(__supported_pte_mask & _PAGE_NX)) - return; - - /* Make EFI runtime service code area executable */ - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; - if (md->type == EFI_RUNTIME_SERVICES_CODE) { - unsigned long end; - end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT); - early_mapping_set_exec(md->phys_addr, end, executable); - } - } -} - -void __init efi_call_phys_prelog(void) -{ - unsigned long vaddress; - - early_runtime_code_mapping_set_exec(1); - local_irq_save(efi_flags); - vaddress = (unsigned long)__va(0x0UL); - save_pgd = *pgd_offset_k(0x0UL); - set_pgd(pgd_offset_k(0x0UL), *pgd_offset_k(vaddress)); - __flush_tlb_all(); -} - -void __init efi_call_phys_epilog(void) -{ - /* - * After the lock is released, the original page table is restored. - */ - set_pgd(pgd_offset_k(0x0UL), save_pgd); - __flush_tlb_all(); - local_irq_restore(efi_flags); - early_runtime_code_mapping_set_exec(0); -} - -void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size, - u32 type) -{ - unsigned long last_map_pfn; - - if (type == EFI_MEMORY_MAPPED_IO) - return ioremap(phys_addr, size); - - last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size); - if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) - return NULL; - - return (void __iomem *)__va(phys_addr); -} diff --git a/arch/x86/kernel/efi_stub_32.S b/arch/x86/kernel/efi_stub_32.S deleted file mode 100644 index fbe66e626c0..00000000000 --- a/arch/x86/kernel/efi_stub_32.S +++ /dev/null @@ -1,123 +0,0 @@ -/* - * EFI call stub for IA32. - * - * This stub allows us to make EFI calls in physical mode with interrupts - * turned off. - */ - -#include <linux/linkage.h> -#include <asm/page_types.h> - -/* - * efi_call_phys(void *, ...) is a function with variable parameters. - * All the callers of this function assure that all the parameters are 4-bytes. - */ - -/* - * In gcc calling convention, EBX, ESP, EBP, ESI and EDI are all callee save. - * So we'd better save all of them at the beginning of this function and restore - * at the end no matter how many we use, because we can not assure EFI runtime - * service functions will comply with gcc calling convention, too. - */ - -.text -ENTRY(efi_call_phys) - /* - * 0. The function can only be called in Linux kernel. So CS has been - * set to 0x0010, DS and SS have been set to 0x0018. In EFI, I found - * the values of these registers are the same. And, the corresponding - * GDT entries are identical. So I will do nothing about segment reg - * and GDT, but change GDT base register in prelog and epilog. - */ - - /* - * 1. Now I am running with EIP = <physical address> + PAGE_OFFSET. - * But to make it smoothly switch from virtual mode to flat mode. - * The mapping of lower virtual memory has been created in prelog and - * epilog. - */ - movl $1f, %edx - subl $__PAGE_OFFSET, %edx - jmp *%edx -1: - - /* - * 2. Now on the top of stack is the return - * address in the caller of efi_call_phys(), then parameter 1, - * parameter 2, ..., param n. To make things easy, we save the return - * address of efi_call_phys in a global variable. - */ - popl %edx - movl %edx, saved_return_addr - /* get the function pointer into ECX*/ - popl %ecx - movl %ecx, efi_rt_function_ptr - movl $2f, %edx - subl $__PAGE_OFFSET, %edx - pushl %edx - - /* - * 3. Clear PG bit in %CR0. - */ - movl %cr0, %edx - andl $0x7fffffff, %edx - movl %edx, %cr0 - jmp 1f -1: - - /* - * 4. Adjust stack pointer. - */ - subl $__PAGE_OFFSET, %esp - - /* - * 5. Call the physical function. - */ - jmp *%ecx - -2: - /* - * 6. After EFI runtime service returns, control will return to - * following instruction. We'd better readjust stack pointer first. - */ - addl $__PAGE_OFFSET, %esp - - /* - * 7. Restore PG bit - */ - movl %cr0, %edx - orl $0x80000000, %edx - movl %edx, %cr0 - jmp 1f -1: - /* - * 8. Now restore the virtual mode from flat mode by - * adding EIP with PAGE_OFFSET. - */ - movl $1f, %edx - jmp *%edx -1: - - /* - * 9. Balance the stack. And because EAX contain the return value, - * we'd better not clobber it. - */ - leal efi_rt_function_ptr, %edx - movl (%edx), %ecx - pushl %ecx - - /* - * 10. Push the saved return address onto the stack and return. - */ - leal saved_return_addr, %edx - movl (%edx), %ecx - pushl %ecx - ret -ENDPROC(efi_call_phys) -.previous - -.data -saved_return_addr: - .long 0 -efi_rt_function_ptr: - .long 0 diff --git a/arch/x86/kernel/efi_stub_64.S b/arch/x86/kernel/efi_stub_64.S deleted file mode 100644 index 4c07ccab814..00000000000 --- a/arch/x86/kernel/efi_stub_64.S +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Function calling ABI conversion from Linux to EFI for x86_64 - * - * Copyright (C) 2007 Intel Corp - * Bibo Mao <bibo.mao@intel.com> - * Huang Ying <ying.huang@intel.com> - */ - -#include <linux/linkage.h> - -#define SAVE_XMM \ - mov %rsp, %rax; \ - subq $0x70, %rsp; \ - and $~0xf, %rsp; \ - mov %rax, (%rsp); \ - mov %cr0, %rax; \ - clts; \ - mov %rax, 0x8(%rsp); \ - movaps %xmm0, 0x60(%rsp); \ - movaps %xmm1, 0x50(%rsp); \ - movaps %xmm2, 0x40(%rsp); \ - movaps %xmm3, 0x30(%rsp); \ - movaps %xmm4, 0x20(%rsp); \ - movaps %xmm5, 0x10(%rsp) - -#define RESTORE_XMM \ - movaps 0x60(%rsp), %xmm0; \ - movaps 0x50(%rsp), %xmm1; \ - movaps 0x40(%rsp), %xmm2; \ - movaps 0x30(%rsp), %xmm3; \ - movaps 0x20(%rsp), %xmm4; \ - movaps 0x10(%rsp), %xmm5; \ - mov 0x8(%rsp), %rsi; \ - mov %rsi, %cr0; \ - mov (%rsp), %rsp - -ENTRY(efi_call0) - SAVE_XMM - subq $32, %rsp - call *%rdi - addq $32, %rsp - RESTORE_XMM - ret -ENDPROC(efi_call0) - -ENTRY(efi_call1) - SAVE_XMM - subq $32, %rsp - mov %rsi, %rcx - call *%rdi - addq $32, %rsp - RESTORE_XMM - ret -ENDPROC(efi_call1) - -ENTRY(efi_call2) - SAVE_XMM - subq $32, %rsp - mov %rsi, %rcx - call *%rdi - addq $32, %rsp - RESTORE_XMM - ret -ENDPROC(efi_call2) - -ENTRY(efi_call3) - SAVE_XMM - subq $32, %rsp - mov %rcx, %r8 - mov %rsi, %rcx - call *%rdi - addq $32, %rsp - RESTORE_XMM - ret -ENDPROC(efi_call3) - -ENTRY(efi_call4) - SAVE_XMM - subq $32, %rsp - mov %r8, %r9 - mov %rcx, %r8 - mov %rsi, %rcx - call *%rdi - addq $32, %rsp - RESTORE_XMM - ret -ENDPROC(efi_call4) - -ENTRY(efi_call5) - SAVE_XMM - subq $48, %rsp - mov %r9, 32(%rsp) - mov %r8, %r9 - mov %rcx, %r8 - mov %rsi, %rcx - call *%rdi - addq $48, %rsp - RESTORE_XMM - ret -ENDPROC(efi_call5) - -ENTRY(efi_call6) - SAVE_XMM - mov (%rsp), %rax - mov 8(%rax), %rax - subq $48, %rsp - mov %r9, 32(%rsp) - mov %rax, 40(%rsp) - mov %r8, %r9 - mov %rcx, %r8 - mov %rsi, %rcx - call *%rdi - addq $48, %rsp - RESTORE_XMM - ret -ENDPROC(efi_call6) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 59e175e8959..5c1a9197491 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -65,6 +65,8 @@ #define sysexit_audit syscall_exit_work #endif + .section .entry.text, "ax" + /* * We use macros for low-level operations which need to be overridden * for paravirtualization. The following will never clobber any registers: @@ -395,7 +397,7 @@ sysenter_past_esp: * A tiny bit of offset fixup is necessary - 4*4 means the 4 words * pushed above; +8 corresponds to copy_thread's esp0 setting. */ - pushl_cfi (TI_sysenter_return-THREAD_SIZE_asm+8+4*4)(%esp) + pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+8+4*4)(%esp) CFI_REL_OFFSET eip, 0 pushl_cfi %eax @@ -788,7 +790,7 @@ ENDPROC(ptregs_clone) */ .section .init.rodata,"a" ENTRY(interrupt) -.text +.section .entry.text, "ax" .p2align 5 .p2align CONFIG_X86_L1_CACHE_SHIFT ENTRY(irq_entries_start) @@ -807,7 +809,7 @@ vector=FIRST_EXTERNAL_VECTOR .endif .previous .long 1b - .text + .section .entry.text, "ax" vector=vector+1 .endif .endr @@ -1406,6 +1408,15 @@ ENTRY(general_protection) CFI_ENDPROC END(general_protection) +#ifdef CONFIG_KVM_GUEST +ENTRY(async_page_fault) + RING0_EC_FRAME + pushl_cfi $do_async_page_fault + jmp error_code + CFI_ENDPROC +END(async_page_fault) +#endif + /* * End of kprobes section */ diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index fe2690d71c0..8a445a0c989 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -18,7 +18,7 @@ * A note on terminology: * - top of stack: Architecture defined interrupt frame from SS to RIP * at the top of the kernel process stack. - * - partial stack frame: partially saved registers upto R11. + * - partial stack frame: partially saved registers up to R11. * - full stack frame: Like partial stack frame, but all register saved. * * Some macro usage: @@ -61,6 +61,8 @@ #define __AUDIT_ARCH_LE 0x40000000 .code64 + .section .entry.text, "ax" + #ifdef CONFIG_FUNCTION_TRACER #ifdef CONFIG_DYNAMIC_FTRACE ENTRY(mcount) @@ -295,20 +297,25 @@ ENDPROC(native_usergs_sysret64) .endm /* save partial stack frame */ + .pushsection .kprobes.text, "ax" ENTRY(save_args) XCPT_FRAME cld - movq_cfi rdi, RDI+16-ARGOFFSET - movq_cfi rsi, RSI+16-ARGOFFSET - movq_cfi rdx, RDX+16-ARGOFFSET - movq_cfi rcx, RCX+16-ARGOFFSET - movq_cfi rax, RAX+16-ARGOFFSET - movq_cfi r8, R8+16-ARGOFFSET - movq_cfi r9, R9+16-ARGOFFSET - movq_cfi r10, R10+16-ARGOFFSET - movq_cfi r11, R11+16-ARGOFFSET - - leaq -ARGOFFSET+16(%rsp),%rdi /* arg1 for handler */ + /* + * start from rbp in pt_regs and jump over + * return address. + */ + movq_cfi rdi, RDI+8-RBP + movq_cfi rsi, RSI+8-RBP + movq_cfi rdx, RDX+8-RBP + movq_cfi rcx, RCX+8-RBP + movq_cfi rax, RAX+8-RBP + movq_cfi r8, R8+8-RBP + movq_cfi r9, R9+8-RBP + movq_cfi r10, R10+8-RBP + movq_cfi r11, R11+8-RBP + + leaq -RBP+8(%rsp),%rdi /* arg1 for handler */ movq_cfi rbp, 8 /* push %rbp */ leaq 8(%rsp), %rbp /* mov %rsp, %ebp */ testl $3, CS(%rdi) @@ -334,6 +341,7 @@ ENTRY(save_args) ret CFI_ENDPROC END(save_args) + .popsection ENTRY(save_rest) PARTIAL_FRAME 1 REST_SKIP+8 @@ -414,7 +422,7 @@ ENTRY(ret_from_fork) END(ret_from_fork) /* - * System call entry. Upto 6 arguments in registers are supported. + * System call entry. Up to 6 arguments in registers are supported. * * SYSCALL does not save anything on the stack and does not change the * stack pointer. @@ -738,7 +746,7 @@ END(stub_rt_sigreturn) */ .section .init.rodata,"a" ENTRY(interrupt) - .text + .section .entry.text .p2align 5 .p2align CONFIG_X86_L1_CACHE_SHIFT ENTRY(irq_entries_start) @@ -757,7 +765,7 @@ vector=FIRST_EXTERNAL_VECTOR .endif .previous .quad 1b - .text + .section .entry.text vector=vector+1 .endif .endr @@ -780,8 +788,9 @@ END(interrupt) /* 0(%rsp): ~(interrupt number) */ .macro interrupt func - subq $ORIG_RAX-ARGOFFSET+8, %rsp - CFI_ADJUST_CFA_OFFSET ORIG_RAX-ARGOFFSET+8 + /* reserve pt_regs for scratch regs and rbp */ + subq $ORIG_RAX-RBP, %rsp + CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP call save_args PARTIAL_FRAME 0 call \func @@ -806,9 +815,14 @@ ret_from_intr: TRACE_IRQS_OFF decl PER_CPU_VAR(irq_count) leaveq + CFI_RESTORE rbp CFI_DEF_CFA_REGISTER rsp CFI_ADJUST_CFA_OFFSET -8 + + /* we did not save rbx, restore only from ARGOFFSET */ + addq $8, %rsp + CFI_ADJUST_CFA_OFFSET -8 exit_intr: GET_THREAD_INFO(%rcx) testl $3,CS-ARGOFFSET(%rsp) @@ -963,9 +977,12 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR \ x86_platform_ipi smp_x86_platform_ipi #ifdef CONFIG_SMP -.irpc idx, "01234567" +.irp idx,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \ + 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 +.if NUM_INVALIDATE_TLB_VECTORS > \idx apicinterrupt (INVALIDATE_TLB_VECTOR_START)+\idx \ invalidate_interrupt\idx smp_invalidate_interrupt +.endif .endr #endif @@ -1236,7 +1253,7 @@ ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) decl PER_CPU_VAR(irq_count) jmp error_exit CFI_ENDPROC -END(do_hypervisor_callback) +END(xen_do_hypervisor_callback) /* * Hypervisor uses this for application faults while it executes. @@ -1317,6 +1334,9 @@ errorentry xen_stack_segment do_stack_segment #endif errorentry general_protection do_general_protection errorentry page_fault do_page_fault +#ifdef CONFIG_KVM_GUEST +errorentry async_page_fault do_async_page_fault +#endif #ifdef CONFIG_X86_MCE paranoidzeroentry machine_check *machine_check_vector(%rip) #endif diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 3afb33f14d2..0ba15a6cc57 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -19,6 +19,7 @@ #include <linux/sched.h> #include <linux/init.h> #include <linux/list.h> +#include <linux/module.h> #include <trace/syscall.h> @@ -49,6 +50,7 @@ static DEFINE_PER_CPU(int, save_modifying_code); int ftrace_arch_code_modify_prepare(void) { set_kernel_text_rw(); + set_all_modules_text_rw(); modifying_code = 1; return 0; } @@ -56,6 +58,7 @@ int ftrace_arch_code_modify_prepare(void) int ftrace_arch_code_modify_post_process(void) { modifying_code = 0; + set_all_modules_text_ro(); set_kernel_text_ro(); return 0; } @@ -167,9 +170,9 @@ static void ftrace_mod_code(void) void ftrace_nmi_enter(void) { - __get_cpu_var(save_modifying_code) = modifying_code; + __this_cpu_write(save_modifying_code, modifying_code); - if (!__get_cpu_var(save_modifying_code)) + if (!__this_cpu_read(save_modifying_code)) return; if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) { @@ -183,7 +186,7 @@ void ftrace_nmi_enter(void) void ftrace_nmi_exit(void) { - if (!__get_cpu_var(save_modifying_code)) + if (!__this_cpu_read(save_modifying_code)) return; /* Finish all executions before clearing nmi_running */ @@ -257,9 +260,9 @@ do_ftrace_mod_code(unsigned long ip, void *new_code) return mod_code_status; } -static unsigned char *ftrace_nop_replace(void) +static const unsigned char *ftrace_nop_replace(void) { - return ideal_nop5; + return ideal_nops[NOP_ATOMIC5]; } static int @@ -434,18 +437,19 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, return; } - if (ftrace_push_return_trace(old, self_addr, &trace.depth, - frame_pointer) == -EBUSY) { - *parent = old; - return; - } - trace.func = self_addr; + trace.depth = current->curr_ret_stack + 1; /* Only trace if the calling function expects to */ if (!ftrace_graph_entry(&trace)) { - current->curr_ret_stack--; *parent = old; + return; + } + + if (ftrace_push_return_trace(old, self_addr, &trace.depth, + frame_pointer) == -EBUSY) { + *parent = old; + return; } } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 763310165fa..3bb08509a7a 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -23,7 +23,6 @@ static void __init i386_default_early_setup(void) { /* Initialize 32bit specific setup functions */ - x86_init.resources.probe_roms = probe_roms; x86_init.resources.reserve_resources = i386_reserve_resources; x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; @@ -34,15 +33,6 @@ void __init i386_start_kernel(void) { memblock_init(); -#ifdef CONFIG_X86_TRAMPOLINE - /* - * But first pinch a few for the stack/trampoline stuff - * FIXME: Don't need the extra page at 4K, but need to fix - * trampoline before removing it. (see the GDT stuff) - */ - memblock_x86_reserve_range(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE"); -#endif - memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); #ifdef CONFIG_BLK_DEV_INITRD @@ -61,6 +51,9 @@ void __init i386_start_kernel(void) case X86_SUBARCH_MRST: x86_mrst_early_setup(); break; + case X86_SUBARCH_CE4100: + x86_ce4100_early_setup(); + break; default: i386_default_early_setup(); break; diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 2d2673c28af..5655c2272ad 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -77,9 +77,6 @@ void __init x86_64_start_kernel(char * real_mode_data) /* Make NULL pointers segfault */ zap_identity_mappings(); - /* Cleanup the over mapped high alias */ - cleanup_highmap(); - max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT; for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) { diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index bcece91dd31..ce0be7cd085 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -60,18 +60,20 @@ #define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD) #endif +/* Number of possible pages in the lowmem region */ +LOWMEM_PAGES = (((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT) + /* Enough space to fit pagetables for the low memory linear map */ -MAPPING_BEYOND_END = \ - PAGE_TABLE_SIZE(((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT) << PAGE_SHIFT +MAPPING_BEYOND_END = PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT /* * Worst-case size of the kernel mapping we need to make: - * the worst-case size of the kernel itself, plus the extra we need - * to map for the linear map. + * a relocatable kernel can live anywhere in lowmem, so we need to be able + * to map all of lowmem. */ -KERNEL_PAGES = (KERNEL_IMAGE_SIZE + MAPPING_BEYOND_END)>>PAGE_SHIFT +KERNEL_PAGES = LOWMEM_PAGES -INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE_asm +INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE RESERVE_BRK(pagetables, INIT_MAP_SIZE) /* @@ -83,6 +85,8 @@ RESERVE_BRK(pagetables, INIT_MAP_SIZE) */ __HEAD ENTRY(startup_32) + movl pa(stack_start),%ecx + /* test KEEP_SEGMENTS flag to see if the bootloader is asking us to not reload segments */ testb $(1<<6), BP_loadflags(%esi) @@ -97,7 +101,9 @@ ENTRY(startup_32) movl %eax,%es movl %eax,%fs movl %eax,%gs + movl %eax,%ss 2: + leal -__PAGE_OFFSET(%ecx),%esp /* * Clear BSS first so that there are no surprises... @@ -124,62 +130,26 @@ ENTRY(startup_32) movsl movl pa(boot_params) + NEW_CL_POINTER,%esi andl %esi,%esi - jz 1f # No comand line + jz 1f # No command line movl $pa(boot_command_line),%edi movl $(COMMAND_LINE_SIZE/4),%ecx rep movsl 1: -#ifdef CONFIG_OLPC_OPENFIRMWARE +#ifdef CONFIG_OLPC /* save OFW's pgdir table for later use when calling into OFW */ movl %cr3, %eax movl %eax, pa(olpc_ofw_pgd) #endif -#ifdef CONFIG_PARAVIRT - /* This is can only trip for a broken bootloader... */ - cmpw $0x207, pa(boot_params + BP_version) - jb default_entry - - /* Paravirt-compatible boot parameters. Look to see what architecture - we're booting under. */ - movl pa(boot_params + BP_hardware_subarch), %eax - cmpl $num_subarch_entries, %eax - jae bad_subarch - - movl pa(subarch_entries)(,%eax,4), %eax - subl $__PAGE_OFFSET, %eax - jmp *%eax - -bad_subarch: -WEAK(lguest_entry) -WEAK(xen_entry) - /* Unknown implementation; there's really - nothing we can do at this point. */ - ud2a - - __INITDATA - -subarch_entries: - .long default_entry /* normal x86/PC */ - .long lguest_entry /* lguest hypervisor */ - .long xen_entry /* Xen hypervisor */ - .long default_entry /* Moorestown MID */ -num_subarch_entries = (. - subarch_entries) / 4 -.previous -#endif /* CONFIG_PARAVIRT */ - /* * Initialize page tables. This creates a PDE and a set of page * tables, which are located immediately beyond __brk_base. The variable * _brk_end is set up to point to the first "safe" location. * Mappings are created both at virtual address 0 (identity mapping) * and PAGE_OFFSET for up to _end. - * - * Note that the stack is not yet set up! */ -default_entry: #ifdef CONFIG_X86_PAE /* @@ -259,7 +229,42 @@ page_pde_offset = (__PAGE_OFFSET >> 20); movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax movl %eax,pa(initial_page_table+0xffc) #endif - jmp 3f + +#ifdef CONFIG_PARAVIRT + /* This is can only trip for a broken bootloader... */ + cmpw $0x207, pa(boot_params + BP_version) + jb default_entry + + /* Paravirt-compatible boot parameters. Look to see what architecture + we're booting under. */ + movl pa(boot_params + BP_hardware_subarch), %eax + cmpl $num_subarch_entries, %eax + jae bad_subarch + + movl pa(subarch_entries)(,%eax,4), %eax + subl $__PAGE_OFFSET, %eax + jmp *%eax + +bad_subarch: +WEAK(lguest_entry) +WEAK(xen_entry) + /* Unknown implementation; there's really + nothing we can do at this point. */ + ud2a + + __INITDATA + +subarch_entries: + .long default_entry /* normal x86/PC */ + .long lguest_entry /* lguest hypervisor */ + .long xen_entry /* Xen hypervisor */ + .long default_entry /* Moorestown MID */ +num_subarch_entries = (. - subarch_entries) / 4 +.previous +#else + jmp default_entry +#endif /* CONFIG_PARAVIRT */ + /* * Non-boot CPU entry point; entered from trampoline.S * We can't lgdt here, because lgdt itself uses a data segment, but @@ -279,8 +284,11 @@ ENTRY(startup_32_smp) movl %eax,%es movl %eax,%fs movl %eax,%gs + movl pa(stack_start),%ecx + movl %eax,%ss + leal -__PAGE_OFFSET(%ecx),%esp #endif /* CONFIG_SMP */ -3: +default_entry: /* * New page tables may be in 4Mbyte page mode and may @@ -314,6 +322,10 @@ ENTRY(startup_32_smp) subl $0x80000001, %eax cmpl $(0x8000ffff-0x80000001), %eax ja 6f + + /* Clear bogus XD_DISABLE bits */ + call verify_cpu + mov $0x80000001, %eax cpuid /* Execute Disable bit supported? */ @@ -340,8 +352,8 @@ ENTRY(startup_32_smp) movl %eax,%cr0 /* ..and set paging (PG) bit */ ljmp $__BOOT_CS,$1f /* Clear prefetch and normalize %eip */ 1: - /* Set up the stack pointer */ - lss stack_start,%esp + /* Shift the stack pointer to a virtual address */ + addl $__PAGE_OFFSET, %esp /* * Initialize eflags. Some BIOS's leave bits like NT set. This would @@ -353,9 +365,7 @@ ENTRY(startup_32_smp) #ifdef CONFIG_SMP cmpb $0, ready - jz 1f /* Initial CPU cleans BSS */ - jmp checkCPUtype -1: + jnz checkCPUtype #endif /* CONFIG_SMP */ /* @@ -463,14 +473,7 @@ is386: movl $2,%ecx # set MP cld # gcc2 wants the direction flag cleared at all times pushl $0 # fake return address for unwinder -#ifdef CONFIG_SMP - movb ready, %cl movb $1, ready - cmpb $0,%cl # the first CPU calls start_kernel - je 1f - movl (stack_start), %esp -1: -#endif /* CONFIG_SMP */ jmp *(initial_code) /* @@ -609,6 +612,8 @@ ignore_int: #endif iret +#include "verify_cpu.S" + __REFDATA .align 4 ENTRY(initial_code) @@ -618,7 +623,7 @@ ENTRY(initial_code) * BSS section */ __PAGE_ALIGNED_BSS - .align PAGE_SIZE_asm + .align PAGE_SIZE #ifdef CONFIG_X86_PAE initial_pg_pmd: .fill 1024*KPMDS,4,0 @@ -639,7 +644,7 @@ ENTRY(swapper_pg_dir) #ifdef CONFIG_X86_PAE __PAGE_ALIGNED_DATA /* Page-aligned for the benefit of paravirt? */ - .align PAGE_SIZE_asm + .align PAGE_SIZE ENTRY(initial_page_table) .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */ # if KPMDS == 3 @@ -657,19 +662,19 @@ ENTRY(initial_page_table) # else # error "Kernel PMDs should be 1, 2 or 3" # endif - .align PAGE_SIZE_asm /* needs to be page-sized too */ + .align PAGE_SIZE /* needs to be page-sized too */ #endif .data +.balign 4 ENTRY(stack_start) .long init_thread_union+THREAD_SIZE - .long __BOOT_DS - -ready: .byte 0 early_recursion_flag: .long 0 +ready: .byte 0 + int_msg: .asciz "Unknown interrupt or fault at: %p %p %p\n" diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 239046bd447..e11e39478a4 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -136,10 +136,9 @@ ident_complete: /* Fixup phys_base */ addq %rbp, phys_base(%rip) -#ifdef CONFIG_X86_TRAMPOLINE + /* Fixup trampoline */ addq %rbp, trampoline_level4_pgt + 0(%rip) addq %rbp, trampoline_level4_pgt + (511*8)(%rip) -#endif /* Due to ENTRY(), sometimes the empty space gets filled with * zeros. Better take a jmp than relying on empty space being diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index aff0b3c2750..6781765b3a0 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -27,6 +27,9 @@ #define HPET_DEV_FSB_CAP 0x1000 #define HPET_DEV_PERI_CAP 0x2000 +#define HPET_MIN_CYCLES 128 +#define HPET_MIN_PROG_DELTA (HPET_MIN_CYCLES + (HPET_MIN_CYCLES >> 1)) + #define EVT_TO_HPET_DEV(evt) container_of(evt, struct hpet_dev, evt) /* @@ -214,7 +217,7 @@ static void hpet_reserve_platform_timers(unsigned int id) { } /* * Common hpet info */ -static unsigned long hpet_period; +static unsigned long hpet_freq; static void hpet_legacy_set_mode(enum clock_event_mode mode, struct clock_event_device *evt); @@ -229,7 +232,6 @@ static struct clock_event_device hpet_clockevent = { .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, .set_mode = hpet_legacy_set_mode, .set_next_event = hpet_legacy_next_event, - .shift = 32, .irq = 0, .rating = 50, }; @@ -287,27 +289,12 @@ static void hpet_legacy_clockevent_register(void) hpet_enable_legacy_int(); /* - * The mult factor is defined as (include/linux/clockchips.h) - * mult/2^shift = cyc/ns (in contrast to ns/cyc in clocksource.h) - * hpet_period is in units of femtoseconds (per cycle), so - * mult/2^shift = cyc/ns = 10^6/hpet_period - * mult = (10^6 * 2^shift)/hpet_period - * mult = (FSEC_PER_NSEC << hpet_clockevent.shift)/hpet_period - */ - hpet_clockevent.mult = div_sc((unsigned long) FSEC_PER_NSEC, - hpet_period, hpet_clockevent.shift); - /* Calculate the min / max delta */ - hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, - &hpet_clockevent); - /* 5 usec minimum reprogramming delta. */ - hpet_clockevent.min_delta_ns = 5000; - - /* * Start hpet with the boot cpu mask and make it * global after the IO_APIC has been initialized. */ hpet_clockevent.cpumask = cpumask_of(smp_processor_id()); - clockevents_register_device(&hpet_clockevent); + clockevents_config_and_register(&hpet_clockevent, hpet_freq, + HPET_MIN_PROG_DELTA, 0x7FFFFFFF); global_clock_event = &hpet_clockevent; printk(KERN_DEBUG "hpet clockevent registered\n"); } @@ -393,22 +380,24 @@ static int hpet_next_event(unsigned long delta, * the wraparound into account) nor a simple count down event * mode. Further the write to the comparator register is * delayed internally up to two HPET clock cycles in certain - * chipsets (ATI, ICH9,10). We worked around that by reading - * back the compare register, but that required another - * workaround for ICH9,10 chips where the first readout after - * write can return the old stale value. We already have a - * minimum delta of 5us enforced, but a NMI or SMI hitting + * chipsets (ATI, ICH9,10). Some newer AMD chipsets have even + * longer delays. We worked around that by reading back the + * compare register, but that required another workaround for + * ICH9,10 chips where the first readout after write can + * return the old stale value. We already had a minimum + * programming delta of 5us enforced, but a NMI or SMI hitting * between the counter readout and the comparator write can * move us behind that point easily. Now instead of reading * the compare register back several times, we make the ETIME * decision based on the following: Return ETIME if the - * counter value after the write is less than 8 HPET cycles + * counter value after the write is less than HPET_MIN_CYCLES * away from the event or if the counter is already ahead of - * the event. + * the event. The minimum programming delta for the generic + * clockevents code is set to 1.5 * HPET_MIN_CYCLES. */ res = (s32)(cnt - hpet_readl(HPET_COUNTER)); - return res < 8 ? -ETIME : 0; + return res < HPET_MIN_CYCLES ? -ETIME : 0; } static void hpet_legacy_set_mode(enum clock_event_mode mode, @@ -497,7 +486,7 @@ static int hpet_assign_irq(struct hpet_dev *dev) if (!irq) return -EINVAL; - set_irq_data(irq, dev); + irq_set_handler_data(irq, dev); if (hpet_setup_msi_irq(irq)) return -EINVAL; @@ -543,7 +532,6 @@ static int hpet_setup_irq(struct hpet_dev *dev) static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu) { struct clock_event_device *evt = &hdev->evt; - uint64_t hpet_freq; WARN_ON(cpu != smp_processor_id()); if (!(hdev->flags & HPET_DEV_VALID)) @@ -565,24 +553,10 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu) evt->set_mode = hpet_msi_set_mode; evt->set_next_event = hpet_msi_next_event; - evt->shift = 32; - - /* - * The period is a femto seconds value. We need to calculate the - * scaled math multiplication factor for nanosecond to hpet tick - * conversion. - */ - hpet_freq = FSEC_PER_SEC; - do_div(hpet_freq, hpet_period); - evt->mult = div_sc((unsigned long) hpet_freq, - NSEC_PER_SEC, evt->shift); - /* Calculate the max delta */ - evt->max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, evt); - /* 5 usec minimum reprogramming delta. */ - evt->min_delta_ns = 5000; - evt->cpumask = cpumask_of(hdev->cpu); - clockevents_register_device(evt); + + clockevents_config_and_register(evt, hpet_freq, HPET_MIN_PROG_DELTA, + 0x7FFFFFFF); } #ifdef CONFIG_HPET @@ -713,7 +687,7 @@ static int hpet_cpuhp_notify(struct notifier_block *n, switch (action & 0xf) { case CPU_ONLINE: - INIT_DELAYED_WORK_ON_STACK(&work.work, hpet_work); + INIT_DELAYED_WORK_ONSTACK(&work.work, hpet_work); init_completion(&work.complete); /* FIXME: add schedule_work_on() */ schedule_delayed_work_on(cpu, &work.work, 0); @@ -786,7 +760,6 @@ static struct clocksource clocksource_hpet = { static int hpet_clocksource_register(void) { u64 start, now; - u64 hpet_freq; cycle_t t1; /* Start the counter */ @@ -813,24 +786,7 @@ static int hpet_clocksource_register(void) return -ENODEV; } - /* - * The definition of mult is (include/linux/clocksource.h) - * mult/2^shift = ns/cyc and hpet_period is in units of fsec/cyc - * so we first need to convert hpet_period to ns/cyc units: - * mult/2^shift = ns/cyc = hpet_period/10^6 - * mult = (hpet_period * 2^shift)/10^6 - * mult = (hpet_period << shift)/FSEC_PER_NSEC - */ - - /* Need to convert hpet_period (fsec/cyc) to cyc/sec: - * - * cyc/sec = FSEC_PER_SEC/hpet_period(fsec/cyc) - * cyc/sec = (FSEC_PER_NSEC * NSEC_PER_SEC)/hpet_period - */ - hpet_freq = FSEC_PER_SEC; - do_div(hpet_freq, hpet_period); clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq); - return 0; } @@ -839,7 +795,9 @@ static int hpet_clocksource_register(void) */ int __init hpet_enable(void) { + unsigned long hpet_period; unsigned int id; + u64 freq; int i; if (!is_hpet_capable()) @@ -878,6 +836,14 @@ int __init hpet_enable(void) goto out_nohpet; /* + * The period is a femto seconds value. Convert it to a + * frequency. + */ + freq = FSEC_PER_SEC; + do_div(freq, hpet_period); + hpet_freq = freq; + + /* * Read the HPET ID register to retrieve the IRQ routing * information and the number of channels */ diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index ff15c9dcc25..02f07634d26 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -122,7 +122,7 @@ int arch_install_hw_breakpoint(struct perf_event *bp) return -EBUSY; set_debugreg(info->address, i); - __get_cpu_var(cpu_debugreg[i]) = info->address; + __this_cpu_write(cpu_debugreg[i], info->address); dr7 = &__get_cpu_var(cpu_dr7); *dr7 |= encode_dr7(i, info->len, info->type); @@ -397,12 +397,12 @@ void flush_ptrace_hw_breakpoint(struct task_struct *tsk) void hw_breakpoint_restore(void) { - set_debugreg(__get_cpu_var(cpu_debugreg[0]), 0); - set_debugreg(__get_cpu_var(cpu_debugreg[1]), 1); - set_debugreg(__get_cpu_var(cpu_debugreg[2]), 2); - set_debugreg(__get_cpu_var(cpu_debugreg[3]), 3); + set_debugreg(__this_cpu_read(cpu_debugreg[0]), 0); + set_debugreg(__this_cpu_read(cpu_debugreg[1]), 1); + set_debugreg(__this_cpu_read(cpu_debugreg[2]), 2); + set_debugreg(__this_cpu_read(cpu_debugreg[3]), 3); set_debugreg(current->thread.debugreg6, 6); - set_debugreg(__get_cpu_var(cpu_dr7), 7); + set_debugreg(__this_cpu_read(cpu_dr7), 7); } EXPORT_SYMBOL_GPL(hw_breakpoint_restore); @@ -433,6 +433,10 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args) dr6_p = (unsigned long *)ERR_PTR(args->err); dr6 = *dr6_p; + /* If it's a single step, TRAP bits are random */ + if (dr6 & DR_STEP) + return NOTIFY_DONE; + /* Do an early return if no trap bits are set in DR6 */ if ((dr6 & DR_TRAP_BITS) == 0) return NOTIFY_DONE; diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 58bb239a2fd..12aff253768 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -145,7 +145,7 @@ EXPORT_SYMBOL_GPL(fpu_finit); * The _current_ task is using the FPU for the first time * so initialize it and set the mxcsr to its default * value at reset if we support XMM instructions and then - * remeber the current task has used the FPU. + * remember the current task has used the FPU. */ int init_fpu(struct task_struct *tsk) { @@ -169,6 +169,7 @@ int init_fpu(struct task_struct *tsk) set_stopped_child_used_math(tsk); return 0; } +EXPORT_SYMBOL_GPL(init_fpu); /* * The xstateregs_active() routine is the same as the fpregs_active() routine, diff --git a/arch/x86/kernel/i8237.c b/arch/x86/kernel/i8237.c index b42ca694dc6..8eeaa81de06 100644 --- a/arch/x86/kernel/i8237.c +++ b/arch/x86/kernel/i8237.c @@ -10,7 +10,7 @@ */ #include <linux/init.h> -#include <linux/sysdev.h> +#include <linux/syscore_ops.h> #include <asm/dma.h> @@ -21,7 +21,7 @@ * in asm/dma.h. */ -static int i8237A_resume(struct sys_device *dev) +static void i8237A_resume(void) { unsigned long flags; int i; @@ -41,31 +41,15 @@ static int i8237A_resume(struct sys_device *dev) enable_dma(4); release_dma_lock(flags); - - return 0; } -static int i8237A_suspend(struct sys_device *dev, pm_message_t state) -{ - return 0; -} - -static struct sysdev_class i8237_sysdev_class = { - .name = "i8237", - .suspend = i8237A_suspend, +static struct syscore_ops i8237_syscore_ops = { .resume = i8237A_resume, }; -static struct sys_device device_i8237A = { - .id = 0, - .cls = &i8237_sysdev_class, -}; - -static int __init i8237A_init_sysfs(void) +static int __init i8237A_init_ops(void) { - int error = sysdev_class_register(&i8237_sysdev_class); - if (!error) - error = sysdev_register(&device_i8237A); - return error; + register_syscore_ops(&i8237_syscore_ops); + return 0; } -device_initcall(i8237A_init_sysfs); +device_initcall(i8237A_init_ops); diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index 2dfd3159744..fb66dc9e36c 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -93,7 +93,6 @@ static struct clock_event_device pit_ce = { .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, .set_mode = init_pit_timer, .set_next_event = pit_next_event, - .shift = 32, .irq = 0, }; @@ -108,90 +107,12 @@ void __init setup_pit_timer(void) * IO_APIC has been initialized. */ pit_ce.cpumask = cpumask_of(smp_processor_id()); - pit_ce.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, pit_ce.shift); - pit_ce.max_delta_ns = clockevent_delta2ns(0x7FFF, &pit_ce); - pit_ce.min_delta_ns = clockevent_delta2ns(0xF, &pit_ce); - clockevents_register_device(&pit_ce); + clockevents_config_and_register(&pit_ce, CLOCK_TICK_RATE, 0xF, 0x7FFF); global_clock_event = &pit_ce; } #ifndef CONFIG_X86_64 -/* - * Since the PIT overflows every tick, its not very useful - * to just read by itself. So use jiffies to emulate a free - * running counter: - */ -static cycle_t pit_read(struct clocksource *cs) -{ - static int old_count; - static u32 old_jifs; - unsigned long flags; - int count; - u32 jifs; - - raw_spin_lock_irqsave(&i8253_lock, flags); - /* - * Although our caller may have the read side of xtime_lock, - * this is now a seqlock, and we are cheating in this routine - * by having side effects on state that we cannot undo if - * there is a collision on the seqlock and our caller has to - * retry. (Namely, old_jifs and old_count.) So we must treat - * jiffies as volatile despite the lock. We read jiffies - * before latching the timer count to guarantee that although - * the jiffies value might be older than the count (that is, - * the counter may underflow between the last point where - * jiffies was incremented and the point where we latch the - * count), it cannot be newer. - */ - jifs = jiffies; - outb_pit(0x00, PIT_MODE); /* latch the count ASAP */ - count = inb_pit(PIT_CH0); /* read the latched count */ - count |= inb_pit(PIT_CH0) << 8; - - /* VIA686a test code... reset the latch if count > max + 1 */ - if (count > LATCH) { - outb_pit(0x34, PIT_MODE); - outb_pit(LATCH & 0xff, PIT_CH0); - outb_pit(LATCH >> 8, PIT_CH0); - count = LATCH - 1; - } - - /* - * It's possible for count to appear to go the wrong way for a - * couple of reasons: - * - * 1. The timer counter underflows, but we haven't handled the - * resulting interrupt and incremented jiffies yet. - * 2. Hardware problem with the timer, not giving us continuous time, - * the counter does small "jumps" upwards on some Pentium systems, - * (see c't 95/10 page 335 for Neptun bug.) - * - * Previous attempts to handle these cases intelligently were - * buggy, so we just do the simple thing now. - */ - if (count > old_count && jifs == old_jifs) - count = old_count; - - old_count = count; - old_jifs = jifs; - - raw_spin_unlock_irqrestore(&i8253_lock, flags); - - count = (LATCH - 1) - count; - - return (cycle_t)(jifs * LATCH) + count; -} - -static struct clocksource pit_cs = { - .name = "pit", - .rating = 110, - .read = pit_read, - .mask = CLOCKSOURCE_MASK(32), - .mult = 0, - .shift = 20, -}; - static int __init init_pit_clocksource(void) { /* @@ -205,10 +126,7 @@ static int __init init_pit_clocksource(void) pit_ce.mode != CLOCK_EVT_MODE_PERIODIC) return 0; - pit_cs.mult = clocksource_hz2mult(CLOCK_TICK_RATE, pit_cs.shift); - - return clocksource_register(&pit_cs); + return clocksource_i8253_init(); } arch_initcall(init_pit_clocksource); - #endif /* !CONFIG_X86_64 */ diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 20757cb2efa..65b8f5c2eeb 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -8,7 +8,7 @@ #include <linux/random.h> #include <linux/init.h> #include <linux/kernel_stat.h> -#include <linux/sysdev.h> +#include <linux/syscore_ops.h> #include <linux/bitops.h> #include <linux/acpi.h> #include <linux/io.h> @@ -112,7 +112,7 @@ static void make_8259A_irq(unsigned int irq) { disable_irq_nosync(irq); io_apic_irqs &= ~(1<<irq); - set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq, + irq_set_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq, i8259A_chip.name); enable_irq(irq); } @@ -245,20 +245,19 @@ static void save_ELCR(char *trigger) trigger[1] = inb(0x4d1) & 0xDE; } -static int i8259A_resume(struct sys_device *dev) +static void i8259A_resume(void) { init_8259A(i8259A_auto_eoi); restore_ELCR(irq_trigger); - return 0; } -static int i8259A_suspend(struct sys_device *dev, pm_message_t state) +static int i8259A_suspend(void) { save_ELCR(irq_trigger); return 0; } -static int i8259A_shutdown(struct sys_device *dev) +static void i8259A_shutdown(void) { /* Put the i8259A into a quiescent state that * the kernel initialization code can get it @@ -266,21 +265,14 @@ static int i8259A_shutdown(struct sys_device *dev) */ outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */ - return 0; } -static struct sysdev_class i8259_sysdev_class = { - .name = "i8259", +static struct syscore_ops i8259_syscore_ops = { .suspend = i8259A_suspend, .resume = i8259A_resume, .shutdown = i8259A_shutdown, }; -static struct sys_device device_i8259A = { - .id = 0, - .cls = &i8259_sysdev_class, -}; - static void mask_8259A(void) { unsigned long flags; @@ -399,17 +391,12 @@ struct legacy_pic default_legacy_pic = { struct legacy_pic *legacy_pic = &default_legacy_pic; -static int __init i8259A_init_sysfs(void) +static int __init i8259A_init_ops(void) { - int error; - - if (legacy_pic != &default_legacy_pic) - return 0; + if (legacy_pic == &default_legacy_pic) + register_syscore_ops(&i8259_syscore_ops); - error = sysdev_class_register(&i8259_sysdev_class); - if (!error) - error = sysdev_register(&device_i8259A); - return error; + return 0; } -device_initcall(i8259A_init_sysfs); +device_initcall(i8259A_init_ops); diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 8eec0ec59af..8c968974253 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -14,22 +14,9 @@ #include <linux/slab.h> #include <linux/thread_info.h> #include <linux/syscalls.h> +#include <linux/bitmap.h> #include <asm/syscalls.h> -/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ -static void set_bitmap(unsigned long *bitmap, unsigned int base, - unsigned int extent, int new_value) -{ - unsigned int i; - - for (i = base; i < base + extent; i++) { - if (new_value) - __set_bit(i, bitmap); - else - __clear_bit(i, bitmap); - } -} - /* * this changes the io permissions bitmap in the current task. */ @@ -69,7 +56,10 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) */ tss = &per_cpu(init_tss, get_cpu()); - set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); + if (turn_on) + bitmap_clear(t->io_bitmap_ptr, from, num); + else + bitmap_set(t->io_bitmap_ptr, from, num); /* * Search for a (possibly new) maximum. This is simple and stupid, diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 83ec0175f98..6c0802eb2f7 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -4,9 +4,11 @@ #include <linux/cpu.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> +#include <linux/of.h> #include <linux/seq_file.h> #include <linux/smp.h> #include <linux/ftrace.h> +#include <linux/delay.h> #include <asm/apic.h> #include <asm/io_apic.h> @@ -43,9 +45,9 @@ void ack_bad_irq(unsigned int irq) #define irq_stats(x) (&per_cpu(irq_stat, x)) /* - * /proc/interrupts printing: + * /proc/interrupts printing for arch specific interrupts */ -static int show_other_interrupts(struct seq_file *p, int prec) +int arch_show_interrupts(struct seq_file *p, int prec) { int j; @@ -121,59 +123,6 @@ static int show_other_interrupts(struct seq_file *p, int prec) return 0; } -int show_interrupts(struct seq_file *p, void *v) -{ - unsigned long flags, any_count = 0; - int i = *(loff_t *) v, j, prec; - struct irqaction *action; - struct irq_desc *desc; - - if (i > nr_irqs) - return 0; - - for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec) - j *= 10; - - if (i == nr_irqs) - return show_other_interrupts(p, prec); - - /* print header */ - if (i == 0) { - seq_printf(p, "%*s", prec + 8, ""); - for_each_online_cpu(j) - seq_printf(p, "CPU%-8d", j); - seq_putc(p, '\n'); - } - - desc = irq_to_desc(i); - if (!desc) - return 0; - - raw_spin_lock_irqsave(&desc->lock, flags); - for_each_online_cpu(j) - any_count |= kstat_irqs_cpu(i, j); - action = desc->action; - if (!action && !any_count) - goto out; - - seq_printf(p, "%*d: ", prec, i); - for_each_online_cpu(j) - seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); - seq_printf(p, " %8s", desc->irq_data.chip->name); - seq_printf(p, "-%-8s", desc->name); - - if (action) { - seq_printf(p, " %s", action->name); - while ((action = action->next) != NULL) - seq_printf(p, ", %s", action->name); - } - - seq_putc(p, '\n'); -out: - raw_spin_unlock_irqrestore(&desc->lock, flags); - return 0; -} - /* * /proc/stat helpers */ @@ -234,7 +183,7 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs) exit_idle(); irq_enter(); - irq = __get_cpu_var(vector_irq)[vector]; + irq = __this_cpu_read(vector_irq[vector]); if (!handle_irq(irq, regs)) { ack_APIC_irq(); @@ -283,6 +232,7 @@ void fixup_irqs(void) static int warned; struct irq_desc *desc; struct irq_data *data; + struct irq_chip *chip; for_each_irq_desc(irq, desc) { int break_affinity = 0; @@ -297,10 +247,10 @@ void fixup_irqs(void) /* interrupt's are disabled at this point */ raw_spin_lock(&desc->lock); - data = &desc->irq_data; + data = irq_desc_get_irq_data(desc); affinity = data->affinity; - if (!irq_has_action(irq) || - cpumask_equal(affinity, cpu_online_mask)) { + if (!irq_has_action(irq) || irqd_is_per_cpu(data) || + cpumask_subset(affinity, cpu_online_mask)) { raw_spin_unlock(&desc->lock); continue; } @@ -317,16 +267,18 @@ void fixup_irqs(void) affinity = cpu_all_mask; } - if (!(desc->status & IRQ_MOVE_PCNTXT) && data->chip->irq_mask) - data->chip->irq_mask(data); + chip = irq_data_get_irq_chip(data); + if (!irqd_can_move_in_process_context(data) && chip->irq_mask) + chip->irq_mask(data); - if (data->chip->irq_set_affinity) - data->chip->irq_set_affinity(data, affinity, true); + if (chip->irq_set_affinity) + chip->irq_set_affinity(data, affinity, true); else if (!(warned++)) set_affinity = 0; - if (!(desc->status & IRQ_MOVE_PCNTXT) && data->chip->irq_unmask) - data->chip->irq_unmask(data); + if (!irqd_can_move_in_process_context(data) && + !irqd_irq_disabled(data) && chip->irq_unmask) + chip->irq_unmask(data); raw_spin_unlock(&desc->lock); @@ -350,17 +302,19 @@ void fixup_irqs(void) for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { unsigned int irr; - if (__get_cpu_var(vector_irq)[vector] < 0) + if (__this_cpu_read(vector_irq[vector]) < 0) continue; irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); if (irr & (1 << (vector % 32))) { - irq = __get_cpu_var(vector_irq)[vector]; + irq = __this_cpu_read(vector_irq[vector]); - data = irq_get_irq_data(irq); + desc = irq_to_desc(irq); + data = irq_desc_get_irq_data(desc); + chip = irq_data_get_irq_chip(data); raw_spin_lock(&desc->lock); - if (data->chip->irq_retrigger) - data->chip->irq_retrigger(data); + if (chip->irq_retrigger) + chip->irq_retrigger(data); raw_spin_unlock(&desc->lock); } } diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 50fbbe60e50..72090705a65 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -17,6 +17,7 @@ #include <linux/delay.h> #include <linux/uaccess.h> #include <linux/percpu.h> +#include <linux/mm.h> #include <asm/apic.h> @@ -60,9 +61,6 @@ union irq_ctx { static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx); static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx); -static DEFINE_PER_CPU_MULTIPAGE_ALIGNED(union irq_ctx, hardirq_stack, THREAD_SIZE); -static DEFINE_PER_CPU_MULTIPAGE_ALIGNED(union irq_ctx, softirq_stack, THREAD_SIZE); - static void call_on_stack(void *func, void *stack) { asm volatile("xchgl %%ebx,%%esp \n" @@ -81,7 +79,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) u32 *isp, arg1, arg2; curctx = (union irq_ctx *) current_thread_info(); - irqctx = __get_cpu_var(hardirq_ctx); + irqctx = __this_cpu_read(hardirq_ctx); /* * this is where we switch to the IRQ stack. However, if we are @@ -128,20 +126,21 @@ void __cpuinit irq_ctx_init(int cpu) if (per_cpu(hardirq_ctx, cpu)) return; - irqctx = &per_cpu(hardirq_stack, cpu); - irqctx->tinfo.task = NULL; - irqctx->tinfo.exec_domain = NULL; + irqctx = page_address(alloc_pages_node(cpu_to_node(cpu), + THREAD_FLAGS, + THREAD_ORDER)); + memset(&irqctx->tinfo, 0, sizeof(struct thread_info)); irqctx->tinfo.cpu = cpu; irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); per_cpu(hardirq_ctx, cpu) = irqctx; - irqctx = &per_cpu(softirq_stack, cpu); - irqctx->tinfo.task = NULL; - irqctx->tinfo.exec_domain = NULL; + irqctx = page_address(alloc_pages_node(cpu_to_node(cpu), + THREAD_FLAGS, + THREAD_ORDER)); + memset(&irqctx->tinfo, 0, sizeof(struct thread_info)); irqctx->tinfo.cpu = cpu; - irqctx->tinfo.preempt_count = 0; irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); per_cpu(softirq_ctx, cpu) = irqctx; @@ -150,11 +149,6 @@ void __cpuinit irq_ctx_init(int cpu) cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu)); } -void irq_ctx_exit(int cpu) -{ - per_cpu(hardirq_ctx, cpu) = NULL; -} - asmlinkage void do_softirq(void) { unsigned long flags; @@ -169,7 +163,7 @@ asmlinkage void do_softirq(void) if (local_softirq_pending()) { curctx = current_thread_info(); - irqctx = __get_cpu_var(softirq_ctx); + irqctx = __this_cpu_read(softirq_ctx); irqctx->tinfo.task = curctx->task; irqctx->tinfo.previous_esp = current_stack_pointer; @@ -178,7 +172,7 @@ asmlinkage void do_softirq(void) call_on_stack(__do_softirq, isp); /* - * Shouldnt happen, we returned above if in_interrupt(): + * Shouldn't happen, we returned above if in_interrupt(): */ WARN_ON_ONCE(softirq_count()); } diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index c752e973958..f470e4ef993 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -25,6 +25,7 @@ #include <asm/setup.h> #include <asm/i8259.h> #include <asm/traps.h> +#include <asm/prom.h> /* * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: @@ -71,6 +72,7 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id) static struct irqaction fpu_irq = { .handler = math_error_irq, .name = "fpu", + .flags = IRQF_NO_THREAD, }; #endif @@ -80,6 +82,7 @@ static struct irqaction fpu_irq = { static struct irqaction irq2 = { .handler = no_action, .name = "cascade", + .flags = IRQF_NO_THREAD, }; DEFINE_PER_CPU(vector_irq_t, vector_irq) = { @@ -110,7 +113,7 @@ void __init init_ISA_irqs(void) legacy_pic->init(0); for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) - set_irq_chip_and_handler_name(i, chip, handle_level_irq, name); + irq_set_chip_and_handler_name(i, chip, handle_level_irq, name); } void __init init_IRQ(void) @@ -118,6 +121,12 @@ void __init init_IRQ(void) int i; /* + * We probably need a better place for this, but it works for + * now ... + */ + x86_add_irq_domains(); + + /* * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15. * If these IRQ's are handled by legacy interrupt-controllers like PIC, * then this configuration will likely be static after the boot. If @@ -164,14 +173,77 @@ static void __init smp_intr_init(void) alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); /* IPIs for invalidation */ - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); - alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); +#define ALLOC_INVTLB_VEC(NR) \ + alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+NR, \ + invalidate_interrupt##NR) + + switch (NUM_INVALIDATE_TLB_VECTORS) { + default: + ALLOC_INVTLB_VEC(31); + case 31: + ALLOC_INVTLB_VEC(30); + case 30: + ALLOC_INVTLB_VEC(29); + case 29: + ALLOC_INVTLB_VEC(28); + case 28: + ALLOC_INVTLB_VEC(27); + case 27: + ALLOC_INVTLB_VEC(26); + case 26: + ALLOC_INVTLB_VEC(25); + case 25: + ALLOC_INVTLB_VEC(24); + case 24: + ALLOC_INVTLB_VEC(23); + case 23: + ALLOC_INVTLB_VEC(22); + case 22: + ALLOC_INVTLB_VEC(21); + case 21: + ALLOC_INVTLB_VEC(20); + case 20: + ALLOC_INVTLB_VEC(19); + case 19: + ALLOC_INVTLB_VEC(18); + case 18: + ALLOC_INVTLB_VEC(17); + case 17: + ALLOC_INVTLB_VEC(16); + case 16: + ALLOC_INVTLB_VEC(15); + case 15: + ALLOC_INVTLB_VEC(14); + case 14: + ALLOC_INVTLB_VEC(13); + case 13: + ALLOC_INVTLB_VEC(12); + case 12: + ALLOC_INVTLB_VEC(11); + case 11: + ALLOC_INVTLB_VEC(10); + case 10: + ALLOC_INVTLB_VEC(9); + case 9: + ALLOC_INVTLB_VEC(8); + case 8: + ALLOC_INVTLB_VEC(7); + case 7: + ALLOC_INVTLB_VEC(6); + case 6: + ALLOC_INVTLB_VEC(5); + case 5: + ALLOC_INVTLB_VEC(4); + case 4: + ALLOC_INVTLB_VEC(3); + case 3: + ALLOC_INVTLB_VEC(2); + case 2: + ALLOC_INVTLB_VEC(1); + case 1: + ALLOC_INVTLB_VEC(0); + break; + } /* IPI for generic function call */ alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); @@ -243,7 +315,7 @@ void __init native_init_IRQ(void) set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); } - if (!acpi_ioapic) + if (!acpi_ioapic && !of_ioapic) setup_irq(2, &irq2); #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index 961b6b30ba9..3fee346ef54 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -34,7 +34,7 @@ void arch_jump_label_transform(struct jump_entry *entry, code.offset = entry->target - (entry->code + JUMP_LABEL_NOP_SIZE); } else - memcpy(&code, ideal_nop5, JUMP_LABEL_NOP_SIZE); + memcpy(&code, ideal_nops[NOP_ATOMIC5], JUMP_LABEL_NOP_SIZE); get_online_cpus(); mutex_lock(&text_mutex); text_poke_smp((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE); @@ -44,7 +44,8 @@ void arch_jump_label_transform(struct jump_entry *entry, void arch_jump_label_text_poke_early(jump_label_t addr) { - text_poke_early((void *)addr, ideal_nop5, JUMP_LABEL_NOP_SIZE); + text_poke_early((void *)addr, ideal_nops[NOP_ATOMIC5], + JUMP_LABEL_NOP_SIZE); } #endif diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index d81cfebb848..5f9ecff328b 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -48,6 +48,7 @@ #include <asm/apicdef.h> #include <asm/system.h> #include <asm/apic.h> +#include <asm/nmi.h> struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = { @@ -120,8 +121,8 @@ char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs) memcpy(mem, (void *)regs + dbg_reg_def[regno].offset, dbg_reg_def[regno].size); - switch (regno) { #ifdef CONFIG_X86_32 + switch (regno) { case GDB_SS: if (!user_mode_vm(regs)) *(unsigned long *)mem = __KERNEL_DS; @@ -134,8 +135,8 @@ char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs) case GDB_FS: *(unsigned long *)mem = 0xFFFF; break; -#endif } +#endif return dbg_reg_def[regno].name; } @@ -277,7 +278,7 @@ static int hw_break_release_slot(int breakno) pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); if (dbg_release_bp_slot(*pevent)) /* - * The debugger is responisble for handing the retry on + * The debugger is responsible for handing the retry on * remove failure. */ return -1; @@ -315,14 +316,18 @@ static void kgdb_remove_all_hw_break(void) if (!breakinfo[i].enabled) continue; bp = *per_cpu_ptr(breakinfo[i].pev, cpu); - if (bp->attr.disabled == 1) + if (!bp->attr.disabled) { + arch_uninstall_hw_breakpoint(bp); + bp->attr.disabled = 1; continue; + } if (dbg_is_early) early_dr7 &= ~encode_dr7(i, breakinfo[i].len, breakinfo[i].type); - else - arch_uninstall_hw_breakpoint(bp); - bp->attr.disabled = 1; + else if (hw_break_release_slot(i)) + printk(KERN_ERR "KGDB: hw bpt remove failed %lx\n", + breakinfo[i].addr); + breakinfo[i].enabled = 0; } } @@ -387,7 +392,7 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) * disable hardware debugging while it is processing gdb packets or * handling exception. */ -void kgdb_disable_hw_debug(struct pt_regs *regs) +static void kgdb_disable_hw_debug(struct pt_regs *regs) { int i; int cpu = raw_smp_processor_id(); @@ -521,10 +526,6 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd) } return NOTIFY_DONE; - case DIE_NMI_IPI: - /* Just ignore, we will handle the roundup on DIE_NMI. */ - return NOTIFY_DONE; - case DIE_NMIUNKNOWN: if (was_in_debug_nmi[raw_smp_processor_id()]) { was_in_debug_nmi[raw_smp_processor_id()] = 0; @@ -532,15 +533,6 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd) } return NOTIFY_DONE; - case DIE_NMIWATCHDOG: - if (atomic_read(&kgdb_active) != -1) { - /* KGDB CPU roundup: */ - kgdb_nmicallback(raw_smp_processor_id(), regs); - return NOTIFY_STOP; - } - /* Enter debugger: */ - break; - case DIE_DEBUG: if (atomic_read(&kgdb_cpu_doing_single_step) != -1) { if (user_mode(regs)) @@ -602,7 +594,7 @@ static struct notifier_block kgdb_notifier = { /* * Lowest-prio notifier priority, we want to be notified last: */ - .priority = -INT_MAX, + .priority = NMI_LOCAL_LOW_PRIOR, }; /** @@ -724,6 +716,7 @@ struct kgdb_arch arch_kgdb_ops = { .flags = KGDB_HW_BREAKPOINT, .set_hw_breakpoint = kgdb_set_hw_break, .remove_hw_breakpoint = kgdb_remove_hw_break, + .disable_hw_break = kgdb_disable_hw_debug, .remove_all_hw_break = kgdb_remove_all_hw_break, .correct_hw_break = kgdb_correct_hw_break, }; diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 1cbd54c0df9..f1a6244d7d9 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -403,7 +403,7 @@ static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb) static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb) { - __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp; + __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp); kcb->kprobe_status = kcb->prev_kprobe.status; kcb->kprobe_old_flags = kcb->prev_kprobe.old_flags; kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags; @@ -412,7 +412,7 @@ static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb) static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb) { - __get_cpu_var(current_kprobe) = p; + __this_cpu_write(current_kprobe, p); kcb->kprobe_saved_flags = kcb->kprobe_old_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF)); if (is_IF_modifier(p->ainsn.insn)) @@ -586,7 +586,7 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) preempt_enable_no_resched(); return 1; } else if (kprobe_running()) { - p = __get_cpu_var(current_kprobe); + p = __this_cpu_read(current_kprobe); if (p->break_handler && p->break_handler(p, regs)) { setup_singlestep(p, regs, kcb, 0); return 1; @@ -759,11 +759,11 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs) orig_ret_address = (unsigned long)ri->ret_addr; if (ri->rp && ri->rp->handler) { - __get_cpu_var(current_kprobe) = &ri->rp->kp; + __this_cpu_write(current_kprobe, &ri->rp->kp); get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE; ri->ret_addr = correct_ret_addr; ri->rp->handler(ri, regs); - __get_cpu_var(current_kprobe) = NULL; + __this_cpu_write(current_kprobe, NULL); } recycle_rp_inst(ri, &empty_rp); @@ -1183,8 +1183,13 @@ static void __kprobes optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) { struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + unsigned long flags; - preempt_disable(); + /* This is possible if op is under delayed unoptimizing */ + if (kprobe_disabled(&op->kp)) + return; + + local_irq_save(flags); if (kprobe_running()) { kprobes_inc_nmissed_count(&op->kp); } else { @@ -1198,12 +1203,12 @@ static void __kprobes optimized_callback(struct optimized_kprobe *op, regs->ip = (unsigned long)op->kp.addr + INT3_SIZE; regs->orig_ax = ~0UL; - __get_cpu_var(current_kprobe) = &op->kp; + __this_cpu_write(current_kprobe, &op->kp); kcb->kprobe_status = KPROBE_HIT_ACTIVE; opt_pre_handler(&op->kp, regs); - __get_cpu_var(current_kprobe) = NULL; + __this_cpu_write(current_kprobe, NULL); } - preempt_enable_no_resched(); + local_irq_restore(flags); } static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src) @@ -1272,6 +1277,14 @@ static int __kprobes can_optimize(unsigned long paddr) if (!kallsyms_lookup_size_offset(paddr, &size, &offset)) return 0; + /* + * Do not optimize in the entry code due to the unstable + * stack handling. + */ + if ((paddr >= (unsigned long )__entry_text_start) && + (paddr < (unsigned long )__entry_text_end)) + return 0; + /* Check there is enough space for a relative jump. */ if (size - offset < RELATIVEJUMP_SIZE) return 0; @@ -1401,10 +1414,16 @@ int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op) return 0; } -/* Replace a breakpoint (int3) with a relative jump. */ -int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op) +#define MAX_OPTIMIZE_PROBES 256 +static struct text_poke_param *jump_poke_params; +static struct jump_poke_buffer { + u8 buf[RELATIVEJUMP_SIZE]; +} *jump_poke_bufs; + +static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm, + u8 *insn_buf, + struct optimized_kprobe *op) { - unsigned char jmp_code[RELATIVEJUMP_SIZE]; s32 rel = (s32)((long)op->optinsn.insn - ((long)op->kp.addr + RELATIVEJUMP_SIZE)); @@ -1412,16 +1431,79 @@ int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op) memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE, RELATIVE_ADDR_SIZE); - jmp_code[0] = RELATIVEJUMP_OPCODE; - *(s32 *)(&jmp_code[1]) = rel; + insn_buf[0] = RELATIVEJUMP_OPCODE; + *(s32 *)(&insn_buf[1]) = rel; + + tprm->addr = op->kp.addr; + tprm->opcode = insn_buf; + tprm->len = RELATIVEJUMP_SIZE; +} + +/* + * Replace breakpoints (int3) with relative jumps. + * Caller must call with locking kprobe_mutex and text_mutex. + */ +void __kprobes arch_optimize_kprobes(struct list_head *oplist) +{ + struct optimized_kprobe *op, *tmp; + int c = 0; + + list_for_each_entry_safe(op, tmp, oplist, list) { + WARN_ON(kprobe_disabled(&op->kp)); + /* Setup param */ + setup_optimize_kprobe(&jump_poke_params[c], + jump_poke_bufs[c].buf, op); + list_del_init(&op->list); + if (++c >= MAX_OPTIMIZE_PROBES) + break; + } /* * text_poke_smp doesn't support NMI/MCE code modifying. * However, since kprobes itself also doesn't support NMI/MCE * code probing, it's not a problem. */ - text_poke_smp(op->kp.addr, jmp_code, RELATIVEJUMP_SIZE); - return 0; + text_poke_smp_batch(jump_poke_params, c); +} + +static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm, + u8 *insn_buf, + struct optimized_kprobe *op) +{ + /* Set int3 to first byte for kprobes */ + insn_buf[0] = BREAKPOINT_INSTRUCTION; + memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); + + tprm->addr = op->kp.addr; + tprm->opcode = insn_buf; + tprm->len = RELATIVEJUMP_SIZE; +} + +/* + * Recover original instructions and breakpoints from relative jumps. + * Caller must call with locking kprobe_mutex. + */ +extern void arch_unoptimize_kprobes(struct list_head *oplist, + struct list_head *done_list) +{ + struct optimized_kprobe *op, *tmp; + int c = 0; + + list_for_each_entry_safe(op, tmp, oplist, list) { + /* Setup param */ + setup_unoptimize_kprobe(&jump_poke_params[c], + jump_poke_bufs[c].buf, op); + list_move(&op->list, done_list); + if (++c >= MAX_OPTIMIZE_PROBES) + break; + } + + /* + * text_poke_smp doesn't support NMI/MCE code modifying. + * However, since kprobes itself also doesn't support NMI/MCE + * code probing, it's not a problem. + */ + text_poke_smp_batch(jump_poke_params, c); } /* Replace a relative jump with a breakpoint (int3). */ @@ -1453,11 +1535,35 @@ static int __kprobes setup_detour_execution(struct kprobe *p, } return 0; } + +static int __kprobes init_poke_params(void) +{ + /* Allocate code buffer and parameter array */ + jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) * + MAX_OPTIMIZE_PROBES, GFP_KERNEL); + if (!jump_poke_bufs) + return -ENOMEM; + + jump_poke_params = kmalloc(sizeof(struct text_poke_param) * + MAX_OPTIMIZE_PROBES, GFP_KERNEL); + if (!jump_poke_params) { + kfree(jump_poke_bufs); + jump_poke_bufs = NULL; + return -ENOMEM; + } + + return 0; +} +#else /* !CONFIG_OPTPROBES */ +static int __kprobes init_poke_params(void) +{ + return 0; +} #endif int __init arch_init_kprobes(void) { - return 0; + return init_poke_params(); } int __kprobes arch_trampoline_kprobe(struct kprobe *p) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 63b0ec8d3d4..33c07b0b122 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -27,16 +27,37 @@ #include <linux/mm.h> #include <linux/highmem.h> #include <linux/hardirq.h> +#include <linux/notifier.h> +#include <linux/reboot.h> +#include <linux/hash.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/kprobes.h> #include <asm/timer.h> +#include <asm/cpu.h> +#include <asm/traps.h> +#include <asm/desc.h> +#include <asm/tlbflush.h> #define MMU_QUEUE_SIZE 1024 +static int kvmapf = 1; + +static int parse_no_kvmapf(char *arg) +{ + kvmapf = 0; + return 0; +} + +early_param("no-kvmapf", parse_no_kvmapf); + struct kvm_para_state { u8 mmu_queue[MMU_QUEUE_SIZE]; int mmu_queue_len; }; static DEFINE_PER_CPU(struct kvm_para_state, para_state); +static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); static struct kvm_para_state *kvm_para_state(void) { @@ -50,6 +71,195 @@ static void kvm_io_delay(void) { } +#define KVM_TASK_SLEEP_HASHBITS 8 +#define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS) + +struct kvm_task_sleep_node { + struct hlist_node link; + wait_queue_head_t wq; + u32 token; + int cpu; + bool halted; + struct mm_struct *mm; +}; + +static struct kvm_task_sleep_head { + spinlock_t lock; + struct hlist_head list; +} async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE]; + +static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b, + u32 token) +{ + struct hlist_node *p; + + hlist_for_each(p, &b->list) { + struct kvm_task_sleep_node *n = + hlist_entry(p, typeof(*n), link); + if (n->token == token) + return n; + } + + return NULL; +} + +void kvm_async_pf_task_wait(u32 token) +{ + u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); + struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; + struct kvm_task_sleep_node n, *e; + DEFINE_WAIT(wait); + int cpu, idle; + + cpu = get_cpu(); + idle = idle_cpu(cpu); + put_cpu(); + + spin_lock(&b->lock); + e = _find_apf_task(b, token); + if (e) { + /* dummy entry exist -> wake up was delivered ahead of PF */ + hlist_del(&e->link); + kfree(e); + spin_unlock(&b->lock); + return; + } + + n.token = token; + n.cpu = smp_processor_id(); + n.mm = current->active_mm; + n.halted = idle || preempt_count() > 1; + atomic_inc(&n.mm->mm_count); + init_waitqueue_head(&n.wq); + hlist_add_head(&n.link, &b->list); + spin_unlock(&b->lock); + + for (;;) { + if (!n.halted) + prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE); + if (hlist_unhashed(&n.link)) + break; + + if (!n.halted) { + local_irq_enable(); + schedule(); + local_irq_disable(); + } else { + /* + * We cannot reschedule. So halt. + */ + native_safe_halt(); + local_irq_disable(); + } + } + if (!n.halted) + finish_wait(&n.wq, &wait); + + return; +} +EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait); + +static void apf_task_wake_one(struct kvm_task_sleep_node *n) +{ + hlist_del_init(&n->link); + if (!n->mm) + return; + mmdrop(n->mm); + if (n->halted) + smp_send_reschedule(n->cpu); + else if (waitqueue_active(&n->wq)) + wake_up(&n->wq); +} + +static void apf_task_wake_all(void) +{ + int i; + + for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) { + struct hlist_node *p, *next; + struct kvm_task_sleep_head *b = &async_pf_sleepers[i]; + spin_lock(&b->lock); + hlist_for_each_safe(p, next, &b->list) { + struct kvm_task_sleep_node *n = + hlist_entry(p, typeof(*n), link); + if (n->cpu == smp_processor_id()) + apf_task_wake_one(n); + } + spin_unlock(&b->lock); + } +} + +void kvm_async_pf_task_wake(u32 token) +{ + u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); + struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; + struct kvm_task_sleep_node *n; + + if (token == ~0) { + apf_task_wake_all(); + return; + } + +again: + spin_lock(&b->lock); + n = _find_apf_task(b, token); + if (!n) { + /* + * async PF was not yet handled. + * Add dummy entry for the token. + */ + n = kmalloc(sizeof(*n), GFP_ATOMIC); + if (!n) { + /* + * Allocation failed! Busy wait while other cpu + * handles async PF. + */ + spin_unlock(&b->lock); + cpu_relax(); + goto again; + } + n->token = token; + n->cpu = smp_processor_id(); + n->mm = NULL; + init_waitqueue_head(&n->wq); + hlist_add_head(&n->link, &b->list); + } else + apf_task_wake_one(n); + spin_unlock(&b->lock); + return; +} +EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake); + +u32 kvm_read_and_reset_pf_reason(void) +{ + u32 reason = 0; + + if (__get_cpu_var(apf_reason).enabled) { + reason = __get_cpu_var(apf_reason).reason; + __get_cpu_var(apf_reason).reason = 0; + } + + return reason; +} +EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason); + +dotraplinkage void __kprobes +do_async_page_fault(struct pt_regs *regs, unsigned long error_code) +{ + switch (kvm_read_and_reset_pf_reason()) { + default: + do_page_fault(regs, error_code); + break; + case KVM_PV_REASON_PAGE_NOT_PRESENT: + /* page is swapped out by the host. */ + kvm_async_pf_task_wait((u32)read_cr2()); + break; + case KVM_PV_REASON_PAGE_READY: + kvm_async_pf_task_wake((u32)read_cr2()); + break; + } +} + static void kvm_mmu_op(void *buffer, unsigned len) { int r; @@ -231,10 +441,117 @@ static void __init paravirt_ops_setup(void) #endif } +void __cpuinit kvm_guest_cpu_init(void) +{ + if (!kvm_para_available()) + return; + + if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) { + u64 pa = __pa(&__get_cpu_var(apf_reason)); + +#ifdef CONFIG_PREEMPT + pa |= KVM_ASYNC_PF_SEND_ALWAYS; +#endif + wrmsrl(MSR_KVM_ASYNC_PF_EN, pa | KVM_ASYNC_PF_ENABLED); + __get_cpu_var(apf_reason).enabled = 1; + printk(KERN_INFO"KVM setup async PF for cpu %d\n", + smp_processor_id()); + } +} + +static void kvm_pv_disable_apf(void *unused) +{ + if (!__get_cpu_var(apf_reason).enabled) + return; + + wrmsrl(MSR_KVM_ASYNC_PF_EN, 0); + __get_cpu_var(apf_reason).enabled = 0; + + printk(KERN_INFO"Unregister pv shared memory for cpu %d\n", + smp_processor_id()); +} + +static int kvm_pv_reboot_notify(struct notifier_block *nb, + unsigned long code, void *unused) +{ + if (code == SYS_RESTART) + on_each_cpu(kvm_pv_disable_apf, NULL, 1); + return NOTIFY_DONE; +} + +static struct notifier_block kvm_pv_reboot_nb = { + .notifier_call = kvm_pv_reboot_notify, +}; + +#ifdef CONFIG_SMP +static void __init kvm_smp_prepare_boot_cpu(void) +{ +#ifdef CONFIG_KVM_CLOCK + WARN_ON(kvm_register_clock("primary cpu clock")); +#endif + kvm_guest_cpu_init(); + native_smp_prepare_boot_cpu(); +} + +static void __cpuinit kvm_guest_cpu_online(void *dummy) +{ + kvm_guest_cpu_init(); +} + +static void kvm_guest_cpu_offline(void *dummy) +{ + kvm_pv_disable_apf(NULL); + apf_task_wake_all(); +} + +static int __cpuinit kvm_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + int cpu = (unsigned long)hcpu; + switch (action) { + case CPU_ONLINE: + case CPU_DOWN_FAILED: + case CPU_ONLINE_FROZEN: + smp_call_function_single(cpu, kvm_guest_cpu_online, NULL, 0); + break; + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + smp_call_function_single(cpu, kvm_guest_cpu_offline, NULL, 1); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata kvm_cpu_notifier = { + .notifier_call = kvm_cpu_notify, +}; +#endif + +static void __init kvm_apf_trap_init(void) +{ + set_intr_gate(14, &async_page_fault); +} + void __init kvm_guest_init(void) { + int i; + if (!kvm_para_available()) return; paravirt_ops_setup(); + register_reboot_notifier(&kvm_pv_reboot_nb); + for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) + spin_lock_init(&async_pf_sleepers[i].lock); + if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF)) + x86_init.irqs.trap_init = kvm_apf_trap_init; + +#ifdef CONFIG_SMP + smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; + register_cpu_notifier(&kvm_cpu_notifier); +#else + kvm_guest_cpu_init(); +#endif } diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index ca43ce31a19..6389a6bca11 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -26,8 +26,6 @@ #include <asm/x86_init.h> #include <asm/reboot.h> -#define KVM_SCALE 22 - static int kvmclock = 1; static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; @@ -120,12 +118,10 @@ static struct clocksource kvm_clock = { .read = kvm_clock_get_cycles, .rating = 400, .mask = CLOCKSOURCE_MASK(64), - .mult = 1 << KVM_SCALE, - .shift = KVM_SCALE, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; -static int kvm_register_clock(char *txt) +int kvm_register_clock(char *txt) { int cpu = smp_processor_id(); int low, high, ret; @@ -152,14 +148,6 @@ static void __cpuinit kvm_setup_secondary_clock(void) } #endif -#ifdef CONFIG_SMP -static void __init kvm_smp_prepare_boot_cpu(void) -{ - WARN_ON(kvm_register_clock("primary cpu clock")); - native_smp_prepare_boot_cpu(); -} -#endif - /* * After the clock is registered, the host will keep writing to the * registered memory location. If the guest happens to shutdown, this memory @@ -206,15 +194,12 @@ void __init kvmclock_init(void) x86_cpuinit.setup_percpu_clockev = kvm_setup_secondary_clock; #endif -#ifdef CONFIG_SMP - smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; -#endif machine_ops.shutdown = kvm_shutdown; #ifdef CONFIG_KEXEC machine_ops.crash_shutdown = kvm_crash_shutdown; #endif kvm_get_preset_lpj(); - clocksource_register(&kvm_clock); + clocksource_register_hz(&kvm_clock, NSEC_PER_SEC); pv_info.paravirt_enabled = 1; pv_info.name = "KVM"; diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c index 63eaf659623..177183cbb6a 100644 --- a/arch/x86/kernel/mca_32.c +++ b/arch/x86/kernel/mca_32.c @@ -259,7 +259,7 @@ static int __init mca_init(void) /* * WARNING: Be careful when making changes here. Putting an adapter * and the motherboard simultaneously into setup mode may result in - * damage to chips (according to The Indispensible PC Hardware Book + * damage to chips (according to The Indispensable PC Hardware Book * by Hans-Peter Messmer). Also, we disable system interrupts (so * that we are not disturbed in the middle of this). */ diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index e1af7c055c7..c5610384ab1 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -66,7 +66,6 @@ struct microcode_amd { unsigned int mpb[0]; }; -#define UCODE_MAX_SIZE 2048 #define UCODE_CONTAINER_SECTION_HDR 8 #define UCODE_CONTAINER_HEADER_SIZE 12 @@ -77,20 +76,20 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) struct cpuinfo_x86 *c = &cpu_data(cpu); u32 dummy; - memset(csig, 0, sizeof(*csig)); if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { - pr_warning("microcode: CPU%d: AMD CPU family 0x%x not " - "supported\n", cpu, c->x86); + pr_warning("CPU%d: family %d not supported\n", cpu, c->x86); return -1; } + rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy); - pr_info("CPU%d: patch_level=0x%x\n", cpu, csig->rev); + pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev); + return 0; } -static int get_matching_microcode(int cpu, void *mc, int rev) +static int get_matching_microcode(int cpu, struct microcode_header_amd *mc_hdr, + int rev) { - struct microcode_header_amd *mc_header = mc; unsigned int current_cpu_id; u16 equiv_cpu_id = 0; unsigned int i = 0; @@ -109,17 +108,17 @@ static int get_matching_microcode(int cpu, void *mc, int rev) if (!equiv_cpu_id) return 0; - if (mc_header->processor_rev_id != equiv_cpu_id) + if (mc_hdr->processor_rev_id != equiv_cpu_id) return 0; /* ucode might be chipset specific -- currently we don't support this */ - if (mc_header->nb_dev_id || mc_header->sb_dev_id) { - pr_err("CPU%d: loading of chipset specific code not yet supported\n", + if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) { + pr_err("CPU%d: chipset specific code not yet supported\n", cpu); return 0; } - if (mc_header->patch_id <= rev) + if (mc_hdr->patch_id <= rev) return 0; return 1; @@ -144,85 +143,93 @@ static int apply_microcode_amd(int cpu) /* check current patch id and patch's id for match */ if (rev != mc_amd->hdr.patch_id) { - pr_err("CPU%d: update failed (for patch_level=0x%x)\n", + pr_err("CPU%d: update failed for patch_level=0x%08x\n", cpu, mc_amd->hdr.patch_id); return -1; } - pr_info("CPU%d: updated (new patch_level=0x%x)\n", cpu, rev); + pr_info("CPU%d: new patch_level=0x%08x\n", cpu, rev); uci->cpu_sig.rev = rev; return 0; } -static int get_ucode_data(void *to, const u8 *from, size_t n) +static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size) { - memcpy(to, from, n); - return 0; + struct cpuinfo_x86 *c = &cpu_data(cpu); + unsigned int max_size, actual_size; + +#define F1XH_MPB_MAX_SIZE 2048 +#define F14H_MPB_MAX_SIZE 1824 +#define F15H_MPB_MAX_SIZE 4096 + + switch (c->x86) { + case 0x14: + max_size = F14H_MPB_MAX_SIZE; + break; + case 0x15: + max_size = F15H_MPB_MAX_SIZE; + break; + default: + max_size = F1XH_MPB_MAX_SIZE; + break; + } + + actual_size = buf[4] + (buf[5] << 8); + + if (actual_size > size || actual_size > max_size) { + pr_err("section size mismatch\n"); + return 0; + } + + return actual_size; } -static void * -get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size) +static struct microcode_header_amd * +get_next_ucode(int cpu, const u8 *buf, unsigned int size, unsigned int *mc_size) { - unsigned int total_size; - u8 section_hdr[UCODE_CONTAINER_SECTION_HDR]; - void *mc; + struct microcode_header_amd *mc = NULL; + unsigned int actual_size = 0; - if (get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR)) - return NULL; - - if (section_hdr[0] != UCODE_UCODE_TYPE) { - pr_err("error: invalid type field in container file section header\n"); - return NULL; + if (buf[0] != UCODE_UCODE_TYPE) { + pr_err("invalid type field in container file section header\n"); + goto out; } - total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8)); + actual_size = verify_ucode_size(cpu, buf, size); + if (!actual_size) + goto out; - if (total_size > size || total_size > UCODE_MAX_SIZE) { - pr_err("error: size mismatch\n"); - return NULL; - } + mc = vzalloc(actual_size); + if (!mc) + goto out; - mc = vmalloc(UCODE_MAX_SIZE); - if (mc) { - memset(mc, 0, UCODE_MAX_SIZE); - if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, - total_size)) { - vfree(mc); - mc = NULL; - } else - *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR; - } + get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, actual_size); + *mc_size = actual_size + UCODE_CONTAINER_SECTION_HDR; + +out: return mc; } static int install_equiv_cpu_table(const u8 *buf) { - u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE]; - unsigned int *buf_pos = (unsigned int *)container_hdr; - unsigned long size; - - if (get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE)) - return 0; - - size = buf_pos[2]; - - if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { - pr_err("error: invalid type field in container file section header\n"); - return 0; + unsigned int *ibuf = (unsigned int *)buf; + unsigned int type = ibuf[1]; + unsigned int size = ibuf[2]; + + if (type != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { + pr_err("empty section/" + "invalid type field in container file section header\n"); + return -EINVAL; } - equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size); + equiv_cpu_table = vmalloc(size); if (!equiv_cpu_table) { pr_err("failed to allocate equivalent CPU table\n"); - return 0; + return -ENOMEM; } - buf += UCODE_CONTAINER_HEADER_SIZE; - if (get_ucode_data(equiv_cpu_table, buf, size)) { - vfree(equiv_cpu_table); - return 0; - } + get_ucode_data(equiv_cpu_table, buf + UCODE_CONTAINER_HEADER_SIZE, size); return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */ } @@ -237,16 +244,16 @@ static enum ucode_state generic_load_microcode(int cpu, const u8 *data, size_t size) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + struct microcode_header_amd *mc_hdr = NULL; + unsigned int mc_size, leftover; + int offset; const u8 *ucode_ptr = data; void *new_mc = NULL; - void *mc; - int new_rev = uci->cpu_sig.rev; - unsigned int leftover; - unsigned long offset; + unsigned int new_rev = uci->cpu_sig.rev; enum ucode_state state = UCODE_OK; offset = install_equiv_cpu_table(ucode_ptr); - if (!offset) { + if (offset < 0) { pr_err("failed to create equivalent cpu table\n"); return UCODE_ERROR; } @@ -255,64 +262,65 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) leftover = size - offset; while (leftover) { - unsigned int uninitialized_var(mc_size); - struct microcode_header_amd *mc_header; - - mc = get_next_ucode(ucode_ptr, leftover, &mc_size); - if (!mc) + mc_hdr = get_next_ucode(cpu, ucode_ptr, leftover, &mc_size); + if (!mc_hdr) break; - mc_header = (struct microcode_header_amd *)mc; - if (get_matching_microcode(cpu, mc, new_rev)) { + if (get_matching_microcode(cpu, mc_hdr, new_rev)) { vfree(new_mc); - new_rev = mc_header->patch_id; - new_mc = mc; + new_rev = mc_hdr->patch_id; + new_mc = mc_hdr; } else - vfree(mc); + vfree(mc_hdr); ucode_ptr += mc_size; leftover -= mc_size; } - if (new_mc) { - if (!leftover) { - vfree(uci->mc); - uci->mc = new_mc; - pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", - cpu, new_rev, uci->cpu_sig.rev); - } else { - vfree(new_mc); - state = UCODE_ERROR; - } - } else + if (!new_mc) { state = UCODE_NFOUND; + goto free_table; + } + if (!leftover) { + vfree(uci->mc); + uci->mc = new_mc; + pr_debug("CPU%d update ucode (0x%08x -> 0x%08x)\n", + cpu, uci->cpu_sig.rev, new_rev); + } else { + vfree(new_mc); + state = UCODE_ERROR; + } + +free_table: free_equiv_cpu_table(); return state; } -static enum ucode_state request_microcode_fw(int cpu, struct device *device) +static enum ucode_state request_microcode_amd(int cpu, struct device *device) { const char *fw_name = "amd-ucode/microcode_amd.bin"; - const struct firmware *firmware; - enum ucode_state ret; + const struct firmware *fw; + enum ucode_state ret = UCODE_NFOUND; - if (request_firmware(&firmware, fw_name, device)) { - printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); - return UCODE_NFOUND; + if (request_firmware(&fw, fw_name, device)) { + pr_err("failed to load file %s\n", fw_name); + goto out; } - if (*(u32 *)firmware->data != UCODE_MAGIC) { - pr_err("invalid UCODE_MAGIC (0x%08x)\n", - *(u32 *)firmware->data); - return UCODE_ERROR; + ret = UCODE_ERROR; + if (*(u32 *)fw->data != UCODE_MAGIC) { + pr_err("invalid magic value (0x%08x)\n", *(u32 *)fw->data); + goto fw_release; } - ret = generic_load_microcode(cpu, firmware->data, firmware->size); + ret = generic_load_microcode(cpu, fw->data, fw->size); - release_firmware(firmware); +fw_release: + release_firmware(fw); +out: return ret; } @@ -333,7 +341,7 @@ static void microcode_fini_cpu_amd(int cpu) static struct microcode_ops microcode_amd_ops = { .request_microcode_user = request_microcode_user, - .request_microcode_fw = request_microcode_fw, + .request_microcode_fw = request_microcode_amd, .collect_cpu_info = collect_cpu_info_amd, .apply_microcode = apply_microcode_amd, .microcode_fini_cpu = microcode_fini_cpu_amd, diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 1cca374a2ba..f9242800bc8 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -82,6 +82,7 @@ #include <linux/cpu.h> #include <linux/fs.h> #include <linux/mm.h> +#include <linux/syscore_ops.h> #include <asm/microcode.h> #include <asm/processor.h> @@ -417,8 +418,10 @@ static int mc_sysdev_add(struct sys_device *sys_dev) if (err) return err; - if (microcode_init_cpu(cpu) == UCODE_ERROR) - err = -EINVAL; + if (microcode_init_cpu(cpu) == UCODE_ERROR) { + sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); + return -EINVAL; + } return err; } @@ -436,33 +439,25 @@ static int mc_sysdev_remove(struct sys_device *sys_dev) return 0; } -static int mc_sysdev_resume(struct sys_device *dev) +static struct sysdev_driver mc_sysdev_driver = { + .add = mc_sysdev_add, + .remove = mc_sysdev_remove, +}; + +/** + * mc_bp_resume - Update boot CPU microcode during resume. + */ +static void mc_bp_resume(void) { - int cpu = dev->id; + int cpu = smp_processor_id(); struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - if (!cpu_online(cpu)) - return 0; - - /* - * All non-bootup cpus are still disabled, - * so only CPU 0 will apply ucode here. - * - * Moreover, there can be no concurrent - * updates from any other places at this point. - */ - WARN_ON(cpu != 0); - if (uci->valid && uci->mc) microcode_ops->apply_microcode(cpu); - - return 0; } -static struct sysdev_driver mc_sysdev_driver = { - .add = mc_sysdev_add, - .remove = mc_sysdev_remove, - .resume = mc_sysdev_resume, +static struct syscore_ops mc_syscore_ops = { + .resume = mc_bp_resume, }; static __cpuinit int @@ -540,6 +535,7 @@ static int __init microcode_init(void) if (error) return error; + register_syscore_ops(&mc_syscore_ops); register_hotcpu_notifier(&mc_cpu_notifier); pr_info("Microcode Update Driver: v" MICROCODE_VERSION @@ -554,6 +550,7 @@ static void __exit microcode_exit(void) microcode_dev_exit(); unregister_hotcpu_notifier(&mc_cpu_notifier); + unregister_syscore_ops(&mc_syscore_ops); get_online_cpus(); mutex_lock(µcode_mutex); diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index dcb65cc0a05..1a1b606d3e9 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -364,8 +364,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, /* For performance reasons, reuse mc area when possible */ if (!mc || mc_size > curr_mc_size) { - if (mc) - vfree(mc); + vfree(mc); mc = vmalloc(mc_size); if (!mc) break; @@ -374,13 +373,11 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, if (get_ucode_data(mc, ucode_ptr, mc_size) || microcode_sanity_check(mc) < 0) { - vfree(mc); break; } if (get_matching_microcode(&uci->cpu_sig, mc, new_rev)) { - if (new_mc) - vfree(new_mc); + vfree(new_mc); new_rev = mc_header.rev; new_mc = mc; mc = NULL; /* trigger new vmalloc */ @@ -390,12 +387,10 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, leftover -= mc_size; } - if (mc) - vfree(mc); + vfree(mc); if (leftover) { - if (new_mc) - vfree(new_mc); + vfree(new_mc); state = UCODE_ERROR; goto out; } @@ -405,8 +400,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, goto out; } - if (uci->mc) - vfree(uci->mc); + vfree(uci->mc); uci->mc = (struct microcode_intel *)new_mc; pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c index 71825806cd4..ac861b8348e 100644 --- a/arch/x86/kernel/mmconf-fam10h_64.c +++ b/arch/x86/kernel/mmconf-fam10h_64.c @@ -25,7 +25,6 @@ struct pci_hostbridge_probe { }; static u64 __cpuinitdata fam10h_pci_mmconf_base; -static int __cpuinitdata fam10h_pci_mmconf_base_status; static struct pci_hostbridge_probe pci_probes[] __cpuinitdata = { { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1200 }, @@ -44,10 +43,12 @@ static int __cpuinit cmp_range(const void *x1, const void *x2) return start1 - start2; } -/*[47:0] */ -/* need to avoid (0xfd<<32) and (0xfe<<32), ht used space */ +#define MMCONF_UNIT (1ULL << FAM10H_MMIO_CONF_BASE_SHIFT) +#define MMCONF_MASK (~(MMCONF_UNIT - 1)) +#define MMCONF_SIZE (MMCONF_UNIT << 8) +/* need to avoid (0xfd<<32), (0xfe<<32), and (0xff<<32), ht used space */ #define FAM10H_PCI_MMCONF_BASE (0xfcULL<<32) -#define BASE_VALID(b) ((b != (0xfdULL << 32)) && (b != (0xfeULL << 32))) +#define BASE_VALID(b) ((b) + MMCONF_SIZE <= (0xfdULL<<32) || (b) >= (1ULL<<40)) static void __cpuinit get_fam10h_pci_mmconf_base(void) { int i; @@ -64,12 +65,11 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void) struct range range[8]; /* only try to get setting from BSP */ - /* -1 or 1 */ - if (fam10h_pci_mmconf_base_status) + if (fam10h_pci_mmconf_base) return; if (!early_pci_allowed()) - goto fail; + return; found = 0; for (i = 0; i < ARRAY_SIZE(pci_probes); i++) { @@ -91,7 +91,7 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void) } if (!found) - goto fail; + return; /* SYS_CFG */ address = MSR_K8_SYSCFG; @@ -99,16 +99,16 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void) /* TOP_MEM2 is not enabled? */ if (!(val & (1<<21))) { - tom2 = 0; + tom2 = 1ULL << 32; } else { /* TOP_MEM2 */ address = MSR_K8_TOP_MEM2; rdmsrl(address, val); - tom2 = val & (0xffffULL<<32); + tom2 = max(val & 0xffffff800000ULL, 1ULL << 32); } if (base <= tom2) - base = tom2 + (1ULL<<32); + base = (tom2 + 2 * MMCONF_UNIT - 1) & MMCONF_MASK; /* * need to check if the range is in the high mmio range that is @@ -123,11 +123,11 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void) if (!(reg & 3)) continue; - start = (((u64)reg) << 8) & (0xffULL << 32); /* 39:16 on 31:8*/ + start = (u64)(reg & 0xffffff00) << 8; /* 39:16 on 31:8*/ reg = read_pci_config(bus, slot, 1, 0x84 + (i << 3)); - end = (((u64)reg) << 8) & (0xffULL << 32); /* 39:16 on 31:8*/ + end = ((u64)(reg & 0xffffff00) << 8) | 0xffff; /* 39:16 on 31:8*/ - if (!end) + if (end < tom2) continue; range[hi_mmio_num].start = start; @@ -143,32 +143,27 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void) if (range[hi_mmio_num - 1].end < base) goto out; - if (range[0].start > base) + if (range[0].start > base + MMCONF_SIZE) goto out; /* need to find one window */ - base = range[0].start - (1ULL << 32); + base = (range[0].start & MMCONF_MASK) - MMCONF_UNIT; if ((base > tom2) && BASE_VALID(base)) goto out; - base = range[hi_mmio_num - 1].end + (1ULL << 32); - if ((base > tom2) && BASE_VALID(base)) + base = (range[hi_mmio_num - 1].end + MMCONF_UNIT) & MMCONF_MASK; + if (BASE_VALID(base)) goto out; /* need to find window between ranges */ - if (hi_mmio_num > 1) - for (i = 0; i < hi_mmio_num - 1; i++) { - if (range[i + 1].start > (range[i].end + (1ULL << 32))) { - base = range[i].end + (1ULL << 32); - if ((base > tom2) && BASE_VALID(base)) - goto out; - } + for (i = 1; i < hi_mmio_num; i++) { + base = (range[i - 1].end + MMCONF_UNIT) & MMCONF_MASK; + val = range[i].start & MMCONF_MASK; + if (val >= base + MMCONF_SIZE && BASE_VALID(base)) + goto out; } - -fail: - fam10h_pci_mmconf_base_status = -1; return; + out: fam10h_pci_mmconf_base = base; - fam10h_pci_mmconf_base_status = 1; } void __cpuinit fam10h_check_enable_mmcfg(void) @@ -190,11 +185,10 @@ void __cpuinit fam10h_check_enable_mmcfg(void) /* only trust the one handle 256 buses, if acpi=off */ if (!acpi_pci_disabled || busnbits >= 8) { - u64 base; - base = val & (0xffffULL << 32); - if (fam10h_pci_mmconf_base_status <= 0) { + u64 base = val & MMCONF_MASK; + + if (!fam10h_pci_mmconf_base) { fam10h_pci_mmconf_base = base; - fam10h_pci_mmconf_base_status = 1; return; } else if (fam10h_pci_mmconf_base == base) return; @@ -206,8 +200,10 @@ void __cpuinit fam10h_check_enable_mmcfg(void) * with 256 buses */ get_fam10h_pci_mmconf_base(); - if (fam10h_pci_mmconf_base_status <= 0) + if (!fam10h_pci_mmconf_base) { + pci_probe &= ~PCI_CHECK_ENABLE_AMD_MMCONF; return; + } printk(KERN_INFO "Enable MMCONFIG on AMD Family 10h\n"); val &= ~((FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT) | @@ -217,13 +213,13 @@ void __cpuinit fam10h_check_enable_mmcfg(void) wrmsrl(address, val); } -static int __devinit set_check_enable_amd_mmconf(const struct dmi_system_id *d) +static int __init set_check_enable_amd_mmconf(const struct dmi_system_id *d) { pci_probe |= PCI_CHECK_ENABLE_AMD_MMCONF; return 0; } -static const struct dmi_system_id __cpuinitconst mmconf_dmi_table[] = { +static const struct dmi_system_id __initconst mmconf_dmi_table[] = { { .callback = set_check_enable_amd_mmconf, .ident = "Sun Microsystems Machine", @@ -234,7 +230,8 @@ static const struct dmi_system_id __cpuinitconst mmconf_dmi_table[] = { {} }; -void __cpuinit check_enable_amd_mmconf_dmi(void) +/* Called from a __cpuinit function, but only on the BSP. */ +void __ref check_enable_amd_mmconf_dmi(void) { dmi_check_system(mmconf_dmi_table); } diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 8f295609173..52f256f2cc8 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -24,6 +24,7 @@ #include <linux/bug.h> #include <linux/mm.h> #include <linux/gfp.h> +#include <linux/jump_label.h> #include <asm/system.h> #include <asm/page.h> @@ -37,20 +38,11 @@ void *module_alloc(unsigned long size) { - struct vm_struct *area; - - if (!size) - return NULL; - size = PAGE_ALIGN(size); - if (size > MODULES_LEN) + if (PAGE_ALIGN(size) > MODULES_LEN) return NULL; - - area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END); - if (!area) - return NULL; - - return __vmalloc_area(area, GFP_KERNEL | __GFP_HIGHMEM, - PAGE_KERNEL_EXEC); + return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, + GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, + -1, __builtin_return_address(0)); } /* Free memory returned from module_alloc */ diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 9af64d9c4b6..6f9bfffb272 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -118,21 +118,8 @@ static void __init MP_bus_info(struct mpc_bus *m) static void __init MP_ioapic_info(struct mpc_ioapic *m) { - if (!(m->flags & MPC_APIC_USABLE)) - return; - - printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n", - m->apicid, m->apicver, m->apicaddr); - - mp_register_ioapic(m->apicid, m->apicaddr, gsi_top); -} - -static void print_MP_intsrc_info(struct mpc_intsrc *m) -{ - apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x," - " IRQ %02x, APIC ID %x, APIC INT %02x\n", - m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus, - m->srcbusirq, m->dstapic, m->dstirq); + if (m->flags & MPC_APIC_USABLE) + mp_register_ioapic(m->apicid, m->apicaddr, gsi_top); } static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq) @@ -144,73 +131,11 @@ static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq) mp_irq->srcbusirq, mp_irq->dstapic, mp_irq->dstirq); } -static void __init assign_to_mp_irq(struct mpc_intsrc *m, - struct mpc_intsrc *mp_irq) -{ - mp_irq->dstapic = m->dstapic; - mp_irq->type = m->type; - mp_irq->irqtype = m->irqtype; - mp_irq->irqflag = m->irqflag; - mp_irq->srcbus = m->srcbus; - mp_irq->srcbusirq = m->srcbusirq; - mp_irq->dstirq = m->dstirq; -} - -static void __init assign_to_mpc_intsrc(struct mpc_intsrc *mp_irq, - struct mpc_intsrc *m) -{ - m->dstapic = mp_irq->dstapic; - m->type = mp_irq->type; - m->irqtype = mp_irq->irqtype; - m->irqflag = mp_irq->irqflag; - m->srcbus = mp_irq->srcbus; - m->srcbusirq = mp_irq->srcbusirq; - m->dstirq = mp_irq->dstirq; -} - -static int __init mp_irq_mpc_intsrc_cmp(struct mpc_intsrc *mp_irq, - struct mpc_intsrc *m) -{ - if (mp_irq->dstapic != m->dstapic) - return 1; - if (mp_irq->type != m->type) - return 2; - if (mp_irq->irqtype != m->irqtype) - return 3; - if (mp_irq->irqflag != m->irqflag) - return 4; - if (mp_irq->srcbus != m->srcbus) - return 5; - if (mp_irq->srcbusirq != m->srcbusirq) - return 6; - if (mp_irq->dstirq != m->dstirq) - return 7; - - return 0; -} - -static void __init MP_intsrc_info(struct mpc_intsrc *m) -{ - int i; - - print_MP_intsrc_info(m); - - for (i = 0; i < mp_irq_entries; i++) { - if (!mp_irq_mpc_intsrc_cmp(&mp_irqs[i], m)) - return; - } - - assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]); - if (++mp_irq_entries == MAX_IRQ_SOURCES) - panic("Max # of irq sources exceeded!!\n"); -} #else /* CONFIG_X86_IO_APIC */ static inline void __init MP_bus_info(struct mpc_bus *m) {} static inline void __init MP_ioapic_info(struct mpc_ioapic *m) {} -static inline void __init MP_intsrc_info(struct mpc_intsrc *m) {} #endif /* CONFIG_X86_IO_APIC */ - static void __init MP_lintsrc_info(struct mpc_lintsrc *m) { apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x," @@ -222,7 +147,6 @@ static void __init MP_lintsrc_info(struct mpc_lintsrc *m) /* * Read/parse the MPC */ - static int __init smp_check_mpc(struct mpc_table *mpc, char *oem, char *str) { @@ -275,18 +199,6 @@ static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt) void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { } -static void __init smp_register_lapic_address(unsigned long address) -{ - mp_lapic_addr = address; - - set_fixmap_nocache(FIX_APIC_BASE, address); - if (boot_cpu_physical_apicid == -1U) { - boot_cpu_physical_apicid = read_apic_id(); - apic_version[boot_cpu_physical_apicid] = - GET_APIC_VERSION(apic_read(APIC_LVR)); - } -} - static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) { char str[16]; @@ -301,17 +213,13 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) #ifdef CONFIG_X86_32 generic_mps_oem_check(mpc, oem, str); #endif - /* save the local APIC address, it might be non-default */ + /* Initialize the lapic mapping */ if (!acpi_lapic) - mp_lapic_addr = mpc->lapic; + register_lapic_address(mpc->lapic); if (early) return 1; - /* Initialize the lapic mapping */ - if (!acpi_lapic) - smp_register_lapic_address(mpc->lapic); - if (mpc->oemptr) x86_init.mpparse.smp_read_mpc_oem(mpc); @@ -337,7 +245,7 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) skip_entry(&mpt, &count, sizeof(struct mpc_ioapic)); break; case MP_INTSRC: - MP_intsrc_info((struct mpc_intsrc *)mpt); + mp_save_irq((struct mpc_intsrc *)mpt); skip_entry(&mpt, &count, sizeof(struct mpc_intsrc)); break; case MP_LINTSRC: @@ -429,13 +337,13 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type) intsrc.srcbusirq = i; intsrc.dstirq = i ? i : 2; /* IRQ0 to INTIN2 */ - MP_intsrc_info(&intsrc); + mp_save_irq(&intsrc); } intsrc.irqtype = mp_ExtINT; intsrc.srcbusirq = 0; intsrc.dstirq = 0; /* 8259A to INTIN0 */ - MP_intsrc_info(&intsrc); + mp_save_irq(&intsrc); } @@ -784,11 +692,11 @@ static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) int i; apic_printk(APIC_VERBOSE, "OLD "); - print_MP_intsrc_info(m); + print_mp_irq_info(m); i = get_MP_intsrc_index(m); if (i > 0) { - assign_to_mpc_intsrc(&mp_irqs[i], m); + memcpy(m, &mp_irqs[i], sizeof(*m)); apic_printk(APIC_VERBOSE, "NEW "); print_mp_irq_info(&mp_irqs[i]); return; @@ -806,23 +714,21 @@ static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) *nr_m_spare += 1; } } -#else /* CONFIG_X86_IO_APIC */ -static -inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {} -#endif /* CONFIG_X86_IO_APIC */ -static int +static int __init check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count) { - int ret = 0; - if (!mpc_new_phys || count <= mpc_new_length) { WARN(1, "update_mptable: No spare slots (length: %x)\n", count); return -1; } - return ret; + return 0; } +#else /* CONFIG_X86_IO_APIC */ +static +inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {} +#endif /* CONFIG_X86_IO_APIC */ static int __init replace_intsrc_all(struct mpc_table *mpc, unsigned long mpc_new_phys, @@ -875,14 +781,14 @@ static int __init replace_intsrc_all(struct mpc_table *mpc, if (nr_m_spare > 0) { apic_printk(APIC_VERBOSE, "*NEW* found\n"); nr_m_spare--; - assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]); + memcpy(m_spare[nr_m_spare], &mp_irqs[i], sizeof(mp_irqs[i])); m_spare[nr_m_spare] = NULL; } else { struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; count += sizeof(struct mpc_intsrc); if (check_slot(mpc_new_phys, mpc_new_length, count) < 0) goto out; - assign_to_mpc_intsrc(&mp_irqs[i], m); + memcpy(m, &mp_irqs[i], sizeof(*m)); mpc->length = count; mpt += sizeof(struct mpc_intsrc); } @@ -975,7 +881,7 @@ static int __init update_mp_table(void) if (!mpc_new_phys) { unsigned char old, new; - /* check if we can change the postion */ + /* check if we can change the position */ mpc->checksum = 0; old = mpf_checksum((unsigned char *)mpc, mpc->length); mpc->checksum = 0xff; @@ -984,7 +890,7 @@ static int __init update_mp_table(void) printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n"); return 0; } - printk(KERN_INFO "use in-positon replacing\n"); + printk(KERN_INFO "use in-position replacing\n"); } else { mpf->physptr = mpc_new_phys; mpc_new = phys_to_virt(mpc_new_phys); diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c deleted file mode 100644 index 79ae68154e8..00000000000 --- a/arch/x86/kernel/mrst.c +++ /dev/null @@ -1,311 +0,0 @@ -/* - * mrst.c: Intel Moorestown platform specific setup code - * - * (C) Copyright 2008 Intel Corporation - * Author: Jacob Pan (jacob.jun.pan@intel.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - */ -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/sfi.h> -#include <linux/irq.h> -#include <linux/module.h> - -#include <asm/setup.h> -#include <asm/mpspec_def.h> -#include <asm/hw_irq.h> -#include <asm/apic.h> -#include <asm/io_apic.h> -#include <asm/mrst.h> -#include <asm/io.h> -#include <asm/i8259.h> -#include <asm/apb_timer.h> - -/* - * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock, - * cmdline option x86_mrst_timer can be used to override the configuration - * to prefer one or the other. - * at runtime, there are basically three timer configurations: - * 1. per cpu apbt clock only - * 2. per cpu always-on lapic clocks only, this is Penwell/Medfield only - * 3. per cpu lapic clock (C3STOP) and one apbt clock, with broadcast. - * - * by default (without cmdline option), platform code first detects cpu type - * to see if we are on lincroft or penwell, then set up both lapic or apbt - * clocks accordingly. - * i.e. by default, medfield uses configuration #2, moorestown uses #1. - * config #3 is supported but not recommended on medfield. - * - * rating and feature summary: - * lapic (with C3STOP) --------- 100 - * apbt (always-on) ------------ 110 - * lapic (always-on,ARAT) ------ 150 - */ - -__cpuinitdata enum mrst_timer_options mrst_timer_options; - -static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM]; -static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM]; -enum mrst_cpu_type __mrst_cpu_chip; -EXPORT_SYMBOL_GPL(__mrst_cpu_chip); - -int sfi_mtimer_num; - -struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX]; -EXPORT_SYMBOL_GPL(sfi_mrtc_array); -int sfi_mrtc_num; - -static inline void assign_to_mp_irq(struct mpc_intsrc *m, - struct mpc_intsrc *mp_irq) -{ - memcpy(mp_irq, m, sizeof(struct mpc_intsrc)); -} - -static inline int mp_irq_cmp(struct mpc_intsrc *mp_irq, - struct mpc_intsrc *m) -{ - return memcmp(mp_irq, m, sizeof(struct mpc_intsrc)); -} - -static void save_mp_irq(struct mpc_intsrc *m) -{ - int i; - - for (i = 0; i < mp_irq_entries; i++) { - if (!mp_irq_cmp(&mp_irqs[i], m)) - return; - } - - assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]); - if (++mp_irq_entries == MAX_IRQ_SOURCES) - panic("Max # of irq sources exceeded!!\n"); -} - -/* parse all the mtimer info to a static mtimer array */ -static int __init sfi_parse_mtmr(struct sfi_table_header *table) -{ - struct sfi_table_simple *sb; - struct sfi_timer_table_entry *pentry; - struct mpc_intsrc mp_irq; - int totallen; - - sb = (struct sfi_table_simple *)table; - if (!sfi_mtimer_num) { - sfi_mtimer_num = SFI_GET_NUM_ENTRIES(sb, - struct sfi_timer_table_entry); - pentry = (struct sfi_timer_table_entry *) sb->pentry; - totallen = sfi_mtimer_num * sizeof(*pentry); - memcpy(sfi_mtimer_array, pentry, totallen); - } - - printk(KERN_INFO "SFI: MTIMER info (num = %d):\n", sfi_mtimer_num); - pentry = sfi_mtimer_array; - for (totallen = 0; totallen < sfi_mtimer_num; totallen++, pentry++) { - printk(KERN_INFO "timer[%d]: paddr = 0x%08x, freq = %dHz," - " irq = %d\n", totallen, (u32)pentry->phys_addr, - pentry->freq_hz, pentry->irq); - if (!pentry->irq) - continue; - mp_irq.type = MP_IOAPIC; - mp_irq.irqtype = mp_INT; -/* triggering mode edge bit 2-3, active high polarity bit 0-1 */ - mp_irq.irqflag = 5; - mp_irq.srcbus = 0; - mp_irq.srcbusirq = pentry->irq; /* IRQ */ - mp_irq.dstapic = MP_APIC_ALL; - mp_irq.dstirq = pentry->irq; - save_mp_irq(&mp_irq); - } - - return 0; -} - -struct sfi_timer_table_entry *sfi_get_mtmr(int hint) -{ - int i; - if (hint < sfi_mtimer_num) { - if (!sfi_mtimer_usage[hint]) { - pr_debug("hint taken for timer %d irq %d\n",\ - hint, sfi_mtimer_array[hint].irq); - sfi_mtimer_usage[hint] = 1; - return &sfi_mtimer_array[hint]; - } - } - /* take the first timer available */ - for (i = 0; i < sfi_mtimer_num;) { - if (!sfi_mtimer_usage[i]) { - sfi_mtimer_usage[i] = 1; - return &sfi_mtimer_array[i]; - } - i++; - } - return NULL; -} - -void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr) -{ - int i; - for (i = 0; i < sfi_mtimer_num;) { - if (mtmr->irq == sfi_mtimer_array[i].irq) { - sfi_mtimer_usage[i] = 0; - return; - } - i++; - } -} - -/* parse all the mrtc info to a global mrtc array */ -int __init sfi_parse_mrtc(struct sfi_table_header *table) -{ - struct sfi_table_simple *sb; - struct sfi_rtc_table_entry *pentry; - struct mpc_intsrc mp_irq; - - int totallen; - - sb = (struct sfi_table_simple *)table; - if (!sfi_mrtc_num) { - sfi_mrtc_num = SFI_GET_NUM_ENTRIES(sb, - struct sfi_rtc_table_entry); - pentry = (struct sfi_rtc_table_entry *)sb->pentry; - totallen = sfi_mrtc_num * sizeof(*pentry); - memcpy(sfi_mrtc_array, pentry, totallen); - } - - printk(KERN_INFO "SFI: RTC info (num = %d):\n", sfi_mrtc_num); - pentry = sfi_mrtc_array; - for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) { - printk(KERN_INFO "RTC[%d]: paddr = 0x%08x, irq = %d\n", - totallen, (u32)pentry->phys_addr, pentry->irq); - mp_irq.type = MP_IOAPIC; - mp_irq.irqtype = mp_INT; - mp_irq.irqflag = 0; - mp_irq.srcbus = 0; - mp_irq.srcbusirq = pentry->irq; /* IRQ */ - mp_irq.dstapic = MP_APIC_ALL; - mp_irq.dstirq = pentry->irq; - save_mp_irq(&mp_irq); - } - return 0; -} - -static unsigned long __init mrst_calibrate_tsc(void) -{ - unsigned long flags, fast_calibrate; - - local_irq_save(flags); - fast_calibrate = apbt_quick_calibrate(); - local_irq_restore(flags); - - if (fast_calibrate) - return fast_calibrate; - - return 0; -} - -void __init mrst_time_init(void) -{ - switch (mrst_timer_options) { - case MRST_TIMER_APBT_ONLY: - break; - case MRST_TIMER_LAPIC_APBT: - x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock; - x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock; - break; - default: - if (!boot_cpu_has(X86_FEATURE_ARAT)) - break; - x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock; - x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock; - return; - } - /* we need at least one APB timer */ - sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr); - pre_init_apic_IRQ0(); - apbt_time_init(); -} - -void __init mrst_rtc_init(void) -{ - sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc); -} - -void __cpuinit mrst_arch_setup(void) -{ - if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27) - __mrst_cpu_chip = MRST_CPU_CHIP_PENWELL; - else if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x26) - __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT; - else { - pr_err("Unknown Moorestown CPU (%d:%d), default to Lincroft\n", - boot_cpu_data.x86, boot_cpu_data.x86_model); - __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT; - } - pr_debug("Moorestown CPU %s identified\n", - (__mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT) ? - "Lincroft" : "Penwell"); -} - -/* MID systems don't have i8042 controller */ -static int mrst_i8042_detect(void) -{ - return 0; -} - -/* - * Moorestown specific x86_init function overrides and early setup - * calls. - */ -void __init x86_mrst_early_setup(void) -{ - x86_init.resources.probe_roms = x86_init_noop; - x86_init.resources.reserve_resources = x86_init_noop; - - x86_init.timers.timer_init = mrst_time_init; - x86_init.timers.setup_percpu_clockev = x86_init_noop; - - x86_init.irqs.pre_vector_init = x86_init_noop; - - x86_init.oem.arch_setup = mrst_arch_setup; - - x86_cpuinit.setup_percpu_clockev = apbt_setup_secondary_clock; - - x86_platform.calibrate_tsc = mrst_calibrate_tsc; - x86_platform.i8042_detect = mrst_i8042_detect; - x86_init.pci.init = pci_mrst_init; - x86_init.pci.fixup_irqs = x86_init_noop; - - legacy_pic = &null_legacy_pic; - - /* Avoid searching for BIOS MP tables */ - x86_init.mpparse.find_smp_config = x86_init_noop; - x86_init.mpparse.get_smp_config = x86_init_uint_noop; - -} - -/* - * if user does not want to use per CPU apb timer, just give it a lower rating - * than local apic timer and skip the late per cpu timer init. - */ -static inline int __init setup_x86_mrst_timer(char *arg) -{ - if (!arg) - return -EINVAL; - - if (strcmp("apbt_only", arg) == 0) - mrst_timer_options = MRST_TIMER_APBT_ONLY; - else if (strcmp("lapic_and_apbt", arg) == 0) - mrst_timer_options = MRST_TIMER_LAPIC_APBT; - else { - pr_warning("X86 MRST timer option %s not recognised" - " use x86_mrst_timer=apbt_only or lapic_and_apbt\n", - arg); - return -EINVAL; - } - return 0; -} -__setup("x86_mrst_timer=", setup_x86_mrst_timer); diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 7bf2dc4c8f7..12fcbe2c143 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -30,7 +30,6 @@ #include <linux/init.h> #include <linux/poll.h> #include <linux/smp.h> -#include <linux/smp_lock.h> #include <linux/major.h> #include <linux/fs.h> #include <linux/device.h> diff --git a/arch/x86/kernel/olpc-xo1.c b/arch/x86/kernel/olpc-xo1.c deleted file mode 100644 index f5442c03abc..00000000000 --- a/arch/x86/kernel/olpc-xo1.c +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Support for features of the OLPC XO-1 laptop - * - * Copyright (C) 2010 One Laptop per Child - * Copyright (C) 2006 Red Hat, Inc. - * Copyright (C) 2006 Advanced Micro Devices, Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#include <linux/module.h> -#include <linux/pci.h> -#include <linux/pci_ids.h> -#include <linux/platform_device.h> -#include <linux/pm.h> - -#include <asm/io.h> -#include <asm/olpc.h> - -#define DRV_NAME "olpc-xo1" - -#define PMS_BAR 4 -#define ACPI_BAR 5 - -/* PMC registers (PMS block) */ -#define PM_SCLK 0x10 -#define PM_IN_SLPCTL 0x20 -#define PM_WKXD 0x34 -#define PM_WKD 0x30 -#define PM_SSC 0x54 - -/* PM registers (ACPI block) */ -#define PM1_CNT 0x08 -#define PM_GPE0_STS 0x18 - -static unsigned long acpi_base; -static unsigned long pms_base; - -static void xo1_power_off(void) -{ - printk(KERN_INFO "OLPC XO-1 power off sequence...\n"); - - /* Enable all of these controls with 0 delay */ - outl(0x40000000, pms_base + PM_SCLK); - outl(0x40000000, pms_base + PM_IN_SLPCTL); - outl(0x40000000, pms_base + PM_WKXD); - outl(0x40000000, pms_base + PM_WKD); - - /* Clear status bits (possibly unnecessary) */ - outl(0x0002ffff, pms_base + PM_SSC); - outl(0xffffffff, acpi_base + PM_GPE0_STS); - - /* Write SLP_EN bit to start the machinery */ - outl(0x00002000, acpi_base + PM1_CNT); -} - -/* Read the base addresses from the PCI BAR info */ -static int __devinit setup_bases(struct pci_dev *pdev) -{ - int r; - - r = pci_enable_device_io(pdev); - if (r) { - dev_err(&pdev->dev, "can't enable device IO\n"); - return r; - } - - r = pci_request_region(pdev, ACPI_BAR, DRV_NAME); - if (r) { - dev_err(&pdev->dev, "can't alloc PCI BAR #%d\n", ACPI_BAR); - return r; - } - - r = pci_request_region(pdev, PMS_BAR, DRV_NAME); - if (r) { - dev_err(&pdev->dev, "can't alloc PCI BAR #%d\n", PMS_BAR); - pci_release_region(pdev, ACPI_BAR); - return r; - } - - acpi_base = pci_resource_start(pdev, ACPI_BAR); - pms_base = pci_resource_start(pdev, PMS_BAR); - - return 0; -} - -static int __devinit olpc_xo1_probe(struct platform_device *pdev) -{ - struct pci_dev *pcidev; - int r; - - pcidev = pci_get_device(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, - NULL); - if (!pdev) - return -ENODEV; - - r = setup_bases(pcidev); - if (r) - return r; - - pm_power_off = xo1_power_off; - - printk(KERN_INFO "OLPC XO-1 support registered\n"); - return 0; -} - -static int __devexit olpc_xo1_remove(struct platform_device *pdev) -{ - pm_power_off = NULL; - return 0; -} - -static struct platform_driver olpc_xo1_driver = { - .driver = { - .name = DRV_NAME, - .owner = THIS_MODULE, - }, - .probe = olpc_xo1_probe, - .remove = __devexit_p(olpc_xo1_remove), -}; - -static int __init olpc_xo1_init(void) -{ - return platform_driver_register(&olpc_xo1_driver); -} - -static void __exit olpc_xo1_exit(void) -{ - platform_driver_unregister(&olpc_xo1_driver); -} - -MODULE_AUTHOR("Daniel Drake <dsd@laptop.org>"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS("platform:olpc-xo1"); - -module_init(olpc_xo1_init); -module_exit(olpc_xo1_exit); diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c deleted file mode 100644 index edaf3fe8dc5..00000000000 --- a/arch/x86/kernel/olpc.c +++ /dev/null @@ -1,281 +0,0 @@ -/* - * Support for the OLPC DCON and OLPC EC access - * - * Copyright © 2006 Advanced Micro Devices, Inc. - * Copyright © 2007-2008 Andres Salomon <dilinger@debian.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/delay.h> -#include <linux/spinlock.h> -#include <linux/io.h> -#include <linux/string.h> -#include <linux/platform_device.h> - -#include <asm/geode.h> -#include <asm/setup.h> -#include <asm/olpc.h> -#include <asm/olpc_ofw.h> - -struct olpc_platform_t olpc_platform_info; -EXPORT_SYMBOL_GPL(olpc_platform_info); - -static DEFINE_SPINLOCK(ec_lock); - -/* what the timeout *should* be (in ms) */ -#define EC_BASE_TIMEOUT 20 - -/* the timeout that bugs in the EC might force us to actually use */ -static int ec_timeout = EC_BASE_TIMEOUT; - -static int __init olpc_ec_timeout_set(char *str) -{ - if (get_option(&str, &ec_timeout) != 1) { - ec_timeout = EC_BASE_TIMEOUT; - printk(KERN_ERR "olpc-ec: invalid argument to " - "'olpc_ec_timeout=', ignoring!\n"); - } - printk(KERN_DEBUG "olpc-ec: using %d ms delay for EC commands.\n", - ec_timeout); - return 1; -} -__setup("olpc_ec_timeout=", olpc_ec_timeout_set); - -/* - * These {i,o}bf_status functions return whether the buffers are full or not. - */ - -static inline unsigned int ibf_status(unsigned int port) -{ - return !!(inb(port) & 0x02); -} - -static inline unsigned int obf_status(unsigned int port) -{ - return inb(port) & 0x01; -} - -#define wait_on_ibf(p, d) __wait_on_ibf(__LINE__, (p), (d)) -static int __wait_on_ibf(unsigned int line, unsigned int port, int desired) -{ - unsigned int timeo; - int state = ibf_status(port); - - for (timeo = ec_timeout; state != desired && timeo; timeo--) { - mdelay(1); - state = ibf_status(port); - } - - if ((state == desired) && (ec_timeout > EC_BASE_TIMEOUT) && - timeo < (ec_timeout - EC_BASE_TIMEOUT)) { - printk(KERN_WARNING "olpc-ec: %d: waited %u ms for IBF!\n", - line, ec_timeout - timeo); - } - - return !(state == desired); -} - -#define wait_on_obf(p, d) __wait_on_obf(__LINE__, (p), (d)) -static int __wait_on_obf(unsigned int line, unsigned int port, int desired) -{ - unsigned int timeo; - int state = obf_status(port); - - for (timeo = ec_timeout; state != desired && timeo; timeo--) { - mdelay(1); - state = obf_status(port); - } - - if ((state == desired) && (ec_timeout > EC_BASE_TIMEOUT) && - timeo < (ec_timeout - EC_BASE_TIMEOUT)) { - printk(KERN_WARNING "olpc-ec: %d: waited %u ms for OBF!\n", - line, ec_timeout - timeo); - } - - return !(state == desired); -} - -/* - * This allows the kernel to run Embedded Controller commands. The EC is - * documented at <http://wiki.laptop.org/go/Embedded_controller>, and the - * available EC commands are here: - * <http://wiki.laptop.org/go/Ec_specification>. Unfortunately, while - * OpenFirmware's source is available, the EC's is not. - */ -int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen, - unsigned char *outbuf, size_t outlen) -{ - unsigned long flags; - int ret = -EIO; - int i; - int restarts = 0; - - spin_lock_irqsave(&ec_lock, flags); - - /* Clear OBF */ - for (i = 0; i < 10 && (obf_status(0x6c) == 1); i++) - inb(0x68); - if (i == 10) { - printk(KERN_ERR "olpc-ec: timeout while attempting to " - "clear OBF flag!\n"); - goto err; - } - - if (wait_on_ibf(0x6c, 0)) { - printk(KERN_ERR "olpc-ec: timeout waiting for EC to " - "quiesce!\n"); - goto err; - } - -restart: - /* - * Note that if we time out during any IBF checks, that's a failure; - * we have to return. There's no way for the kernel to clear that. - * - * If we time out during an OBF check, we can restart the command; - * reissuing it will clear the OBF flag, and we should be alright. - * The OBF flag will sometimes misbehave due to what we believe - * is a hardware quirk.. - */ - pr_devel("olpc-ec: running cmd 0x%x\n", cmd); - outb(cmd, 0x6c); - - if (wait_on_ibf(0x6c, 0)) { - printk(KERN_ERR "olpc-ec: timeout waiting for EC to read " - "command!\n"); - goto err; - } - - if (inbuf && inlen) { - /* write data to EC */ - for (i = 0; i < inlen; i++) { - if (wait_on_ibf(0x6c, 0)) { - printk(KERN_ERR "olpc-ec: timeout waiting for" - " EC accept data!\n"); - goto err; - } - pr_devel("olpc-ec: sending cmd arg 0x%x\n", inbuf[i]); - outb(inbuf[i], 0x68); - } - } - if (outbuf && outlen) { - /* read data from EC */ - for (i = 0; i < outlen; i++) { - if (wait_on_obf(0x6c, 1)) { - printk(KERN_ERR "olpc-ec: timeout waiting for" - " EC to provide data!\n"); - if (restarts++ < 10) - goto restart; - goto err; - } - outbuf[i] = inb(0x68); - pr_devel("olpc-ec: received 0x%x\n", outbuf[i]); - } - } - - ret = 0; -err: - spin_unlock_irqrestore(&ec_lock, flags); - return ret; -} -EXPORT_SYMBOL_GPL(olpc_ec_cmd); - -static bool __init check_ofw_architecture(void) -{ - size_t propsize; - char olpc_arch[5]; - const void *args[] = { NULL, "architecture", olpc_arch, (void *)5 }; - void *res[] = { &propsize }; - - if (olpc_ofw("getprop", args, res)) { - printk(KERN_ERR "ofw: getprop call failed!\n"); - return false; - } - return propsize == 5 && strncmp("OLPC", olpc_arch, 5) == 0; -} - -static u32 __init get_board_revision(void) -{ - size_t propsize; - __be32 rev; - const void *args[] = { NULL, "board-revision-int", &rev, (void *)4 }; - void *res[] = { &propsize }; - - if (olpc_ofw("getprop", args, res) || propsize != 4) { - printk(KERN_ERR "ofw: getprop call failed!\n"); - return cpu_to_be32(0); - } - return be32_to_cpu(rev); -} - -static bool __init platform_detect(void) -{ - if (!check_ofw_architecture()) - return false; - olpc_platform_info.flags |= OLPC_F_PRESENT; - olpc_platform_info.boardrev = get_board_revision(); - return true; -} - -static int __init add_xo1_platform_devices(void) -{ - struct platform_device *pdev; - - pdev = platform_device_register_simple("xo1-rfkill", -1, NULL, 0); - if (IS_ERR(pdev)) - return PTR_ERR(pdev); - - pdev = platform_device_register_simple("olpc-xo1", -1, NULL, 0); - if (IS_ERR(pdev)) - return PTR_ERR(pdev); - - return 0; -} - -static int __init olpc_init(void) -{ - int r = 0; - - if (!olpc_ofw_present() || !platform_detect()) - return 0; - - spin_lock_init(&ec_lock); - - /* assume B1 and above models always have a DCON */ - if (olpc_board_at_least(olpc_board(0xb1))) - olpc_platform_info.flags |= OLPC_F_DCON; - - /* get the EC revision */ - olpc_ec_cmd(EC_FIRMWARE_REV, NULL, 0, - (unsigned char *) &olpc_platform_info.ecver, 1); - -#ifdef CONFIG_PCI_OLPC - /* If the VSA exists let it emulate PCI, if not emulate in kernel. - * XO-1 only. */ - if (olpc_platform_info.boardrev < olpc_board_pre(0xd0) && - !cs5535_has_vsa2()) - x86_init.pci.arch_init = pci_olpc_init; -#endif - - printk(KERN_INFO "OLPC board revision %s%X (EC=%x)\n", - ((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "", - olpc_platform_info.boardrev >> 4, - olpc_platform_info.ecver); - - if (olpc_platform_info.boardrev < olpc_board_pre(0xd0)) { /* XO-1 */ - r = add_xo1_platform_devices(); - if (r) - return r; - } - - return 0; -} - -postcore_initcall(olpc_init); diff --git a/arch/x86/kernel/olpc_ofw.c b/arch/x86/kernel/olpc_ofw.c deleted file mode 100644 index 78732046437..00000000000 --- a/arch/x86/kernel/olpc_ofw.c +++ /dev/null @@ -1,112 +0,0 @@ -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <asm/page.h> -#include <asm/setup.h> -#include <asm/io.h> -#include <asm/pgtable.h> -#include <asm/olpc_ofw.h> - -/* address of OFW callback interface; will be NULL if OFW isn't found */ -static int (*olpc_ofw_cif)(int *); - -/* page dir entry containing OFW's pgdir table; filled in by head_32.S */ -u32 olpc_ofw_pgd __initdata; - -static DEFINE_SPINLOCK(ofw_lock); - -#define MAXARGS 10 - -void __init setup_olpc_ofw_pgd(void) -{ - pgd_t *base, *ofw_pde; - - if (!olpc_ofw_cif) - return; - - /* fetch OFW's PDE */ - base = early_ioremap(olpc_ofw_pgd, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD); - if (!base) { - printk(KERN_ERR "failed to remap OFW's pgd - disabling OFW!\n"); - olpc_ofw_cif = NULL; - return; - } - ofw_pde = &base[OLPC_OFW_PDE_NR]; - - /* install OFW's PDE permanently into the kernel's pgtable */ - set_pgd(&swapper_pg_dir[OLPC_OFW_PDE_NR], *ofw_pde); - /* implicit optimization barrier here due to uninline function return */ - - early_iounmap(base, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD); -} - -int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res, - void **res) -{ - int ofw_args[MAXARGS + 3]; - unsigned long flags; - int ret, i, *p; - - BUG_ON(nr_args + nr_res > MAXARGS); - - if (!olpc_ofw_cif) - return -EIO; - - ofw_args[0] = (int)name; - ofw_args[1] = nr_args; - ofw_args[2] = nr_res; - - p = &ofw_args[3]; - for (i = 0; i < nr_args; i++, p++) - *p = (int)args[i]; - - /* call into ofw */ - spin_lock_irqsave(&ofw_lock, flags); - ret = olpc_ofw_cif(ofw_args); - spin_unlock_irqrestore(&ofw_lock, flags); - - if (!ret) { - for (i = 0; i < nr_res; i++, p++) - *((int *)res[i]) = *p; - } - - return ret; -} -EXPORT_SYMBOL_GPL(__olpc_ofw); - -bool olpc_ofw_present(void) -{ - return olpc_ofw_cif != NULL; -} -EXPORT_SYMBOL_GPL(olpc_ofw_present); - -/* OFW cif _should_ be above this address */ -#define OFW_MIN 0xff000000 - -/* OFW starts on a 1MB boundary */ -#define OFW_BOUND (1<<20) - -void __init olpc_ofw_detect(void) -{ - struct olpc_ofw_header *hdr = &boot_params.olpc_ofw_header; - unsigned long start; - - /* ensure OFW booted us by checking for "OFW " string */ - if (hdr->ofw_magic != OLPC_OFW_SIG) - return; - - olpc_ofw_cif = (int (*)(int *))hdr->cif_handler; - - if ((unsigned long)olpc_ofw_cif < OFW_MIN) { - printk(KERN_ERR "OFW detected, but cif has invalid address 0x%lx - disabling.\n", - (unsigned long)olpc_ofw_cif); - olpc_ofw_cif = NULL; - return; - } - - /* determine where OFW starts in memory */ - start = round_down((unsigned long)olpc_ofw_cif, OFW_BOUND); - printk(KERN_INFO "OFW detected in memory, cif @ 0x%lx (reserving top %ldMB)\n", - (unsigned long)olpc_ofw_cif, (-start) >> 20); - reserve_top_address(-start); -} diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index c5b250011fd..869e1aeeb71 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -421,8 +421,11 @@ struct pv_mmu_ops pv_mmu_ops = { .set_pte = native_set_pte, .set_pte_at = native_set_pte_at, .set_pmd = native_set_pmd, + .set_pmd_at = native_set_pmd_at, .pte_update = paravirt_nop, .pte_update_defer = paravirt_nop, + .pmd_update = paravirt_nop, + .pmd_update_defer = paravirt_nop, .ptep_modify_prot_start = __ptep_modify_prot_start, .ptep_modify_prot_commit = __ptep_modify_prot_commit, diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index f56a117cef6..e8c33a30200 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -1279,7 +1279,7 @@ static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev) if (pci_dev == PCI_DEVICE_ID_IBM_CALIOC2) { /* - * FIXME: properly scan for devices accross the + * FIXME: properly scan for devices across the * PCI-to-PCI bridge on every CalIOC2 port. */ return 1; @@ -1295,7 +1295,7 @@ static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev) /* * calgary_init_bitmap_from_tce_table(): - * Funtion for kdump case. In the second/kdump kernel initialize + * Function for kdump case. In the second/kdump kernel initialize * the bitmap based on the tce table entries obtained from first kernel */ static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl) diff --git a/arch/x86/kernel/pci-iommu_table.c b/arch/x86/kernel/pci-iommu_table.c index 55d745ec118..35ccf75696e 100644 --- a/arch/x86/kernel/pci-iommu_table.c +++ b/arch/x86/kernel/pci-iommu_table.c @@ -50,20 +50,14 @@ void __init check_iommu_entries(struct iommu_table_entry *start, struct iommu_table_entry *finish) { struct iommu_table_entry *p, *q, *x; - char sym_p[KSYM_SYMBOL_LEN]; - char sym_q[KSYM_SYMBOL_LEN]; /* Simple cyclic dependency checker. */ for (p = start; p < finish; p++) { q = find_dependents_of(start, finish, p); x = find_dependents_of(start, finish, q); if (p == x) { - sprint_symbol(sym_p, (unsigned long)p->detect); - sprint_symbol(sym_q, (unsigned long)q->detect); - - printk(KERN_ERR "CYCLIC DEPENDENCY FOUND! %s depends" \ - " on %s and vice-versa. BREAKING IT.\n", - sym_p, sym_q); + printk(KERN_ERR "CYCLIC DEPENDENCY FOUND! %pS depends on %pS and vice-versa. BREAKING IT.\n", + p->detect, q->detect); /* Heavy handed way..*/ x->depend = 0; } @@ -72,12 +66,8 @@ void __init check_iommu_entries(struct iommu_table_entry *start, for (p = start; p < finish; p++) { q = find_dependents_of(p, finish, p); if (q && q > p) { - sprint_symbol(sym_p, (unsigned long)p->detect); - sprint_symbol(sym_q, (unsigned long)q->detect); - - printk(KERN_ERR "EXECUTION ORDER INVALID! %s "\ - "should be called before %s!\n", - sym_p, sym_q); + printk(KERN_ERR "EXECUTION ORDER INVALID! %pS should be called before %pS!\n", + p->detect, q->detect); } } } diff --git a/arch/x86/kernel/probe_roms_32.c b/arch/x86/kernel/probe_roms.c index 071e7fea42e..ba0a4cce53b 100644 --- a/arch/x86/kernel/probe_roms_32.c +++ b/arch/x86/kernel/probe_roms.c @@ -73,6 +73,107 @@ static struct resource video_rom_resource = { .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM }; +/* does this oprom support the given pci device, or any of the devices + * that the driver supports? + */ +static bool match_id(struct pci_dev *pdev, unsigned short vendor, unsigned short device) +{ + struct pci_driver *drv = pdev->driver; + const struct pci_device_id *id; + + if (pdev->vendor == vendor && pdev->device == device) + return true; + + for (id = drv ? drv->id_table : NULL; id && id->vendor; id++) + if (id->vendor == vendor && id->device == device) + break; + + return id && id->vendor; +} + +static bool probe_list(struct pci_dev *pdev, unsigned short vendor, + const unsigned char *rom_list) +{ + unsigned short device; + + do { + if (probe_kernel_address(rom_list, device) != 0) + device = 0; + + if (device && match_id(pdev, vendor, device)) + break; + + rom_list += 2; + } while (device); + + return !!device; +} + +static struct resource *find_oprom(struct pci_dev *pdev) +{ + struct resource *oprom = NULL; + int i; + + for (i = 0; i < ARRAY_SIZE(adapter_rom_resources); i++) { + struct resource *res = &adapter_rom_resources[i]; + unsigned short offset, vendor, device, list, rev; + const unsigned char *rom; + + if (res->end == 0) + break; + + rom = isa_bus_to_virt(res->start); + if (probe_kernel_address(rom + 0x18, offset) != 0) + continue; + + if (probe_kernel_address(rom + offset + 0x4, vendor) != 0) + continue; + + if (probe_kernel_address(rom + offset + 0x6, device) != 0) + continue; + + if (match_id(pdev, vendor, device)) { + oprom = res; + break; + } + + if (probe_kernel_address(rom + offset + 0x8, list) == 0 && + probe_kernel_address(rom + offset + 0xc, rev) == 0 && + rev >= 3 && list && + probe_list(pdev, vendor, rom + offset + list)) { + oprom = res; + break; + } + } + + return oprom; +} + +void *pci_map_biosrom(struct pci_dev *pdev) +{ + struct resource *oprom = find_oprom(pdev); + + if (!oprom) + return NULL; + + return ioremap(oprom->start, resource_size(oprom)); +} +EXPORT_SYMBOL(pci_map_biosrom); + +void pci_unmap_biosrom(void __iomem *image) +{ + iounmap(image); +} +EXPORT_SYMBOL(pci_unmap_biosrom); + +size_t pci_biosrom_size(struct pci_dev *pdev) +{ + struct resource *oprom = find_oprom(pdev); + + return oprom ? resource_size(oprom) : 0; +} +EXPORT_SYMBOL(pci_biosrom_size); + #define ROMSIGNATURE 0xaa55 static int __init romsignature(const unsigned char *rom) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 57d1868a86a..88a90a977f8 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -14,6 +14,7 @@ #include <linux/utsname.h> #include <trace/events/power.h> #include <linux/hw_breakpoint.h> +#include <asm/cpu.h> #include <asm/system.h> #include <asm/apic.h> #include <asm/syscalls.h> @@ -22,11 +23,6 @@ #include <asm/i387.h> #include <asm/debugreg.h> -unsigned long idle_halt; -EXPORT_SYMBOL(idle_halt); -unsigned long idle_nomwait; -EXPORT_SYMBOL(idle_nomwait); - struct kmem_cache *task_xstate_cachep; EXPORT_SYMBOL_GPL(task_xstate_cachep); @@ -91,27 +87,33 @@ void exit_thread(void) void show_regs(struct pt_regs *regs) { show_registers(regs); - show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs), - regs->bp); + show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs), 0); } void show_regs_common(void) { - const char *board, *product; + const char *vendor, *product, *board; - board = dmi_get_system_info(DMI_BOARD_NAME); - if (!board) - board = ""; + vendor = dmi_get_system_info(DMI_SYS_VENDOR); + if (!vendor) + vendor = ""; product = dmi_get_system_info(DMI_PRODUCT_NAME); if (!product) product = ""; + /* Board Name is optional */ + board = dmi_get_system_info(DMI_BOARD_NAME); + printk(KERN_CONT "\n"); - printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s %s/%s\n", + printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s", current->pid, current->comm, print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), - init_utsname()->version, board, product); + init_utsname()->version); + printk(KERN_CONT " %s %s", vendor, product); + if (board) + printk(KERN_CONT "/%s", board); + printk(KERN_CONT "\n"); } void flush_thread(void) @@ -328,7 +330,7 @@ long sys_execve(const char __user *name, /* * Idle related variables and functions */ -unsigned long boot_option_idle_override = 0; +unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE; EXPORT_SYMBOL(boot_option_idle_override); /* @@ -374,6 +376,7 @@ void default_idle(void) { if (hlt_use_halt()) { trace_power_start(POWER_CSTATE, 1, smp_processor_id()); + trace_cpu_idle(1, smp_processor_id()); current_thread_info()->status &= ~TS_POLLING; /* * TS_POLLING-cleared state must be visible before we @@ -386,6 +389,8 @@ void default_idle(void) else local_irq_enable(); current_thread_info()->status |= TS_POLLING; + trace_power_end(smp_processor_id()); + trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); } else { local_irq_enable(); /* loop is done by the caller */ @@ -443,9 +448,8 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); */ void mwait_idle_with_hints(unsigned long ax, unsigned long cx) { - trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id()); if (!need_resched()) { - if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) + if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); __monitor((void *)¤t_thread_info()->flags, 0, 0); @@ -460,7 +464,8 @@ static void mwait_idle(void) { if (!need_resched()) { trace_power_start(POWER_CSTATE, 1, smp_processor_id()); - if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) + trace_cpu_idle(1, smp_processor_id()); + if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); __monitor((void *)¤t_thread_info()->flags, 0, 0); @@ -469,6 +474,8 @@ static void mwait_idle(void) __sti_mwait(0, 0); else local_irq_enable(); + trace_power_end(smp_processor_id()); + trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); } else local_irq_enable(); } @@ -481,10 +488,12 @@ static void mwait_idle(void) static void poll_idle(void) { trace_power_start(POWER_CSTATE, 0, smp_processor_id()); + trace_cpu_idle(0, smp_processor_id()); local_irq_enable(); while (!need_resched()) cpu_relax(); - trace_power_end(0); + trace_power_end(smp_processor_id()); + trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); } /* @@ -499,17 +508,16 @@ static void poll_idle(void) * * idle=mwait overrides this decision and forces the usage of mwait. */ -static int __cpuinitdata force_mwait; #define MWAIT_INFO 0x05 #define MWAIT_ECX_EXTENDED_INFO 0x01 #define MWAIT_EDX_C1 0xf0 -static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) +int mwait_usable(const struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; - if (force_mwait) + if (boot_option_idle_override == IDLE_FORCE_MWAIT) return 1; if (c->cpuid_level < MWAIT_INFO) @@ -629,9 +637,10 @@ static int __init idle_setup(char *str) if (!strcmp(str, "poll")) { printk("using polling idle threads.\n"); pm_idle = poll_idle; - } else if (!strcmp(str, "mwait")) - force_mwait = 1; - else if (!strcmp(str, "halt")) { + boot_option_idle_override = IDLE_POLL; + } else if (!strcmp(str, "mwait")) { + boot_option_idle_override = IDLE_FORCE_MWAIT; + } else if (!strcmp(str, "halt")) { /* * When the boot option of idle=halt is added, halt is * forced to be used for CPU idle. In such case CPU C2/C3 @@ -640,8 +649,7 @@ static int __init idle_setup(char *str) * the boot_option_idle_override. */ pm_idle = default_idle; - idle_halt = 1; - return 0; + boot_option_idle_override = IDLE_HALT; } else if (!strcmp(str, "nomwait")) { /* * If the boot option of "idle=nomwait" is added, @@ -649,12 +657,10 @@ static int __init idle_setup(char *str) * states. In such case it won't touch the variable * of boot_option_idle_override. */ - idle_nomwait = 1; - return 0; + boot_option_idle_override = IDLE_NOMWAIT; } else return -1; - boot_option_idle_override = 1; return 0; } early_param("idle", idle_setup); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 96586c3cbbb..8d128783af4 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -57,8 +57,6 @@ #include <asm/syscalls.h> #include <asm/debugreg.h> -#include <trace/events/power.h> - asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); /* @@ -113,8 +111,6 @@ void cpu_idle(void) stop_critical_timings(); pm_idle(); start_critical_timings(); - - trace_power_end(smp_processor_id()); } tick_nohz_restart_sched_tick(); preempt_enable_no_resched(); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b3d7a3a04f3..6c9dd922ac0 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -51,8 +51,6 @@ #include <asm/syscalls.h> #include <asm/debugreg.h> -#include <trace/events/power.h> - asmlinkage extern void ret_from_fork(void); DEFINE_PER_CPU(unsigned long, old_rsp); @@ -141,8 +139,6 @@ void cpu_idle(void) pm_idle(); start_critical_timings(); - trace_power_end(smp_processor_id()); - /* In many cases the interrupt that ended idle has already called exit_idle. But some idle loops can be woken up without interrupt. */ @@ -505,6 +501,10 @@ void set_personality_64bit(void) /* Make sure to be in 64bit mode */ clear_thread_flag(TIF_IA32); + /* Ensure the corresponding mm is not marked. */ + if (current->mm) + current->mm->context.ia32_compat = 0; + /* TBD: overwrites user setup. Should have two bits. But 64bit processes have always behaved this way, so it's not too bad. The main problem is just that @@ -520,6 +520,10 @@ void set_personality_ia32(void) set_thread_flag(TIF_IA32); current->personality |= force_personality32; + /* Mark the associated mm as containing 32-bit tasks. */ + if (current->mm) + current->mm->context.ia32_compat = 1; + /* Prepare the first "return" to user space */ current_thread_info()->status |= TS_COMPAT; } diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 70c4872cd8a..f65e5b521db 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -608,6 +608,9 @@ static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data) unsigned len, type; struct perf_event *bp; + if (ptrace_get_breakpoints(tsk) < 0) + return -ESRCH; + data &= ~DR_CONTROL_RESERVED; old_dr7 = ptrace_get_dr7(thread->ptrace_bps); restore: @@ -655,6 +658,9 @@ restore: } goto restore; } + + ptrace_put_breakpoints(tsk); + return ((orig_ret < 0) ? orig_ret : rc); } @@ -668,10 +674,17 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) if (n < HBP_NUM) { struct perf_event *bp; + + if (ptrace_get_breakpoints(tsk) < 0) + return -ESRCH; + bp = thread->ptrace_bps[n]; if (!bp) - return 0; - val = bp->hw.info.address; + val = 0; + else + val = bp->hw.info.address; + + ptrace_put_breakpoints(tsk); } else if (n == 6) { val = thread->debugreg6; } else if (n == 7) { @@ -686,6 +699,10 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, struct perf_event *bp; struct thread_struct *t = &tsk->thread; struct perf_event_attr attr; + int err = 0; + + if (ptrace_get_breakpoints(tsk) < 0) + return -ESRCH; if (!t->ptrace_bps[nr]) { ptrace_breakpoint_init(&attr); @@ -709,24 +726,23 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, * writing for the user. And anyway this is the previous * behaviour. */ - if (IS_ERR(bp)) - return PTR_ERR(bp); + if (IS_ERR(bp)) { + err = PTR_ERR(bp); + goto put; + } t->ptrace_bps[nr] = bp; } else { - int err; - bp = t->ptrace_bps[nr]; attr = bp->attr; attr.bp_addr = addr; err = modify_user_hw_breakpoint(bp, &attr); - if (err) - return err; } - - return 0; +put: + ptrace_put_breakpoints(tsk); + return err; } /* @@ -801,7 +817,8 @@ void ptrace_disable(struct task_struct *child) static const struct user_regset_view user_x86_32_view; /* Initialized below. */ #endif -long arch_ptrace(struct task_struct *child, long request, long addr, long data) +long arch_ptrace(struct task_struct *child, long request, + unsigned long addr, unsigned long data) { int ret; unsigned long __user *datap = (unsigned long __user *)data; @@ -812,8 +829,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) unsigned long tmp; ret = -EIO; - if ((addr & (sizeof(data) - 1)) || addr < 0 || - addr >= sizeof(struct user)) + if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user)) break; tmp = 0; /* Default return condition */ @@ -830,8 +846,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ ret = -EIO; - if ((addr & (sizeof(data) - 1)) || addr < 0 || - addr >= sizeof(struct user)) + if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user)) break; if (addr < sizeof(struct user_regs_struct)) @@ -888,17 +903,17 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION case PTRACE_GET_THREAD_AREA: - if (addr < 0) + if ((int) addr < 0) return -EIO; ret = do_get_thread_area(child, addr, - (struct user_desc __user *) data); + (struct user_desc __user *)data); break; case PTRACE_SET_THREAD_AREA: - if (addr < 0) + if ((int) addr < 0) return -EIO; ret = do_set_thread_area(child, addr, - (struct user_desc __user *) data, 0); + (struct user_desc __user *)data, 0); break; #endif diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index bab3b9e6f66..42eb3300dfc 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -41,44 +41,6 @@ void pvclock_set_flags(u8 flags) valid_flags = flags; } -/* - * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, - * yielding a 64-bit result. - */ -static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) -{ - u64 product; -#ifdef __i386__ - u32 tmp1, tmp2; -#endif - - if (shift < 0) - delta >>= -shift; - else - delta <<= shift; - -#ifdef __i386__ - __asm__ ( - "mul %5 ; " - "mov %4,%%eax ; " - "mov %%edx,%4 ; " - "mul %5 ; " - "xor %5,%5 ; " - "add %4,%%eax ; " - "adc %5,%%edx ; " - : "=A" (product), "=r" (tmp1), "=r" (tmp2) - : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); -#elif defined(__x86_64__) - __asm__ ( - "mul %%rdx ; shrd $32,%%rdx,%%rax" - : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); -#else -#error implement me! -#endif - - return product; -} - static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow) { u64 delta = native_read_tsc() - shadow->tsc_timestamp; @@ -121,6 +83,11 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src) static atomic64_t last_value = ATOMIC64_INIT(0); +void pvclock_resume(void) +{ + atomic64_set(&last_value, 0); +} + cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) { struct pvclock_shadow_time shadow; diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index f7f53dcd3e0..0c016f72769 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -6,6 +6,7 @@ #include <linux/dmi.h> #include <linux/sched.h> #include <linux/tboot.h> +#include <linux/delay.h> #include <acpi/reboot.h> #include <asm/io.h> #include <asm/apic.h> @@ -18,6 +19,7 @@ #include <asm/pci_x86.h> #include <asm/virtext.h> #include <asm/cpu.h> +#include <asm/nmi.h> #ifdef CONFIG_X86_32 # include <linux/ctype.h> @@ -34,7 +36,7 @@ EXPORT_SYMBOL(pm_power_off); static const struct desc_ptr no_idt = {}; static int reboot_mode; -enum reboot_type reboot_type = BOOT_KBD; +enum reboot_type reboot_type = BOOT_ACPI; int reboot_force; #if defined(CONFIG_X86_32) && defined(CONFIG_SMP) @@ -284,6 +286,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "P4S800"), }, }, + { /* Handle problems with rebooting on VersaLogic Menlow boards */ + .callback = set_bios_reboot, + .ident = "VersaLogic Menlow based board", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "VersaLogic Corporation"), + DMI_MATCH(DMI_BOARD_NAME, "VersaLogic Menlow board"), + }, + }, { } }; @@ -294,68 +304,16 @@ static int __init reboot_init(void) } core_initcall(reboot_init); -/* The following code and data reboots the machine by switching to real - mode and jumping to the BIOS reset entry point, as if the CPU has - really been reset. The previous version asked the keyboard - controller to pulse the CPU reset line, which is more thorough, but - doesn't work with at least one type of 486 motherboard. It is easy - to stop this code working; hence the copious comments. */ -static const unsigned long long -real_mode_gdt_entries [3] = -{ - 0x0000000000000000ULL, /* Null descriptor */ - 0x00009b000000ffffULL, /* 16-bit real-mode 64k code at 0x00000000 */ - 0x000093000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */ -}; +extern const unsigned char machine_real_restart_asm[]; +extern const u64 machine_real_restart_gdt[3]; -static const struct desc_ptr -real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries }, -real_mode_idt = { 0x3ff, 0 }; - -/* This is 16-bit protected mode code to disable paging and the cache, - switch to real mode and jump to the BIOS reset code. - - The instruction that switches to real mode by writing to CR0 must be - followed immediately by a far jump instruction, which set CS to a - valid value for real mode, and flushes the prefetch queue to avoid - running instructions that have already been decoded in protected - mode. - - Clears all the flags except ET, especially PG (paging), PE - (protected-mode enable) and TS (task switch for coprocessor state - save). Flushes the TLB after paging has been disabled. Sets CD and - NW, to disable the cache on a 486, and invalidates the cache. This - is more like the state of a 486 after reset. I don't know if - something else should be done for other chips. - - More could be done here to set up the registers as if a CPU reset had - occurred; hopefully real BIOSs don't assume much. */ -static const unsigned char real_mode_switch [] = +void machine_real_restart(unsigned int type) { - 0x66, 0x0f, 0x20, 0xc0, /* movl %cr0,%eax */ - 0x66, 0x83, 0xe0, 0x11, /* andl $0x00000011,%eax */ - 0x66, 0x0d, 0x00, 0x00, 0x00, 0x60, /* orl $0x60000000,%eax */ - 0x66, 0x0f, 0x22, 0xc0, /* movl %eax,%cr0 */ - 0x66, 0x0f, 0x22, 0xd8, /* movl %eax,%cr3 */ - 0x66, 0x0f, 0x20, 0xc3, /* movl %cr0,%ebx */ - 0x66, 0x81, 0xe3, 0x00, 0x00, 0x00, 0x60, /* andl $0x60000000,%ebx */ - 0x74, 0x02, /* jz f */ - 0x0f, 0x09, /* wbinvd */ - 0x24, 0x10, /* f: andb $0x10,al */ - 0x66, 0x0f, 0x22, 0xc0 /* movl %eax,%cr0 */ -}; -static const unsigned char jump_to_bios [] = -{ - 0xea, 0x00, 0x00, 0xff, 0xff /* ljmp $0xffff,$0x0000 */ -}; + void *restart_va; + unsigned long restart_pa; + void (*restart_lowmem)(unsigned int); + u64 *lowmem_gdt; -/* - * Switch to real mode and then execute the code - * specified by the code and length parameters. - * We assume that length will aways be less that 100! - */ -void machine_real_restart(const unsigned char *code, int length) -{ local_irq_disable(); /* Write zero to CMOS register number 0x0f, which the BIOS POST @@ -383,41 +341,23 @@ void machine_real_restart(const unsigned char *code, int length) too. */ *((unsigned short *)0x472) = reboot_mode; - /* For the switch to real mode, copy some code to low memory. It has - to be in the first 64k because it is running in 16-bit mode, and it - has to have the same physical and virtual address, because it turns - off paging. Copy it near the end of the first page, out of the way - of BIOS variables. */ - memcpy((void *)(0x1000 - sizeof(real_mode_switch) - 100), - real_mode_switch, sizeof (real_mode_switch)); - memcpy((void *)(0x1000 - 100), code, length); - - /* Set up the IDT for real mode. */ - load_idt(&real_mode_idt); - - /* Set up a GDT from which we can load segment descriptors for real - mode. The GDT is not used in real mode; it is just needed here to - prepare the descriptors. */ - load_gdt(&real_mode_gdt); - - /* Load the data segment registers, and thus the descriptors ready for - real mode. The base address of each segment is 0x100, 16 times the - selector value being loaded here. This is so that the segment - registers don't have to be reloaded after switching to real mode: - the values are consistent for real mode operation already. */ - __asm__ __volatile__ ("movl $0x0010,%%eax\n" - "\tmovl %%eax,%%ds\n" - "\tmovl %%eax,%%es\n" - "\tmovl %%eax,%%fs\n" - "\tmovl %%eax,%%gs\n" - "\tmovl %%eax,%%ss" : : : "eax"); - - /* Jump to the 16-bit code that we copied earlier. It disables paging - and the cache, switches to real mode, and jumps to the BIOS reset - entry point. */ - __asm__ __volatile__ ("ljmp $0x0008,%0" - : - : "i" ((void *)(0x1000 - sizeof (real_mode_switch) - 100))); + /* Patch the GDT in the low memory trampoline */ + lowmem_gdt = TRAMPOLINE_SYM(machine_real_restart_gdt); + + restart_va = TRAMPOLINE_SYM(machine_real_restart_asm); + restart_pa = virt_to_phys(restart_va); + restart_lowmem = (void (*)(unsigned int))restart_pa; + + /* GDT[0]: GDT self-pointer */ + lowmem_gdt[0] = + (u64)(sizeof(machine_real_restart_gdt) - 1) + + ((u64)virt_to_phys(lowmem_gdt) << 16); + /* GDT[1]: 64K real mode code segment */ + lowmem_gdt[1] = + GDT_ENTRY(0x009b, restart_pa, 0xffff); + + /* Jump to the identity-mapped low memory code */ + restart_lowmem(type); } #ifdef CONFIG_APM_MODULE EXPORT_SYMBOL(machine_real_restart); @@ -538,9 +478,24 @@ void __attribute__((weak)) mach_reboot_fixups(void) { } +/* + * Windows compatible x86 hardware expects the following on reboot: + * + * 1) If the FADT has the ACPI reboot register flag set, try it + * 2) If still alive, write to the keyboard controller + * 3) If still alive, write to the ACPI reboot register again + * 4) If still alive, write to the keyboard controller again + * + * If the machine is still alive at this stage, it gives up. We default to + * following the same pattern, except that if we're still alive after (4) we'll + * try to force a triple fault and then cycle between hitting the keyboard + * controller and doing that + */ static void native_machine_emergency_restart(void) { int i; + int attempt = 0; + int orig_reboot_type = reboot_type; if (reboot_emergency) emergency_vmx_disable_all(); @@ -562,6 +517,13 @@ static void native_machine_emergency_restart(void) outb(0xfe, 0x64); /* pulse reset low */ udelay(50); } + if (attempt == 0 && orig_reboot_type == BOOT_ACPI) { + attempt = 1; + reboot_type = BOOT_ACPI; + } else { + reboot_type = BOOT_TRIPLE; + } + break; case BOOT_TRIPLE: load_idt(&no_idt); @@ -572,7 +534,7 @@ static void native_machine_emergency_restart(void) #ifdef CONFIG_X86_32 case BOOT_BIOS: - machine_real_restart(jump_to_bios, sizeof(jump_to_bios)); + machine_real_restart(MRR_BIOS); reboot_type = BOOT_KBD; break; @@ -635,7 +597,7 @@ void native_machine_shutdown(void) /* O.K Now that I'm on the appropriate processor, * stop all of the others. */ - smp_send_stop(); + stop_other_cpus(); #endif lapic_shutdown(); @@ -747,7 +709,7 @@ static int crash_nmi_callback(struct notifier_block *self, { int cpu; - if (val != DIE_NMI_IPI) + if (val != DIE_NMI) return NOTIFY_OK; cpu = raw_smp_processor_id(); @@ -778,6 +740,8 @@ static void smp_send_nmi_allbutself(void) static struct notifier_block crash_nmi_nb = { .notifier_call = crash_nmi_callback, + /* we want to be the first one called */ + .priority = NMI_LOCAL_HIGH_PRIOR+1, }; /* Halt all other CPUs, calling the specified function on each of them diff --git a/arch/x86/kernel/reboot_32.S b/arch/x86/kernel/reboot_32.S new file mode 100644 index 00000000000..1d5c46df0d7 --- /dev/null +++ b/arch/x86/kernel/reboot_32.S @@ -0,0 +1,135 @@ +#include <linux/linkage.h> +#include <linux/init.h> +#include <asm/segment.h> +#include <asm/page_types.h> + +/* + * The following code and data reboots the machine by switching to real + * mode and jumping to the BIOS reset entry point, as if the CPU has + * really been reset. The previous version asked the keyboard + * controller to pulse the CPU reset line, which is more thorough, but + * doesn't work with at least one type of 486 motherboard. It is easy + * to stop this code working; hence the copious comments. + * + * This code is called with the restart type (0 = BIOS, 1 = APM) in %eax. + */ + .section ".x86_trampoline","a" + .balign 16 + .code32 +ENTRY(machine_real_restart_asm) +r_base = . + /* Get our own relocated address */ + call 1f +1: popl %ebx + subl $(1b - r_base), %ebx + + /* Compute the equivalent real-mode segment */ + movl %ebx, %ecx + shrl $4, %ecx + + /* Patch post-real-mode segment jump */ + movw (dispatch_table - r_base)(%ebx,%eax,2),%ax + movw %ax, (101f - r_base)(%ebx) + movw %cx, (102f - r_base)(%ebx) + + /* Set up the IDT for real mode. */ + lidtl (machine_real_restart_idt - r_base)(%ebx) + + /* + * Set up a GDT from which we can load segment descriptors for real + * mode. The GDT is not used in real mode; it is just needed here to + * prepare the descriptors. + */ + lgdtl (machine_real_restart_gdt - r_base)(%ebx) + + /* + * Load the data segment registers with 16-bit compatible values + */ + movl $16, %ecx + movl %ecx, %ds + movl %ecx, %es + movl %ecx, %fs + movl %ecx, %gs + movl %ecx, %ss + ljmpl $8, $1f - r_base + +/* + * This is 16-bit protected mode code to disable paging and the cache, + * switch to real mode and jump to the BIOS reset code. + * + * The instruction that switches to real mode by writing to CR0 must be + * followed immediately by a far jump instruction, which set CS to a + * valid value for real mode, and flushes the prefetch queue to avoid + * running instructions that have already been decoded in protected + * mode. + * + * Clears all the flags except ET, especially PG (paging), PE + * (protected-mode enable) and TS (task switch for coprocessor state + * save). Flushes the TLB after paging has been disabled. Sets CD and + * NW, to disable the cache on a 486, and invalidates the cache. This + * is more like the state of a 486 after reset. I don't know if + * something else should be done for other chips. + * + * More could be done here to set up the registers as if a CPU reset had + * occurred; hopefully real BIOSs don't assume much. This is not the + * actual BIOS entry point, anyway (that is at 0xfffffff0). + * + * Most of this work is probably excessive, but it is what is tested. + */ + .code16 +1: + xorl %ecx, %ecx + movl %cr0, %eax + andl $0x00000011, %eax + orl $0x60000000, %eax + movl %eax, %cr0 + movl %ecx, %cr3 + movl %cr0, %edx + andl $0x60000000, %edx /* If no cache bits -> no wbinvd */ + jz 2f + wbinvd +2: + andb $0x10, %al + movl %eax, %cr0 + .byte 0xea /* ljmpw */ +101: .word 0 /* Offset */ +102: .word 0 /* Segment */ + +bios: + ljmpw $0xf000, $0xfff0 + +apm: + movw $0x1000, %ax + movw %ax, %ss + movw $0xf000, %sp + movw $0x5307, %ax + movw $0x0001, %bx + movw $0x0003, %cx + int $0x15 + +END(machine_real_restart_asm) + + .balign 16 + /* These must match <asm/reboot.h */ +dispatch_table: + .word bios - r_base + .word apm - r_base +END(dispatch_table) + + .balign 16 +machine_real_restart_idt: + .word 0xffff /* Length - real mode default value */ + .long 0 /* Base - real mode default value */ +END(machine_real_restart_idt) + + .balign 16 +ENTRY(machine_real_restart_gdt) + .quad 0 /* Self-pointer, filled in by PM code */ + .quad 0 /* 16-bit code segment, filled in by PM code */ + /* + * 16-bit data segment with the selector value 16 = 0x10 and + * base value 0x100; since this is consistent with real mode + * semantics we don't have to reload the segments once CR0.PE = 0. + */ + .quad GDT_ENTRY(0x0093, 0x100, 0xffff) +END(machine_real_restart_gdt) diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c index fda313ebbb0..c8e41e90f59 100644 --- a/arch/x86/kernel/reboot_fixups_32.c +++ b/arch/x86/kernel/reboot_fixups_32.c @@ -43,17 +43,33 @@ static void rdc321x_reset(struct pci_dev *dev) outb(1, 0x92); } +static void ce4100_reset(struct pci_dev *dev) +{ + int i; + + for (i = 0; i < 10; i++) { + outb(0x2, 0xcf9); + udelay(50); + } +} + struct device_fixup { unsigned int vendor; unsigned int device; void (*reboot_fixup)(struct pci_dev *); }; +/* + * PCI ids solely used for fixups_table go here + */ +#define PCI_DEVICE_ID_INTEL_CE4100 0x0708 + static const struct device_fixup fixups_table[] = { { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset }, { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset }, { PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE, cs5530a_warm_reset }, { PCI_VENDOR_ID_RDC, PCI_DEVICE_ID_RDC_R6030, rdc321x_reset }, +{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CE4100, ce4100_reset }, }; /* diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c new file mode 100644 index 00000000000..2a26819bb6a --- /dev/null +++ b/arch/x86/kernel/resource.c @@ -0,0 +1,48 @@ +#include <linux/ioport.h> +#include <asm/e820.h> + +static void resource_clip(struct resource *res, resource_size_t start, + resource_size_t end) +{ + resource_size_t low = 0, high = 0; + + if (res->end < start || res->start > end) + return; /* no conflict */ + + if (res->start < start) + low = start - res->start; + + if (res->end > end) + high = res->end - end; + + /* Keep the area above or below the conflict, whichever is larger */ + if (low > high) + res->end = start - 1; + else + res->start = end + 1; +} + +static void remove_e820_regions(struct resource *avail) +{ + int i; + struct e820entry *entry; + + for (i = 0; i < e820.nr_map; i++) { + entry = &e820.map[i]; + + resource_clip(avail, entry->addr, + entry->addr + entry->size - 1); + } +} + +void arch_remove_reservations(struct resource *avail) +{ + /* Trim out BIOS areas (low 1MB and high 2MB) and E820 regions */ + if (avail->flags & IORESOURCE_MEM) { + if (avail->start < BIOS_END) + avail->start = BIOS_END; + resource_clip(avail, BIOS_ROM_BASE, BIOS_ROM_END); + + remove_e820_regions(avail); + } +} diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 1cfbbfc3ae2..3f2ad2640d8 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -6,6 +6,7 @@ #include <linux/acpi.h> #include <linux/bcd.h> #include <linux/pnp.h> +#include <linux/of.h> #include <asm/vsyscall.h> #include <asm/x86_init.h> @@ -76,7 +77,7 @@ int mach_set_rtc_mmss(unsigned long nowtime) CMOS_WRITE(real_seconds, RTC_SECONDS); CMOS_WRITE(real_minutes, RTC_MINUTES); } else { - printk(KERN_WARNING + printk_once(KERN_NOTICE "set_rtc_mmss: can't update from %d to %d\n", cmos_minutes, real_minutes); retval = -1; @@ -236,6 +237,8 @@ static __init int add_rtc_cmos(void) } } #endif + if (of_have_populated_dt()) + return 0; platform_device_register(&rtc_device); dev_info(&rtc_device.dev, diff --git a/arch/x86/kernel/scx200_32.c b/arch/x86/kernel/scx200_32.c deleted file mode 100644 index 7e004acbe52..00000000000 --- a/arch/x86/kernel/scx200_32.c +++ /dev/null @@ -1,131 +0,0 @@ -/* - * Copyright (c) 2001,2002 Christer Weinigel <wingel@nano-system.com> - * - * National Semiconductor SCx200 support. - */ - -#include <linux/module.h> -#include <linux/errno.h> -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/mutex.h> -#include <linux/pci.h> - -#include <linux/scx200.h> -#include <linux/scx200_gpio.h> - -/* Verify that the configuration block really is there */ -#define scx200_cb_probe(base) (inw((base) + SCx200_CBA) == (base)) - -#define NAME "scx200" - -MODULE_AUTHOR("Christer Weinigel <wingel@nano-system.com>"); -MODULE_DESCRIPTION("NatSemi SCx200 Driver"); -MODULE_LICENSE("GPL"); - -unsigned scx200_gpio_base = 0; -unsigned long scx200_gpio_shadow[2]; - -unsigned scx200_cb_base = 0; - -static struct pci_device_id scx200_tbl[] = { - { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SCx200_BRIDGE) }, - { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE) }, - { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SCx200_XBUS) }, - { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_XBUS) }, - { }, -}; -MODULE_DEVICE_TABLE(pci,scx200_tbl); - -static int __devinit scx200_probe(struct pci_dev *, const struct pci_device_id *); - -static struct pci_driver scx200_pci_driver = { - .name = "scx200", - .id_table = scx200_tbl, - .probe = scx200_probe, -}; - -static DEFINE_MUTEX(scx200_gpio_config_lock); - -static void __devinit scx200_init_shadow(void) -{ - int bank; - - /* read the current values driven on the GPIO signals */ - for (bank = 0; bank < 2; ++bank) - scx200_gpio_shadow[bank] = inl(scx200_gpio_base + 0x10 * bank); -} - -static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_id *ent) -{ - unsigned base; - - if (pdev->device == PCI_DEVICE_ID_NS_SCx200_BRIDGE || - pdev->device == PCI_DEVICE_ID_NS_SC1100_BRIDGE) { - base = pci_resource_start(pdev, 0); - printk(KERN_INFO NAME ": GPIO base 0x%x\n", base); - - if (!request_region(base, SCx200_GPIO_SIZE, "NatSemi SCx200 GPIO")) { - printk(KERN_ERR NAME ": can't allocate I/O for GPIOs\n"); - return -EBUSY; - } - - scx200_gpio_base = base; - scx200_init_shadow(); - - } else { - /* find the base of the Configuration Block */ - if (scx200_cb_probe(SCx200_CB_BASE_FIXED)) { - scx200_cb_base = SCx200_CB_BASE_FIXED; - } else { - pci_read_config_dword(pdev, SCx200_CBA_SCRATCH, &base); - if (scx200_cb_probe(base)) { - scx200_cb_base = base; - } else { - printk(KERN_WARNING NAME ": Configuration Block not found\n"); - return -ENODEV; - } - } - printk(KERN_INFO NAME ": Configuration Block base 0x%x\n", scx200_cb_base); - } - - return 0; -} - -u32 scx200_gpio_configure(unsigned index, u32 mask, u32 bits) -{ - u32 config, new_config; - - mutex_lock(&scx200_gpio_config_lock); - - outl(index, scx200_gpio_base + 0x20); - config = inl(scx200_gpio_base + 0x24); - - new_config = (config & mask) | bits; - outl(new_config, scx200_gpio_base + 0x24); - - mutex_unlock(&scx200_gpio_config_lock); - - return config; -} - -static int __init scx200_init(void) -{ - printk(KERN_INFO NAME ": NatSemi SCx200 Driver\n"); - - return pci_register_driver(&scx200_pci_driver); -} - -static void __exit scx200_cleanup(void) -{ - pci_unregister_driver(&scx200_pci_driver); - release_region(scx200_gpio_base, SCx200_GPIO_SIZE); -} - -module_init(scx200_init); -module_exit(scx200_cleanup); - -EXPORT_SYMBOL(scx200_gpio_base); -EXPORT_SYMBOL(scx200_gpio_shadow); -EXPORT_SYMBOL(scx200_gpio_configure); -EXPORT_SYMBOL(scx200_cb_base); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 95a32746fbf..c3050af9306 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -113,6 +113,7 @@ #endif #include <asm/mce.h> #include <asm/alternative.h> +#include <asm/prom.h> /* * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. @@ -297,6 +298,9 @@ static void __init init_gbpages(void) static inline void init_gbpages(void) { } +static void __init cleanup_highmap(void) +{ +} #endif static void __init reserve_brk(void) @@ -429,16 +433,30 @@ static void __init parse_setup_data(void) return; pa_data = boot_params.hdr.setup_data; while (pa_data) { - data = early_memremap(pa_data, PAGE_SIZE); + u32 data_len, map_len; + + map_len = max(PAGE_SIZE - (pa_data & ~PAGE_MASK), + (u64)sizeof(struct setup_data)); + data = early_memremap(pa_data, map_len); + data_len = data->len + sizeof(struct setup_data); + if (data_len > map_len) { + early_iounmap(data, map_len); + data = early_memremap(pa_data, data_len); + map_len = data_len; + } + switch (data->type) { case SETUP_E820_EXT: - parse_e820_ext(data, pa_data); + parse_e820_ext(data); + break; + case SETUP_DTB: + add_dtb(pa_data); break; default: break; } pa_data = data->next; - early_iounmap(data, PAGE_SIZE); + early_iounmap(data, map_len); } } @@ -501,7 +519,18 @@ static inline unsigned long long get_total_mem(void) return total << PAGE_SHIFT; } -#define DEFAULT_BZIMAGE_ADDR_MAX 0x37FFFFFF +/* + * Keep the crash kernel below this limit. On 32 bits earlier kernels + * would limit the kernel to the low 512 MiB due to mapping restrictions. + * On 64 bits, kexec-tools currently limits us to 896 MiB; increase this + * limit once kexec-tools are fixed. + */ +#ifdef CONFIG_X86_32 +# define CRASH_KERNEL_ADDR_MAX (512 << 20) +#else +# define CRASH_KERNEL_ADDR_MAX (896 << 20) +#endif + static void __init reserve_crashkernel(void) { unsigned long long total_mem; @@ -520,10 +549,10 @@ static void __init reserve_crashkernel(void) const unsigned long long alignment = 16<<20; /* 16M */ /* - * kexec want bzImage is below DEFAULT_BZIMAGE_ADDR_MAX + * kexec want bzImage is below CRASH_KERNEL_ADDR_MAX */ crash_base = memblock_find_in_range(alignment, - DEFAULT_BZIMAGE_ADDR_MAX, crash_size, alignment); + CRASH_KERNEL_ADDR_MAX, crash_size, alignment); if (crash_base == MEMBLOCK_ERROR) { pr_info("crashkernel reservation failed - No suitable area found.\n"); @@ -590,28 +619,6 @@ void __init reserve_standard_io_resources(void) } -/* - * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by - * is_kdump_kernel() to determine if we are booting after a panic. Hence - * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE. - */ - -#ifdef CONFIG_CRASH_DUMP -/* elfcorehdr= specifies the location of elf core header - * stored by the crashed kernel. This option will be passed - * by kexec loader to the capture kernel. - */ -static int __init setup_elfcorehdr(char *arg) -{ - char *end; - if (!arg) - return -EINVAL; - elfcorehdr_addr = memparse(arg, &end); - return end > arg ? 0 : -EINVAL; -} -early_param("elfcorehdr", setup_elfcorehdr); -#endif - static __init void reserve_ibft_region(void) { unsigned long addr, size = 0; @@ -669,15 +676,6 @@ static int __init parse_reservelow(char *p) early_param("reservelow", parse_reservelow); -static u64 __init get_max_mapped(void) -{ - u64 end = max_pfn_mapped; - - end <<= PAGE_SHIFT; - - return end; -} - /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -693,10 +691,6 @@ static u64 __init get_max_mapped(void) void __init setup_arch(char **cmdline_p) { - int acpi = 0; - int k8 = 0; - unsigned long flags; - #ifdef CONFIG_X86_32 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); visws_early_detect(); @@ -769,6 +763,7 @@ void __init setup_arch(char **cmdline_p) x86_init.oem.arch_setup(); + iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; setup_memory_map(); parse_setup_data(); /* update the e820_saved too */ @@ -910,6 +905,8 @@ void __init setup_arch(char **cmdline_p) */ reserve_brk(); + cleanup_highmap(); + memblock.current_limit = get_max_mapped(); memblock_x86_fill(); @@ -923,15 +920,8 @@ void __init setup_arch(char **cmdline_p) printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n", max_pfn_mapped<<PAGE_SHIFT); - reserve_trampoline_memory(); + setup_trampolines(); -#ifdef CONFIG_ACPI_SLEEP - /* - * Reserve low memory region for sleep support. - * even before init_memory_mapping - */ - acpi_reserve_wakeup_memory(); -#endif init_gbpages(); /* max_pfn_mapped is updated here */ @@ -972,19 +962,7 @@ void __init setup_arch(char **cmdline_p) early_acpi_boot_init(); -#ifdef CONFIG_ACPI_NUMA - /* - * Parse SRAT to discover nodes. - */ - acpi = acpi_numa_init(); -#endif - -#ifdef CONFIG_K8_NUMA - if (!acpi) - k8 = !k8_numa_init(0, max_pfn); -#endif - - initmem_init(0, max_pfn, acpi, k8); + initmem_init(); memblock_find_dma_reserve(); dma32_reserve_bootmem(); @@ -996,6 +974,11 @@ void __init setup_arch(char **cmdline_p) paging_init(); x86_init.paging.pagetable_setup_done(swapper_pg_dir); + if (boot_cpu_data.cpuid_level >= 0) { + /* A CPU has %cr4 if and only if it has CPUID */ + mmu_cr4_features = read_cr4(); + } + #ifdef CONFIG_X86_32 /* sync back kernel address range */ clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY, @@ -1017,8 +1000,8 @@ void __init setup_arch(char **cmdline_p) * Read APIC and some other early information from ACPI tables. */ acpi_boot_init(); - sfi_init(); + x86_dtb_init(); /* * get boot-time SMP configuration: @@ -1028,15 +1011,10 @@ void __init setup_arch(char **cmdline_p) prefill_possible_map(); -#ifdef CONFIG_X86_64 init_cpu_to_node(); -#endif init_apic_mappings(); - ioapic_init_mappings(); - - /* need to wait for io_apic is mapped */ - probe_nr_irqs_gsi(); + ioapic_and_gsi_init(); kvm_guest_init(); @@ -1057,11 +1035,11 @@ void __init setup_arch(char **cmdline_p) #endif x86_init.oem.banner(); + x86_init.timers.wallclock_init(); + mcheck_init(); - local_irq_save(flags); - arch_init_ideal_nop5(); - local_irq_restore(flags); + arch_init_ideal_nops(); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 002b79685f7..71f4727da37 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -225,10 +225,15 @@ void __init setup_per_cpu_areas(void) per_cpu(x86_bios_cpu_apicid, cpu) = early_per_cpu_map(x86_bios_cpu_apicid, cpu); #endif +#ifdef CONFIG_X86_32 + per_cpu(x86_cpu_to_logical_apicid, cpu) = + early_per_cpu_map(x86_cpu_to_logical_apicid, cpu); +#endif #ifdef CONFIG_X86_64 per_cpu(irq_stack_ptr, cpu) = per_cpu(irq_stack_union.irq_stack, cpu) + IRQ_STACK_SIZE - 64; +#endif #ifdef CONFIG_NUMA per_cpu(x86_cpu_to_node_map, cpu) = early_per_cpu_map(x86_cpu_to_node_map, cpu); @@ -242,7 +247,6 @@ void __init setup_per_cpu_areas(void) */ set_cpu_numa_node(cpu, early_cpu_to_node(cpu)); #endif -#endif /* * Up to this point, the boot CPU has been using .init.data * area. Reload any changed state for the boot CPU. @@ -256,7 +260,10 @@ void __init setup_per_cpu_areas(void) early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; #endif -#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA) +#ifdef CONFIG_X86_32 + early_per_cpu_ptr(x86_cpu_to_logical_apicid) = NULL; +#endif +#ifdef CONFIG_NUMA early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; #endif diff --git a/arch/x86/kernel/sfi.c b/arch/x86/kernel/sfi.c deleted file mode 100644 index dd4c281ffe5..00000000000 --- a/arch/x86/kernel/sfi.c +++ /dev/null @@ -1,120 +0,0 @@ -/* - * sfi.c - x86 architecture SFI support. - * - * Copyright (c) 2009, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. - * - */ - -#define KMSG_COMPONENT "SFI" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt - -#include <linux/acpi.h> -#include <linux/init.h> -#include <linux/sfi.h> -#include <linux/io.h> - -#include <asm/io_apic.h> -#include <asm/mpspec.h> -#include <asm/setup.h> -#include <asm/apic.h> - -#ifdef CONFIG_X86_LOCAL_APIC -static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; - -static void __init mp_sfi_register_lapic_address(unsigned long address) -{ - mp_lapic_addr = address; - - set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); - if (boot_cpu_physical_apicid == -1U) - boot_cpu_physical_apicid = read_apic_id(); - - pr_info("Boot CPU = %d\n", boot_cpu_physical_apicid); -} - -/* All CPUs enumerated by SFI must be present and enabled */ -static void __cpuinit mp_sfi_register_lapic(u8 id) -{ - if (MAX_APICS - id <= 0) { - pr_warning("Processor #%d invalid (max %d)\n", - id, MAX_APICS); - return; - } - - pr_info("registering lapic[%d]\n", id); - - generic_processor_info(id, GET_APIC_VERSION(apic_read(APIC_LVR))); -} - -static int __init sfi_parse_cpus(struct sfi_table_header *table) -{ - struct sfi_table_simple *sb; - struct sfi_cpu_table_entry *pentry; - int i; - int cpu_num; - - sb = (struct sfi_table_simple *)table; - cpu_num = SFI_GET_NUM_ENTRIES(sb, struct sfi_cpu_table_entry); - pentry = (struct sfi_cpu_table_entry *)sb->pentry; - - for (i = 0; i < cpu_num; i++) { - mp_sfi_register_lapic(pentry->apic_id); - pentry++; - } - - smp_found_config = 1; - return 0; -} -#endif /* CONFIG_X86_LOCAL_APIC */ - -#ifdef CONFIG_X86_IO_APIC - -static int __init sfi_parse_ioapic(struct sfi_table_header *table) -{ - struct sfi_table_simple *sb; - struct sfi_apic_table_entry *pentry; - int i, num; - - sb = (struct sfi_table_simple *)table; - num = SFI_GET_NUM_ENTRIES(sb, struct sfi_apic_table_entry); - pentry = (struct sfi_apic_table_entry *)sb->pentry; - - for (i = 0; i < num; i++) { - mp_register_ioapic(i, pentry->phys_addr, gsi_top); - pentry++; - } - - WARN(pic_mode, KERN_WARNING - "SFI: pic_mod shouldn't be 1 when IOAPIC table is present\n"); - pic_mode = 0; - return 0; -} -#endif /* CONFIG_X86_IO_APIC */ - -/* - * sfi_platform_init(): register lapics & io-apics - */ -int __init sfi_platform_init(void) -{ -#ifdef CONFIG_X86_LOCAL_APIC - mp_sfi_register_lapic_address(sfi_lapic_addr); - sfi_table_parse(SFI_SIG_CPUS, NULL, NULL, sfi_parse_cpus); -#endif -#ifdef CONFIG_X86_IO_APIC - sfi_table_parse(SFI_SIG_APIC, NULL, NULL, sfi_parse_ioapic); -#endif - return 0; -} diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 4fd173cd8e5..40a24932a8a 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -601,10 +601,7 @@ long sys_rt_sigreturn(struct pt_regs *regs) goto badframe; sigdelsetmask(&set, ~_BLOCKABLE); - spin_lock_irq(¤t->sighand->siglock); - current->blocked = set; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); + set_current_blocked(&set); if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) goto badframe; @@ -682,6 +679,7 @@ static int handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, sigset_t *oldset, struct pt_regs *regs) { + sigset_t blocked; int ret; /* Are we from a system call? */ @@ -741,12 +739,10 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, */ regs->flags &= ~X86_EFLAGS_TF; - spin_lock_irq(¤t->sighand->siglock); - sigorsets(¤t->blocked, ¤t->blocked, &ka->sa.sa_mask); + sigorsets(&blocked, ¤t->blocked, &ka->sa.sa_mask); if (!(ka->sa.sa_flags & SA_NODEFER)) - sigaddset(¤t->blocked, sig); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); + sigaddset(&blocked, sig); + set_current_blocked(&blocked); tracehook_signal_handler(sig, info, ka, regs, test_thread_flag(TIF_SINGLESTEP)); diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index d801210945d..013e7eba83b 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -159,10 +159,10 @@ asmlinkage void smp_reboot_interrupt(void) irq_exit(); } -static void native_smp_send_stop(void) +static void native_stop_other_cpus(int wait) { unsigned long flags; - unsigned long wait; + unsigned long timeout; if (reboot_force) return; @@ -179,9 +179,12 @@ static void native_smp_send_stop(void) if (num_online_cpus() > 1) { apic->send_IPI_allbutself(REBOOT_VECTOR); - /* Don't wait longer than a second */ - wait = USEC_PER_SEC; - while (num_online_cpus() > 1 && wait--) + /* + * Don't wait longer than a second if the caller + * didn't ask us to wait. + */ + timeout = USEC_PER_SEC; + while (num_online_cpus() > 1 && (wait || timeout--)) udelay(1); } @@ -191,14 +194,13 @@ static void native_smp_send_stop(void) } /* - * Reschedule call back. Nothing to do, - * all the work is done automatically when - * we return from the interrupt. + * Reschedule call back. */ void smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); inc_irq_stat(irq_resched_count); + scheduler_ipi(); /* * KVM uses this interrupt to force a cpu out of guest mode */ @@ -227,7 +229,7 @@ struct smp_ops smp_ops = { .smp_prepare_cpus = native_smp_prepare_cpus, .smp_cpus_done = native_smp_cpus_done, - .smp_send_stop = native_smp_send_stop, + .stop_other_cpus = native_stop_other_cpus, .smp_send_reschedule = native_smp_send_reschedule, .cpu_up = native_cpu_up, diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 6af118511b4..a3c430bdfb6 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -64,6 +64,7 @@ #include <asm/mtrr.h> #include <asm/mwait.h> #include <asm/apic.h> +#include <asm/io_apic.h> #include <asm/setup.h> #include <asm/uv/uv.h> #include <linux/mc146818rtc.h> @@ -71,10 +72,6 @@ #include <asm/smpboot_hooks.h> #include <asm/i8259.h> -#ifdef CONFIG_X86_32 -u8 apicid_2_node[MAX_APICID]; -#endif - /* State of each CPU */ DEFINE_PER_CPU(int, cpu_state) = { 0 }; @@ -97,12 +94,12 @@ static DEFINE_PER_CPU(struct task_struct *, idle_thread_array); */ static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex); -void cpu_hotplug_driver_lock() +void cpu_hotplug_driver_lock(void) { mutex_lock(&x86_cpu_hotplug_driver_mutex); } -void cpu_hotplug_driver_unlock() +void cpu_hotplug_driver_unlock(void) { mutex_unlock(&x86_cpu_hotplug_driver_mutex); } @@ -130,68 +127,14 @@ EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); DEFINE_PER_CPU(cpumask_var_t, cpu_core_map); EXPORT_PER_CPU_SYMBOL(cpu_core_map); +DEFINE_PER_CPU(cpumask_var_t, cpu_llc_shared_map); + /* Per CPU bogomips and other parameters */ DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); EXPORT_PER_CPU_SYMBOL(cpu_info); atomic_t init_deasserted; -#if defined(CONFIG_NUMA) && defined(CONFIG_X86_32) -/* which node each logical CPU is on */ -int cpu_to_node_map[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 }; -EXPORT_SYMBOL(cpu_to_node_map); - -/* set up a mapping between cpu and node. */ -static void map_cpu_to_node(int cpu, int node) -{ - printk(KERN_INFO "Mapping cpu %d to node %d\n", cpu, node); - cpumask_set_cpu(cpu, node_to_cpumask_map[node]); - cpu_to_node_map[cpu] = node; -} - -/* undo a mapping between cpu and node. */ -static void unmap_cpu_to_node(int cpu) -{ - int node; - - printk(KERN_INFO "Unmapping cpu %d from all nodes\n", cpu); - for (node = 0; node < MAX_NUMNODES; node++) - cpumask_clear_cpu(cpu, node_to_cpumask_map[node]); - cpu_to_node_map[cpu] = 0; -} -#else /* !(CONFIG_NUMA && CONFIG_X86_32) */ -#define map_cpu_to_node(cpu, node) ({}) -#define unmap_cpu_to_node(cpu) ({}) -#endif - -#ifdef CONFIG_X86_32 -static int boot_cpu_logical_apicid; - -u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly = - { [0 ... NR_CPUS-1] = BAD_APICID }; - -static void map_cpu_to_logical_apicid(void) -{ - int cpu = smp_processor_id(); - int apicid = logical_smp_processor_id(); - int node = apic->apicid_to_node(apicid); - - if (!node_online(node)) - node = first_online_node; - - cpu_2_logical_apicid[cpu] = apicid; - map_cpu_to_node(cpu, node); -} - -void numa_remove_cpu(int cpu) -{ - cpu_2_logical_apicid[cpu] = BAD_APICID; - unmap_cpu_to_node(cpu); -} -#else -#define map_cpu_to_logical_apicid() do {} while (0) -#endif - /* * Report back to the Boot Processor. * Running on AP. @@ -259,7 +202,6 @@ static void __cpuinit smp_callin(void) apic->smp_callin_clear_local_apic(); setup_local_APIC(); end_local_APIC_setup(); - map_cpu_to_logical_apicid(); /* * Need to setup vector mappings before we enable interrupts. @@ -281,6 +223,13 @@ static void __cpuinit smp_callin(void) */ smp_store_cpu_info(cpuid); + /* + * This must be done before setting cpu_online_mask + * or calling notify_cpu_starting. + */ + set_cpu_sibling_map(raw_smp_processor_id()); + wmb(); + notify_cpu_starting(cpuid); /* @@ -316,16 +265,6 @@ notrace static void __cpuinit start_secondary(void *unused) */ check_tsc_sync_target(); - if (nmi_watchdog == NMI_IO_APIC) { - legacy_pic->mask(0); - enable_NMI_through_LVT0(); - legacy_pic->unmask(0); - } - - /* This must be done before setting cpu_online_mask */ - set_cpu_sibling_map(raw_smp_processor_id()); - wmb(); - /* * We need to hold call_lock, so there is no inconsistency * between the time smp_call_function() determines number of @@ -358,23 +297,6 @@ notrace static void __cpuinit start_secondary(void *unused) cpu_idle(); } -#ifdef CONFIG_CPUMASK_OFFSTACK -/* In this case, llc_shared_map is a pointer to a cpumask. */ -static inline void copy_cpuinfo_x86(struct cpuinfo_x86 *dst, - const struct cpuinfo_x86 *src) -{ - struct cpumask *llc = dst->llc_shared_map; - *dst = *src; - dst->llc_shared_map = llc; -} -#else -static inline void copy_cpuinfo_x86(struct cpuinfo_x86 *dst, - const struct cpuinfo_x86 *src) -{ - *dst = *src; -} -#endif /* CONFIG_CPUMASK_OFFSTACK */ - /* * The bootstrap kernel entry code has set these up. Save them for * a given CPU @@ -384,7 +306,7 @@ void __cpuinit smp_store_cpu_info(int id) { struct cpuinfo_x86 *c = &cpu_data(id); - copy_cpuinfo_x86(c, &boot_cpu_data); + *c = boot_cpu_data; c->cpu_index = id; if (id != 0) identify_secondary_cpu(c); @@ -392,15 +314,12 @@ void __cpuinit smp_store_cpu_info(int id) static void __cpuinit link_thread_siblings(int cpu1, int cpu2) { - struct cpuinfo_x86 *c1 = &cpu_data(cpu1); - struct cpuinfo_x86 *c2 = &cpu_data(cpu2); - cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2)); cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1)); cpumask_set_cpu(cpu1, cpu_core_mask(cpu2)); cpumask_set_cpu(cpu2, cpu_core_mask(cpu1)); - cpumask_set_cpu(cpu1, c2->llc_shared_map); - cpumask_set_cpu(cpu2, c1->llc_shared_map); + cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2)); + cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1)); } @@ -417,6 +336,7 @@ void __cpuinit set_cpu_sibling_map(int cpu) if (cpu_has(c, X86_FEATURE_TOPOEXT)) { if (c->phys_proc_id == o->phys_proc_id && + per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i) && c->compute_unit_id == o->compute_unit_id) link_thread_siblings(cpu, i); } else if (c->phys_proc_id == o->phys_proc_id && @@ -428,9 +348,9 @@ void __cpuinit set_cpu_sibling_map(int cpu) cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); } - cpumask_set_cpu(cpu, c->llc_shared_map); + cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu)); - if (current_cpu_data.x86_max_cores == 1) { + if (__this_cpu_read(cpu_info.x86_max_cores) == 1) { cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu)); c->booted_cores = 1; return; @@ -439,8 +359,8 @@ void __cpuinit set_cpu_sibling_map(int cpu) for_each_cpu(i, cpu_sibling_setup_mask) { if (per_cpu(cpu_llc_id, cpu) != BAD_APICID && per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { - cpumask_set_cpu(i, c->llc_shared_map); - cpumask_set_cpu(cpu, cpu_data(i).llc_shared_map); + cpumask_set_cpu(i, cpu_llc_shared_mask(cpu)); + cpumask_set_cpu(cpu, cpu_llc_shared_mask(i)); } if (c->phys_proc_id == cpu_data(i).phys_proc_id) { cpumask_set_cpu(i, cpu_core_mask(cpu)); @@ -479,7 +399,7 @@ const struct cpumask *cpu_coregroup_mask(int cpu) !(cpu_has(c, X86_FEATURE_AMD_DCM))) return cpu_core_mask(cpu); else - return c->llc_shared_map; + return cpu_llc_shared_mask(cpu); } static void impress_friends(void) @@ -641,7 +561,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) * target processor state. */ startup_ipi_hook(phys_apicid, (unsigned long) start_secondary, - (unsigned long)stack_start.sp); + stack_start); /* * Run STARTUP IPI loop. @@ -747,7 +667,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), }; - INIT_WORK_ON_STACK(&c_idle.work, do_fork_idle); + INIT_WORK_ONSTACK(&c_idle.work, do_fork_idle); alternatives_smp_switch(1); @@ -788,10 +708,10 @@ do_rest: #endif early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); initial_code = (unsigned long)start_secondary; - stack_start.sp = (void *) c_idle.idle->thread.sp; + stack_start = c_idle.idle->thread.sp; /* start_ip had better be page-aligned! */ - start_ip = setup_trampoline(); + start_ip = trampoline_address(); /* So we see what's up */ announce_cpu(cpu, apicid); @@ -801,6 +721,8 @@ do_rest: * the targeted processor. */ + printk(KERN_DEBUG "smpboot cpu %d: start_ip = %lx\n", cpu, start_ip); + atomic_set(&init_deasserted, 0); if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { @@ -854,8 +776,8 @@ do_rest: pr_debug("CPU%d: has booted.\n", cpu); else { boot_error = 1; - if (*((volatile unsigned char *)trampoline_base) - == 0xA5) + if (*(volatile u32 *)TRAMPOLINE_SYM(trampoline_status) + == 0xA5A5A5A5) /* trampoline started but...? */ pr_err("CPU%d: Stuck ??\n", cpu); else @@ -881,7 +803,7 @@ do_rest: } /* mark "stuck" area as not stuck */ - *((volatile unsigned long *)trampoline_base) = 0; + *(volatile u32 *)TRAMPOLINE_SYM(trampoline_status) = 0; if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { /* @@ -948,6 +870,14 @@ int __cpuinit native_cpu_up(unsigned int cpu) return 0; } +/** + * arch_disable_smp_support() - disables SMP support for x86 at runtime + */ +void arch_disable_smp_support(void) +{ + disable_ioapic_support(); +} + /* * Fall back to non SMP mode after errors. * @@ -963,7 +893,6 @@ static __init void disable_smp(void) physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); else physid_set_mask_of_physid(0, &phys_cpu_present_map); - map_cpu_to_logical_apicid(); cpumask_set_cpu(0, cpu_sibling_mask(0)); cpumask_set_cpu(0, cpu_core_mask(0)); } @@ -1048,7 +977,7 @@ static int __init smp_sanity_check(unsigned max_cpus) "(tell your hw vendor)\n"); } smpboot_clear_io_apic(); - arch_disable_smp_support(); + disable_ioapic_support(); return -1; } @@ -1061,11 +990,9 @@ static int __init smp_sanity_check(unsigned max_cpus) printk(KERN_INFO "SMP mode deactivated.\n"); smpboot_clear_io_apic(); - localise_nmi_watchdog(); - connect_bsp_APIC(); setup_local_APIC(); - end_local_APIC_setup(); + bsp_end_local_APIC_setup(); return -1; } @@ -1094,21 +1021,19 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) preempt_disable(); smp_cpu_index_default(); - current_cpu_data = boot_cpu_data; - cpumask_copy(cpu_callin_mask, cpumask_of(0)); - mb(); + /* * Setup boot CPU information */ smp_store_cpu_info(0); /* Final full version of the data */ -#ifdef CONFIG_X86_32 - boot_cpu_logical_apicid = logical_smp_processor_id(); -#endif + cpumask_copy(cpu_callin_mask, cpumask_of(0)); + mb(); + current_thread_info()->cpu = 0; /* needed? */ for_each_possible_cpu(i) { zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); - zalloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL); + zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); } set_cpu_sibling_map(0); @@ -1142,9 +1067,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) if (!skip_ioapic_setup && nr_ioapics) enable_IO_APIC(); - end_local_APIC_setup(); - - map_cpu_to_logical_apicid(); + bsp_end_local_APIC_setup(); if (apic->setup_portio_remap) apic->setup_portio_remap(); @@ -1166,6 +1089,20 @@ out: preempt_enable(); } +void arch_disable_nonboot_cpus_begin(void) +{ + /* + * Avoid the smp alternatives switch during the disable_nonboot_cpus(). + * In the suspend path, we will be back in the SMP mode shortly anyways. + */ + skip_smp_alternatives = true; +} + +void arch_disable_nonboot_cpus_end(void) +{ + skip_smp_alternatives = false; +} + void arch_enable_nonboot_cpus_begin(void) { set_mtrr_aps_delayed_init(); @@ -1196,7 +1133,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus) #ifdef CONFIG_X86_IO_APIC setup_ioapic_dest(); #endif - check_nmi_watchdog(); mtrr_aps_init(); } @@ -1341,8 +1277,6 @@ int native_cpu_disable(void) if (cpu == 0) return -EBUSY; - if (nmi_watchdog == NMI_LOCAL_APIC) - stop_apic_nmi_watchdog(NULL); clear_local_APIC(); cpu_disable_common(); @@ -1373,12 +1307,11 @@ void play_dead_common(void) { idle_task_exit(); reset_lazy_tlbstate(); - irq_ctx_exit(raw_smp_processor_id()); c1e_remove_cpu(raw_smp_processor_id()); mb(); /* Ack it */ - __get_cpu_var(cpu_state) = CPU_DEAD; + __this_cpu_write(cpu_state, CPU_DEAD); /* * With physical CPU hotplug, we should halt the cpu @@ -1397,12 +1330,13 @@ static inline void mwait_play_dead(void) unsigned int highest_subcstate = 0; int i; void *mwait_ptr; + struct cpuinfo_x86 *c = __this_cpu_ptr(&cpu_info); - if (!cpu_has(¤t_cpu_data, X86_FEATURE_MWAIT)) + if (!this_cpu_has(X86_FEATURE_MWAIT) && mwait_usable(c)) return; - if (!cpu_has(¤t_cpu_data, X86_FEATURE_CLFLSH)) + if (!this_cpu_has(X86_FEATURE_CLFLSH)) return; - if (current_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) + if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF) return; eax = CPUID_MWAIT_LEAF; @@ -1453,7 +1387,7 @@ static inline void mwait_play_dead(void) static inline void hlt_play_dead(void) { - if (current_cpu_data.x86 >= 4) + if (__this_cpu_read(cpu_info.x86) >= 4) wbinvd(); while (1) { diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index b53c525368a..55d9bc03f69 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -9,15 +9,6 @@ #include <linux/uaccess.h> #include <asm/stacktrace.h> -static void save_stack_warning(void *data, char *msg) -{ -} - -static void -save_stack_warning_symbol(void *data, char *msg, unsigned long symbol) -{ -} - static int save_stack_stack(void *data, char *name) { return 0; @@ -53,16 +44,12 @@ save_stack_address_nosched(void *data, unsigned long addr, int reliable) } static const struct stacktrace_ops save_stack_ops = { - .warning = save_stack_warning, - .warning_symbol = save_stack_warning_symbol, .stack = save_stack_stack, .address = save_stack_address, .walk_stack = print_context_stack, }; static const struct stacktrace_ops save_stack_ops_nosched = { - .warning = save_stack_warning, - .warning_symbol = save_stack_warning_symbol, .stack = save_stack_stack, .address = save_stack_address_nosched, .walk_stack = print_context_stack, @@ -79,9 +66,9 @@ void save_stack_trace(struct stack_trace *trace) } EXPORT_SYMBOL_GPL(save_stack_trace); -void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp) +void save_stack_trace_regs(struct stack_trace *trace, struct pt_regs *regs) { - dump_trace(current, NULL, NULL, bp, &save_stack_ops, trace); + dump_trace(current, regs, NULL, 0, &save_stack_ops, trace); if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; } diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index 58de45ee08b..7977f0cfe33 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -166,7 +166,7 @@ static void enable_step(struct task_struct *child, bool block) * Make sure block stepping (BTF) is not enabled unless it should be. * Note that we don't try to worry about any is_setting_trap_flag() * instructions after the first when using block stepping. - * So noone should try to use debugger block stepping in a program + * So no one should try to use debugger block stepping in a program * that uses user-mode single stepping itself. */ if (enable_single_step(child) && block) { diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index b35786dc9b8..32cbffb0c49 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -340,3 +340,8 @@ ENTRY(sys_call_table) .long sys_fanotify_init .long sys_fanotify_mark .long sys_prlimit64 /* 340 */ + .long sys_name_to_handle_at + .long sys_open_by_handle_at + .long sys_clock_adjtime + .long sys_syncfs + .long sys_sendmmsg /* 345 */ diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index c2f1b26141e..998e972f3b1 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -133,7 +133,7 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn, pmd = pmd_alloc(&tboot_mm, pud, vaddr); if (!pmd) return -1; - pte = pte_alloc_map(&tboot_mm, pmd, vaddr); + pte = pte_alloc_map(&tboot_mm, NULL, pmd, vaddr); if (!pte) return -1; set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot)); diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index fb5cc5e14cf..25a28a24593 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -22,10 +22,6 @@ #include <asm/hpet.h> #include <asm/time.h> -#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC) -int timer_ack; -#endif - #ifdef CONFIG_X86_64 volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; #endif @@ -63,20 +59,6 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) /* Keep nmi watchdog up to date */ inc_irq_stat(irq0_irqs); - /* Optimized out for !IO_APIC and x86_64 */ - if (timer_ack) { - /* - * Subtle, when I/O APICs are used we have to ack timer IRQ - * manually to deassert NMI lines for the watchdog if run - * on an 82489DX-based system. - */ - raw_spin_lock(&i8259A_lock); - outb(0x0c, PIC_MASTER_OCW3); - /* Ack the IRQ; AEOI will end it automatically. */ - inb(PIC_MASTER_POLL); - raw_spin_unlock(&i8259A_lock); - } - global_clock_event->event_handler(global_clock_event); /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */ diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c deleted file mode 100644 index 20ea20a39e2..00000000000 --- a/arch/x86/kernel/tlb_uv.c +++ /dev/null @@ -1,1661 +0,0 @@ -/* - * SGI UltraViolet TLB flush routines. - * - * (c) 2008-2010 Cliff Wickman <cpw@sgi.com>, SGI. - * - * This code is released under the GNU General Public License version 2 or - * later. - */ -#include <linux/seq_file.h> -#include <linux/proc_fs.h> -#include <linux/debugfs.h> -#include <linux/kernel.h> -#include <linux/slab.h> - -#include <asm/mmu_context.h> -#include <asm/uv/uv.h> -#include <asm/uv/uv_mmrs.h> -#include <asm/uv/uv_hub.h> -#include <asm/uv/uv_bau.h> -#include <asm/apic.h> -#include <asm/idle.h> -#include <asm/tsc.h> -#include <asm/irq_vectors.h> -#include <asm/timer.h> - -/* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */ -static int timeout_base_ns[] = { - 20, - 160, - 1280, - 10240, - 81920, - 655360, - 5242880, - 167772160 -}; -static int timeout_us; -static int nobau; -static int baudisabled; -static spinlock_t disable_lock; -static cycles_t congested_cycles; - -/* tunables: */ -static int max_bau_concurrent = MAX_BAU_CONCURRENT; -static int max_bau_concurrent_constant = MAX_BAU_CONCURRENT; -static int plugged_delay = PLUGGED_DELAY; -static int plugsb4reset = PLUGSB4RESET; -static int timeoutsb4reset = TIMEOUTSB4RESET; -static int ipi_reset_limit = IPI_RESET_LIMIT; -static int complete_threshold = COMPLETE_THRESHOLD; -static int congested_response_us = CONGESTED_RESPONSE_US; -static int congested_reps = CONGESTED_REPS; -static int congested_period = CONGESTED_PERIOD; -static struct dentry *tunables_dir; -static struct dentry *tunables_file; - -static int __init setup_nobau(char *arg) -{ - nobau = 1; - return 0; -} -early_param("nobau", setup_nobau); - -/* base pnode in this partition */ -static int uv_partition_base_pnode __read_mostly; -/* position of pnode (which is nasid>>1): */ -static int uv_nshift __read_mostly; -static unsigned long uv_mmask __read_mostly; - -static DEFINE_PER_CPU(struct ptc_stats, ptcstats); -static DEFINE_PER_CPU(struct bau_control, bau_control); -static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask); - -/* - * Determine the first node on a uvhub. 'Nodes' are used for kernel - * memory allocation. - */ -static int __init uvhub_to_first_node(int uvhub) -{ - int node, b; - - for_each_online_node(node) { - b = uv_node_to_blade_id(node); - if (uvhub == b) - return node; - } - return -1; -} - -/* - * Determine the apicid of the first cpu on a uvhub. - */ -static int __init uvhub_to_first_apicid(int uvhub) -{ - int cpu; - - for_each_present_cpu(cpu) - if (uvhub == uv_cpu_to_blade_id(cpu)) - return per_cpu(x86_cpu_to_apicid, cpu); - return -1; -} - -/* - * Free a software acknowledge hardware resource by clearing its Pending - * bit. This will return a reply to the sender. - * If the message has timed out, a reply has already been sent by the - * hardware but the resource has not been released. In that case our - * clear of the Timeout bit (as well) will free the resource. No reply will - * be sent (the hardware will only do one reply per message). - */ -static inline void uv_reply_to_message(struct msg_desc *mdp, - struct bau_control *bcp) -{ - unsigned long dw; - struct bau_payload_queue_entry *msg; - - msg = mdp->msg; - if (!msg->canceled) { - dw = (msg->sw_ack_vector << UV_SW_ACK_NPENDING) | - msg->sw_ack_vector; - uv_write_local_mmr( - UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw); - } - msg->replied_to = 1; - msg->sw_ack_vector = 0; -} - -/* - * Process the receipt of a RETRY message - */ -static inline void uv_bau_process_retry_msg(struct msg_desc *mdp, - struct bau_control *bcp) -{ - int i; - int cancel_count = 0; - int slot2; - unsigned long msg_res; - unsigned long mmr = 0; - struct bau_payload_queue_entry *msg; - struct bau_payload_queue_entry *msg2; - struct ptc_stats *stat; - - msg = mdp->msg; - stat = bcp->statp; - stat->d_retries++; - /* - * cancel any message from msg+1 to the retry itself - */ - for (msg2 = msg+1, i = 0; i < DEST_Q_SIZE; msg2++, i++) { - if (msg2 > mdp->va_queue_last) - msg2 = mdp->va_queue_first; - if (msg2 == msg) - break; - - /* same conditions for cancellation as uv_do_reset */ - if ((msg2->replied_to == 0) && (msg2->canceled == 0) && - (msg2->sw_ack_vector) && ((msg2->sw_ack_vector & - msg->sw_ack_vector) == 0) && - (msg2->sending_cpu == msg->sending_cpu) && - (msg2->msg_type != MSG_NOOP)) { - slot2 = msg2 - mdp->va_queue_first; - mmr = uv_read_local_mmr - (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); - msg_res = msg2->sw_ack_vector; - /* - * This is a message retry; clear the resources held - * by the previous message only if they timed out. - * If it has not timed out we have an unexpected - * situation to report. - */ - if (mmr & (msg_res << UV_SW_ACK_NPENDING)) { - /* - * is the resource timed out? - * make everyone ignore the cancelled message. - */ - msg2->canceled = 1; - stat->d_canceled++; - cancel_count++; - uv_write_local_mmr( - UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, - (msg_res << UV_SW_ACK_NPENDING) | - msg_res); - } - } - } - if (!cancel_count) - stat->d_nocanceled++; -} - -/* - * Do all the things a cpu should do for a TLB shootdown message. - * Other cpu's may come here at the same time for this message. - */ -static void uv_bau_process_message(struct msg_desc *mdp, - struct bau_control *bcp) -{ - int msg_ack_count; - short socket_ack_count = 0; - struct ptc_stats *stat; - struct bau_payload_queue_entry *msg; - struct bau_control *smaster = bcp->socket_master; - - /* - * This must be a normal message, or retry of a normal message - */ - msg = mdp->msg; - stat = bcp->statp; - if (msg->address == TLB_FLUSH_ALL) { - local_flush_tlb(); - stat->d_alltlb++; - } else { - __flush_tlb_one(msg->address); - stat->d_onetlb++; - } - stat->d_requestee++; - - /* - * One cpu on each uvhub has the additional job on a RETRY - * of releasing the resource held by the message that is - * being retried. That message is identified by sending - * cpu number. - */ - if (msg->msg_type == MSG_RETRY && bcp == bcp->uvhub_master) - uv_bau_process_retry_msg(mdp, bcp); - - /* - * This is a sw_ack message, so we have to reply to it. - * Count each responding cpu on the socket. This avoids - * pinging the count's cache line back and forth between - * the sockets. - */ - socket_ack_count = atomic_add_short_return(1, (struct atomic_short *) - &smaster->socket_acknowledge_count[mdp->msg_slot]); - if (socket_ack_count == bcp->cpus_in_socket) { - /* - * Both sockets dump their completed count total into - * the message's count. - */ - smaster->socket_acknowledge_count[mdp->msg_slot] = 0; - msg_ack_count = atomic_add_short_return(socket_ack_count, - (struct atomic_short *)&msg->acknowledge_count); - - if (msg_ack_count == bcp->cpus_in_uvhub) { - /* - * All cpus in uvhub saw it; reply - */ - uv_reply_to_message(mdp, bcp); - } - } - - return; -} - -/* - * Determine the first cpu on a uvhub. - */ -static int uvhub_to_first_cpu(int uvhub) -{ - int cpu; - for_each_present_cpu(cpu) - if (uvhub == uv_cpu_to_blade_id(cpu)) - return cpu; - return -1; -} - -/* - * Last resort when we get a large number of destination timeouts is - * to clear resources held by a given cpu. - * Do this with IPI so that all messages in the BAU message queue - * can be identified by their nonzero sw_ack_vector field. - * - * This is entered for a single cpu on the uvhub. - * The sender want's this uvhub to free a specific message's - * sw_ack resources. - */ -static void -uv_do_reset(void *ptr) -{ - int i; - int slot; - int count = 0; - unsigned long mmr; - unsigned long msg_res; - struct bau_control *bcp; - struct reset_args *rap; - struct bau_payload_queue_entry *msg; - struct ptc_stats *stat; - - bcp = &per_cpu(bau_control, smp_processor_id()); - rap = (struct reset_args *)ptr; - stat = bcp->statp; - stat->d_resets++; - - /* - * We're looking for the given sender, and - * will free its sw_ack resource. - * If all cpu's finally responded after the timeout, its - * message 'replied_to' was set. - */ - for (msg = bcp->va_queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) { - /* uv_do_reset: same conditions for cancellation as - uv_bau_process_retry_msg() */ - if ((msg->replied_to == 0) && - (msg->canceled == 0) && - (msg->sending_cpu == rap->sender) && - (msg->sw_ack_vector) && - (msg->msg_type != MSG_NOOP)) { - /* - * make everyone else ignore this message - */ - msg->canceled = 1; - slot = msg - bcp->va_queue_first; - count++; - /* - * only reset the resource if it is still pending - */ - mmr = uv_read_local_mmr - (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); - msg_res = msg->sw_ack_vector; - if (mmr & msg_res) { - stat->d_rcanceled++; - uv_write_local_mmr( - UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, - (msg_res << UV_SW_ACK_NPENDING) | - msg_res); - } - } - } - return; -} - -/* - * Use IPI to get all target uvhubs to release resources held by - * a given sending cpu number. - */ -static void uv_reset_with_ipi(struct bau_target_uvhubmask *distribution, - int sender) -{ - int uvhub; - int cpu; - cpumask_t mask; - struct reset_args reset_args; - - reset_args.sender = sender; - - cpus_clear(mask); - /* find a single cpu for each uvhub in this distribution mask */ - for (uvhub = 0; - uvhub < sizeof(struct bau_target_uvhubmask) * BITSPERBYTE; - uvhub++) { - if (!bau_uvhub_isset(uvhub, distribution)) - continue; - /* find a cpu for this uvhub */ - cpu = uvhub_to_first_cpu(uvhub); - cpu_set(cpu, mask); - } - /* IPI all cpus; Preemption is already disabled */ - smp_call_function_many(&mask, uv_do_reset, (void *)&reset_args, 1); - return; -} - -static inline unsigned long -cycles_2_us(unsigned long long cyc) -{ - unsigned long long ns; - unsigned long us; - ns = (cyc * per_cpu(cyc2ns, smp_processor_id())) - >> CYC2NS_SCALE_FACTOR; - us = ns / 1000; - return us; -} - -/* - * wait for all cpus on this hub to finish their sends and go quiet - * leaves uvhub_quiesce set so that no new broadcasts are started by - * bau_flush_send_and_wait() - */ -static inline void -quiesce_local_uvhub(struct bau_control *hmaster) -{ - atomic_add_short_return(1, (struct atomic_short *) - &hmaster->uvhub_quiesce); -} - -/* - * mark this quiet-requestor as done - */ -static inline void -end_uvhub_quiesce(struct bau_control *hmaster) -{ - atomic_add_short_return(-1, (struct atomic_short *) - &hmaster->uvhub_quiesce); -} - -/* - * Wait for completion of a broadcast software ack message - * return COMPLETE, RETRY(PLUGGED or TIMEOUT) or GIVEUP - */ -static int uv_wait_completion(struct bau_desc *bau_desc, - unsigned long mmr_offset, int right_shift, int this_cpu, - struct bau_control *bcp, struct bau_control *smaster, long try) -{ - unsigned long descriptor_status; - cycles_t ttime; - struct ptc_stats *stat = bcp->statp; - struct bau_control *hmaster; - - hmaster = bcp->uvhub_master; - - /* spin on the status MMR, waiting for it to go idle */ - while ((descriptor_status = (((unsigned long) - uv_read_local_mmr(mmr_offset) >> - right_shift) & UV_ACT_STATUS_MASK)) != - DESC_STATUS_IDLE) { - /* - * Our software ack messages may be blocked because there are - * no swack resources available. As long as none of them - * has timed out hardware will NACK our message and its - * state will stay IDLE. - */ - if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) { - stat->s_stimeout++; - return FLUSH_GIVEUP; - } else if (descriptor_status == - DESC_STATUS_DESTINATION_TIMEOUT) { - stat->s_dtimeout++; - ttime = get_cycles(); - - /* - * Our retries may be blocked by all destination - * swack resources being consumed, and a timeout - * pending. In that case hardware returns the - * ERROR that looks like a destination timeout. - */ - if (cycles_2_us(ttime - bcp->send_message) < - timeout_us) { - bcp->conseccompletes = 0; - return FLUSH_RETRY_PLUGGED; - } - - bcp->conseccompletes = 0; - return FLUSH_RETRY_TIMEOUT; - } else { - /* - * descriptor_status is still BUSY - */ - cpu_relax(); - } - } - bcp->conseccompletes++; - return FLUSH_COMPLETE; -} - -static inline cycles_t -sec_2_cycles(unsigned long sec) -{ - unsigned long ns; - cycles_t cyc; - - ns = sec * 1000000000; - cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); - return cyc; -} - -/* - * conditionally add 1 to *v, unless *v is >= u - * return 0 if we cannot add 1 to *v because it is >= u - * return 1 if we can add 1 to *v because it is < u - * the add is atomic - * - * This is close to atomic_add_unless(), but this allows the 'u' value - * to be lowered below the current 'v'. atomic_add_unless can only stop - * on equal. - */ -static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u) -{ - spin_lock(lock); - if (atomic_read(v) >= u) { - spin_unlock(lock); - return 0; - } - atomic_inc(v); - spin_unlock(lock); - return 1; -} - -/* - * Our retries are blocked by all destination swack resources being - * in use, and a timeout is pending. In that case hardware immediately - * returns the ERROR that looks like a destination timeout. - */ -static void -destination_plugged(struct bau_desc *bau_desc, struct bau_control *bcp, - struct bau_control *hmaster, struct ptc_stats *stat) -{ - udelay(bcp->plugged_delay); - bcp->plugged_tries++; - if (bcp->plugged_tries >= bcp->plugsb4reset) { - bcp->plugged_tries = 0; - quiesce_local_uvhub(hmaster); - spin_lock(&hmaster->queue_lock); - uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu); - spin_unlock(&hmaster->queue_lock); - end_uvhub_quiesce(hmaster); - bcp->ipi_attempts++; - stat->s_resets_plug++; - } -} - -static void -destination_timeout(struct bau_desc *bau_desc, struct bau_control *bcp, - struct bau_control *hmaster, struct ptc_stats *stat) -{ - hmaster->max_bau_concurrent = 1; - bcp->timeout_tries++; - if (bcp->timeout_tries >= bcp->timeoutsb4reset) { - bcp->timeout_tries = 0; - quiesce_local_uvhub(hmaster); - spin_lock(&hmaster->queue_lock); - uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu); - spin_unlock(&hmaster->queue_lock); - end_uvhub_quiesce(hmaster); - bcp->ipi_attempts++; - stat->s_resets_timeout++; - } -} - -/* - * Completions are taking a very long time due to a congested numalink - * network. - */ -static void -disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat) -{ - int tcpu; - struct bau_control *tbcp; - - /* let only one cpu do this disabling */ - spin_lock(&disable_lock); - if (!baudisabled && bcp->period_requests && - ((bcp->period_time / bcp->period_requests) > congested_cycles)) { - /* it becomes this cpu's job to turn on the use of the - BAU again */ - baudisabled = 1; - bcp->set_bau_off = 1; - bcp->set_bau_on_time = get_cycles() + - sec_2_cycles(bcp->congested_period); - stat->s_bau_disabled++; - for_each_present_cpu(tcpu) { - tbcp = &per_cpu(bau_control, tcpu); - tbcp->baudisabled = 1; - } - } - spin_unlock(&disable_lock); -} - -/** - * uv_flush_send_and_wait - * - * Send a broadcast and wait for it to complete. - * - * The flush_mask contains the cpus the broadcast is to be sent to including - * cpus that are on the local uvhub. - * - * Returns 0 if all flushing represented in the mask was done. - * Returns 1 if it gives up entirely and the original cpu mask is to be - * returned to the kernel. - */ -int uv_flush_send_and_wait(struct bau_desc *bau_desc, - struct cpumask *flush_mask, struct bau_control *bcp) -{ - int right_shift; - int completion_status = 0; - int seq_number = 0; - long try = 0; - int cpu = bcp->uvhub_cpu; - int this_cpu = bcp->cpu; - unsigned long mmr_offset; - unsigned long index; - cycles_t time1; - cycles_t time2; - cycles_t elapsed; - struct ptc_stats *stat = bcp->statp; - struct bau_control *smaster = bcp->socket_master; - struct bau_control *hmaster = bcp->uvhub_master; - - if (!atomic_inc_unless_ge(&hmaster->uvhub_lock, - &hmaster->active_descriptor_count, - hmaster->max_bau_concurrent)) { - stat->s_throttles++; - do { - cpu_relax(); - } while (!atomic_inc_unless_ge(&hmaster->uvhub_lock, - &hmaster->active_descriptor_count, - hmaster->max_bau_concurrent)); - } - while (hmaster->uvhub_quiesce) - cpu_relax(); - - if (cpu < UV_CPUS_PER_ACT_STATUS) { - mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; - right_shift = cpu * UV_ACT_STATUS_SIZE; - } else { - mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1; - right_shift = - ((cpu - UV_CPUS_PER_ACT_STATUS) * UV_ACT_STATUS_SIZE); - } - time1 = get_cycles(); - do { - if (try == 0) { - bau_desc->header.msg_type = MSG_REGULAR; - seq_number = bcp->message_number++; - } else { - bau_desc->header.msg_type = MSG_RETRY; - stat->s_retry_messages++; - } - bau_desc->header.sequence = seq_number; - index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) | - bcp->uvhub_cpu; - bcp->send_message = get_cycles(); - uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index); - try++; - completion_status = uv_wait_completion(bau_desc, mmr_offset, - right_shift, this_cpu, bcp, smaster, try); - - if (completion_status == FLUSH_RETRY_PLUGGED) { - destination_plugged(bau_desc, bcp, hmaster, stat); - } else if (completion_status == FLUSH_RETRY_TIMEOUT) { - destination_timeout(bau_desc, bcp, hmaster, stat); - } - if (bcp->ipi_attempts >= bcp->ipi_reset_limit) { - bcp->ipi_attempts = 0; - completion_status = FLUSH_GIVEUP; - break; - } - cpu_relax(); - } while ((completion_status == FLUSH_RETRY_PLUGGED) || - (completion_status == FLUSH_RETRY_TIMEOUT)); - time2 = get_cycles(); - bcp->plugged_tries = 0; - bcp->timeout_tries = 0; - if ((completion_status == FLUSH_COMPLETE) && - (bcp->conseccompletes > bcp->complete_threshold) && - (hmaster->max_bau_concurrent < - hmaster->max_bau_concurrent_constant)) - hmaster->max_bau_concurrent++; - while (hmaster->uvhub_quiesce) - cpu_relax(); - atomic_dec(&hmaster->active_descriptor_count); - if (time2 > time1) { - elapsed = time2 - time1; - stat->s_time += elapsed; - if ((completion_status == FLUSH_COMPLETE) && (try == 1)) { - bcp->period_requests++; - bcp->period_time += elapsed; - if ((elapsed > congested_cycles) && - (bcp->period_requests > bcp->congested_reps)) { - disable_for_congestion(bcp, stat); - } - } - } else - stat->s_requestor--; - if (completion_status == FLUSH_COMPLETE && try > 1) - stat->s_retriesok++; - else if (completion_status == FLUSH_GIVEUP) { - stat->s_giveup++; - return 1; - } - return 0; -} - -/** - * uv_flush_tlb_others - globally purge translation cache of a virtual - * address or all TLB's - * @cpumask: mask of all cpu's in which the address is to be removed - * @mm: mm_struct containing virtual address range - * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu) - * @cpu: the current cpu - * - * This is the entry point for initiating any UV global TLB shootdown. - * - * Purges the translation caches of all specified processors of the given - * virtual address, or purges all TLB's on specified processors. - * - * The caller has derived the cpumask from the mm_struct. This function - * is called only if there are bits set in the mask. (e.g. flush_tlb_page()) - * - * The cpumask is converted into a uvhubmask of the uvhubs containing - * those cpus. - * - * Note that this function should be called with preemption disabled. - * - * Returns NULL if all remote flushing was done. - * Returns pointer to cpumask if some remote flushing remains to be - * done. The returned pointer is valid till preemption is re-enabled. - */ -const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, - struct mm_struct *mm, - unsigned long va, unsigned int cpu) -{ - int tcpu; - int uvhub; - int locals = 0; - int remotes = 0; - int hubs = 0; - struct bau_desc *bau_desc; - struct cpumask *flush_mask; - struct ptc_stats *stat; - struct bau_control *bcp; - struct bau_control *tbcp; - - /* kernel was booted 'nobau' */ - if (nobau) - return cpumask; - - bcp = &per_cpu(bau_control, cpu); - stat = bcp->statp; - - /* bau was disabled due to slow response */ - if (bcp->baudisabled) { - /* the cpu that disabled it must re-enable it */ - if (bcp->set_bau_off) { - if (get_cycles() >= bcp->set_bau_on_time) { - stat->s_bau_reenabled++; - baudisabled = 0; - for_each_present_cpu(tcpu) { - tbcp = &per_cpu(bau_control, tcpu); - tbcp->baudisabled = 0; - tbcp->period_requests = 0; - tbcp->period_time = 0; - } - } - } - return cpumask; - } - - /* - * Each sending cpu has a per-cpu mask which it fills from the caller's - * cpu mask. All cpus are converted to uvhubs and copied to the - * activation descriptor. - */ - flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu); - /* don't actually do a shootdown of the local cpu */ - cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); - if (cpu_isset(cpu, *cpumask)) - stat->s_ntargself++; - - bau_desc = bcp->descriptor_base; - bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu; - bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); - - /* cpu statistics */ - for_each_cpu(tcpu, flush_mask) { - uvhub = uv_cpu_to_blade_id(tcpu); - bau_uvhub_set(uvhub, &bau_desc->distribution); - if (uvhub == bcp->uvhub) - locals++; - else - remotes++; - } - if ((locals + remotes) == 0) - return NULL; - stat->s_requestor++; - stat->s_ntargcpu += remotes + locals; - stat->s_ntargremotes += remotes; - stat->s_ntarglocals += locals; - remotes = bau_uvhub_weight(&bau_desc->distribution); - - /* uvhub statistics */ - hubs = bau_uvhub_weight(&bau_desc->distribution); - if (locals) { - stat->s_ntarglocaluvhub++; - stat->s_ntargremoteuvhub += (hubs - 1); - } else - stat->s_ntargremoteuvhub += hubs; - stat->s_ntarguvhub += hubs; - if (hubs >= 16) - stat->s_ntarguvhub16++; - else if (hubs >= 8) - stat->s_ntarguvhub8++; - else if (hubs >= 4) - stat->s_ntarguvhub4++; - else if (hubs >= 2) - stat->s_ntarguvhub2++; - else - stat->s_ntarguvhub1++; - - bau_desc->payload.address = va; - bau_desc->payload.sending_cpu = cpu; - - /* - * uv_flush_send_and_wait returns 0 if all cpu's were messaged, - * or 1 if it gave up and the original cpumask should be returned. - */ - if (!uv_flush_send_and_wait(bau_desc, flush_mask, bcp)) - return NULL; - else - return cpumask; -} - -/* - * The BAU message interrupt comes here. (registered by set_intr_gate) - * See entry_64.S - * - * We received a broadcast assist message. - * - * Interrupts are disabled; this interrupt could represent - * the receipt of several messages. - * - * All cores/threads on this hub get this interrupt. - * The last one to see it does the software ack. - * (the resource will not be freed until noninterruptable cpus see this - * interrupt; hardware may timeout the s/w ack and reply ERROR) - */ -void uv_bau_message_interrupt(struct pt_regs *regs) -{ - int count = 0; - cycles_t time_start; - struct bau_payload_queue_entry *msg; - struct bau_control *bcp; - struct ptc_stats *stat; - struct msg_desc msgdesc; - - time_start = get_cycles(); - bcp = &per_cpu(bau_control, smp_processor_id()); - stat = bcp->statp; - msgdesc.va_queue_first = bcp->va_queue_first; - msgdesc.va_queue_last = bcp->va_queue_last; - msg = bcp->bau_msg_head; - while (msg->sw_ack_vector) { - count++; - msgdesc.msg_slot = msg - msgdesc.va_queue_first; - msgdesc.sw_ack_slot = ffs(msg->sw_ack_vector) - 1; - msgdesc.msg = msg; - uv_bau_process_message(&msgdesc, bcp); - msg++; - if (msg > msgdesc.va_queue_last) - msg = msgdesc.va_queue_first; - bcp->bau_msg_head = msg; - } - stat->d_time += (get_cycles() - time_start); - if (!count) - stat->d_nomsg++; - else if (count > 1) - stat->d_multmsg++; - ack_APIC_irq(); -} - -/* - * uv_enable_timeouts - * - * Each target uvhub (i.e. a uvhub that has no cpu's) needs to have - * shootdown message timeouts enabled. The timeout does not cause - * an interrupt, but causes an error message to be returned to - * the sender. - */ -static void uv_enable_timeouts(void) -{ - int uvhub; - int nuvhubs; - int pnode; - unsigned long mmr_image; - - nuvhubs = uv_num_possible_blades(); - - for (uvhub = 0; uvhub < nuvhubs; uvhub++) { - if (!uv_blade_nr_possible_cpus(uvhub)) - continue; - - pnode = uv_blade_to_pnode(uvhub); - mmr_image = - uv_read_global_mmr64(pnode, UVH_LB_BAU_MISC_CONTROL); - /* - * Set the timeout period and then lock it in, in three - * steps; captures and locks in the period. - * - * To program the period, the SOFT_ACK_MODE must be off. - */ - mmr_image &= ~((unsigned long)1 << - UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT); - uv_write_global_mmr64 - (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image); - /* - * Set the 4-bit period. - */ - mmr_image &= ~((unsigned long)0xf << - UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT); - mmr_image |= (UV_INTD_SOFT_ACK_TIMEOUT_PERIOD << - UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT); - uv_write_global_mmr64 - (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image); - /* - * Subsequent reversals of the timebase bit (3) cause an - * immediate timeout of one or all INTD resources as - * indicated in bits 2:0 (7 causes all of them to timeout). - */ - mmr_image |= ((unsigned long)1 << - UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT); - uv_write_global_mmr64 - (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image); - } -} - -static void *uv_ptc_seq_start(struct seq_file *file, loff_t *offset) -{ - if (*offset < num_possible_cpus()) - return offset; - return NULL; -} - -static void *uv_ptc_seq_next(struct seq_file *file, void *data, loff_t *offset) -{ - (*offset)++; - if (*offset < num_possible_cpus()) - return offset; - return NULL; -} - -static void uv_ptc_seq_stop(struct seq_file *file, void *data) -{ -} - -static inline unsigned long long -microsec_2_cycles(unsigned long microsec) -{ - unsigned long ns; - unsigned long long cyc; - - ns = microsec * 1000; - cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); - return cyc; -} - -/* - * Display the statistics thru /proc. - * 'data' points to the cpu number - */ -static int uv_ptc_seq_show(struct seq_file *file, void *data) -{ - struct ptc_stats *stat; - int cpu; - - cpu = *(loff_t *)data; - - if (!cpu) { - seq_printf(file, - "# cpu sent stime self locals remotes ncpus localhub "); - seq_printf(file, - "remotehub numuvhubs numuvhubs16 numuvhubs8 "); - seq_printf(file, - "numuvhubs4 numuvhubs2 numuvhubs1 dto "); - seq_printf(file, - "retries rok resetp resett giveup sto bz throt "); - seq_printf(file, - "sw_ack recv rtime all "); - seq_printf(file, - "one mult none retry canc nocan reset rcan "); - seq_printf(file, - "disable enable\n"); - } - if (cpu < num_possible_cpus() && cpu_online(cpu)) { - stat = &per_cpu(ptcstats, cpu); - /* source side statistics */ - seq_printf(file, - "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", - cpu, stat->s_requestor, cycles_2_us(stat->s_time), - stat->s_ntargself, stat->s_ntarglocals, - stat->s_ntargremotes, stat->s_ntargcpu, - stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub, - stat->s_ntarguvhub, stat->s_ntarguvhub16); - seq_printf(file, "%ld %ld %ld %ld %ld ", - stat->s_ntarguvhub8, stat->s_ntarguvhub4, - stat->s_ntarguvhub2, stat->s_ntarguvhub1, - stat->s_dtimeout); - seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ", - stat->s_retry_messages, stat->s_retriesok, - stat->s_resets_plug, stat->s_resets_timeout, - stat->s_giveup, stat->s_stimeout, - stat->s_busy, stat->s_throttles); - - /* destination side statistics */ - seq_printf(file, - "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", - uv_read_global_mmr64(uv_cpu_to_pnode(cpu), - UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE), - stat->d_requestee, cycles_2_us(stat->d_time), - stat->d_alltlb, stat->d_onetlb, stat->d_multmsg, - stat->d_nomsg, stat->d_retries, stat->d_canceled, - stat->d_nocanceled, stat->d_resets, - stat->d_rcanceled); - seq_printf(file, "%ld %ld\n", - stat->s_bau_disabled, stat->s_bau_reenabled); - } - - return 0; -} - -/* - * Display the tunables thru debugfs - */ -static ssize_t tunables_read(struct file *file, char __user *userbuf, - size_t count, loff_t *ppos) -{ - char *buf; - int ret; - - buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n", - "max_bau_concurrent plugged_delay plugsb4reset", - "timeoutsb4reset ipi_reset_limit complete_threshold", - "congested_response_us congested_reps congested_period", - max_bau_concurrent, plugged_delay, plugsb4reset, - timeoutsb4reset, ipi_reset_limit, complete_threshold, - congested_response_us, congested_reps, congested_period); - - if (!buf) - return -ENOMEM; - - ret = simple_read_from_buffer(userbuf, count, ppos, buf, strlen(buf)); - kfree(buf); - return ret; -} - -/* - * -1: resetf the statistics - * 0: display meaning of the statistics - */ -static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user, - size_t count, loff_t *data) -{ - int cpu; - long input_arg; - char optstr[64]; - struct ptc_stats *stat; - - if (count == 0 || count > sizeof(optstr)) - return -EINVAL; - if (copy_from_user(optstr, user, count)) - return -EFAULT; - optstr[count - 1] = '\0'; - if (strict_strtol(optstr, 10, &input_arg) < 0) { - printk(KERN_DEBUG "%s is invalid\n", optstr); - return -EINVAL; - } - - if (input_arg == 0) { - printk(KERN_DEBUG "# cpu: cpu number\n"); - printk(KERN_DEBUG "Sender statistics:\n"); - printk(KERN_DEBUG - "sent: number of shootdown messages sent\n"); - printk(KERN_DEBUG - "stime: time spent sending messages\n"); - printk(KERN_DEBUG - "numuvhubs: number of hubs targeted with shootdown\n"); - printk(KERN_DEBUG - "numuvhubs16: number times 16 or more hubs targeted\n"); - printk(KERN_DEBUG - "numuvhubs8: number times 8 or more hubs targeted\n"); - printk(KERN_DEBUG - "numuvhubs4: number times 4 or more hubs targeted\n"); - printk(KERN_DEBUG - "numuvhubs2: number times 2 or more hubs targeted\n"); - printk(KERN_DEBUG - "numuvhubs1: number times 1 hub targeted\n"); - printk(KERN_DEBUG - "numcpus: number of cpus targeted with shootdown\n"); - printk(KERN_DEBUG - "dto: number of destination timeouts\n"); - printk(KERN_DEBUG - "retries: destination timeout retries sent\n"); - printk(KERN_DEBUG - "rok: : destination timeouts successfully retried\n"); - printk(KERN_DEBUG - "resetp: ipi-style resource resets for plugs\n"); - printk(KERN_DEBUG - "resett: ipi-style resource resets for timeouts\n"); - printk(KERN_DEBUG - "giveup: fall-backs to ipi-style shootdowns\n"); - printk(KERN_DEBUG - "sto: number of source timeouts\n"); - printk(KERN_DEBUG - "bz: number of stay-busy's\n"); - printk(KERN_DEBUG - "throt: number times spun in throttle\n"); - printk(KERN_DEBUG "Destination side statistics:\n"); - printk(KERN_DEBUG - "sw_ack: image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n"); - printk(KERN_DEBUG - "recv: shootdown messages received\n"); - printk(KERN_DEBUG - "rtime: time spent processing messages\n"); - printk(KERN_DEBUG - "all: shootdown all-tlb messages\n"); - printk(KERN_DEBUG - "one: shootdown one-tlb messages\n"); - printk(KERN_DEBUG - "mult: interrupts that found multiple messages\n"); - printk(KERN_DEBUG - "none: interrupts that found no messages\n"); - printk(KERN_DEBUG - "retry: number of retry messages processed\n"); - printk(KERN_DEBUG - "canc: number messages canceled by retries\n"); - printk(KERN_DEBUG - "nocan: number retries that found nothing to cancel\n"); - printk(KERN_DEBUG - "reset: number of ipi-style reset requests processed\n"); - printk(KERN_DEBUG - "rcan: number messages canceled by reset requests\n"); - printk(KERN_DEBUG - "disable: number times use of the BAU was disabled\n"); - printk(KERN_DEBUG - "enable: number times use of the BAU was re-enabled\n"); - } else if (input_arg == -1) { - for_each_present_cpu(cpu) { - stat = &per_cpu(ptcstats, cpu); - memset(stat, 0, sizeof(struct ptc_stats)); - } - } - - return count; -} - -static int local_atoi(const char *name) -{ - int val = 0; - - for (;; name++) { - switch (*name) { - case '0' ... '9': - val = 10*val+(*name-'0'); - break; - default: - return val; - } - } -} - -/* - * set the tunables - * 0 values reset them to defaults - */ -static ssize_t tunables_write(struct file *file, const char __user *user, - size_t count, loff_t *data) -{ - int cpu; - int cnt = 0; - int val; - char *p; - char *q; - char instr[64]; - struct bau_control *bcp; - - if (count == 0 || count > sizeof(instr)-1) - return -EINVAL; - if (copy_from_user(instr, user, count)) - return -EFAULT; - - instr[count] = '\0'; - /* count the fields */ - p = instr + strspn(instr, WHITESPACE); - q = p; - for (; *p; p = q + strspn(q, WHITESPACE)) { - q = p + strcspn(p, WHITESPACE); - cnt++; - if (q == p) - break; - } - if (cnt != 9) { - printk(KERN_INFO "bau tunable error: should be 9 numbers\n"); - return -EINVAL; - } - - p = instr + strspn(instr, WHITESPACE); - q = p; - for (cnt = 0; *p; p = q + strspn(q, WHITESPACE), cnt++) { - q = p + strcspn(p, WHITESPACE); - val = local_atoi(p); - switch (cnt) { - case 0: - if (val == 0) { - max_bau_concurrent = MAX_BAU_CONCURRENT; - max_bau_concurrent_constant = - MAX_BAU_CONCURRENT; - continue; - } - bcp = &per_cpu(bau_control, smp_processor_id()); - if (val < 1 || val > bcp->cpus_in_uvhub) { - printk(KERN_DEBUG - "Error: BAU max concurrent %d is invalid\n", - val); - return -EINVAL; - } - max_bau_concurrent = val; - max_bau_concurrent_constant = val; - continue; - case 1: - if (val == 0) - plugged_delay = PLUGGED_DELAY; - else - plugged_delay = val; - continue; - case 2: - if (val == 0) - plugsb4reset = PLUGSB4RESET; - else - plugsb4reset = val; - continue; - case 3: - if (val == 0) - timeoutsb4reset = TIMEOUTSB4RESET; - else - timeoutsb4reset = val; - continue; - case 4: - if (val == 0) - ipi_reset_limit = IPI_RESET_LIMIT; - else - ipi_reset_limit = val; - continue; - case 5: - if (val == 0) - complete_threshold = COMPLETE_THRESHOLD; - else - complete_threshold = val; - continue; - case 6: - if (val == 0) - congested_response_us = CONGESTED_RESPONSE_US; - else - congested_response_us = val; - continue; - case 7: - if (val == 0) - congested_reps = CONGESTED_REPS; - else - congested_reps = val; - continue; - case 8: - if (val == 0) - congested_period = CONGESTED_PERIOD; - else - congested_period = val; - continue; - } - if (q == p) - break; - } - for_each_present_cpu(cpu) { - bcp = &per_cpu(bau_control, cpu); - bcp->max_bau_concurrent = max_bau_concurrent; - bcp->max_bau_concurrent_constant = max_bau_concurrent; - bcp->plugged_delay = plugged_delay; - bcp->plugsb4reset = plugsb4reset; - bcp->timeoutsb4reset = timeoutsb4reset; - bcp->ipi_reset_limit = ipi_reset_limit; - bcp->complete_threshold = complete_threshold; - bcp->congested_response_us = congested_response_us; - bcp->congested_reps = congested_reps; - bcp->congested_period = congested_period; - } - return count; -} - -static const struct seq_operations uv_ptc_seq_ops = { - .start = uv_ptc_seq_start, - .next = uv_ptc_seq_next, - .stop = uv_ptc_seq_stop, - .show = uv_ptc_seq_show -}; - -static int uv_ptc_proc_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &uv_ptc_seq_ops); -} - -static int tunables_open(struct inode *inode, struct file *file) -{ - return 0; -} - -static const struct file_operations proc_uv_ptc_operations = { - .open = uv_ptc_proc_open, - .read = seq_read, - .write = uv_ptc_proc_write, - .llseek = seq_lseek, - .release = seq_release, -}; - -static const struct file_operations tunables_fops = { - .open = tunables_open, - .read = tunables_read, - .write = tunables_write, - .llseek = default_llseek, -}; - -static int __init uv_ptc_init(void) -{ - struct proc_dir_entry *proc_uv_ptc; - - if (!is_uv_system()) - return 0; - - proc_uv_ptc = proc_create(UV_PTC_BASENAME, 0444, NULL, - &proc_uv_ptc_operations); - if (!proc_uv_ptc) { - printk(KERN_ERR "unable to create %s proc entry\n", - UV_PTC_BASENAME); - return -EINVAL; - } - - tunables_dir = debugfs_create_dir(UV_BAU_TUNABLES_DIR, NULL); - if (!tunables_dir) { - printk(KERN_ERR "unable to create debugfs directory %s\n", - UV_BAU_TUNABLES_DIR); - return -EINVAL; - } - tunables_file = debugfs_create_file(UV_BAU_TUNABLES_FILE, 0600, - tunables_dir, NULL, &tunables_fops); - if (!tunables_file) { - printk(KERN_ERR "unable to create debugfs file %s\n", - UV_BAU_TUNABLES_FILE); - return -EINVAL; - } - return 0; -} - -/* - * initialize the sending side's sending buffers - */ -static void -uv_activation_descriptor_init(int node, int pnode) -{ - int i; - int cpu; - unsigned long pa; - unsigned long m; - unsigned long n; - struct bau_desc *bau_desc; - struct bau_desc *bd2; - struct bau_control *bcp; - - /* - * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR) - * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per uvhub - */ - bau_desc = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)* - UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node); - BUG_ON(!bau_desc); - - pa = uv_gpa(bau_desc); /* need the real nasid*/ - n = pa >> uv_nshift; - m = pa & uv_mmask; - - uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, - (n << UV_DESC_BASE_PNODE_SHIFT | m)); - - /* - * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each - * cpu even though we only use the first one; one descriptor can - * describe a broadcast to 256 uv hubs. - */ - for (i = 0, bd2 = bau_desc; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR); - i++, bd2++) { - memset(bd2, 0, sizeof(struct bau_desc)); - bd2->header.sw_ack_flag = 1; - /* - * base_dest_nodeid is the nasid (pnode<<1) of the first uvhub - * in the partition. The bit map will indicate uvhub numbers, - * which are 0-N in a partition. Pnodes are unique system-wide. - */ - bd2->header.base_dest_nodeid = uv_partition_base_pnode << 1; - bd2->header.dest_subnodeid = 0x10; /* the LB */ - bd2->header.command = UV_NET_ENDPOINT_INTD; - bd2->header.int_both = 1; - /* - * all others need to be set to zero: - * fairness chaining multilevel count replied_to - */ - } - for_each_present_cpu(cpu) { - if (pnode != uv_blade_to_pnode(uv_cpu_to_blade_id(cpu))) - continue; - bcp = &per_cpu(bau_control, cpu); - bcp->descriptor_base = bau_desc; - } -} - -/* - * initialize the destination side's receiving buffers - * entered for each uvhub in the partition - * - node is first node (kernel memory notion) on the uvhub - * - pnode is the uvhub's physical identifier - */ -static void -uv_payload_queue_init(int node, int pnode) -{ - int pn; - int cpu; - char *cp; - unsigned long pa; - struct bau_payload_queue_entry *pqp; - struct bau_payload_queue_entry *pqp_malloc; - struct bau_control *bcp; - - pqp = (struct bau_payload_queue_entry *) kmalloc_node( - (DEST_Q_SIZE + 1) * sizeof(struct bau_payload_queue_entry), - GFP_KERNEL, node); - BUG_ON(!pqp); - pqp_malloc = pqp; - - cp = (char *)pqp + 31; - pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5); - - for_each_present_cpu(cpu) { - if (pnode != uv_cpu_to_pnode(cpu)) - continue; - /* for every cpu on this pnode: */ - bcp = &per_cpu(bau_control, cpu); - bcp->va_queue_first = pqp; - bcp->bau_msg_head = pqp; - bcp->va_queue_last = pqp + (DEST_Q_SIZE - 1); - } - /* - * need the pnode of where the memory was really allocated - */ - pa = uv_gpa(pqp); - pn = pa >> uv_nshift; - uv_write_global_mmr64(pnode, - UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST, - ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) | - uv_physnodeaddr(pqp)); - uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL, - uv_physnodeaddr(pqp)); - uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST, - (unsigned long) - uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1))); - /* in effect, all msg_type's are set to MSG_NOOP */ - memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * DEST_Q_SIZE); -} - -/* - * Initialization of each UV hub's structures - */ -static void __init uv_init_uvhub(int uvhub, int vector) -{ - int node; - int pnode; - unsigned long apicid; - - node = uvhub_to_first_node(uvhub); - pnode = uv_blade_to_pnode(uvhub); - uv_activation_descriptor_init(node, pnode); - uv_payload_queue_init(node, pnode); - /* - * the below initialization can't be in firmware because the - * messaging IRQ will be determined by the OS - */ - apicid = uvhub_to_first_apicid(uvhub); - uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, - ((apicid << 32) | vector)); -} - -/* - * We will set BAU_MISC_CONTROL with a timeout period. - * But the BIOS has set UVH_AGING_PRESCALE_SEL and UVH_TRANSACTION_TIMEOUT. - * So the destination timeout period has be be calculated from them. - */ -static int -calculate_destination_timeout(void) -{ - unsigned long mmr_image; - int mult1; - int mult2; - int index; - int base; - int ret; - unsigned long ts_ns; - - mult1 = UV_INTD_SOFT_ACK_TIMEOUT_PERIOD & BAU_MISC_CONTROL_MULT_MASK; - mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL); - index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK; - mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT); - mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK; - base = timeout_base_ns[index]; - ts_ns = base * mult1 * mult2; - ret = ts_ns / 1000; - return ret; -} - -/* - * initialize the bau_control structure for each cpu - */ -static void __init uv_init_per_cpu(int nuvhubs) -{ - int i; - int cpu; - int pnode; - int uvhub; - int have_hmaster; - short socket = 0; - unsigned short socket_mask; - unsigned char *uvhub_mask; - struct bau_control *bcp; - struct uvhub_desc *bdp; - struct socket_desc *sdp; - struct bau_control *hmaster = NULL; - struct bau_control *smaster = NULL; - struct socket_desc { - short num_cpus; - short cpu_number[16]; - }; - struct uvhub_desc { - unsigned short socket_mask; - short num_cpus; - short uvhub; - short pnode; - struct socket_desc socket[2]; - }; - struct uvhub_desc *uvhub_descs; - - timeout_us = calculate_destination_timeout(); - - uvhub_descs = (struct uvhub_desc *) - kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL); - memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc)); - uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL); - for_each_present_cpu(cpu) { - bcp = &per_cpu(bau_control, cpu); - memset(bcp, 0, sizeof(struct bau_control)); - pnode = uv_cpu_hub_info(cpu)->pnode; - uvhub = uv_cpu_hub_info(cpu)->numa_blade_id; - *(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8)); - bdp = &uvhub_descs[uvhub]; - bdp->num_cpus++; - bdp->uvhub = uvhub; - bdp->pnode = pnode; - /* kludge: 'assuming' one node per socket, and assuming that - disabling a socket just leaves a gap in node numbers */ - socket = (cpu_to_node(cpu) & 1); - bdp->socket_mask |= (1 << socket); - sdp = &bdp->socket[socket]; - sdp->cpu_number[sdp->num_cpus] = cpu; - sdp->num_cpus++; - } - for (uvhub = 0; uvhub < nuvhubs; uvhub++) { - if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8)))) - continue; - have_hmaster = 0; - bdp = &uvhub_descs[uvhub]; - socket_mask = bdp->socket_mask; - socket = 0; - while (socket_mask) { - if (!(socket_mask & 1)) - goto nextsocket; - sdp = &bdp->socket[socket]; - for (i = 0; i < sdp->num_cpus; i++) { - cpu = sdp->cpu_number[i]; - bcp = &per_cpu(bau_control, cpu); - bcp->cpu = cpu; - if (i == 0) { - smaster = bcp; - if (!have_hmaster) { - have_hmaster++; - hmaster = bcp; - } - } - bcp->cpus_in_uvhub = bdp->num_cpus; - bcp->cpus_in_socket = sdp->num_cpus; - bcp->socket_master = smaster; - bcp->uvhub = bdp->uvhub; - bcp->uvhub_master = hmaster; - bcp->uvhub_cpu = uv_cpu_hub_info(cpu)-> - blade_processor_id; - } -nextsocket: - socket++; - socket_mask = (socket_mask >> 1); - } - } - kfree(uvhub_descs); - kfree(uvhub_mask); - for_each_present_cpu(cpu) { - bcp = &per_cpu(bau_control, cpu); - bcp->baudisabled = 0; - bcp->statp = &per_cpu(ptcstats, cpu); - /* time interval to catch a hardware stay-busy bug */ - bcp->timeout_interval = microsec_2_cycles(2*timeout_us); - bcp->max_bau_concurrent = max_bau_concurrent; - bcp->max_bau_concurrent_constant = max_bau_concurrent; - bcp->plugged_delay = plugged_delay; - bcp->plugsb4reset = plugsb4reset; - bcp->timeoutsb4reset = timeoutsb4reset; - bcp->ipi_reset_limit = ipi_reset_limit; - bcp->complete_threshold = complete_threshold; - bcp->congested_response_us = congested_response_us; - bcp->congested_reps = congested_reps; - bcp->congested_period = congested_period; - } -} - -/* - * Initialization of BAU-related structures - */ -static int __init uv_bau_init(void) -{ - int uvhub; - int pnode; - int nuvhubs; - int cur_cpu; - int vector; - unsigned long mmr; - - if (!is_uv_system()) - return 0; - - if (nobau) - return 0; - - for_each_possible_cpu(cur_cpu) - zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu), - GFP_KERNEL, cpu_to_node(cur_cpu)); - - uv_nshift = uv_hub_info->m_val; - uv_mmask = (1UL << uv_hub_info->m_val) - 1; - nuvhubs = uv_num_possible_blades(); - spin_lock_init(&disable_lock); - congested_cycles = microsec_2_cycles(congested_response_us); - - uv_init_per_cpu(nuvhubs); - - uv_partition_base_pnode = 0x7fffffff; - for (uvhub = 0; uvhub < nuvhubs; uvhub++) - if (uv_blade_nr_possible_cpus(uvhub) && - (uv_blade_to_pnode(uvhub) < uv_partition_base_pnode)) - uv_partition_base_pnode = uv_blade_to_pnode(uvhub); - - vector = UV_BAU_MESSAGE; - for_each_possible_blade(uvhub) - if (uv_blade_nr_possible_cpus(uvhub)) - uv_init_uvhub(uvhub, vector); - - uv_enable_timeouts(); - alloc_intr_gate(vector, uv_bau_message_intr1); - - for_each_possible_blade(uvhub) { - if (uv_blade_nr_possible_cpus(uvhub)) { - pnode = uv_blade_to_pnode(uvhub); - /* INIT the bau */ - uv_write_global_mmr64(pnode, - UVH_LB_BAU_SB_ACTIVATION_CONTROL, - ((unsigned long)1 << 63)); - mmr = 1; /* should be 1 to broadcast to both sockets */ - uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST, - mmr); - } - } - - return 0; -} -core_initcall(uv_bau_init); -fs_initcall(uv_ptc_init); diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index 7e4515957a1..8927486a464 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -39,7 +39,7 @@ int __ref arch_register_cpu(int num) /* * CPU0 cannot be offlined due to several * restrictions and assumptions in kernel. This basically - * doesnt add a control file, one cannot attempt to offline + * doesn't add a control file, one cannot attempt to offline * BSP. * * Also certain PCI quirks require not to enable hotplug control diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c index a375616d77f..a91ae7709b4 100644 --- a/arch/x86/kernel/trampoline.c +++ b/arch/x86/kernel/trampoline.c @@ -2,39 +2,41 @@ #include <linux/memblock.h> #include <asm/trampoline.h> +#include <asm/cacheflush.h> #include <asm/pgtable.h> -#if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP) -#define __trampinit -#define __trampinitdata -#else -#define __trampinit __cpuinit -#define __trampinitdata __cpuinitdata -#endif +unsigned char *x86_trampoline_base; -/* ready for x86_64 and x86 */ -unsigned char *__trampinitdata trampoline_base; - -void __init reserve_trampoline_memory(void) +void __init setup_trampolines(void) { phys_addr_t mem; + size_t size = PAGE_ALIGN(x86_trampoline_end - x86_trampoline_start); /* Has to be in very low memory so we can execute real-mode AP code. */ - mem = memblock_find_in_range(0, 1<<20, TRAMPOLINE_SIZE, PAGE_SIZE); + mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE); if (mem == MEMBLOCK_ERROR) panic("Cannot allocate trampoline\n"); - trampoline_base = __va(mem); - memblock_x86_reserve_range(mem, mem + TRAMPOLINE_SIZE, "TRAMPOLINE"); + x86_trampoline_base = __va(mem); + memblock_x86_reserve_range(mem, mem + size, "TRAMPOLINE"); + + printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n", + x86_trampoline_base, (unsigned long long)mem, size); + + memcpy(x86_trampoline_base, x86_trampoline_start, size); } /* - * Currently trivial. Write the real->protected mode - * bootstrap into the page concerned. The caller - * has made sure it's suitably aligned. + * setup_trampolines() gets called very early, to guarantee the + * availability of low memory. This is before the proper kernel page + * tables are set up, so we cannot set page permissions in that + * function. Thus, we use an arch_initcall instead. */ -unsigned long __trampinit setup_trampoline(void) +static int __init configure_trampolines(void) { - memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); - return virt_to_phys(trampoline_base); + size_t size = PAGE_ALIGN(x86_trampoline_end - x86_trampoline_start); + + set_memory_x((unsigned long)x86_trampoline_base, size >> PAGE_SHIFT); + return 0; } +arch_initcall(configure_trampolines); diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S index 8508237e8e4..451c0a7ef7f 100644 --- a/arch/x86/kernel/trampoline_32.S +++ b/arch/x86/kernel/trampoline_32.S @@ -32,9 +32,11 @@ #include <asm/segment.h> #include <asm/page_types.h> -/* We can free up trampoline after bootup if cpu hotplug is not supported. */ -__CPUINITRODATA -.code16 +#ifdef CONFIG_SMP + + .section ".x86_trampoline","a" + .balign PAGE_SIZE + .code16 ENTRY(trampoline_data) r_base = . @@ -44,7 +46,7 @@ r_base = . cli # We should be safe anyway - movl $0xA5A5A5A5, trampoline_data - r_base + movl $0xA5A5A5A5, trampoline_status - r_base # write marker for master knows we're running /* GDT tables in non default location kernel can be beyond 16MB and @@ -72,5 +74,10 @@ boot_idt_descr: .word 0 # idt limit = 0 .long 0 # idt base = 0L +ENTRY(trampoline_status) + .long 0 + .globl trampoline_end trampoline_end: + +#endif /* CONFIG_SMP */ diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S index 3af2dff58b2..09ff51799e9 100644 --- a/arch/x86/kernel/trampoline_64.S +++ b/arch/x86/kernel/trampoline_64.S @@ -32,13 +32,9 @@ #include <asm/segment.h> #include <asm/processor-flags.h> -#ifdef CONFIG_ACPI_SLEEP -.section .rodata, "a", @progbits -#else -/* We can free up the trampoline after bootup if cpu hotplug is not supported. */ -__CPUINITRODATA -#endif -.code16 + .section ".x86_trampoline","a" + .balign PAGE_SIZE + .code16 ENTRY(trampoline_data) r_base = . @@ -50,7 +46,7 @@ r_base = . mov %ax, %ss - movl $0xA5A5A5A5, trampoline_data - r_base + movl $0xA5A5A5A5, trampoline_status - r_base # write marker for master knows we're running # Setup stack @@ -64,10 +60,13 @@ r_base = . movzx %ax, %esi # Find the 32bit trampoline location shll $4, %esi - # Fixup the vectors - addl %esi, startup_32_vector - r_base - addl %esi, startup_64_vector - r_base - addl %esi, tgdt + 2 - r_base # Fixup the gdt pointer + # Fixup the absolute vectors + leal (startup_32 - r_base)(%esi), %eax + movl %eax, startup_32_vector - r_base + leal (startup_64 - r_base)(%esi), %eax + movl %eax, startup_64_vector - r_base + leal (tgdt - r_base)(%esi), %eax + movl %eax, (tgdt + 2 - r_base) /* * GDT tables in non default location kernel can be beyond 16MB and @@ -127,8 +126,9 @@ startup_64: no_longmode: hlt jmp no_longmode -#include "verify_cpu_64.S" +#include "verify_cpu.S" + .balign 4 # Careful these need to be in the same 64K segment as the above; tidt: .word 0 # idt limit = 0 @@ -156,6 +156,10 @@ startup_64_vector: .long startup_64 - r_base .word __KERNEL_CS, 0 + .balign 4 +ENTRY(trampoline_status) + .long 0 + trampoline_stack: .org 0x1000 trampoline_stack_end: diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index cb838ca42c9..b9b67166f9d 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -83,6 +83,13 @@ EXPORT_SYMBOL_GPL(used_vectors); static int ignore_nmis; +int unknown_nmi_panic; +/* + * Prevent NMI reason port (0x61) being accessed simultaneously, can + * only be used in NMI handler. + */ +static DEFINE_RAW_SPINLOCK(nmi_reason_lock); + static inline void conditional_sti(struct pt_regs *regs) { if (regs->flags & X86_EFLAGS_IF) @@ -300,16 +307,23 @@ gp_in_kernel: die("general protection fault", regs, error_code); } -static notrace __kprobes void -mem_parity_error(unsigned char reason, struct pt_regs *regs) +static int __init setup_unknown_nmi_panic(char *str) { - printk(KERN_EMERG - "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", - reason, smp_processor_id()); + unknown_nmi_panic = 1; + return 1; +} +__setup("unknown_nmi_panic", setup_unknown_nmi_panic); - printk(KERN_EMERG - "You have some hardware problem, likely on the PCI bus.\n"); +static notrace __kprobes void +pci_serr_error(unsigned char reason, struct pt_regs *regs) +{ + pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n", + reason, smp_processor_id()); + /* + * On some machines, PCI SERR line is used to report memory + * errors. EDAC makes use of it. + */ #if defined(CONFIG_EDAC) if (edac_handler_set()) { edac_atomic_assert_error(); @@ -320,11 +334,11 @@ mem_parity_error(unsigned char reason, struct pt_regs *regs) if (panic_on_unrecovered_nmi) panic("NMI: Not continuing"); - printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); + pr_emerg("Dazed and confused, but trying to continue\n"); - /* Clear and disable the memory parity error line. */ - reason = (reason & 0xf) | 4; - outb(reason, 0x61); + /* Clear and disable the PCI SERR error line. */ + reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR; + outb(reason, NMI_REASON_PORT); } static notrace __kprobes void @@ -332,22 +346,26 @@ io_check_error(unsigned char reason, struct pt_regs *regs) { unsigned long i; - printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n"); + pr_emerg( + "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n", + reason, smp_processor_id()); show_registers(regs); if (panic_on_io_nmi) panic("NMI IOCK error: Not continuing"); /* Re-enable the IOCK line, wait for a few seconds */ - reason = (reason & 0xf) | 8; - outb(reason, 0x61); + reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK; + outb(reason, NMI_REASON_PORT); - i = 2000; - while (--i) - udelay(1000); + i = 20000; + while (--i) { + touch_nmi_watchdog(); + udelay(100); + } - reason &= ~8; - outb(reason, 0x61); + reason &= ~NMI_REASON_CLEAR_IOCHK; + outb(reason, NMI_REASON_PORT); } static notrace __kprobes void @@ -366,69 +384,50 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) return; } #endif - printk(KERN_EMERG - "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", - reason, smp_processor_id()); + pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", + reason, smp_processor_id()); - printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); - if (panic_on_unrecovered_nmi) + pr_emerg("Do you have a strange power saving mode enabled?\n"); + if (unknown_nmi_panic || panic_on_unrecovered_nmi) panic("NMI: Not continuing"); - printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); + pr_emerg("Dazed and confused, but trying to continue\n"); } static notrace __kprobes void default_do_nmi(struct pt_regs *regs) { unsigned char reason = 0; - int cpu; - cpu = smp_processor_id(); - - /* Only the BSP gets external NMIs from the system. */ - if (!cpu) - reason = get_nmi_reason(); - - if (!(reason & 0xc0)) { - if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) - == NOTIFY_STOP) - return; + /* + * CPU-specific NMI must be processed before non-CPU-specific + * NMI, otherwise we may lose it, because the CPU-specific + * NMI can not be detected/processed on other CPUs. + */ + if (notify_die(DIE_NMI, "nmi", regs, 0, 2, SIGINT) == NOTIFY_STOP) + return; -#ifdef CONFIG_X86_LOCAL_APIC - if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) - == NOTIFY_STOP) - return; + /* Non-CPU-specific NMI: NMI sources can be processed on any CPU */ + raw_spin_lock(&nmi_reason_lock); + reason = get_nmi_reason(); -#ifndef CONFIG_LOCKUP_DETECTOR + if (reason & NMI_REASON_MASK) { + if (reason & NMI_REASON_SERR) + pci_serr_error(reason, regs); + else if (reason & NMI_REASON_IOCHK) + io_check_error(reason, regs); +#ifdef CONFIG_X86_32 /* - * Ok, so this is none of the documented NMI sources, - * so it must be the NMI watchdog. + * Reassert NMI in case it became active + * meanwhile as it's edge-triggered: */ - if (nmi_watchdog_tick(regs, reason)) - return; - if (!do_nmi_callback(regs, cpu)) -#endif /* !CONFIG_LOCKUP_DETECTOR */ - unknown_nmi_error(reason, regs); -#else - unknown_nmi_error(reason, regs); + reassert_nmi(); #endif - + raw_spin_unlock(&nmi_reason_lock); return; } - if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) - return; + raw_spin_unlock(&nmi_reason_lock); - /* AK: following checks seem to be broken on modern chipsets. FIXME */ - if (reason & 0x80) - mem_parity_error(reason, regs); - if (reason & 0x40) - io_check_error(reason, regs); -#ifdef CONFIG_X86_32 - /* - * Reassert NMI in case it became active meanwhile - * as it's edge-triggered: - */ - reassert_nmi(); -#endif + unknown_nmi_error(reason, regs); } dotraplinkage notrace __kprobes void @@ -446,14 +445,12 @@ do_nmi(struct pt_regs *regs, long error_code) void stop_nmi(void) { - acpi_nmi_disable(); ignore_nmis++; } void restart_nmi(void) { ignore_nmis--; - acpi_nmi_enable(); } /* May run on IST stack. */ diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 0c40d8b7241..9335bf7dd2e 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -427,7 +427,7 @@ unsigned long native_calibrate_tsc(void) * the delta to the previous read. We keep track of the min * and max values of that delta. The delta is mostly defined * by the IO time of the PIT access, so we can detect when a - * SMI/SMM disturbance happend between the two reads. If the + * SMI/SMM disturbance happened between the two reads. If the * maximum time is significantly larger than the minimum time, * then we discard the result and have another try. * @@ -464,7 +464,7 @@ unsigned long native_calibrate_tsc(void) tsc_pit_min = min(tsc_pit_min, tsc_pit_khz); /* hpet or pmtimer available ? */ - if (!hpet && !ref1 && !ref2) + if (ref1 == ref2) continue; /* Check, whether the sampling was disturbed by an SMI */ @@ -659,7 +659,7 @@ void restore_sched_clock_state(void) local_irq_save(flags); - __get_cpu_var(cyc2ns_offset) = 0; + __this_cpu_write(cyc2ns_offset, 0); offset = cyc2ns_suspend - sched_clock(); for_each_possible_cpu(cpu) @@ -872,6 +872,9 @@ __cpuinit int unsynchronized_tsc(void) if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) return 0; + + if (tsc_clocksource_reliable) + return 0; /* * Intel systems are normally all synchronized. * Exceptions must mark TSC as unstable: @@ -879,14 +882,92 @@ __cpuinit int unsynchronized_tsc(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { /* assume multi socket systems are not synchronized: */ if (num_possible_cpus() > 1) - tsc_unstable = 1; + return 1; } - return tsc_unstable; + return 0; +} + + +static void tsc_refine_calibration_work(struct work_struct *work); +static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work); +/** + * tsc_refine_calibration_work - Further refine tsc freq calibration + * @work - ignored. + * + * This functions uses delayed work over a period of a + * second to further refine the TSC freq value. Since this is + * timer based, instead of loop based, we don't block the boot + * process while this longer calibration is done. + * + * If there are any calibration anomalies (too many SMIs, etc), + * or the refined calibration is off by 1% of the fast early + * calibration, we throw out the new calibration and use the + * early calibration. + */ +static void tsc_refine_calibration_work(struct work_struct *work) +{ + static u64 tsc_start = -1, ref_start; + static int hpet; + u64 tsc_stop, ref_stop, delta; + unsigned long freq; + + /* Don't bother refining TSC on unstable systems */ + if (check_tsc_unstable()) + goto out; + + /* + * Since the work is started early in boot, we may be + * delayed the first time we expire. So set the workqueue + * again once we know timers are working. + */ + if (tsc_start == -1) { + /* + * Only set hpet once, to avoid mixing hardware + * if the hpet becomes enabled later. + */ + hpet = is_hpet_enabled(); + schedule_delayed_work(&tsc_irqwork, HZ); + tsc_start = tsc_read_refs(&ref_start, hpet); + return; + } + + tsc_stop = tsc_read_refs(&ref_stop, hpet); + + /* hpet or pmtimer available ? */ + if (ref_start == ref_stop) + goto out; + + /* Check, whether the sampling was disturbed by an SMI */ + if (tsc_start == ULLONG_MAX || tsc_stop == ULLONG_MAX) + goto out; + + delta = tsc_stop - tsc_start; + delta *= 1000000LL; + if (hpet) + freq = calc_hpet_ref(delta, ref_start, ref_stop); + else + freq = calc_pmtimer_ref(delta, ref_start, ref_stop); + + /* Make sure we're within 1% */ + if (abs(tsc_khz - freq) > tsc_khz/100) + goto out; + + tsc_khz = freq; + printk(KERN_INFO "Refined TSC clocksource calibration: " + "%lu.%03lu MHz.\n", (unsigned long)tsc_khz / 1000, + (unsigned long)tsc_khz % 1000); + +out: + clocksource_register_khz(&clocksource_tsc, tsc_khz); } -static void __init init_tsc_clocksource(void) + +static int __init init_tsc_clocksource(void) { + if (!cpu_has_tsc || tsc_disabled > 0 || !tsc_khz) + return 0; + if (tsc_clocksource_reliable) clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; /* lower the rating if we already know its unstable: */ @@ -894,8 +975,14 @@ static void __init init_tsc_clocksource(void) clocksource_tsc.rating = 0; clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; } - clocksource_register_khz(&clocksource_tsc, tsc_khz); + schedule_delayed_work(&tsc_irqwork, 0); + return 0; } +/* + * We use device_initcall here, to ensure we run after the hpet + * is fully initialized, which may occur at fs_initcall time. + */ +device_initcall(init_tsc_clocksource); void __init tsc_init(void) { @@ -949,6 +1036,5 @@ void __init tsc_init(void) mark_tsc_unstable("TSCs unsynchronized"); check_system_tsc_reliable(); - init_tsc_clocksource(); } diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c deleted file mode 100644 index 7b24460917d..00000000000 --- a/arch/x86/kernel/uv_irq.c +++ /dev/null @@ -1,285 +0,0 @@ -/* - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * SGI UV IRQ functions - * - * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved. - */ - -#include <linux/module.h> -#include <linux/rbtree.h> -#include <linux/slab.h> -#include <linux/irq.h> - -#include <asm/apic.h> -#include <asm/uv/uv_irq.h> -#include <asm/uv/uv_hub.h> - -/* MMR offset and pnode of hub sourcing interrupts for a given irq */ -struct uv_irq_2_mmr_pnode{ - struct rb_node list; - unsigned long offset; - int pnode; - int irq; -}; - -static spinlock_t uv_irq_lock; -static struct rb_root uv_irq_root; - -static int uv_set_irq_affinity(struct irq_data *, const struct cpumask *, bool); - -static void uv_noop(struct irq_data *data) { } - -static void uv_ack_apic(struct irq_data *data) -{ - ack_APIC_irq(); -} - -static struct irq_chip uv_irq_chip = { - .name = "UV-CORE", - .irq_mask = uv_noop, - .irq_unmask = uv_noop, - .irq_eoi = uv_ack_apic, - .irq_set_affinity = uv_set_irq_affinity, -}; - -/* - * Add offset and pnode information of the hub sourcing interrupts to the - * rb tree for a specific irq. - */ -static int uv_set_irq_2_mmr_info(int irq, unsigned long offset, unsigned blade) -{ - struct rb_node **link = &uv_irq_root.rb_node; - struct rb_node *parent = NULL; - struct uv_irq_2_mmr_pnode *n; - struct uv_irq_2_mmr_pnode *e; - unsigned long irqflags; - - n = kmalloc_node(sizeof(struct uv_irq_2_mmr_pnode), GFP_KERNEL, - uv_blade_to_memory_nid(blade)); - if (!n) - return -ENOMEM; - - n->irq = irq; - n->offset = offset; - n->pnode = uv_blade_to_pnode(blade); - spin_lock_irqsave(&uv_irq_lock, irqflags); - /* Find the right place in the rbtree: */ - while (*link) { - parent = *link; - e = rb_entry(parent, struct uv_irq_2_mmr_pnode, list); - - if (unlikely(irq == e->irq)) { - /* irq entry exists */ - e->pnode = uv_blade_to_pnode(blade); - e->offset = offset; - spin_unlock_irqrestore(&uv_irq_lock, irqflags); - kfree(n); - return 0; - } - - if (irq < e->irq) - link = &(*link)->rb_left; - else - link = &(*link)->rb_right; - } - - /* Insert the node into the rbtree. */ - rb_link_node(&n->list, parent, link); - rb_insert_color(&n->list, &uv_irq_root); - - spin_unlock_irqrestore(&uv_irq_lock, irqflags); - return 0; -} - -/* Retrieve offset and pnode information from the rb tree for a specific irq */ -int uv_irq_2_mmr_info(int irq, unsigned long *offset, int *pnode) -{ - struct uv_irq_2_mmr_pnode *e; - struct rb_node *n; - unsigned long irqflags; - - spin_lock_irqsave(&uv_irq_lock, irqflags); - n = uv_irq_root.rb_node; - while (n) { - e = rb_entry(n, struct uv_irq_2_mmr_pnode, list); - - if (e->irq == irq) { - *offset = e->offset; - *pnode = e->pnode; - spin_unlock_irqrestore(&uv_irq_lock, irqflags); - return 0; - } - - if (irq < e->irq) - n = n->rb_left; - else - n = n->rb_right; - } - spin_unlock_irqrestore(&uv_irq_lock, irqflags); - return -1; -} - -/* - * Re-target the irq to the specified CPU and enable the specified MMR located - * on the specified blade to allow the sending of MSIs to the specified CPU. - */ -static int -arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, - unsigned long mmr_offset, int limit) -{ - const struct cpumask *eligible_cpu = cpumask_of(cpu); - struct irq_cfg *cfg = get_irq_chip_data(irq); - unsigned long mmr_value; - struct uv_IO_APIC_route_entry *entry; - int mmr_pnode, err; - - BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != - sizeof(unsigned long)); - - err = assign_irq_vector(irq, cfg, eligible_cpu); - if (err != 0) - return err; - - if (limit == UV_AFFINITY_CPU) - irq_set_status_flags(irq, IRQ_NO_BALANCING); - else - irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); - - set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq, - irq_name); - - mmr_value = 0; - entry = (struct uv_IO_APIC_route_entry *)&mmr_value; - entry->vector = cfg->vector; - entry->delivery_mode = apic->irq_delivery_mode; - entry->dest_mode = apic->irq_dest_mode; - entry->polarity = 0; - entry->trigger = 0; - entry->mask = 0; - entry->dest = apic->cpu_mask_to_apicid(eligible_cpu); - - mmr_pnode = uv_blade_to_pnode(mmr_blade); - uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); - - if (cfg->move_in_progress) - send_cleanup_vector(cfg); - - return irq; -} - -/* - * Disable the specified MMR located on the specified blade so that MSIs are - * longer allowed to be sent. - */ -static void arch_disable_uv_irq(int mmr_pnode, unsigned long mmr_offset) -{ - unsigned long mmr_value; - struct uv_IO_APIC_route_entry *entry; - - BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != - sizeof(unsigned long)); - - mmr_value = 0; - entry = (struct uv_IO_APIC_route_entry *)&mmr_value; - entry->mask = 1; - - uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); -} - -static int -uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask, - bool force) -{ - struct irq_cfg *cfg = data->chip_data; - unsigned int dest; - unsigned long mmr_value, mmr_offset; - struct uv_IO_APIC_route_entry *entry; - int mmr_pnode; - - if (__ioapic_set_affinity(data, mask, &dest)) - return -1; - - mmr_value = 0; - entry = (struct uv_IO_APIC_route_entry *)&mmr_value; - - entry->vector = cfg->vector; - entry->delivery_mode = apic->irq_delivery_mode; - entry->dest_mode = apic->irq_dest_mode; - entry->polarity = 0; - entry->trigger = 0; - entry->mask = 0; - entry->dest = dest; - - /* Get previously stored MMR and pnode of hub sourcing interrupts */ - if (uv_irq_2_mmr_info(data->irq, &mmr_offset, &mmr_pnode)) - return -1; - - uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); - - if (cfg->move_in_progress) - send_cleanup_vector(cfg); - - return 0; -} - -/* - * Set up a mapping of an available irq and vector, and enable the specified - * MMR that defines the MSI that is to be sent to the specified CPU when an - * interrupt is raised. - */ -int uv_setup_irq(char *irq_name, int cpu, int mmr_blade, - unsigned long mmr_offset, int limit) -{ - int irq, ret; - - irq = create_irq_nr(NR_IRQS_LEGACY, uv_blade_to_memory_nid(mmr_blade)); - - if (irq <= 0) - return -EBUSY; - - ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset, - limit); - if (ret == irq) - uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade); - else - destroy_irq(irq); - - return ret; -} -EXPORT_SYMBOL_GPL(uv_setup_irq); - -/* - * Tear down a mapping of an irq and vector, and disable the specified MMR that - * defined the MSI that was to be sent to the specified CPU when an interrupt - * was raised. - * - * Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq(). - */ -void uv_teardown_irq(unsigned int irq) -{ - struct uv_irq_2_mmr_pnode *e; - struct rb_node *n; - unsigned long irqflags; - - spin_lock_irqsave(&uv_irq_lock, irqflags); - n = uv_irq_root.rb_node; - while (n) { - e = rb_entry(n, struct uv_irq_2_mmr_pnode, list); - if (e->irq == irq) { - arch_disable_uv_irq(e->pnode, e->offset); - rb_erase(n, &uv_irq_root); - kfree(e); - break; - } - if (irq < e->irq) - n = n->rb_left; - else - n = n->rb_right; - } - spin_unlock_irqrestore(&uv_irq_lock, irqflags); - destroy_irq(irq); -} -EXPORT_SYMBOL_GPL(uv_teardown_irq); diff --git a/arch/x86/kernel/uv_sysfs.c b/arch/x86/kernel/uv_sysfs.c deleted file mode 100644 index 309c70fb775..00000000000 --- a/arch/x86/kernel/uv_sysfs.c +++ /dev/null @@ -1,76 +0,0 @@ -/* - * This file supports the /sys/firmware/sgi_uv interfaces for SGI UV. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. - * Copyright (c) Russ Anderson - */ - -#include <linux/sysdev.h> -#include <asm/uv/bios.h> -#include <asm/uv/uv.h> - -struct kobject *sgi_uv_kobj; - -static ssize_t partition_id_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%ld\n", sn_partition_id); -} - -static ssize_t coherence_id_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%ld\n", partition_coherence_id()); -} - -static struct kobj_attribute partition_id_attr = - __ATTR(partition_id, S_IRUGO, partition_id_show, NULL); - -static struct kobj_attribute coherence_id_attr = - __ATTR(coherence_id, S_IRUGO, coherence_id_show, NULL); - - -static int __init sgi_uv_sysfs_init(void) -{ - unsigned long ret; - - if (!is_uv_system()) - return -ENODEV; - - if (!sgi_uv_kobj) - sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj); - if (!sgi_uv_kobj) { - printk(KERN_WARNING "kobject_create_and_add sgi_uv failed\n"); - return -EINVAL; - } - - ret = sysfs_create_file(sgi_uv_kobj, &partition_id_attr.attr); - if (ret) { - printk(KERN_WARNING "sysfs_create_file partition_id failed\n"); - return ret; - } - - ret = sysfs_create_file(sgi_uv_kobj, &coherence_id_attr.attr); - if (ret) { - printk(KERN_WARNING "sysfs_create_file coherence_id failed\n"); - return ret; - } - - return 0; -} - -device_initcall(sgi_uv_sysfs_init); diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c deleted file mode 100644 index 56e421bc379..00000000000 --- a/arch/x86/kernel/uv_time.c +++ /dev/null @@ -1,423 +0,0 @@ -/* - * SGI RTC clock/timer routines. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * Copyright (c) 2009 Silicon Graphics, Inc. All Rights Reserved. - * Copyright (c) Dimitri Sivanich - */ -#include <linux/clockchips.h> -#include <linux/slab.h> - -#include <asm/uv/uv_mmrs.h> -#include <asm/uv/uv_hub.h> -#include <asm/uv/bios.h> -#include <asm/uv/uv.h> -#include <asm/apic.h> -#include <asm/cpu.h> - -#define RTC_NAME "sgi_rtc" - -static cycle_t uv_read_rtc(struct clocksource *cs); -static int uv_rtc_next_event(unsigned long, struct clock_event_device *); -static void uv_rtc_timer_setup(enum clock_event_mode, - struct clock_event_device *); - -static struct clocksource clocksource_uv = { - .name = RTC_NAME, - .rating = 400, - .read = uv_read_rtc, - .mask = (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK, - .shift = 10, - .flags = CLOCK_SOURCE_IS_CONTINUOUS, -}; - -static struct clock_event_device clock_event_device_uv = { - .name = RTC_NAME, - .features = CLOCK_EVT_FEAT_ONESHOT, - .shift = 20, - .rating = 400, - .irq = -1, - .set_next_event = uv_rtc_next_event, - .set_mode = uv_rtc_timer_setup, - .event_handler = NULL, -}; - -static DEFINE_PER_CPU(struct clock_event_device, cpu_ced); - -/* There is one of these allocated per node */ -struct uv_rtc_timer_head { - spinlock_t lock; - /* next cpu waiting for timer, local node relative: */ - int next_cpu; - /* number of cpus on this node: */ - int ncpus; - struct { - int lcpu; /* systemwide logical cpu number */ - u64 expires; /* next timer expiration for this cpu */ - } cpu[1]; -}; - -/* - * Access to uv_rtc_timer_head via blade id. - */ -static struct uv_rtc_timer_head **blade_info __read_mostly; - -static int uv_rtc_evt_enable; - -/* - * Hardware interface routines - */ - -/* Send IPIs to another node */ -static void uv_rtc_send_IPI(int cpu) -{ - unsigned long apicid, val; - int pnode; - - apicid = cpu_physical_id(cpu); - pnode = uv_apicid_to_pnode(apicid); - val = (1UL << UVH_IPI_INT_SEND_SHFT) | - (apicid << UVH_IPI_INT_APIC_ID_SHFT) | - (X86_PLATFORM_IPI_VECTOR << UVH_IPI_INT_VECTOR_SHFT); - - uv_write_global_mmr64(pnode, UVH_IPI_INT, val); -} - -/* Check for an RTC interrupt pending */ -static int uv_intr_pending(int pnode) -{ - return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) & - UVH_EVENT_OCCURRED0_RTC1_MASK; -} - -/* Setup interrupt and return non-zero if early expiration occurred. */ -static int uv_setup_intr(int cpu, u64 expires) -{ - u64 val; - int pnode = uv_cpu_to_pnode(cpu); - - uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, - UVH_RTC1_INT_CONFIG_M_MASK); - uv_write_global_mmr64(pnode, UVH_INT_CMPB, -1L); - - uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS, - UVH_EVENT_OCCURRED0_RTC1_MASK); - - val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) | - ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT); - - /* Set configuration */ - uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, val); - /* Initialize comparator value */ - uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires); - - if (uv_read_rtc(NULL) <= expires) - return 0; - - return !uv_intr_pending(pnode); -} - -/* - * Per-cpu timer tracking routines - */ - -static __init void uv_rtc_deallocate_timers(void) -{ - int bid; - - for_each_possible_blade(bid) { - kfree(blade_info[bid]); - } - kfree(blade_info); -} - -/* Allocate per-node list of cpu timer expiration times. */ -static __init int uv_rtc_allocate_timers(void) -{ - int cpu; - - blade_info = kmalloc(uv_possible_blades * sizeof(void *), GFP_KERNEL); - if (!blade_info) - return -ENOMEM; - memset(blade_info, 0, uv_possible_blades * sizeof(void *)); - - for_each_present_cpu(cpu) { - int nid = cpu_to_node(cpu); - int bid = uv_cpu_to_blade_id(cpu); - int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id; - struct uv_rtc_timer_head *head = blade_info[bid]; - - if (!head) { - head = kmalloc_node(sizeof(struct uv_rtc_timer_head) + - (uv_blade_nr_possible_cpus(bid) * - 2 * sizeof(u64)), - GFP_KERNEL, nid); - if (!head) { - uv_rtc_deallocate_timers(); - return -ENOMEM; - } - spin_lock_init(&head->lock); - head->ncpus = uv_blade_nr_possible_cpus(bid); - head->next_cpu = -1; - blade_info[bid] = head; - } - - head->cpu[bcpu].lcpu = cpu; - head->cpu[bcpu].expires = ULLONG_MAX; - } - - return 0; -} - -/* Find and set the next expiring timer. */ -static void uv_rtc_find_next_timer(struct uv_rtc_timer_head *head, int pnode) -{ - u64 lowest = ULLONG_MAX; - int c, bcpu = -1; - - head->next_cpu = -1; - for (c = 0; c < head->ncpus; c++) { - u64 exp = head->cpu[c].expires; - if (exp < lowest) { - bcpu = c; - lowest = exp; - } - } - if (bcpu >= 0) { - head->next_cpu = bcpu; - c = head->cpu[bcpu].lcpu; - if (uv_setup_intr(c, lowest)) - /* If we didn't set it up in time, trigger */ - uv_rtc_send_IPI(c); - } else { - uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, - UVH_RTC1_INT_CONFIG_M_MASK); - } -} - -/* - * Set expiration time for current cpu. - * - * Returns 1 if we missed the expiration time. - */ -static int uv_rtc_set_timer(int cpu, u64 expires) -{ - int pnode = uv_cpu_to_pnode(cpu); - int bid = uv_cpu_to_blade_id(cpu); - struct uv_rtc_timer_head *head = blade_info[bid]; - int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id; - u64 *t = &head->cpu[bcpu].expires; - unsigned long flags; - int next_cpu; - - spin_lock_irqsave(&head->lock, flags); - - next_cpu = head->next_cpu; - *t = expires; - - /* Will this one be next to go off? */ - if (next_cpu < 0 || bcpu == next_cpu || - expires < head->cpu[next_cpu].expires) { - head->next_cpu = bcpu; - if (uv_setup_intr(cpu, expires)) { - *t = ULLONG_MAX; - uv_rtc_find_next_timer(head, pnode); - spin_unlock_irqrestore(&head->lock, flags); - return -ETIME; - } - } - - spin_unlock_irqrestore(&head->lock, flags); - return 0; -} - -/* - * Unset expiration time for current cpu. - * - * Returns 1 if this timer was pending. - */ -static int uv_rtc_unset_timer(int cpu, int force) -{ - int pnode = uv_cpu_to_pnode(cpu); - int bid = uv_cpu_to_blade_id(cpu); - struct uv_rtc_timer_head *head = blade_info[bid]; - int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id; - u64 *t = &head->cpu[bcpu].expires; - unsigned long flags; - int rc = 0; - - spin_lock_irqsave(&head->lock, flags); - - if ((head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) || force) - rc = 1; - - if (rc) { - *t = ULLONG_MAX; - /* Was the hardware setup for this timer? */ - if (head->next_cpu == bcpu) - uv_rtc_find_next_timer(head, pnode); - } - - spin_unlock_irqrestore(&head->lock, flags); - - return rc; -} - - -/* - * Kernel interface routines. - */ - -/* - * Read the RTC. - * - * Starting with HUB rev 2.0, the UV RTC register is replicated across all - * cachelines of it's own page. This allows faster simultaneous reads - * from a given socket. - */ -static cycle_t uv_read_rtc(struct clocksource *cs) -{ - unsigned long offset; - - if (uv_get_min_hub_revision_id() == 1) - offset = 0; - else - offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE; - - return (cycle_t)uv_read_local_mmr(UVH_RTC | offset); -} - -/* - * Program the next event, relative to now - */ -static int uv_rtc_next_event(unsigned long delta, - struct clock_event_device *ced) -{ - int ced_cpu = cpumask_first(ced->cpumask); - - return uv_rtc_set_timer(ced_cpu, delta + uv_read_rtc(NULL)); -} - -/* - * Setup the RTC timer in oneshot mode - */ -static void uv_rtc_timer_setup(enum clock_event_mode mode, - struct clock_event_device *evt) -{ - int ced_cpu = cpumask_first(evt->cpumask); - - switch (mode) { - case CLOCK_EVT_MODE_PERIODIC: - case CLOCK_EVT_MODE_ONESHOT: - case CLOCK_EVT_MODE_RESUME: - /* Nothing to do here yet */ - break; - case CLOCK_EVT_MODE_UNUSED: - case CLOCK_EVT_MODE_SHUTDOWN: - uv_rtc_unset_timer(ced_cpu, 1); - break; - } -} - -static void uv_rtc_interrupt(void) -{ - int cpu = smp_processor_id(); - struct clock_event_device *ced = &per_cpu(cpu_ced, cpu); - - if (!ced || !ced->event_handler) - return; - - if (uv_rtc_unset_timer(cpu, 0) != 1) - return; - - ced->event_handler(ced); -} - -static int __init uv_enable_evt_rtc(char *str) -{ - uv_rtc_evt_enable = 1; - - return 1; -} -__setup("uvrtcevt", uv_enable_evt_rtc); - -static __init void uv_rtc_register_clockevents(struct work_struct *dummy) -{ - struct clock_event_device *ced = &__get_cpu_var(cpu_ced); - - *ced = clock_event_device_uv; - ced->cpumask = cpumask_of(smp_processor_id()); - clockevents_register_device(ced); -} - -static __init int uv_rtc_setup_clock(void) -{ - int rc; - - if (!is_uv_system()) - return -ENODEV; - - clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second, - clocksource_uv.shift); - - /* If single blade, prefer tsc */ - if (uv_num_possible_blades() == 1) - clocksource_uv.rating = 250; - - rc = clocksource_register(&clocksource_uv); - if (rc) - printk(KERN_INFO "UV RTC clocksource failed rc %d\n", rc); - else - printk(KERN_INFO "UV RTC clocksource registered freq %lu MHz\n", - sn_rtc_cycles_per_second/(unsigned long)1E6); - - if (rc || !uv_rtc_evt_enable || x86_platform_ipi_callback) - return rc; - - /* Setup and register clockevents */ - rc = uv_rtc_allocate_timers(); - if (rc) - goto error; - - x86_platform_ipi_callback = uv_rtc_interrupt; - - clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second, - NSEC_PER_SEC, clock_event_device_uv.shift); - - clock_event_device_uv.min_delta_ns = NSEC_PER_SEC / - sn_rtc_cycles_per_second; - - clock_event_device_uv.max_delta_ns = clocksource_uv.mask * - (NSEC_PER_SEC / sn_rtc_cycles_per_second); - - rc = schedule_on_each_cpu(uv_rtc_register_clockevents); - if (rc) { - x86_platform_ipi_callback = NULL; - uv_rtc_deallocate_timers(); - goto error; - } - - printk(KERN_INFO "UV RTC clockevents registered\n"); - - return 0; - -error: - clocksource_unregister(&clocksource_uv); - printk(KERN_INFO "UV RTC clockevents failed rc %d\n", rc); - - return rc; -} -arch_initcall(uv_rtc_setup_clock); diff --git a/arch/x86/kernel/verify_cpu_64.S b/arch/x86/kernel/verify_cpu.S index 56a8c2a867d..b9242bacbe5 100644 --- a/arch/x86/kernel/verify_cpu_64.S +++ b/arch/x86/kernel/verify_cpu.S @@ -7,6 +7,7 @@ * Copyright (c) 2007 Andi Kleen (ak@suse.de) * Copyright (c) 2007 Eric Biederman (ebiederm@xmission.com) * Copyright (c) 2007 Vivek Goyal (vgoyal@in.ibm.com) + * Copyright (c) 2010 Kees Cook (kees.cook@canonical.com) * * This source code is licensed under the GNU General Public License, * Version 2. See the file COPYING for more details. @@ -14,18 +15,17 @@ * This is a common code for verification whether CPU supports * long mode and SSE or not. It is not called directly instead this * file is included at various places and compiled in that context. - * Following are the current usage. + * This file is expected to run in 32bit code. Currently: * - * This file is included by both 16bit and 32bit code. + * arch/x86/boot/compressed/head_64.S: Boot cpu verification + * arch/x86/kernel/trampoline_64.S: secondary processor verification + * arch/x86/kernel/head_32.S: processor startup * - * arch/x86_64/boot/setup.S : Boot cpu verification (16bit) - * arch/x86_64/boot/compressed/head.S: Boot cpu verification (32bit) - * arch/x86_64/kernel/trampoline.S: secondary processor verfication (16bit) - * arch/x86_64/kernel/acpi/wakeup.S:Verfication at resume (16bit) - * - * verify_cpu, returns the status of cpu check in register %eax. + * verify_cpu, returns the status of longmode and SSE in register %eax. * 0: Success 1: Failure * + * On Intel, the XD_DISABLE flag will be cleared as a side-effect. + * * The caller needs to check for the error code and take the action * appropriately. Either display a message or halt. */ @@ -62,8 +62,41 @@ verify_cpu: cmpl $0x444d4163,%ecx jnz verify_cpu_noamd mov $1,%di # cpu is from AMD + jmp verify_cpu_check verify_cpu_noamd: + cmpl $0x756e6547,%ebx # GenuineIntel? + jnz verify_cpu_check + cmpl $0x49656e69,%edx + jnz verify_cpu_check + cmpl $0x6c65746e,%ecx + jnz verify_cpu_check + + # only call IA32_MISC_ENABLE when: + # family > 6 || (family == 6 && model >= 0xd) + movl $0x1, %eax # check CPU family and model + cpuid + movl %eax, %ecx + + andl $0x0ff00f00, %eax # mask family and extended family + shrl $8, %eax + cmpl $6, %eax + ja verify_cpu_clear_xd # family > 6, ok + jb verify_cpu_check # family < 6, skip + + andl $0x000f00f0, %ecx # mask model and extended model + shrl $4, %ecx + cmpl $0xd, %ecx + jb verify_cpu_check # family == 6, model < 0xd, skip + +verify_cpu_clear_xd: + movl $MSR_IA32_MISC_ENABLE, %ecx + rdmsr + btrl $2, %edx # clear MSR_IA32_MISC_ENABLE_XD_DISABLE + jnc verify_cpu_check # only write MSR if bit was changed + wrmsr + +verify_cpu_check: movl $0x1,%eax # Does the cpu have what it takes cpuid andl $REQUIRED_MASK0,%edx diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c deleted file mode 100644 index 3371bd053b8..00000000000 --- a/arch/x86/kernel/visws_quirks.c +++ /dev/null @@ -1,614 +0,0 @@ -/* - * SGI Visual Workstation support and quirks, unmaintained. - * - * Split out from setup.c by davej@suse.de - * - * Copyright (C) 1999 Bent Hagemark, Ingo Molnar - * - * SGI Visual Workstation interrupt controller - * - * The Cobalt system ASIC in the Visual Workstation contains a "Cobalt" APIC - * which serves as the main interrupt controller in the system. Non-legacy - * hardware in the system uses this controller directly. Legacy devices - * are connected to the PIIX4 which in turn has its 8259(s) connected to - * a of the Cobalt APIC entry. - * - * 09/02/2000 - Updated for 2.4 by jbarnes@sgi.com - * - * 25/11/2002 - Updated for 2.5 by Andrey Panin <pazke@orbita1.ru> - */ -#include <linux/interrupt.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/smp.h> - -#include <asm/visws/cobalt.h> -#include <asm/visws/piix4.h> -#include <asm/io_apic.h> -#include <asm/fixmap.h> -#include <asm/reboot.h> -#include <asm/setup.h> -#include <asm/apic.h> -#include <asm/e820.h> -#include <asm/time.h> -#include <asm/io.h> - -#include <linux/kernel_stat.h> - -#include <asm/i8259.h> -#include <asm/irq_vectors.h> -#include <asm/visws/lithium.h> - -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/pci.h> -#include <linux/pci_ids.h> - -extern int no_broadcast; - -char visws_board_type = -1; -char visws_board_rev = -1; - -static void __init visws_time_init(void) -{ - printk(KERN_INFO "Starting Cobalt Timer system clock\n"); - - /* Set the countdown value */ - co_cpu_write(CO_CPU_TIMEVAL, CO_TIME_HZ/HZ); - - /* Start the timer */ - co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) | CO_CTRL_TIMERUN); - - /* Enable (unmask) the timer interrupt */ - co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK); - - setup_default_timer_irq(); -} - -/* Replaces the default init_ISA_irqs in the generic setup */ -static void __init visws_pre_intr_init(void); - -/* Quirk for machine specific memory setup. */ - -#define MB (1024 * 1024) - -unsigned long sgivwfb_mem_phys; -unsigned long sgivwfb_mem_size; -EXPORT_SYMBOL(sgivwfb_mem_phys); -EXPORT_SYMBOL(sgivwfb_mem_size); - -long long mem_size __initdata = 0; - -static char * __init visws_memory_setup(void) -{ - long long gfx_mem_size = 8 * MB; - - mem_size = boot_params.alt_mem_k; - - if (!mem_size) { - printk(KERN_WARNING "Bootloader didn't set memory size, upgrade it !\n"); - mem_size = 128 * MB; - } - - /* - * this hardcodes the graphics memory to 8 MB - * it really should be sized dynamically (or at least - * set as a boot param) - */ - if (!sgivwfb_mem_size) { - printk(KERN_WARNING "Defaulting to 8 MB framebuffer size\n"); - sgivwfb_mem_size = 8 * MB; - } - - /* - * Trim to nearest MB - */ - sgivwfb_mem_size &= ~((1 << 20) - 1); - sgivwfb_mem_phys = mem_size - gfx_mem_size; - - e820_add_region(0, LOWMEMSIZE(), E820_RAM); - e820_add_region(HIGH_MEMORY, mem_size - sgivwfb_mem_size - HIGH_MEMORY, E820_RAM); - e820_add_region(sgivwfb_mem_phys, sgivwfb_mem_size, E820_RESERVED); - - return "PROM"; -} - -static void visws_machine_emergency_restart(void) -{ - /* - * Visual Workstations restart after this - * register is poked on the PIIX4 - */ - outb(PIIX4_RESET_VAL, PIIX4_RESET_PORT); -} - -static void visws_machine_power_off(void) -{ - unsigned short pm_status; -/* extern unsigned int pci_bus0; */ - - while ((pm_status = inw(PMSTS_PORT)) & 0x100) - outw(pm_status, PMSTS_PORT); - - outw(PM_SUSPEND_ENABLE, PMCNTRL_PORT); - - mdelay(10); - -#define PCI_CONF1_ADDRESS(bus, devfn, reg) \ - (0x80000000 | (bus << 16) | (devfn << 8) | (reg & ~3)) - -/* outl(PCI_CONF1_ADDRESS(pci_bus0, SPECIAL_DEV, SPECIAL_REG), 0xCF8); */ - outl(PIIX_SPECIAL_STOP, 0xCFC); -} - -static void __init visws_get_smp_config(unsigned int early) -{ -} - -/* - * The Visual Workstation is Intel MP compliant in the hardware - * sense, but it doesn't have a BIOS(-configuration table). - * No problem for Linux. - */ - -static void __init MP_processor_info(struct mpc_cpu *m) -{ - int ver, logical_apicid; - physid_mask_t apic_cpus; - - if (!(m->cpuflag & CPU_ENABLED)) - return; - - logical_apicid = m->apicid; - printk(KERN_INFO "%sCPU #%d %u:%u APIC version %d\n", - m->cpuflag & CPU_BOOTPROCESSOR ? "Bootup " : "", - m->apicid, (m->cpufeature & CPU_FAMILY_MASK) >> 8, - (m->cpufeature & CPU_MODEL_MASK) >> 4, m->apicver); - - if (m->cpuflag & CPU_BOOTPROCESSOR) - boot_cpu_physical_apicid = m->apicid; - - ver = m->apicver; - if ((ver >= 0x14 && m->apicid >= 0xff) || m->apicid >= 0xf) { - printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n", - m->apicid, MAX_APICS); - return; - } - - apic->apicid_to_cpu_present(m->apicid, &apic_cpus); - physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus); - /* - * Validate version - */ - if (ver == 0x0) { - printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! " - "fixing up to 0x10. (tell your hw vendor)\n", - m->apicid); - ver = 0x10; - } - apic_version[m->apicid] = ver; -} - -static void __init visws_find_smp_config(void) -{ - struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS); - unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS)); - - if (ncpus > CO_CPU_MAX) { - printk(KERN_WARNING "find_visws_smp: got cpu count of %d at %p\n", - ncpus, mp); - - ncpus = CO_CPU_MAX; - } - - if (ncpus > setup_max_cpus) - ncpus = setup_max_cpus; - -#ifdef CONFIG_X86_LOCAL_APIC - smp_found_config = 1; -#endif - while (ncpus--) - MP_processor_info(mp++); - - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; -} - -static void visws_trap_init(void); - -void __init visws_early_detect(void) -{ - int raw; - - visws_board_type = (char)(inb_p(PIIX_GPI_BD_REG) & PIIX_GPI_BD_REG) - >> PIIX_GPI_BD_SHIFT; - - if (visws_board_type < 0) - return; - - /* - * Override the default platform setup functions - */ - x86_init.resources.memory_setup = visws_memory_setup; - x86_init.mpparse.get_smp_config = visws_get_smp_config; - x86_init.mpparse.find_smp_config = visws_find_smp_config; - x86_init.irqs.pre_vector_init = visws_pre_intr_init; - x86_init.irqs.trap_init = visws_trap_init; - x86_init.timers.timer_init = visws_time_init; - x86_init.pci.init = pci_visws_init; - x86_init.pci.init_irq = x86_init_noop; - - /* - * Install reboot quirks: - */ - pm_power_off = visws_machine_power_off; - machine_ops.emergency_restart = visws_machine_emergency_restart; - - /* - * Do not use broadcast IPIs: - */ - no_broadcast = 0; - -#ifdef CONFIG_X86_IO_APIC - /* - * Turn off IO-APIC detection and initialization: - */ - skip_ioapic_setup = 1; -#endif - - /* - * Get Board rev. - * First, we have to initialize the 307 part to allow us access - * to the GPIO registers. Let's map them at 0x0fc0 which is right - * after the PIIX4 PM section. - */ - outb_p(SIO_DEV_SEL, SIO_INDEX); - outb_p(SIO_GP_DEV, SIO_DATA); /* Talk to GPIO regs. */ - - outb_p(SIO_DEV_MSB, SIO_INDEX); - outb_p(SIO_GP_MSB, SIO_DATA); /* MSB of GPIO base address */ - - outb_p(SIO_DEV_LSB, SIO_INDEX); - outb_p(SIO_GP_LSB, SIO_DATA); /* LSB of GPIO base address */ - - outb_p(SIO_DEV_ENB, SIO_INDEX); - outb_p(1, SIO_DATA); /* Enable GPIO registers. */ - - /* - * Now, we have to map the power management section to write - * a bit which enables access to the GPIO registers. - * What lunatic came up with this shit? - */ - outb_p(SIO_DEV_SEL, SIO_INDEX); - outb_p(SIO_PM_DEV, SIO_DATA); /* Talk to GPIO regs. */ - - outb_p(SIO_DEV_MSB, SIO_INDEX); - outb_p(SIO_PM_MSB, SIO_DATA); /* MSB of PM base address */ - - outb_p(SIO_DEV_LSB, SIO_INDEX); - outb_p(SIO_PM_LSB, SIO_DATA); /* LSB of PM base address */ - - outb_p(SIO_DEV_ENB, SIO_INDEX); - outb_p(1, SIO_DATA); /* Enable PM registers. */ - - /* - * Now, write the PM register which enables the GPIO registers. - */ - outb_p(SIO_PM_FER2, SIO_PM_INDEX); - outb_p(SIO_PM_GP_EN, SIO_PM_DATA); - - /* - * Now, initialize the GPIO registers. - * We want them all to be inputs which is the - * power on default, so let's leave them alone. - * So, let's just read the board rev! - */ - raw = inb_p(SIO_GP_DATA1); - raw &= 0x7f; /* 7 bits of valid board revision ID. */ - - if (visws_board_type == VISWS_320) { - if (raw < 0x6) { - visws_board_rev = 4; - } else if (raw < 0xc) { - visws_board_rev = 5; - } else { - visws_board_rev = 6; - } - } else if (visws_board_type == VISWS_540) { - visws_board_rev = 2; - } else { - visws_board_rev = raw; - } - - printk(KERN_INFO "Silicon Graphics Visual Workstation %s (rev %d) detected\n", - (visws_board_type == VISWS_320 ? "320" : - (visws_board_type == VISWS_540 ? "540" : - "unknown")), visws_board_rev); -} - -#define A01234 (LI_INTA_0 | LI_INTA_1 | LI_INTA_2 | LI_INTA_3 | LI_INTA_4) -#define BCD (LI_INTB | LI_INTC | LI_INTD) -#define ALLDEVS (A01234 | BCD) - -static __init void lithium_init(void) -{ - set_fixmap(FIX_LI_PCIA, LI_PCI_A_PHYS); - set_fixmap(FIX_LI_PCIB, LI_PCI_B_PHYS); - - if ((li_pcia_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) || - (li_pcia_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) { - printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'A'); -/* panic("This machine is not SGI Visual Workstation 320/540"); */ - } - - if ((li_pcib_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) || - (li_pcib_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) { - printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'B'); -/* panic("This machine is not SGI Visual Workstation 320/540"); */ - } - - li_pcia_write16(LI_PCI_INTEN, ALLDEVS); - li_pcib_write16(LI_PCI_INTEN, ALLDEVS); -} - -static __init void cobalt_init(void) -{ - /* - * On normal SMP PC this is used only with SMP, but we have to - * use it and set it up here to start the Cobalt clock - */ - set_fixmap(FIX_APIC_BASE, APIC_DEFAULT_PHYS_BASE); - setup_local_APIC(); - printk(KERN_INFO "Local APIC Version %#x, ID %#x\n", - (unsigned int)apic_read(APIC_LVR), - (unsigned int)apic_read(APIC_ID)); - - set_fixmap(FIX_CO_CPU, CO_CPU_PHYS); - set_fixmap(FIX_CO_APIC, CO_APIC_PHYS); - printk(KERN_INFO "Cobalt Revision %#lx, APIC ID %#lx\n", - co_cpu_read(CO_CPU_REV), co_apic_read(CO_APIC_ID)); - - /* Enable Cobalt APIC being careful to NOT change the ID! */ - co_apic_write(CO_APIC_ID, co_apic_read(CO_APIC_ID) | CO_APIC_ENABLE); - - printk(KERN_INFO "Cobalt APIC enabled: ID reg %#lx\n", - co_apic_read(CO_APIC_ID)); -} - -static void __init visws_trap_init(void) -{ - lithium_init(); - cobalt_init(); -} - -/* - * IRQ controller / APIC support: - */ - -static DEFINE_SPINLOCK(cobalt_lock); - -/* - * Set the given Cobalt APIC Redirection Table entry to point - * to the given IDT vector/index. - */ -static inline void co_apic_set(int entry, int irq) -{ - co_apic_write(CO_APIC_LO(entry), CO_APIC_LEVEL | (irq + FIRST_EXTERNAL_VECTOR)); - co_apic_write(CO_APIC_HI(entry), 0); -} - -/* - * Cobalt (IO)-APIC functions to handle PCI devices. - */ -static inline int co_apic_ide0_hack(void) -{ - extern char visws_board_type; - extern char visws_board_rev; - - if (visws_board_type == VISWS_320 && visws_board_rev == 5) - return 5; - return CO_APIC_IDE0; -} - -static int is_co_apic(unsigned int irq) -{ - if (IS_CO_APIC(irq)) - return CO_APIC(irq); - - switch (irq) { - case 0: return CO_APIC_CPU; - case CO_IRQ_IDE0: return co_apic_ide0_hack(); - case CO_IRQ_IDE1: return CO_APIC_IDE1; - default: return -1; - } -} - - -/* - * This is the SGI Cobalt (IO-)APIC: - */ -static void enable_cobalt_irq(struct irq_data *data) -{ - co_apic_set(is_co_apic(data->irq), data->irq); -} - -static void disable_cobalt_irq(struct irq_data *data) -{ - int entry = is_co_apic(data->irq); - - co_apic_write(CO_APIC_LO(entry), CO_APIC_MASK); - co_apic_read(CO_APIC_LO(entry)); -} - -static void ack_cobalt_irq(struct irq_data *data) -{ - unsigned long flags; - - spin_lock_irqsave(&cobalt_lock, flags); - disable_cobalt_irq(data); - apic_write(APIC_EOI, APIC_EIO_ACK); - spin_unlock_irqrestore(&cobalt_lock, flags); -} - -static struct irq_chip cobalt_irq_type = { - .name = "Cobalt-APIC", - .irq_enable = enable_cobalt_irq, - .irq_disable = disable_cobalt_irq, - .irq_ack = ack_cobalt_irq, -}; - - -/* - * This is the PIIX4-based 8259 that is wired up indirectly to Cobalt - * -- not the manner expected by the code in i8259.c. - * - * there is a 'master' physical interrupt source that gets sent to - * the CPU. But in the chipset there are various 'virtual' interrupts - * waiting to be handled. We represent this to Linux through a 'master' - * interrupt controller type, and through a special virtual interrupt- - * controller. Device drivers only see the virtual interrupt sources. - */ -static unsigned int startup_piix4_master_irq(struct irq_data *data) -{ - legacy_pic->init(0); - enable_cobalt_irq(data); -} - -static void end_piix4_master_irq(struct irq_data *data) -{ - unsigned long flags; - - spin_lock_irqsave(&cobalt_lock, flags); - enable_cobalt_irq(data); - spin_unlock_irqrestore(&cobalt_lock, flags); -} - -static struct irq_chip piix4_master_irq_type = { - .name = "PIIX4-master", - .irq_startup = startup_piix4_master_irq, - .irq_ack = ack_cobalt_irq, -}; - -static void pii4_mask(struct irq_data *data) { } - -static struct irq_chip piix4_virtual_irq_type = { - .name = "PIIX4-virtual", - .mask = pii4_mask, -}; - -/* - * PIIX4-8259 master/virtual functions to handle interrupt requests - * from legacy devices: floppy, parallel, serial, rtc. - * - * None of these get Cobalt APIC entries, neither do they have IDT - * entries. These interrupts are purely virtual and distributed from - * the 'master' interrupt source: CO_IRQ_8259. - * - * When the 8259 interrupts its handler figures out which of these - * devices is interrupting and dispatches to its handler. - * - * CAREFUL: devices see the 'virtual' interrupt only. Thus disable/ - * enable_irq gets the right irq. This 'master' irq is never directly - * manipulated by any driver. - */ -static irqreturn_t piix4_master_intr(int irq, void *dev_id) -{ - unsigned long flags; - int realirq; - - raw_spin_lock_irqsave(&i8259A_lock, flags); - - /* Find out what's interrupting in the PIIX4 master 8259 */ - outb(0x0c, 0x20); /* OCW3 Poll command */ - realirq = inb(0x20); - - /* - * Bit 7 == 0 means invalid/spurious - */ - if (unlikely(!(realirq & 0x80))) - goto out_unlock; - - realirq &= 7; - - if (unlikely(realirq == 2)) { - outb(0x0c, 0xa0); - realirq = inb(0xa0); - - if (unlikely(!(realirq & 0x80))) - goto out_unlock; - - realirq = (realirq & 7) + 8; - } - - /* mask and ack interrupt */ - cached_irq_mask |= 1 << realirq; - if (unlikely(realirq > 7)) { - inb(0xa1); - outb(cached_slave_mask, 0xa1); - outb(0x60 + (realirq & 7), 0xa0); - outb(0x60 + 2, 0x20); - } else { - inb(0x21); - outb(cached_master_mask, 0x21); - outb(0x60 + realirq, 0x20); - } - - raw_spin_unlock_irqrestore(&i8259A_lock, flags); - - /* - * handle this 'virtual interrupt' as a Cobalt one now. - */ - generic_handle_irq(realirq); - - return IRQ_HANDLED; - -out_unlock: - raw_spin_unlock_irqrestore(&i8259A_lock, flags); - return IRQ_NONE; -} - -static struct irqaction master_action = { - .handler = piix4_master_intr, - .name = "PIIX4-8259", -}; - -static struct irqaction cascade_action = { - .handler = no_action, - .name = "cascade", -}; - -static inline void set_piix4_virtual_irq_type(void) -{ - piix4_virtual_irq_type.enable = i8259A_chip.unmask; - piix4_virtual_irq_type.disable = i8259A_chip.mask; - piix4_virtual_irq_type.unmask = i8259A_chip.unmask; -} - -static void __init visws_pre_intr_init(void) -{ - int i; - - set_piix4_virtual_irq_type(); - - for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) { - struct irq_chip *chip = NULL; - - if (i == 0) - chip = &cobalt_irq_type; - else if (i == CO_IRQ_IDE0) - chip = &cobalt_irq_type; - else if (i == CO_IRQ_IDE1) - >chip = &cobalt_irq_type; - else if (i == CO_IRQ_8259) - chip = &piix4_master_irq_type; - else if (i < CO_IRQ_APIC0) - chip = &piix4_virtual_irq_type; - else if (IS_CO_APIC(i)) - chip = &cobalt_irq_type; - - if (chip) - set_irq_chip(i, chip); - } - - setup_irq(CO_IRQ_8259, &master_action); - setup_irq(2, &cascade_action); -} diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 61fb9851962..863f8753ab0 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -179,6 +179,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) if (pud_none_or_clear_bad(pud)) goto out; pmd = pmd_offset(pud, 0xA0000); + split_huge_page_pmd(mm, pmd); if (pmd_none_or_clear_bad(pmd)) goto out; pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index e03530aebfd..624a2016198 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -69,7 +69,7 @@ jiffies_64 = jiffies; PHDRS { text PT_LOAD FLAGS(5); /* R_E */ - data PT_LOAD FLAGS(7); /* RWE */ + data PT_LOAD FLAGS(6); /* RW_ */ #ifdef CONFIG_X86_64 user PT_LOAD FLAGS(5); /* R_E */ #ifdef CONFIG_SMP @@ -105,6 +105,7 @@ SECTIONS SCHED_TEXT LOCK_TEXT KPROBES_TEXT + ENTRY_TEXT IRQENTRY_TEXT *(.fixup) *(.gnu.warning) @@ -116,6 +117,10 @@ SECTIONS EXCEPTION_TABLE(16) :text = 0x9090 +#if defined(CONFIG_DEBUG_RODATA) + /* .text should occupy whole number of pages */ + . = ALIGN(PAGE_SIZE); +#endif X64_ALIGN_DEBUG_RODATA_BEGIN RO_DATA(PAGE_SIZE) X64_ALIGN_DEBUG_RODATA_END @@ -226,7 +231,7 @@ SECTIONS * output PHDR, so the next output section - .init.text - should * start another segment - init. */ - PERCPU_VADDR(0, :percpu) + PERCPU_VADDR(INTERNODE_CACHE_BYTES, 0, :percpu) #endif INIT_TEXT_SECTION(PAGE_SIZE) @@ -236,6 +241,18 @@ SECTIONS INIT_DATA_SECTION(16) + /* + * Code and data for a variety of lowlevel trampolines, to be + * copied into base memory (< 1 MiB) during initialization. + * Since it is copied early, the main copy can be discarded + * afterwards. + */ + .x86_trampoline : AT(ADDR(.x86_trampoline) - LOAD_OFFSET) { + x86_trampoline_start = .; + *(.x86_trampoline) + x86_trampoline_end = .; + } + .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { __x86_cpu_dev_start = .; *(.x86_cpu_dev.init) @@ -287,6 +304,7 @@ SECTIONS *(.iommu_table) __iommu_table_end = .; } + . = ALIGN(8); /* * .exit.text is discard at runtime, not link time, to deal with @@ -301,7 +319,7 @@ SECTIONS } #if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) - PERCPU(THREAD_SIZE) + PERCPU(INTERNODE_CACHE_BYTES, PAGE_SIZE) #endif . = ALIGN(PAGE_SIZE); @@ -335,7 +353,7 @@ SECTIONS __bss_start = .; *(.bss..page_aligned) *(.bss) - . = ALIGN(4); + . = ALIGN(PAGE_SIZE); __bss_stop = .; } diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index 1b950d151e5..9796c2f3d07 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -52,6 +52,7 @@ extern void *__memcpy(void *, const void *, __kernel_size_t); EXPORT_SYMBOL(memset); EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(__memcpy); +EXPORT_SYMBOL(memmove); EXPORT_SYMBOL(empty_zero_page); #ifndef CONFIG_PARAVIRT diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index cd6da6bf3ec..6f164bd5e14 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -6,10 +6,12 @@ #include <linux/init.h> #include <linux/ioport.h> #include <linux/module.h> +#include <linux/pci.h> #include <asm/bios_ebda.h> #include <asm/paravirt.h> #include <asm/pci_x86.h> +#include <asm/pci.h> #include <asm/mpspec.h> #include <asm/setup.h> #include <asm/apic.h> @@ -33,7 +35,7 @@ void iommu_shutdown_noop(void) { } struct x86_init_ops x86_init __initdata = { .resources = { - .probe_roms = x86_init_noop, + .probe_roms = probe_roms, .reserve_resources = reserve_standard_io_resources, .memory_setup = default_machine_specific_memory_setup, }, @@ -59,6 +61,10 @@ struct x86_init_ops x86_init __initdata = { .banner = default_banner, }, + .mapping = { + .pagetable_reserve = native_pagetable_reserve, + }, + .paging = { .pagetable_setup_start = native_pagetable_setup_start, .pagetable_setup_done = native_pagetable_setup_done, @@ -68,6 +74,7 @@ struct x86_init_ops x86_init __initdata = { .setup_percpu_clockev = setup_boot_APIC_clock, .tsc_pre_init = x86_init_noop, .timer_init = hpet_time_init, + .wallclock_init = x86_init_noop, }, .iommu = { @@ -99,3 +106,8 @@ struct x86_platform_ops x86_platform = { }; EXPORT_SYMBOL_GPL(x86_platform); +struct x86_msi_ops x86_msi = { + .setup_msi_irqs = native_setup_msi_irqs, + .teardown_msi_irq = native_teardown_msi_irq, + .teardown_msi_irqs = default_teardown_msi_irqs, +}; diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 9c253bd65e2..a3911343976 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -53,7 +53,7 @@ void __sanitize_i387_state(struct task_struct *tsk) /* * None of the feature bits are in init state. So nothing else - * to do for us, as the memory layout is upto date. + * to do for us, as the memory layout is up to date. */ if ((xstate_bv & pcntxt_mask) == pcntxt_mask) return; @@ -394,7 +394,8 @@ static void __init setup_xstate_init(void) * Setup init_xstate_buf to represent the init state of * all the features managed by the xsave */ - init_xstate_buf = alloc_bootmem(xstate_size); + init_xstate_buf = alloc_bootmem_align(xstate_size, + __alignof__(struct xsave_struct)); init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT; clts(); |
