diff options
Diffstat (limited to 'arch/x86')
120 files changed, 838 insertions, 574 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 93936de6779..9458685902b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -662,7 +662,7 @@ config GART_IOMMU bool "GART IOMMU support" if EMBEDDED default y select SWIOTLB - depends on X86_64 && PCI + depends on X86_64 && PCI && K8_NB ---help--- Support for full DMA access of devices with 32bit memory access only on systems with more than 3GB. This is usually needed for USB, @@ -1216,8 +1216,8 @@ config NUMA_EMU config NODES_SHIFT int "Maximum NUMA Nodes (as a power of 2)" if !MAXSMP - range 1 9 - default "9" if MAXSMP + range 1 10 + default "10" if MAXSMP default "6" if X86_64 default "4" if X86_NUMAQ default "3" @@ -2061,7 +2061,7 @@ endif # X86_32 config K8_NB def_bool y - depends on AGP_AMD64 || (X86_64 && (GART_IOMMU || (PCI && NUMA))) + depends on CPU_SUP_AMD && PCI source "drivers/pcmcia/Kconfig" diff --git a/arch/x86/crypto/fpu.c b/arch/x86/crypto/fpu.c index daef6cd2b45..1a8f8649c03 100644 --- a/arch/x86/crypto/fpu.c +++ b/arch/x86/crypto/fpu.c @@ -16,6 +16,7 @@ #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> +#include <linux/slab.h> #include <asm/i387.h> struct crypto_fpu_ctx { diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 280c019cfad..0350311906a 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -21,7 +21,6 @@ #include <linux/fcntl.h> #include <linux/ptrace.h> #include <linux/user.h> -#include <linux/slab.h> #include <linux/binfmts.h> #include <linux/personality.h> #include <linux/init.h> diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 59b4556a5b9..e790bc1fbfa 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -626,7 +626,7 @@ ia32_sys_call_table: .quad stub32_sigreturn .quad stub32_clone /* 120 */ .quad sys_setdomainname - .quad sys_uname + .quad sys_newuname .quad sys_modify_ldt .quad compat_sys_adjtimex .quad sys32_mprotect /* 125 */ diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 74c35431b7d..626be156d88 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -40,6 +40,7 @@ #include <linux/ptrace.h> #include <linux/highuid.h> #include <linux/sysctl.h> +#include <linux/slab.h> #include <asm/mman.h> #include <asm/types.h> #include <asm/uaccess.h> diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index ba19ad4c47d..86a0ff0aeac 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -21,6 +21,7 @@ #define _ASM_X86_AMD_IOMMU_TYPES_H #include <linux/types.h> +#include <linux/mutex.h> #include <linux/list.h> #include <linux/spinlock.h> @@ -140,6 +141,7 @@ /* constants to configure the command buffer */ #define CMD_BUFFER_SIZE 8192 +#define CMD_BUFFER_UNINITIALIZED 1 #define CMD_BUFFER_ENTRIES 512 #define MMIO_CMD_SIZE_SHIFT 56 #define MMIO_CMD_SIZE_512 (0x9ULL << MMIO_CMD_SIZE_SHIFT) @@ -237,6 +239,7 @@ struct protection_domain { struct list_head list; /* for list of all protection domains */ struct list_head dev_list; /* List of all devices in this domain */ spinlock_t lock; /* mostly used to lock the page table*/ + struct mutex api_lock; /* protect page tables in the iommu-api path */ u16 id; /* the domain id written to the device table */ int mode; /* paging mode (0-6 levels) */ u64 *pt_root; /* page table root pointer */ diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 635f03bb499..d07b44f7d1d 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -82,6 +82,9 @@ enum fixed_addresses { #endif FIX_DBGP_BASE, FIX_EARLYCON_MEM_BASE, +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT + FIX_OHCI1394_BASE, +#endif #ifdef CONFIG_X86_LOCAL_APIC FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */ #endif @@ -132,9 +135,6 @@ enum fixed_addresses { (__end_of_permanent_fixed_addresses & (TOTAL_FIX_BTMAPS - 1)) : __end_of_permanent_fixed_addresses, FIX_BTMAP_BEGIN = FIX_BTMAP_END + TOTAL_FIX_BTMAPS - 1, -#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT - FIX_OHCI1394_BASE, -#endif #ifdef CONFIG_X86_32 FIX_WP_TEST, #endif diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h index 0675a7c4c20..2a1bd8f4f23 100644 --- a/arch/x86/include/asm/hw_breakpoint.h +++ b/arch/x86/include/asm/hw_breakpoint.h @@ -10,7 +10,6 @@ * (display/resolving) */ struct arch_hw_breakpoint { - char *name; /* Contains name of the symbol to set bkpt */ unsigned long address; u8 len; u8 type; diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index a929c9ede33..46c0fe05f23 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -133,6 +133,7 @@ extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void); typedef int vector_irq_t[NR_VECTORS]; DECLARE_PER_CPU(vector_irq_t, vector_irq); +extern void setup_vector_irq(int cpu); #ifdef CONFIG_X86_IO_APIC extern void lock_vector_lock(void); diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index ba0eed8aa1a..b60f2924c41 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h @@ -28,22 +28,39 @@ #ifndef __ASSEMBLY__ #include <asm/hw_irq.h> -#include <asm/kvm_para.h> /*G:030 * But first, how does our Guest contact the Host to ask for privileged * operations? There are two ways: the direct way is to make a "hypercall", * to make requests of the Host Itself. * - * We use the KVM hypercall mechanism, though completely different hypercall - * numbers. Seventeen hypercalls are available: the hypercall number is put in - * the %eax register, and the arguments (when required) are placed in %ebx, - * %ecx, %edx and %esi. If a return value makes sense, it's returned in %eax. + * Our hypercall mechanism uses the highest unused trap code (traps 32 and + * above are used by real hardware interrupts). Seventeen hypercalls are + * available: the hypercall number is put in the %eax register, and the + * arguments (when required) are placed in %ebx, %ecx, %edx and %esi. + * If a return value makes sense, it's returned in %eax. * * Grossly invalid calls result in Sudden Death at the hands of the vengeful * Host, rather than returning failure. This reflects Winston Churchill's * definition of a gentleman: "someone who is only rude intentionally". -:*/ + */ +static inline unsigned long +hcall(unsigned long call, + unsigned long arg1, unsigned long arg2, unsigned long arg3, + unsigned long arg4) +{ + /* "int" is the Intel instruction to trigger a trap. */ + asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY) + /* The call in %eax (aka "a") might be overwritten */ + : "=a"(call) + /* The arguments are in %eax, %ebx, %ecx, %edx & %esi */ + : "a"(call), "b"(arg1), "c"(arg2), "d"(arg3), "S"(arg4) + /* "memory" means this might write somewhere in memory. + * This isn't true for all calls, but it's safe to tell + * gcc that it might happen so it doesn't get clever. */ + : "memory"); + return call; +} /* Can't use our min() macro here: needs to be a constant */ #define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 1cd58cdbc03..4604e6a54d3 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -105,6 +105,8 @@ #define MSR_AMD64_PATCH_LEVEL 0x0000008b #define MSR_AMD64_NB_CFG 0xc001001f #define MSR_AMD64_PATCH_LOADER 0xc0010020 +#define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140 +#define MSR_AMD64_OSVW_STATUS 0xc0010141 #define MSR_AMD64_IBSFETCHCTL 0xc0011030 #define MSR_AMD64_IBSFETCHLINAD 0xc0011031 #define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032 diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index befd172c82a..db6109a885a 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -18,7 +18,7 @@ #define MSR_ARCH_PERFMON_EVENTSEL0 0x186 #define MSR_ARCH_PERFMON_EVENTSEL1 0x187 -#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22) +#define ARCH_PERFMON_EVENTSEL_ENABLE (1 << 22) #define ARCH_PERFMON_EVENTSEL_ANY (1 << 21) #define ARCH_PERFMON_EVENTSEL_INT (1 << 20) #define ARCH_PERFMON_EVENTSEL_OS (1 << 17) @@ -50,7 +50,7 @@ INTEL_ARCH_INV_MASK| \ INTEL_ARCH_EDGE_MASK|\ INTEL_ARCH_UNIT_MASK|\ - INTEL_ARCH_EVTSEL_MASK) + INTEL_ARCH_EVENT_MASK) #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) @@ -117,6 +117,18 @@ union cpuid10_edx { */ #define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16) +/* IbsFetchCtl bits/masks */ +#define IBS_FETCH_RAND_EN (1ULL<<57) +#define IBS_FETCH_VAL (1ULL<<49) +#define IBS_FETCH_ENABLE (1ULL<<48) +#define IBS_FETCH_CNT 0xFFFF0000ULL +#define IBS_FETCH_MAX_CNT 0x0000FFFFULL + +/* IbsOpCtl bits */ +#define IBS_OP_CNT_CTL (1ULL<<19) +#define IBS_OP_VAL (1ULL<<18) +#define IBS_OP_ENABLE (1ULL<<17) +#define IBS_OP_MAX_CNT 0x0000FFFFULL #ifdef CONFIG_PERF_EVENTS extern void init_hw_perf_events(void); diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index 47339a1ac7b..2984a25ff38 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -19,7 +19,6 @@ #include <asm/paravirt.h> #include <linux/bitops.h> -#include <linux/slab.h> #include <linux/list.h> #include <linux/spinlock.h> diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index a54d714545f..cd40aba6aa9 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -31,6 +31,7 @@ #include <linux/module.h> #include <linux/dmi.h> #include <linux/irq.h> +#include <linux/slab.h> #include <linux/bootmem.h> #include <linux/ioport.h> #include <linux/pci.h> @@ -490,6 +491,7 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) * ACPI based hotplug support for CPU */ #ifdef CONFIG_ACPI_HOTPLUG_CPU +#include <acpi/processor.h> static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) { @@ -567,6 +569,8 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu) goto free_new_map; } + acpi_processor_set_pdc(handle); + cpu = cpumask_first(new_map); acpi_map_cpu2node(handle, cpu, physid); @@ -1293,23 +1297,6 @@ static int __init dmi_disable_acpi(const struct dmi_system_id *d) } /* - * Limit ACPI to CPU enumeration for HT - */ -static int __init force_acpi_ht(const struct dmi_system_id *d) -{ - if (!acpi_force) { - printk(KERN_NOTICE "%s detected: force use of acpi=ht\n", - d->ident); - disable_acpi(); - acpi_ht = 1; - } else { - printk(KERN_NOTICE - "Warning: acpi=force overrules DMI blacklist: acpi=ht\n"); - } - return 0; -} - -/* * Force ignoring BIOS IRQ0 pin2 override */ static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) @@ -1345,82 +1332,6 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = { }, /* - * Boxes that need acpi=ht - */ - { - .callback = force_acpi_ht, - .ident = "FSC Primergy T850", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"), - DMI_MATCH(DMI_PRODUCT_NAME, "PRIMERGY T850"), - }, - }, - { - .callback = force_acpi_ht, - .ident = "HP VISUALIZE NT Workstation", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"), - DMI_MATCH(DMI_PRODUCT_NAME, "HP VISUALIZE NT Workstation"), - }, - }, - { - .callback = force_acpi_ht, - .ident = "Compaq Workstation W8000", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Compaq"), - DMI_MATCH(DMI_PRODUCT_NAME, "Workstation W8000"), - }, - }, - { - .callback = force_acpi_ht, - .ident = "ASUS CUR-DLS", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), - DMI_MATCH(DMI_BOARD_NAME, "CUR-DLS"), - }, - }, - { - .callback = force_acpi_ht, - .ident = "ABIT i440BX-W83977", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "ABIT <http://www.abit.com>"), - DMI_MATCH(DMI_BOARD_NAME, "i440BX-W83977 (BP6)"), - }, - }, - { - .callback = force_acpi_ht, - .ident = "IBM Bladecenter", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), - DMI_MATCH(DMI_BOARD_NAME, "IBM eServer BladeCenter HS20"), - }, - }, - { - .callback = force_acpi_ht, - .ident = "IBM eServer xSeries 360", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), - DMI_MATCH(DMI_BOARD_NAME, "eServer xSeries 360"), - }, - }, - { - .callback = force_acpi_ht, - .ident = "IBM eserver xSeries 330", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), - DMI_MATCH(DMI_BOARD_NAME, "eserver xSeries 330"), - }, - }, - { - .callback = force_acpi_ht, - .ident = "IBM eserver xSeries 440", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), - DMI_MATCH(DMI_PRODUCT_NAME, "eserver xSeries 440"), - }, - }, - - /* * Boxes that need ACPI PCI IRQ routing disabled */ { @@ -1652,8 +1563,10 @@ static int __init parse_acpi(char *arg) } /* Limit ACPI just to boot-time to enable HT */ else if (strcmp(arg, "ht") == 0) { - if (!acpi_force) + if (!acpi_force) { + printk(KERN_WARNING "acpi=ht will be removed in Linux-2.6.35\n"); disable_acpi(); + } acpi_ht = 1; } /* acpi=rsdt use RSDT instead of XSDT */ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 3a4bf35c179..1a160d5d44d 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -8,6 +8,7 @@ #include <linux/vmalloc.h> #include <linux/memory.h> #include <linux/stop_machine.h> +#include <linux/slab.h> #include <asm/alternative.h> #include <asm/sections.h> #include <asm/pgtable.h> diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index adb0ba02570..f854d89b7ed 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -18,8 +18,8 @@ */ #include <linux/pci.h> -#include <linux/gfp.h> #include <linux/bitmap.h> +#include <linux/slab.h> #include <linux/debugfs.h> #include <linux/scatterlist.h> #include <linux/dma-mapping.h> @@ -118,7 +118,7 @@ static bool check_device(struct device *dev) return false; /* No device or no PCI device */ - if (!dev || dev->bus != &pci_bus_type) + if (dev->bus != &pci_bus_type) return false; devid = get_device_id(dev); @@ -392,6 +392,7 @@ static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) u32 tail, head; u8 *target; + WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED); tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); target = iommu->cmd_buf + tail; memcpy_toio(target, cmd, sizeof(*cmd)); @@ -2186,7 +2187,7 @@ static void prealloc_protection_domains(void) struct dma_ops_domain *dma_dom; u16 devid; - while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { + for_each_pci_dev(dev) { /* Do we handle this device? */ if (!check_device(&dev->dev)) @@ -2298,7 +2299,7 @@ static void cleanup_domain(struct protection_domain *domain) list_for_each_entry_safe(dev_data, next, &domain->dev_list, list) { struct device *dev = dev_data->dev; - do_detach(dev); + __detach_device(dev); atomic_set(&dev_data->bind, 0); } @@ -2327,6 +2328,7 @@ static struct protection_domain *protection_domain_alloc(void) return NULL; spin_lock_init(&domain->lock); + mutex_init(&domain->api_lock); domain->id = domain_id_alloc(); if (!domain->id) goto out_err; @@ -2379,9 +2381,7 @@ static void amd_iommu_domain_destroy(struct iommu_domain *dom) free_pagetable(domain); - domain_id_free(domain->id); - - kfree(domain); + protection_domain_free(domain); dom->priv = NULL; } @@ -2456,6 +2456,8 @@ static int amd_iommu_map_range(struct iommu_domain *dom, iova &= PAGE_MASK; paddr &= PAGE_MASK; + mutex_lock(&domain->api_lock); + for (i = 0; i < npages; ++i) { ret = iommu_map_page(domain, iova, paddr, prot, PM_MAP_4k); if (ret) @@ -2465,6 +2467,8 @@ static int amd_iommu_map_range(struct iommu_domain *dom, paddr += PAGE_SIZE; } + mutex_unlock(&domain->api_lock); + return 0; } @@ -2477,12 +2481,16 @@ static void amd_iommu_unmap_range(struct iommu_domain *dom, iova &= PAGE_MASK; + mutex_lock(&domain->api_lock); + for (i = 0; i < npages; ++i) { iommu_unmap_page(domain, iova, PM_MAP_4k); iova += PAGE_SIZE; } iommu_flush_tlb_pde(domain); + + mutex_unlock(&domain->api_lock); } static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 9dc91b43147..6360abf993d 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -19,8 +19,8 @@ #include <linux/pci.h> #include <linux/acpi.h> -#include <linux/gfp.h> #include <linux/list.h> +#include <linux/slab.h> #include <linux/sysdev.h> #include <linux/interrupt.h> #include <linux/msi.h> @@ -138,9 +138,9 @@ int amd_iommus_present; bool amd_iommu_np_cache __read_mostly; /* - * Set to true if ACPI table parsing and hardware intialization went properly + * The ACPI table parsing functions set this variable on an error */ -static bool amd_iommu_initialized; +static int __initdata amd_iommu_init_err; /* * List of protection domains - used during resume @@ -391,9 +391,11 @@ static int __init find_last_devid_acpi(struct acpi_table_header *table) */ for (i = 0; i < table->length; ++i) checksum += p[i]; - if (checksum != 0) + if (checksum != 0) { /* ACPI table corrupt */ - return -ENODEV; + amd_iommu_init_err = -ENODEV; + return 0; + } p += IVRS_HEADER_LENGTH; @@ -436,7 +438,7 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) if (cmd_buf == NULL) return NULL; - iommu->cmd_buf_size = CMD_BUFFER_SIZE; + iommu->cmd_buf_size = CMD_BUFFER_SIZE | CMD_BUFFER_UNINITIALIZED; return cmd_buf; } @@ -472,12 +474,13 @@ static void iommu_enable_command_buffer(struct amd_iommu *iommu) &entry, sizeof(entry)); amd_iommu_reset_cmd_buffer(iommu); + iommu->cmd_buf_size &= ~(CMD_BUFFER_UNINITIALIZED); } static void __init free_command_buffer(struct amd_iommu *iommu) { free_pages((unsigned long)iommu->cmd_buf, - get_order(iommu->cmd_buf_size)); + get_order(iommu->cmd_buf_size & ~(CMD_BUFFER_UNINITIALIZED))); } /* allocates the memory where the IOMMU will log its events to */ @@ -920,11 +923,16 @@ static int __init init_iommu_all(struct acpi_table_header *table) h->mmio_phys); iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL); - if (iommu == NULL) - return -ENOMEM; + if (iommu == NULL) { + amd_iommu_init_err = -ENOMEM; + return 0; + } + ret = init_iommu_one(iommu, h); - if (ret) - return ret; + if (ret) { + amd_iommu_init_err = ret; + return 0; + } break; default: break; @@ -934,8 +942,6 @@ static int __init init_iommu_all(struct acpi_table_header *table) } WARN_ON(p != end); - amd_iommu_initialized = true; - return 0; } @@ -1211,6 +1217,10 @@ static int __init amd_iommu_init(void) if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0) return -ENODEV; + ret = amd_iommu_init_err; + if (ret) + goto out; + dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE); alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE); rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE); @@ -1270,12 +1280,19 @@ static int __init amd_iommu_init(void) if (acpi_table_parse("IVRS", init_iommu_all) != 0) goto free; - if (!amd_iommu_initialized) + if (amd_iommu_init_err) { + ret = amd_iommu_init_err; goto free; + } if (acpi_table_parse("IVRS", init_memory_definitions) != 0) goto free; + if (amd_iommu_init_err) { + ret = amd_iommu_init_err; + goto free; + } + ret = sysdev_class_register(&amd_iommu_sysdev_class); if (ret) goto free; @@ -1288,6 +1305,8 @@ static int __init amd_iommu_init(void) if (ret) goto free; + enable_iommus(); + if (iommu_pass_through) ret = amd_iommu_init_passthrough(); else @@ -1300,8 +1319,6 @@ static int __init amd_iommu_init(void) amd_iommu_init_notifier(); - enable_iommus(); - if (iommu_pass_through) goto out; @@ -1315,6 +1332,7 @@ out: return ret; free: + disable_iommus(); amd_iommu_uninit_devices(); diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index 4b7099526d2..ff469e47005 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -33,6 +33,7 @@ #include <linux/errno.h> #include <linux/init.h> #include <linux/sysdev.h> +#include <linux/slab.h> #include <linux/pm.h> #include <linux/pci.h> #include <linux/sfi.h> diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index f147a95fd84..b5d8b0bcf23 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -31,7 +31,6 @@ #include <asm/x86_init.h> int gart_iommu_aperture; -EXPORT_SYMBOL_GPL(gart_iommu_aperture); int gart_iommu_aperture_disabled __initdata; int gart_iommu_aperture_allowed __initdata; @@ -394,6 +393,7 @@ void __init gart_iommu_hole_init(void) for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { int bus; int dev_base, dev_limit; + u32 ctl; bus = bus_dev_ranges[i].bus; dev_base = bus_dev_ranges[i].dev_base; @@ -407,7 +407,19 @@ void __init gart_iommu_hole_init(void) gart_iommu_aperture = 1; x86_init.iommu.iommu_init = gart_iommu_init; - aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7; + ctl = read_pci_config(bus, slot, 3, + AMD64_GARTAPERTURECTL); + + /* + * Before we do anything else disable the GART. It may + * still be enabled if we boot into a crash-kernel here. + * Reconfiguring the GART while it is enabled could have + * unknown side-effects. + */ + ctl &= ~GARTEN; + write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl); + + aper_order = (ctl >> 1) & 7; aper_size = (32 * 1024 * 1024) << aper_order; aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff; aper_base <<= 25; diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 00187f1fcfb..e5a4a1e0161 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1640,8 +1640,10 @@ int __init APIC_init_uniprocessor(void) } #endif +#ifndef CONFIG_SMP enable_IR_x2apic(); default_setup_apic_routing(); +#endif verify_local_APIC(); connect_bsp_APIC(); diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index dd2b5f26464..03ba1b895f5 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -42,6 +42,7 @@ #include <linux/errno.h> #include <linux/acpi.h> #include <linux/init.h> +#include <linux/gfp.h> #include <linux/nmi.h> #include <linux/smp.h> #include <linux/io.h> diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index e4e0ddcb154..127b8718abf 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -36,6 +36,7 @@ #include <linux/freezer.h> #include <linux/kthread.h> #include <linux/jiffies.h> /* time_after() */ +#include <linux/slab.h> #ifdef CONFIG_ACPI #include <acpi/acpi_bus.h> #endif @@ -1268,6 +1269,14 @@ void __setup_vector_irq(int cpu) /* Mark the inuse vectors */ for_each_irq_desc(irq, desc) { cfg = desc->chip_data; + + /* + * If it is a legacy IRQ handled by the legacy PIC, this cpu + * will be part of the irq_cfg's domain. + */ + if (irq < legacy_pic->nr_legacy_irqs && !IO_APIC_IRQ(irq)) + cpumask_set_cpu(cpu, cfg->domain); + if (!cpumask_test_cpu(cpu, cfg->domain)) continue; vector = cfg->vector; diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index 8aa65adbd25..1edaf15c0b8 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -18,6 +18,7 @@ #include <linux/delay.h> #include <linux/interrupt.h> #include <linux/module.h> +#include <linux/slab.h> #include <linux/sysdev.h> #include <linux/sysctl.h> #include <linux/percpu.h> diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 3740c8a4eae..c085d52dbaf 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -17,6 +17,7 @@ #include <linux/ctype.h> #include <linux/sched.h> #include <linux/timer.h> +#include <linux/slab.h> #include <linux/cpu.h> #include <linux/init.h> #include <linux/io.h> @@ -120,11 +121,9 @@ EXPORT_SYMBOL_GPL(uv_possible_blades); unsigned long sn_rtc_cycles_per_second; EXPORT_SYMBOL(sn_rtc_cycles_per_second); -/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ - static const struct cpumask *uv_target_cpus(void) { - return cpumask_of(0); + return cpu_online_mask; } static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask) diff --git a/arch/x86/kernel/bootflag.c b/arch/x86/kernel/bootflag.c index 30f25a75fe2..5de7f4c5697 100644 --- a/arch/x86/kernel/bootflag.c +++ b/arch/x86/kernel/bootflag.c @@ -5,7 +5,6 @@ #include <linux/kernel.h> #include <linux/init.h> #include <linux/string.h> -#include <linux/slab.h> #include <linux/spinlock.h> #include <linux/acpi.h> #include <asm/io.h> diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 1b1920fa7c8..459168083b7 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -33,6 +33,7 @@ #include <linux/cpufreq.h> #include <linux/compiler.h> #include <linux/dmi.h> +#include <linux/slab.h> #include <trace/events/power.h> #include <linux/acpi.h> diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c index 006b278b0d5..c587db472a7 100644 --- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c +++ b/arch/x86/kernel/cpu/cpufreq/elanfreq.c @@ -20,7 +20,6 @@ #include <linux/module.h> #include <linux/init.h> -#include <linux/slab.h> #include <linux/delay.h> #include <linux/cpufreq.h> diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c index ac27ec2264d..16e3483be9e 100644 --- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c +++ b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c @@ -80,6 +80,7 @@ #include <linux/cpufreq.h> #include <linux/pci.h> #include <linux/errno.h> +#include <linux/slab.h> #include <asm/processor-cyrix.h> diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c index da5f70fcb76..e7b559d74c5 100644 --- a/arch/x86/kernel/cpu/cpufreq/longrun.c +++ b/arch/x86/kernel/cpu/cpufreq/longrun.c @@ -9,7 +9,6 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> -#include <linux/slab.h> #include <linux/cpufreq.h> #include <linux/timex.h> diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c index 86961519372..7b8a8ba67b0 100644 --- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c +++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c @@ -25,7 +25,6 @@ #include <linux/init.h> #include <linux/smp.h> #include <linux/cpufreq.h> -#include <linux/slab.h> #include <linux/cpumask.h> #include <linux/timex.h> diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c index ff36d2979a9..ce7cde713e7 100644 --- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c @@ -30,6 +30,7 @@ #include <linux/sched.h> #include <linux/cpufreq.h> #include <linux/compiler.h> +#include <linux/slab.h> #include <linux/acpi.h> #include <linux/io.h> diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c index cb01dac267d..b3379d6a5c5 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c @@ -13,7 +13,6 @@ #include <linux/init.h> #include <linux/cpufreq.h> #include <linux/ioport.h> -#include <linux/slab.h> #include <linux/timex.h> #include <linux/io.h> diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c index 8d672ef162c..9b1ff37de46 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c @@ -20,6 +20,7 @@ #include <linux/sched.h> /* current */ #include <linux/delay.h> #include <linux/compiler.h> +#include <linux/gfp.h> #include <asm/msr.h> #include <asm/processor.h> diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c index 2ce8e0b5cc5..561758e9518 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c @@ -23,7 +23,6 @@ #include <linux/init.h> #include <linux/cpufreq.h> #include <linux/pci.h> -#include <linux/slab.h> #include <linux/sched.h> #include "speedstep-lib.h" diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c index ad0083abfa2..a94ec6be69f 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c @@ -13,7 +13,6 @@ #include <linux/moduleparam.h> #include <linux/init.h> #include <linux/cpufreq.h> -#include <linux/slab.h> #include <asm/msr.h> #include <asm/tsc.h> diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c index 04d73c114e4..8abd869baab 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c @@ -17,7 +17,6 @@ #include <linux/moduleparam.h> #include <linux/init.h> #include <linux/cpufreq.h> -#include <linux/slab.h> #include <linux/delay.h> #include <linux/io.h> #include <asm/ist.h> diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 879666f4d87..7e1cca13af3 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -70,7 +70,8 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) if (c->x86_power & (1 << 8)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); - sched_clock_stable = 1; + if (!check_tsc_unstable()) + sched_clock_stable = 1; } /* diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 73734baa50f..e7dbde7bfed 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -22,6 +22,7 @@ #include <linux/kdebug.h> #include <linux/cpu.h> #include <linux/sched.h> +#include <linux/gfp.h> #include <asm/mce.h> #include <asm/apic.h> diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 28cba46bf32..8a6f0afa767 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -26,6 +26,7 @@ #include <linux/sched.h> #include <linux/sysfs.h> #include <linux/types.h> +#include <linux/slab.h> #include <linux/init.h> #include <linux/kmod.h> #include <linux/poll.h> @@ -46,6 +47,13 @@ #include "mce-internal.h" +static DEFINE_MUTEX(mce_read_mutex); + +#define rcu_dereference_check_mce(p) \ + rcu_dereference_check((p), \ + rcu_read_lock_sched_held() || \ + lockdep_is_held(&mce_read_mutex)) + #define CREATE_TRACE_POINTS #include <trace/events/mce.h> @@ -158,7 +166,7 @@ void mce_log(struct mce *mce) mce->finished = 0; wmb(); for (;;) { - entry = rcu_dereference(mcelog.next); + entry = rcu_dereference_check_mce(mcelog.next); for (;;) { /* * When the buffer fills up discard new entries. @@ -1485,8 +1493,6 @@ static void collect_tscs(void *data) rdtscll(cpu_tsc[smp_processor_id()]); } -static DEFINE_MUTEX(mce_read_mutex); - static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off) { @@ -1500,7 +1506,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, return -ENOMEM; mutex_lock(&mce_read_mutex); - next = rcu_dereference(mcelog.next); + next = rcu_dereference_check_mce(mcelog.next); /* Only supports full reads right now */ if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { @@ -1565,7 +1571,7 @@ timeout: static unsigned int mce_poll(struct file *file, poll_table *wait) { poll_wait(file, &mce_wait, wait); - if (rcu_dereference(mcelog.next)) + if (rcu_dereference_check_mce(mcelog.next)) return POLLIN | POLLRDNORM; return 0; } diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index cda932ca3ad..224392d8fe8 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -21,6 +21,7 @@ #include <linux/errno.h> #include <linux/sched.h> #include <linux/sysfs.h> +#include <linux/slab.h> #include <linux/init.h> #include <linux/cpu.h> #include <linux/smp.h> diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 7c785634af2..62b48e40920 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -5,6 +5,7 @@ * Author: Andi Kleen */ +#include <linux/gfp.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/percpu.h> @@ -95,7 +96,7 @@ static void cmci_discover(int banks, int boot) /* Already owned by someone else? */ if (val & CMCI_EN) { - if (test_and_clear_bit(i, owned) || boot) + if (test_and_clear_bit(i, owned) && !boot) print_update("SHD", &hdr, i); __clear_bit(i, __get_cpu_var(mce_poll_banks)); continue; @@ -107,7 +108,7 @@ static void cmci_discover(int banks, int boot) /* Did the enable bit stick? -- the bank supports CMCI */ if (val & CMCI_EN) { - if (!test_and_set_bit(i, owned) || boot) + if (!test_and_set_bit(i, owned) && !boot) print_update("CMCI", &hdr, i); __clear_bit(i, __get_cpu_var(mce_poll_banks)); } else { diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 9aa5dc76ff4..fd31a441c61 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -6,7 +6,6 @@ #include <linux/module.h> #include <linux/init.h> -#include <linux/slab.h> #include <linux/io.h> #include <linux/mm.h> diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index e006e56f699..79289632cb2 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -5,6 +5,7 @@ #include <linux/module.h> #include <linux/ctype.h> #include <linux/string.h> +#include <linux/slab.h> #include <linux/init.h> #define LINE_SIZE 80 diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index b1fbdeecf6c..db5bdc8addf 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -21,6 +21,7 @@ #include <linux/kdebug.h> #include <linux/sched.h> #include <linux/uaccess.h> +#include <linux/slab.h> #include <linux/highmem.h> #include <linux/cpu.h> #include <linux/bitops.h> @@ -28,6 +29,7 @@ #include <asm/apic.h> #include <asm/stacktrace.h> #include <asm/nmi.h> +#include <asm/compat.h> static u64 perf_event_mask __read_mostly; @@ -73,10 +75,10 @@ struct debug_store { struct event_constraint { union { unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - u64 idxmsk64[1]; + u64 idxmsk64; }; - int code; - int cmask; + u64 code; + u64 cmask; int weight; }; @@ -103,7 +105,7 @@ struct cpu_hw_events { }; #define __EVENT_CONSTRAINT(c, n, m, w) {\ - { .idxmsk64[0] = (n) }, \ + { .idxmsk64 = (n) }, \ .code = (c), \ .cmask = (m), \ .weight = (w), \ @@ -116,7 +118,7 @@ struct cpu_hw_events { EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVTSEL_MASK) #define FIXED_EVENT_CONSTRAINT(c, n) \ - EVENT_CONSTRAINT(c, n, INTEL_ARCH_FIXED_MASK) + EVENT_CONSTRAINT(c, (1ULL << (32+n)), INTEL_ARCH_FIXED_MASK) #define EVENT_CONSTRAINT_END \ EVENT_CONSTRAINT(0, 0, 0) @@ -133,8 +135,8 @@ struct x86_pmu { int (*handle_irq)(struct pt_regs *); void (*disable_all)(void); void (*enable_all)(void); - void (*enable)(struct hw_perf_event *, int); - void (*disable)(struct hw_perf_event *, int); + void (*enable)(struct perf_event *); + void (*disable)(struct perf_event *); unsigned eventsel; unsigned perfctr; u64 (*event_map)(int); @@ -157,6 +159,11 @@ struct x86_pmu { void (*put_event_constraints)(struct cpu_hw_events *cpuc, struct perf_event *event); struct event_constraint *event_constraints; + + int (*cpu_prepare)(int cpu); + void (*cpu_starting)(int cpu); + void (*cpu_dying)(int cpu); + void (*cpu_dead)(int cpu); }; static struct x86_pmu x86_pmu __read_mostly; @@ -165,8 +172,7 @@ static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, }; -static int x86_perf_event_set_period(struct perf_event *event, - struct hw_perf_event *hwc, int idx); +static int x86_perf_event_set_period(struct perf_event *event); /* * Generalized hw caching related hw_event table, filled @@ -189,11 +195,12 @@ static u64 __read_mostly hw_cache_event_ids * Returns the delta events processed. */ static u64 -x86_perf_event_update(struct perf_event *event, - struct hw_perf_event *hwc, int idx) +x86_perf_event_update(struct perf_event *event) { + struct hw_perf_event *hwc = &event->hw; int shift = 64 - x86_pmu.event_bits; u64 prev_raw_count, new_raw_count; + int idx = hwc->idx; s64 delta; if (idx == X86_PMC_IDX_FIXED_BTS) @@ -293,7 +300,7 @@ static inline bool bts_available(void) return x86_pmu.enable_bts != NULL; } -static inline void init_debug_store_on_cpu(int cpu) +static void init_debug_store_on_cpu(int cpu) { struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; @@ -305,7 +312,7 @@ static inline void init_debug_store_on_cpu(int cpu) (u32)((u64)(unsigned long)ds >> 32)); } -static inline void fini_debug_store_on_cpu(int cpu) +static void fini_debug_store_on_cpu(int cpu) { if (!per_cpu(cpu_hw_events, cpu).ds) return; @@ -503,6 +510,9 @@ static int __hw_perf_event_init(struct perf_event *event) */ if (attr->type == PERF_TYPE_RAW) { hwc->config |= x86_pmu.raw_event(attr->config); + if ((hwc->config & ARCH_PERFMON_EVENTSEL_ANY) && + perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) + return -EACCES; return 0; } @@ -553,9 +563,9 @@ static void x86_pmu_disable_all(void) if (!test_bit(idx, cpuc->active_mask)) continue; rdmsrl(x86_pmu.eventsel + idx, val); - if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE)) + if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE)) continue; - val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; wrmsrl(x86_pmu.eventsel + idx, val); } } @@ -590,7 +600,7 @@ static void x86_pmu_enable_all(void) continue; val = event->hw.config; - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + val |= ARCH_PERFMON_EVENTSEL_ENABLE; wrmsrl(x86_pmu.eventsel + idx, val); } } @@ -612,8 +622,8 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) bitmap_zero(used_mask, X86_PMC_IDX_MAX); for (i = 0; i < n; i++) { - constraints[i] = - x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]); + c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]); + constraints[i] = c; } /* @@ -635,7 +645,7 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) if (test_bit(hwc->idx, used_mask)) break; - set_bit(hwc->idx, used_mask); + __set_bit(hwc->idx, used_mask); if (assign) assign[i] = hwc->idx; } @@ -684,7 +694,7 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) if (j == X86_PMC_IDX_MAX) break; - set_bit(j, used_mask); + __set_bit(j, used_mask); if (assign) assign[i] = j; @@ -777,6 +787,7 @@ static inline int match_prev_assignment(struct hw_perf_event *hwc, hwc->last_tag == cpuc->tags[i]; } +static int x86_pmu_start(struct perf_event *event); static void x86_pmu_stop(struct perf_event *event); void hw_perf_enable(void) @@ -793,6 +804,7 @@ void hw_perf_enable(void) return; if (cpuc->n_added) { + int n_running = cpuc->n_events - cpuc->n_added; /* * apply assignment obtained either from * hw_perf_group_sched_in() or x86_pmu_enable() @@ -800,8 +812,7 @@ void hw_perf_enable(void) * step1: save events moving to new counters * step2: reprogram moved events into new counters */ - for (i = 0; i < cpuc->n_events; i++) { - + for (i = 0; i < n_running; i++) { event = cpuc->event_list[i]; hwc = &event->hw; @@ -816,29 +827,18 @@ void hw_perf_enable(void) continue; x86_pmu_stop(event); - - hwc->idx = -1; } for (i = 0; i < cpuc->n_events; i++) { - event = cpuc->event_list[i]; hwc = &event->hw; - if (hwc->idx == -1) { + if (!match_prev_assignment(hwc, cpuc, i)) x86_assign_hw_event(event, cpuc, i); - x86_perf_event_set_period(event, hwc, hwc->idx); - } - /* - * need to mark as active because x86_pmu_disable() - * clear active_mask and events[] yet it preserves - * idx - */ - set_bit(hwc->idx, cpuc->active_mask); - cpuc->events[hwc->idx] = event; + else if (i < n_running) + continue; - x86_pmu.enable(hwc, hwc->idx); - perf_event_update_userpage(event); + x86_pmu_start(event); } cpuc->n_added = 0; perf_events_lapic_init(); @@ -850,15 +850,16 @@ void hw_perf_enable(void) x86_pmu.enable_all(); } -static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, int idx) +static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc) { - (void)checking_wrmsrl(hwc->config_base + idx, - hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); + (void)checking_wrmsrl(hwc->config_base + hwc->idx, + hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE); } -static inline void x86_pmu_disable_event(struct hw_perf_event *hwc, int idx) +static inline void x86_pmu_disable_event(struct perf_event *event) { - (void)checking_wrmsrl(hwc->config_base + idx, hwc->config); + struct hw_perf_event *hwc = &event->hw; + (void)checking_wrmsrl(hwc->config_base + hwc->idx, hwc->config); } static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); @@ -868,12 +869,12 @@ static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); * To be called with the event disabled in hw: */ static int -x86_perf_event_set_period(struct perf_event *event, - struct hw_perf_event *hwc, int idx) +x86_perf_event_set_period(struct perf_event *event) { + struct hw_perf_event *hwc = &event->hw; s64 left = atomic64_read(&hwc->period_left); s64 period = hwc->sample_period; - int err, ret = 0; + int err, ret = 0, idx = hwc->idx; if (idx == X86_PMC_IDX_FIXED_BTS) return 0; @@ -919,11 +920,11 @@ x86_perf_event_set_period(struct perf_event *event, return ret; } -static void x86_pmu_enable_event(struct hw_perf_event *hwc, int idx) +static void x86_pmu_enable_event(struct perf_event *event) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); if (cpuc->enabled) - __x86_pmu_enable_event(hwc, idx); + __x86_pmu_enable_event(&event->hw); } /* @@ -959,34 +960,32 @@ static int x86_pmu_enable(struct perf_event *event) memcpy(cpuc->assign, assign, n*sizeof(int)); cpuc->n_events = n; - cpuc->n_added = n - n0; + cpuc->n_added += n - n0; return 0; } static int x86_pmu_start(struct perf_event *event) { - struct hw_perf_event *hwc = &event->hw; + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + int idx = event->hw.idx; - if (hwc->idx == -1) + if (idx == -1) return -EAGAIN; - x86_perf_event_set_period(event, hwc, hwc->idx); - x86_pmu.enable(hwc, hwc->idx); + x86_perf_event_set_period(event); + cpuc->events[idx] = event; + __set_bit(idx, cpuc->active_mask); + x86_pmu.enable(event); + perf_event_update_userpage(event); return 0; } static void x86_pmu_unthrottle(struct perf_event *event) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - struct hw_perf_event *hwc = &event->hw; - - if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX || - cpuc->events[hwc->idx] != event)) - return; - - x86_pmu.enable(hwc, hwc->idx); + int ret = x86_pmu_start(event); + WARN_ON_ONCE(ret); } void perf_event_print_debug(void) @@ -1046,18 +1045,16 @@ static void x86_pmu_stop(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; - /* - * Must be done before we disable, otherwise the nmi handler - * could reenable again: - */ - clear_bit(idx, cpuc->active_mask); - x86_pmu.disable(hwc, idx); + if (!__test_and_clear_bit(idx, cpuc->active_mask)) + return; + + x86_pmu.disable(event); /* * Drain the remaining delta count out of a event * that we are disabling: */ - x86_perf_event_update(event, hwc, idx); + x86_perf_event_update(event); cpuc->events[idx] = NULL; } @@ -1094,8 +1091,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) int idx, handled = 0; u64 val; - data.addr = 0; - data.raw = NULL; + perf_sample_data_init(&data, 0); cpuc = &__get_cpu_var(cpu_hw_events); @@ -1106,7 +1102,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) event = cpuc->events[idx]; hwc = &event->hw; - val = x86_perf_event_update(event, hwc, idx); + val = x86_perf_event_update(event); if (val & (1ULL << (x86_pmu.event_bits - 1))) continue; @@ -1116,11 +1112,11 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) handled = 1; data.period = event->hw.last_period; - if (!x86_perf_event_set_period(event, hwc, idx)) + if (!x86_perf_event_set_period(event)) continue; if (perf_event_overflow(event, 1, &data, regs)) - x86_pmu.disable(hwc, idx); + x86_pmu_stop(event); } if (handled) @@ -1307,7 +1303,7 @@ int hw_perf_group_sched_in(struct perf_event *leader, memcpy(cpuc->assign, assign, n0*sizeof(int)); cpuc->n_events = n0; - cpuc->n_added = n1; + cpuc->n_added += n1; ctx->nr_active += n1; /* @@ -1335,6 +1331,41 @@ undo: #include "perf_event_p6.c" #include "perf_event_intel.c" +static int __cpuinit +x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) +{ + unsigned int cpu = (long)hcpu; + int ret = NOTIFY_OK; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_UP_PREPARE: + if (x86_pmu.cpu_prepare) + ret = x86_pmu.cpu_prepare(cpu); + break; + + case CPU_STARTING: + if (x86_pmu.cpu_starting) + x86_pmu.cpu_starting(cpu); + break; + + case CPU_DYING: + if (x86_pmu.cpu_dying) + x86_pmu.cpu_dying(cpu); + break; + + case CPU_UP_CANCELED: + case CPU_DEAD: + if (x86_pmu.cpu_dead) + x86_pmu.cpu_dead(cpu); + break; + + default: + break; + } + + return ret; +} + static void __init pmu_check_apic(void) { if (cpu_has_apic) @@ -1347,6 +1378,7 @@ static void __init pmu_check_apic(void) void __init init_hw_perf_events(void) { + struct event_constraint *c; int err; pr_info("Performance Events: "); @@ -1395,6 +1427,16 @@ void __init init_hw_perf_events(void) __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_events) - 1, 0, x86_pmu.num_events); + if (x86_pmu.event_constraints) { + for_each_event_constraint(c, x86_pmu.event_constraints) { + if (c->cmask != INTEL_ARCH_FIXED_MASK) + continue; + + c->idxmsk64 |= (1ULL << x86_pmu.num_events) - 1; + c->weight += x86_pmu.num_events; + } + } + pr_info("... version: %d\n", x86_pmu.version); pr_info("... bit width: %d\n", x86_pmu.event_bits); pr_info("... generic registers: %d\n", x86_pmu.num_events); @@ -1402,11 +1444,13 @@ void __init init_hw_perf_events(void) pr_info("... max period: %016Lx\n", x86_pmu.max_period); pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed); pr_info("... event mask: %016Lx\n", perf_event_mask); + + perf_cpu_notifier(x86_pmu_notifier); } static inline void x86_pmu_read(struct perf_event *event) { - x86_perf_event_update(event, &event->hw, event->hw.idx); + x86_perf_event_update(event); } static const struct pmu pmu = { @@ -1588,14 +1632,42 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n) return len; } -static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) +#ifdef CONFIG_COMPAT +static inline int +perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) { - unsigned long bytes; + /* 32-bit process in 64-bit kernel. */ + struct stack_frame_ia32 frame; + const void __user *fp; - bytes = copy_from_user_nmi(frame, fp, sizeof(*frame)); + if (!test_thread_flag(TIF_IA32)) + return 0; - return bytes == sizeof(*frame); + fp = compat_ptr(regs->bp); + while (entry->nr < PERF_MAX_STACK_DEPTH) { + unsigned long bytes; + frame.next_frame = 0; + frame.return_address = 0; + + bytes = copy_from_user_nmi(&frame, fp, sizeof(frame)); + if (bytes != sizeof(frame)) + break; + + if (fp < compat_ptr(regs->sp)) + break; + + callchain_store(entry, frame.return_address); + fp = compat_ptr(frame.next_frame); + } + return 1; +} +#else +static inline int +perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) +{ + return 0; } +#endif static void perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) @@ -1611,11 +1683,16 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) callchain_store(entry, PERF_CONTEXT_USER); callchain_store(entry, regs->ip); + if (perf_callchain_user32(regs, entry)) + return; + while (entry->nr < PERF_MAX_STACK_DEPTH) { + unsigned long bytes; frame.next_frame = NULL; frame.return_address = 0; - if (!copy_stack_frame(fp, &frame)) + bytes = copy_from_user_nmi(&frame, fp, sizeof(frame)); + if (bytes != sizeof(frame)) break; if ((unsigned long)fp < regs->sp) @@ -1662,28 +1739,14 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) return entry; } -void hw_perf_event_setup_online(int cpu) -{ - init_debug_store_on_cpu(cpu); - - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - amd_pmu_cpu_online(cpu); - break; - default: - return; - } -} - -void hw_perf_event_setup_offline(int cpu) +void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip) { - init_debug_store_on_cpu(cpu); - - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - amd_pmu_cpu_offline(cpu); - break; - default: - return; - } + regs->ip = ip; + /* + * perf_arch_fetch_caller_regs adds another call, we need to increment + * the skip level + */ + regs->bp = rewind_frame_pointer(skip + 1); + regs->cs = __KERNEL_CS; + local_save_flags(regs->flags); } diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 8f3dbfda3c4..db6f7d4056e 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -137,6 +137,13 @@ static inline int amd_is_nb_event(struct hw_perf_event *hwc) return (hwc->config & 0xe0) == 0xe0; } +static inline int amd_has_nb(struct cpu_hw_events *cpuc) +{ + struct amd_nb *nb = cpuc->amd_nb; + + return nb && nb->nb_id != -1; +} + static void amd_put_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) { @@ -147,7 +154,7 @@ static void amd_put_event_constraints(struct cpu_hw_events *cpuc, /* * only care about NB events */ - if (!(nb && amd_is_nb_event(hwc))) + if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc))) return; /* @@ -214,7 +221,7 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) /* * if not NB event or no NB, then no constraints */ - if (!(nb && amd_is_nb_event(hwc))) + if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc))) return &unconstrained; /* @@ -271,28 +278,6 @@ done: return &emptyconstraint; } -static __initconst struct x86_pmu amd_pmu = { - .name = "AMD", - .handle_irq = x86_pmu_handle_irq, - .disable_all = x86_pmu_disable_all, - .enable_all = x86_pmu_enable_all, - .enable = x86_pmu_enable_event, - .disable = x86_pmu_disable_event, - .eventsel = MSR_K7_EVNTSEL0, - .perfctr = MSR_K7_PERFCTR0, - .event_map = amd_pmu_event_map, - .raw_event = amd_pmu_raw_event, - .max_events = ARRAY_SIZE(amd_perfmon_event_map), - .num_events = 4, - .event_bits = 48, - .event_mask = (1ULL << 48) - 1, - .apic = 1, - /* use highest bit to detect overflow */ - .max_period = (1ULL << 47) - 1, - .get_event_constraints = amd_get_event_constraints, - .put_event_constraints = amd_put_event_constraints -}; - static struct amd_nb *amd_alloc_nb(int cpu, int nb_id) { struct amd_nb *nb; @@ -309,57 +294,61 @@ static struct amd_nb *amd_alloc_nb(int cpu, int nb_id) * initialize all possible NB constraints */ for (i = 0; i < x86_pmu.num_events; i++) { - set_bit(i, nb->event_constraints[i].idxmsk); + __set_bit(i, nb->event_constraints[i].idxmsk); nb->event_constraints[i].weight = 1; } return nb; } -static void amd_pmu_cpu_online(int cpu) +static int amd_pmu_cpu_prepare(int cpu) { - struct cpu_hw_events *cpu1, *cpu2; - struct amd_nb *nb = NULL; + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + + WARN_ON_ONCE(cpuc->amd_nb); + + if (boot_cpu_data.x86_max_cores < 2) + return NOTIFY_OK; + + cpuc->amd_nb = amd_alloc_nb(cpu, -1); + if (!cpuc->amd_nb) + return NOTIFY_BAD; + + return NOTIFY_OK; +} + +static void amd_pmu_cpu_starting(int cpu) +{ + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + struct amd_nb *nb; int i, nb_id; if (boot_cpu_data.x86_max_cores < 2) return; - /* - * function may be called too early in the - * boot process, in which case nb_id is bogus - */ nb_id = amd_get_nb_id(cpu); - if (nb_id == BAD_APICID) - return; - - cpu1 = &per_cpu(cpu_hw_events, cpu); - cpu1->amd_nb = NULL; + WARN_ON_ONCE(nb_id == BAD_APICID); raw_spin_lock(&amd_nb_lock); for_each_online_cpu(i) { - cpu2 = &per_cpu(cpu_hw_events, i); - nb = cpu2->amd_nb; - if (!nb) + nb = per_cpu(cpu_hw_events, i).amd_nb; + if (WARN_ON_ONCE(!nb)) continue; - if (nb->nb_id == nb_id) - goto found; - } - nb = amd_alloc_nb(cpu, nb_id); - if (!nb) { - pr_err("perf_events: failed NB allocation for CPU%d\n", cpu); - raw_spin_unlock(&amd_nb_lock); - return; + if (nb->nb_id == nb_id) { + kfree(cpuc->amd_nb); + cpuc->amd_nb = nb; + break; + } } -found: - nb->refcnt++; - cpu1->amd_nb = nb; + + cpuc->amd_nb->nb_id = nb_id; + cpuc->amd_nb->refcnt++; raw_spin_unlock(&amd_nb_lock); } -static void amd_pmu_cpu_offline(int cpu) +static void amd_pmu_cpu_dead(int cpu) { struct cpu_hw_events *cpuhw; @@ -370,14 +359,44 @@ static void amd_pmu_cpu_offline(int cpu) raw_spin_lock(&amd_nb_lock); - if (--cpuhw->amd_nb->refcnt == 0) - kfree(cpuhw->amd_nb); + if (cpuhw->amd_nb) { + struct amd_nb *nb = cpuhw->amd_nb; + + if (nb->nb_id == -1 || --nb->refcnt == 0) + kfree(nb); - cpuhw->amd_nb = NULL; + cpuhw->amd_nb = NULL; + } raw_spin_unlock(&amd_nb_lock); } +static __initconst struct x86_pmu amd_pmu = { + .name = "AMD", + .handle_irq = x86_pmu_handle_irq, + .disable_all = x86_pmu_disable_all, + .enable_all = x86_pmu_enable_all, + .enable = x86_pmu_enable_event, + .disable = x86_pmu_disable_event, + .eventsel = MSR_K7_EVNTSEL0, + .perfctr = MSR_K7_PERFCTR0, + .event_map = amd_pmu_event_map, + .raw_event = amd_pmu_raw_event, + .max_events = ARRAY_SIZE(amd_perfmon_event_map), + .num_events = 4, + .event_bits = 48, + .event_mask = (1ULL << 48) - 1, + .apic = 1, + /* use highest bit to detect overflow */ + .max_period = (1ULL << 47) - 1, + .get_event_constraints = amd_get_event_constraints, + .put_event_constraints = amd_put_event_constraints, + + .cpu_prepare = amd_pmu_cpu_prepare, + .cpu_starting = amd_pmu_cpu_starting, + .cpu_dead = amd_pmu_cpu_dead, +}; + static __init int amd_pmu_init(void) { /* Performance-monitoring supported from K7 and later: */ @@ -390,11 +409,6 @@ static __init int amd_pmu_init(void) memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, sizeof(hw_cache_event_ids)); - /* - * explicitly initialize the boot cpu, other cpus will get - * the cpu hotplug callbacks from smp_init() - */ - amd_pmu_cpu_online(smp_processor_id()); return 0; } @@ -405,12 +419,4 @@ static int amd_pmu_init(void) return 0; } -static void amd_pmu_cpu_online(int cpu) -{ -} - -static void amd_pmu_cpu_offline(int cpu) -{ -} - #endif diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 977e7544738..9c794ac8783 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1,7 +1,7 @@ #ifdef CONFIG_CPU_SUP_INTEL /* - * Intel PerfMon v3. Used on Core2 and later. + * Intel PerfMon, used on Core and later. */ static const u64 intel_perfmon_event_map[] = { @@ -27,8 +27,14 @@ static struct event_constraint intel_core_event_constraints[] = static struct event_constraint intel_core2_event_constraints[] = { - FIXED_EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ - FIXED_EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + /* + * Core2 has Fixed Counter 2 listed as CPU_CLK_UNHALTED.REF and event + * 0x013c as CPU_CLK_UNHALTED.BUS and specifies there is a fixed + * ratio between these counters. + */ + /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ @@ -37,14 +43,16 @@ static struct event_constraint intel_core2_event_constraints[] = INTEL_EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */ INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */ INTEL_EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */ + INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* ITLB_MISS_RETIRED (T30-9) */ INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */ EVENT_CONSTRAINT_END }; static struct event_constraint intel_nehalem_event_constraints[] = { - FIXED_EVENT_CONSTRAINT(0xc0, (0xf|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ - FIXED_EVENT_CONSTRAINT(0x3c, (0xf|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */ INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */ INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */ @@ -58,8 +66,9 @@ static struct event_constraint intel_nehalem_event_constraints[] = static struct event_constraint intel_westmere_event_constraints[] = { - FIXED_EVENT_CONSTRAINT(0xc0, (0xf|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ - FIXED_EVENT_CONSTRAINT(0x3c, (0xf|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */ INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ @@ -68,8 +77,9 @@ static struct event_constraint intel_westmere_event_constraints[] = static struct event_constraint intel_gen_event_constraints[] = { - FIXED_EVENT_CONSTRAINT(0xc0, (0x3|(1ULL<<32))), /* INSTRUCTIONS_RETIRED */ - FIXED_EVENT_CONSTRAINT(0x3c, (0x3|(1ULL<<33))), /* UNHALTED_CORE_CYCLES */ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ EVENT_CONSTRAINT_END }; @@ -538,9 +548,9 @@ static inline void intel_pmu_ack_status(u64 ack) } static inline void -intel_pmu_disable_fixed(struct hw_perf_event *hwc, int __idx) +intel_pmu_disable_fixed(struct hw_perf_event *hwc) { - int idx = __idx - X86_PMC_IDX_FIXED; + int idx = hwc->idx - X86_PMC_IDX_FIXED; u64 ctrl_val, mask; mask = 0xfULL << (idx * 4); @@ -580,10 +590,9 @@ static void intel_pmu_drain_bts_buffer(void) ds->bts_index = ds->bts_buffer_base; + perf_sample_data_init(&data, 0); data.period = event->hw.last_period; - data.addr = 0; - data.raw = NULL; regs.ip = 0; /* @@ -612,26 +621,28 @@ static void intel_pmu_drain_bts_buffer(void) } static inline void -intel_pmu_disable_event(struct hw_perf_event *hwc, int idx) +intel_pmu_disable_event(struct perf_event *event) { - if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { + struct hw_perf_event *hwc = &event->hw; + + if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) { intel_pmu_disable_bts(); intel_pmu_drain_bts_buffer(); return; } if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { - intel_pmu_disable_fixed(hwc, idx); + intel_pmu_disable_fixed(hwc); return; } - x86_pmu_disable_event(hwc, idx); + x86_pmu_disable_event(event); } static inline void -intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx) +intel_pmu_enable_fixed(struct hw_perf_event *hwc) { - int idx = __idx - X86_PMC_IDX_FIXED; + int idx = hwc->idx - X86_PMC_IDX_FIXED; u64 ctrl_val, bits, mask; int err; @@ -661,9 +672,11 @@ intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx) err = checking_wrmsrl(hwc->config_base, ctrl_val); } -static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx) +static void intel_pmu_enable_event(struct perf_event *event) { - if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { + struct hw_perf_event *hwc = &event->hw; + + if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) { if (!__get_cpu_var(cpu_hw_events).enabled) return; @@ -672,11 +685,11 @@ static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx) } if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { - intel_pmu_enable_fixed(hwc, idx); + intel_pmu_enable_fixed(hwc); return; } - __x86_pmu_enable_event(hwc, idx); + __x86_pmu_enable_event(hwc); } /* @@ -685,14 +698,8 @@ static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx) */ static int intel_pmu_save_and_restart(struct perf_event *event) { - struct hw_perf_event *hwc = &event->hw; - int idx = hwc->idx; - int ret; - - x86_perf_event_update(event, hwc, idx); - ret = x86_perf_event_set_period(event, hwc, idx); - - return ret; + x86_perf_event_update(event); + return x86_perf_event_set_period(event); } static void intel_pmu_reset(void) @@ -732,16 +739,15 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) int bit, loops; u64 ack, status; - data.addr = 0; - data.raw = NULL; + perf_sample_data_init(&data, 0); cpuc = &__get_cpu_var(cpu_hw_events); - perf_disable(); + intel_pmu_disable_all(); intel_pmu_drain_bts_buffer(); status = intel_pmu_get_status(); if (!status) { - perf_enable(); + intel_pmu_enable_all(); return 0; } @@ -751,8 +757,7 @@ again: WARN_ONCE(1, "perfevents: irq loop stuck!\n"); perf_event_print_debug(); intel_pmu_reset(); - perf_enable(); - return 1; + goto done; } inc_irq_stat(apic_perf_irqs); @@ -760,7 +765,6 @@ again: for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { struct perf_event *event = cpuc->events[bit]; - clear_bit(bit, (unsigned long *) &status); if (!test_bit(bit, cpuc->active_mask)) continue; @@ -770,7 +774,7 @@ again: data.period = event->hw.last_period; if (perf_event_overflow(event, 1, &data, regs)) - intel_pmu_disable_event(&event->hw, bit); + x86_pmu_stop(event); } intel_pmu_ack_status(ack); @@ -782,8 +786,8 @@ again: if (status) goto again; - perf_enable(); - +done: + intel_pmu_enable_all(); return 1; } @@ -862,7 +866,10 @@ static __initconst struct x86_pmu intel_pmu = { .max_period = (1ULL << 31) - 1, .enable_bts = intel_pmu_enable_bts, .disable_bts = intel_pmu_disable_bts, - .get_event_constraints = intel_get_event_constraints + .get_event_constraints = intel_get_event_constraints, + + .cpu_starting = init_debug_store_on_cpu, + .cpu_dying = fini_debug_store_on_cpu, }; static __init int intel_pmu_init(void) @@ -929,13 +936,14 @@ static __init int intel_pmu_init(void) case 26: /* 45 nm nehalem, "Bloomfield" */ case 30: /* 45 nm nehalem, "Lynnfield" */ + case 46: /* 45 nm nehalem-ex, "Beckton" */ memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, sizeof(hw_cache_event_ids)); x86_pmu.event_constraints = intel_nehalem_event_constraints; pr_cont("Nehalem/Corei7 events, "); break; - case 28: + case 28: /* Atom */ memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, sizeof(hw_cache_event_ids)); @@ -951,6 +959,7 @@ static __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_westmere_event_constraints; pr_cont("Westmere events, "); break; + default: /* * default constraints for v2 and up diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c index 1ca5ba078af..a330485d14d 100644 --- a/arch/x86/kernel/cpu/perf_event_p6.c +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -62,7 +62,7 @@ static void p6_pmu_disable_all(void) /* p6 only has one enable register */ rdmsrl(MSR_P6_EVNTSEL0, val); - val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; wrmsrl(MSR_P6_EVNTSEL0, val); } @@ -72,32 +72,34 @@ static void p6_pmu_enable_all(void) /* p6 only has one enable register */ rdmsrl(MSR_P6_EVNTSEL0, val); - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + val |= ARCH_PERFMON_EVENTSEL_ENABLE; wrmsrl(MSR_P6_EVNTSEL0, val); } static inline void -p6_pmu_disable_event(struct hw_perf_event *hwc, int idx) +p6_pmu_disable_event(struct perf_event *event) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; u64 val = P6_NOP_EVENT; if (cpuc->enabled) - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + val |= ARCH_PERFMON_EVENTSEL_ENABLE; - (void)checking_wrmsrl(hwc->config_base + idx, val); + (void)checking_wrmsrl(hwc->config_base + hwc->idx, val); } -static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx) +static void p6_pmu_enable_event(struct perf_event *event) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; u64 val; val = hwc->config; if (cpuc->enabled) - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + val |= ARCH_PERFMON_EVENTSEL_ENABLE; - (void)checking_wrmsrl(hwc->config_base + idx, val); + (void)checking_wrmsrl(hwc->config_base + hwc->idx, val); } static __initconst struct x86_pmu p6_pmu = { diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 74f4e85a572..fb329e9f849 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -680,7 +680,7 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz) cpu_nmi_set_wd_enabled(); apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; + evntsel |= ARCH_PERFMON_EVENTSEL_ENABLE; wrmsr(evntsel_msr, evntsel, 0); intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1); return 1; diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 83e5e628de7..8b862d5900f 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -40,6 +40,7 @@ #include <linux/cpu.h> #include <linux/notifier.h> #include <linux/uaccess.h> +#include <linux/gfp.h> #include <asm/processor.h> #include <asm/msr.h> diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index a4849c10a77..ebd4c51d096 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -27,7 +27,6 @@ #include <asm/cpu.h> #include <asm/reboot.h> #include <asm/virtext.h> -#include <asm/x86_init.h> #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) @@ -103,10 +102,5 @@ void native_machine_crash_shutdown(struct pt_regs *regs) #ifdef CONFIG_HPET_TIMER hpet_disable(); #endif - -#ifdef CONFIG_X86_64 - x86_platform.iommu_shutdown(); -#endif - crash_save_cpu(regs, safe_smp_processor_id()); } diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index cd97ce18c29..67414550c3c 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c @@ -5,6 +5,7 @@ * Copyright (C) IBM Corporation, 2004. All rights reserved */ +#include <linux/slab.h> #include <linux/errno.h> #include <linux/highmem.h> #include <linux/crash_dump.h> diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h index 4fd1420faff..e1a93be4fd4 100644 --- a/arch/x86/kernel/dumpstack.h +++ b/arch/x86/kernel/dumpstack.h @@ -14,6 +14,8 @@ #define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) #endif +#include <linux/uaccess.h> + extern void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, char *log_lvl); @@ -29,4 +31,26 @@ struct stack_frame { struct stack_frame *next_frame; unsigned long return_address; }; + +struct stack_frame_ia32 { + u32 next_frame; + u32 return_address; +}; + +static inline unsigned long rewind_frame_pointer(int n) +{ + struct stack_frame *frame; + + get_bp(frame); + +#ifdef CONFIG_FRAME_POINTER + while (n--) { + if (probe_kernel_address(&frame->next_frame, frame)) + break; + } #endif + + return (unsigned long)frame; +} + +#endif /* DUMPSTACK_H */ diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index dce99abb449..272c9f1f05f 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -120,9 +120,15 @@ fixup_bp_irq_link(unsigned long bp, unsigned long *stack, { #ifdef CONFIG_FRAME_POINTER struct stack_frame *frame = (struct stack_frame *)bp; + unsigned long next; - if (!in_irq_stack(stack, irq_stack, irq_stack_end)) - return (unsigned long)frame->next_frame; + if (!in_irq_stack(stack, irq_stack, irq_stack_end)) { + if (!probe_kernel_address(&frame->next_frame, next)) + return next; + else + WARN_ONCE(1, "Perf: bad frame pointer = %p in " + "callchain\n", &frame->next_frame); + } #endif return bp; } @@ -202,7 +208,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, if (in_irq_stack(stack, irq_stack, irq_stack_end)) { if (ops->stack(data, "IRQ") < 0) break; - bp = print_context_stack(tinfo, stack, bp, + bp = ops->walk_stack(tinfo, stack, bp, ops, data, irq_stack_end, &graph); /* * We link to the next stack (which would be @@ -223,7 +229,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, /* * This handles the process stack: */ - bp = print_context_stack(tinfo, stack, bp, ops, data, NULL, &graph); + bp = ops->walk_stack(tinfo, stack, bp, ops, data, NULL, &graph); put_cpu(); } EXPORT_SYMBOL(dump_trace); diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 740b440fbd7..7bca3c6a02f 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -519,29 +519,45 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type, printk(KERN_DEBUG "e820 remove range: %016Lx - %016Lx ", (unsigned long long) start, (unsigned long long) end); - e820_print_type(old_type); + if (checktype) + e820_print_type(old_type); printk(KERN_CONT "\n"); for (i = 0; i < e820.nr_map; i++) { struct e820entry *ei = &e820.map[i]; u64 final_start, final_end; + u64 ei_end; if (checktype && ei->type != old_type) continue; + + ei_end = ei->addr + ei->size; /* totally covered? */ - if (ei->addr >= start && - (ei->addr + ei->size) <= (start + size)) { + if (ei->addr >= start && ei_end <= end) { real_removed_size += ei->size; memset(ei, 0, sizeof(struct e820entry)); continue; } + + /* new range is totally covered? */ + if (ei->addr < start && ei_end > end) { + e820_add_region(end, ei_end - end, ei->type); + ei->size = start - ei->addr; + real_removed_size += size; + continue; + } + /* partially covered */ final_start = max(start, ei->addr); - final_end = min(start + size, ei->addr + ei->size); + final_end = min(end, ei_end); if (final_start >= final_end) continue; real_removed_size += final_end - final_start; + /* + * left range could be head or tail, so need to update + * size at first. + */ ei->size -= final_end - final_start; if (ei->addr < final_start) continue; diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index adedeef1ded..b2e24603739 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -7,6 +7,7 @@ #include <linux/init.h> #include <linux/start_kernel.h> +#include <linux/mm.h> #include <asm/setup.h> #include <asm/sections.h> @@ -44,9 +45,10 @@ void __init i386_start_kernel(void) #ifdef CONFIG_BLK_DEV_INITRD /* Reserve INITRD */ if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { + /* Assume only end is not page aligned */ u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; - u64 ramdisk_end = ramdisk_image + ramdisk_size; + u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); } #endif diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index b5a9896ca1e..7147143fd61 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -103,9 +103,10 @@ void __init x86_64_start_reservations(char *real_mode_data) #ifdef CONFIG_BLK_DEV_INITRD /* Reserve INITRD */ if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { + /* Assume only end is not page aligned */ unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; - unsigned long ramdisk_end = ramdisk_image + ramdisk_size; + unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); } #endif diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index ee4fa1bfcb3..23b4ecdffa9 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -4,6 +4,7 @@ #include <linux/sysdev.h> #include <linux/delay.h> #include <linux/errno.h> +#include <linux/slab.h> #include <linux/hpet.h> #include <linux/init.h> #include <linux/cpu.h> @@ -399,9 +400,15 @@ static int hpet_next_event(unsigned long delta, * then we might have a real hardware problem. We can not do * much about it here, but at least alert the user/admin with * a prominent warning. + * An erratum on some chipsets (ICH9,..), results in comparator read + * immediately following a write returning old value. Workaround + * for this is to read this value second time, when first + * read returns old value. */ - WARN_ONCE(hpet_readl(HPET_Tn_CMP(timer)) != cnt, + if (unlikely((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt)) { + WARN_ONCE(hpet_readl(HPET_Tn_CMP(timer)) != cnt, KERN_WARNING "hpet: compare register read back failed.\n"); + } return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; } @@ -1143,6 +1150,7 @@ int hpet_set_periodic_freq(unsigned long freq) do_div(clc, freq); clc >>= hpet_clockevent.shift; hpet_pie_delta = clc; + hpet_pie_limit = 0; } return 1; } diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index dca2802c666..d6cc065f519 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -344,13 +344,6 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp, } /* - * For kernel-addresses, either the address or symbol name can be - * specified. - */ - if (info->name) - info->address = (unsigned long) - kallsyms_lookup_name(info->name); - /* * Check that the low-order bits of the address are appropriate * for the alignment implied by len. */ @@ -535,8 +528,3 @@ void hw_breakpoint_pmu_read(struct perf_event *bp) { /* TODO */ } - -void hw_breakpoint_pmu_unthrottle(struct perf_event *bp) -{ - /* TODO */ -} diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index c01a2b846d4..54c31c28548 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -8,6 +8,7 @@ #include <linux/module.h> #include <linux/regset.h> #include <linux/sched.h> +#include <linux/slab.h> #include <asm/sigcontext.h> #include <asm/processor.h> diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index fb725ee15f5..7c9f02c130f 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -5,7 +5,6 @@ #include <linux/ioport.h> #include <linux/interrupt.h> #include <linux/timex.h> -#include <linux/slab.h> #include <linux/random.h> #include <linux/init.h> #include <linux/kernel_stat.h> diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index ef257fc2921..0ed2d300cd4 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -5,7 +5,6 @@ #include <linux/ioport.h> #include <linux/interrupt.h> #include <linux/timex.h> -#include <linux/slab.h> #include <linux/random.h> #include <linux/kprobes.h> #include <linux/init.h> @@ -141,6 +140,28 @@ void __init init_IRQ(void) x86_init.irqs.intr_init(); } +/* + * Setup the vector to irq mappings. + */ +void setup_vector_irq(int cpu) +{ +#ifndef CONFIG_X86_IO_APIC + int irq; + + /* + * On most of the platforms, legacy PIC delivers the interrupts on the + * boot cpu. But there are certain platforms where PIC interrupts are + * delivered to multiple cpu's. If the legacy IRQ is handled by the + * legacy PIC, for the new cpu that is coming online, setup the static + * legacy vector to irq mapping: + */ + for (irq = 0; irq < legacy_pic->nr_legacy_irqs; irq++) + per_cpu(vector_irq, cpu)[IRQ0_VECTOR + irq] = irq; +#endif + + __setup_vector_irq(cpu); +} + static void __init smp_intr_init(void) { #ifdef CONFIG_SMP diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c index cbc4332a77b..0f7bc20cfcd 100644 --- a/arch/x86/kernel/k8.c +++ b/arch/x86/kernel/k8.c @@ -2,8 +2,8 @@ * Shared support code for AMD K8 northbridges and derivates. * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2. */ -#include <linux/gfp.h> #include <linux/types.h> +#include <linux/slab.h> #include <linux/init.h> #include <linux/errno.h> #include <linux/module.h> @@ -121,3 +121,17 @@ void k8_flush_garts(void) } EXPORT_SYMBOL_GPL(k8_flush_garts); +static __init int init_k8_nbs(void) +{ + int err = 0; + + err = cache_k8_northbridges(); + + if (err < 0) + printk(KERN_NOTICE "K8 NB: Cannot enumerate AMD northbridges.\n"); + + return err; +} + +/* This has to go after the PCI subsystem */ +fs_initcall(init_k8_nbs); diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index e444357375c..8afd9f321f1 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c @@ -9,6 +9,7 @@ #include <linux/debugfs.h> #include <linux/uaccess.h> #include <linux/module.h> +#include <linux/slab.h> #include <linux/init.h> #include <linux/stat.h> #include <linux/io.h> diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index bfba6019d76..b2258ca9100 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -618,8 +618,8 @@ int kgdb_arch_init(void) * portion of kgdb because this operation requires mutexs to * complete. */ + hw_breakpoint_init(&attr); attr.bp_addr = (unsigned long)kgdb_arch_init; - attr.type = PERF_TYPE_BREAKPOINT; attr.bp_len = HW_BREAKPOINT_LEN_1; attr.bp_type = HW_BREAKPOINT_W; attr.disabled = 1; diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index ec6ef60cbd1..ea697263b37 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -7,6 +7,7 @@ */ #include <linux/errno.h> +#include <linux/gfp.h> #include <linux/sched.h> #include <linux/string.h> #include <linux/mm.h> diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 4a8bb82248a..035c8c52918 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -9,6 +9,7 @@ #include <linux/mm.h> #include <linux/kexec.h> #include <linux/string.h> +#include <linux/gfp.h> #include <linux/reboot.h> #include <linux/numa.h> #include <linux/ftrace.h> diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c index 845d80ce1ef..63eaf659623 100644 --- a/arch/x86/kernel/mca_32.c +++ b/arch/x86/kernel/mca_32.c @@ -42,6 +42,7 @@ #include <linux/kernel.h> #include <linux/mca.h> #include <linux/kprobes.h> +#include <linux/slab.h> #include <asm/system.h> #include <asm/io.h> #include <linux/proc_fs.h> diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 89f386f044e..e0bc186d750 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -23,6 +23,7 @@ #include <linux/kernel.h> #include <linux/bug.h> #include <linux/mm.h> +#include <linux/gfp.h> #include <asm/system.h> #include <asm/page.h> diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index a2c1edd2d3a..e81030f71a8 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -664,7 +664,7 @@ static void __init smp_reserve_memory(struct mpf_intel *mpf) { unsigned long size = get_mpc_size(mpf->physptr); - reserve_early(mpf->physptr, mpf->physptr+size, "MP-table mpc"); + reserve_early_overlap_ok(mpf->physptr, mpf->physptr+size, "MP-table mpc"); } static int __init smp_scan_config(unsigned long base, unsigned long length) @@ -693,7 +693,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length) mpf, (u64)virt_to_phys(mpf)); mem = virt_to_phys(mpf); - reserve_early(mem, mem + sizeof(*mpf), "MP-table mpf"); + reserve_early_overlap_ok(mem, mem + sizeof(*mpf), "MP-table mpf"); if (mpf->physptr) smp_reserve_memory(mpf); diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 206735ac8cb..4d4468e9f47 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -37,6 +37,7 @@ #include <linux/cpu.h> #include <linux/notifier.h> #include <linux/uaccess.h> +#include <linux/gfp.h> #include <asm/processor.h> #include <asm/msr.h> diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index a4ac764a688..4b7e3d8b01d 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -2,6 +2,7 @@ #include <linux/dma-debug.h> #include <linux/dmar.h> #include <linux/bootmem.h> +#include <linux/gfp.h> #include <linux/pci.h> #include <linux/kmemleak.h> diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 34de53b46f8..0f7f130caa6 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -29,6 +29,7 @@ #include <linux/iommu-helper.h> #include <linux/sysdev.h> #include <linux/io.h> +#include <linux/gfp.h> #include <asm/atomic.h> #include <asm/mtrr.h> #include <asm/pgtable.h> @@ -564,6 +565,9 @@ static void enable_gart_translations(void) enable_gart_translation(dev, __pa(agp_gatt_table)); } + + /* Flush the GART-TLB to remove stale entries */ + k8_flush_garts(); } /* @@ -735,7 +739,7 @@ int __init gart_iommu_init(void) unsigned long scratch; long i; - if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) + if (num_k8_northbridges == 0) return 0; #ifndef CONFIG_AGP_AMD64 diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index 22be12b60a8..3af4af810c0 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -4,6 +4,7 @@ #include <linux/scatterlist.h> #include <linux/string.h> #include <linux/init.h> +#include <linux/gfp.h> #include <linux/pci.h> #include <linux/mm.h> diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 02d678065d7..28ad9f4d8b9 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -526,21 +526,37 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) } /* - * Check for AMD CPUs, which have potentially C1E support + * Check for AMD CPUs, where APIC timer interrupt does not wake up CPU from C1e. + * For more information see + * - Erratum #400 for NPT family 0xf and family 0x10 CPUs + * - Erratum #365 for family 0x11 (not affected because C1e not in use) */ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c) { + u64 val; if (c->x86_vendor != X86_VENDOR_AMD) - return 0; - - if (c->x86 < 0x0F) - return 0; + goto no_c1e_idle; /* Family 0x0f models < rev F do not have C1E */ - if (c->x86 == 0x0f && c->x86_model < 0x40) - return 0; + if (c->x86 == 0x0F && c->x86_model >= 0x40) + return 1; - return 1; + if (c->x86 == 0x10) { + /* + * check OSVW bit for CPUs that are not affected + * by erratum #400 + */ + rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val); + if (val >= 2) { + rdmsrl(MSR_AMD64_OSVW_STATUS, val); + if (!(val & BIT(1))) + goto no_c1e_idle; + } + return 1; + } + +no_c1e_idle: + return 0; } static cpumask_var_t c1e_mask; @@ -607,7 +623,7 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP if (pm_idle == poll_idle && smp_num_siblings > 1) { - printk(KERN_WARNING "WARNING: polling idle and HT enabled," + printk_once(KERN_WARNING "WARNING: polling idle and HT enabled," " performance may degrade.\n"); } #endif diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index a503b1fd04e..2e9b55027b7 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -12,6 +12,7 @@ #include <linux/mm.h> #include <linux/smp.h> #include <linux/errno.h> +#include <linux/slab.h> #include <linux/ptrace.h> #include <linux/regset.h> #include <linux/tracehook.h> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 5d7ba1a449b..c4851eff57b 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -55,7 +55,6 @@ #include <linux/stddef.h> #include <linux/unistd.h> #include <linux/ptrace.h> -#include <linux/slab.h> #include <linux/user.h> #include <linux/delay.h> @@ -314,16 +313,17 @@ static void __init reserve_brk(void) #define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) static void __init relocate_initrd(void) { - + /* Assume only end is not page aligned */ u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; + u64 area_size = PAGE_ALIGN(ramdisk_size); u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; u64 ramdisk_here; unsigned long slop, clen, mapaddr; char *p, *q; /* We need to move the initrd down into lowmem */ - ramdisk_here = find_e820_area(0, end_of_lowmem, ramdisk_size, + ramdisk_here = find_e820_area(0, end_of_lowmem, area_size, PAGE_SIZE); if (ramdisk_here == -1ULL) @@ -332,7 +332,7 @@ static void __init relocate_initrd(void) /* Note: this includes all the lowmem currently occupied by the initrd, we rely on that fact to keep the data intact. */ - reserve_early(ramdisk_here, ramdisk_here + ramdisk_size, + reserve_early(ramdisk_here, ramdisk_here + area_size, "NEW RAMDISK"); initrd_start = ramdisk_here + PAGE_OFFSET; initrd_end = initrd_start + ramdisk_size; @@ -376,9 +376,10 @@ static void __init relocate_initrd(void) static void __init reserve_initrd(void) { + /* Assume only end is not page aligned */ u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; - u64 ramdisk_end = ramdisk_image + ramdisk_size; + u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; if (!boot_params.hdr.type_of_loader || @@ -606,6 +607,16 @@ static int __init setup_elfcorehdr(char *arg) early_param("elfcorehdr", setup_elfcorehdr); #endif +static __init void reserve_ibft_region(void) +{ + unsigned long addr, size = 0; + + addr = find_ibft_region(&size); + + if (size) + reserve_early_overlap_ok(addr, addr + size, "ibft"); +} + #ifdef CONFIG_X86_RESERVE_LOW_64K static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) { @@ -908,6 +919,8 @@ void __init setup_arch(char **cmdline_p) */ find_smp_config(); + reserve_ibft_region(); + reserve_trampoline_memory(); #ifdef CONFIG_ACPI_SLEEP @@ -975,8 +988,6 @@ void __init setup_arch(char **cmdline_p) dma32_reserve_bootmem(); - reserve_ibft_region(); - #ifdef CONFIG_KVM_CLOCK kvmclock_init(); #endif diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index ec1de97600e..d801210945d 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -21,6 +21,7 @@ #include <linux/cache.h> #include <linux/interrupt.h> #include <linux/cpu.h> +#include <linux/gfp.h> #include <asm/mtrr.h> #include <asm/tlbflush.h> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index a02e80c3c54..763d815e27a 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -49,6 +49,7 @@ #include <linux/nmi.h> #include <linux/tboot.h> #include <linux/stackprotector.h> +#include <linux/gfp.h> #include <asm/acpi.h> #include <asm/desc.h> @@ -242,12 +243,10 @@ static void __cpuinit smp_callin(void) end_local_APIC_setup(); map_cpu_to_logical_apicid(); - notify_cpu_starting(cpuid); - /* * Need to setup vector mappings before we enable interrupts. */ - __setup_vector_irq(smp_processor_id()); + setup_vector_irq(smp_processor_id()); /* * Get our bogomips. * @@ -264,6 +263,8 @@ static void __cpuinit smp_callin(void) */ smp_store_cpu_info(cpuid); + notify_cpu_starting(cpuid); + /* * Allow the master to continue. */ diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 364d015efeb..17b03dd3a6b 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -9,6 +9,7 @@ #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/kernel.h> +#include <linux/slab.h> #include <asm/mmu_context.h> #include <asm/uv/uv.h> diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c index ece73d8e324..1d40336b030 100644 --- a/arch/x86/kernel/uv_irq.c +++ b/arch/x86/kernel/uv_irq.c @@ -10,6 +10,7 @@ #include <linux/module.h> #include <linux/rbtree.h> +#include <linux/slab.h> #include <linux/irq.h> #include <asm/apic.h> diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c index 2b75ef638db..56e421bc379 100644 --- a/arch/x86/kernel/uv_time.c +++ b/arch/x86/kernel/uv_time.c @@ -19,6 +19,7 @@ * Copyright (c) Dimitri Sivanich */ #include <linux/clockchips.h> +#include <linux/slab.h> #include <asm/uv/uv_mmrs.h> #include <asm/uv/uv_hub.h> diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 7dd599deca4..ce9fbacb752 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -28,6 +28,7 @@ #include <linux/mm.h> #include <linux/highmem.h> #include <linux/sched.h> +#include <linux/gfp.h> #include <asm/vmi.h> #include <asm/io.h> #include <asm/fixmap.h> diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 44879df5569..2cc249718c4 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -291,8 +291,8 @@ SECTIONS .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { __smp_locks = .; *(.smp_locks) - __smp_locks_end = .; . = ALIGN(PAGE_SIZE); + __smp_locks_end = .; } #ifdef CONFIG_X86_64 diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 294698b6daf..0150affad25 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -32,6 +32,7 @@ #define pr_fmt(fmt) "pit: " fmt #include <linux/kvm_host.h> +#include <linux/slab.h> #include "irq.h" #include "i8254.h" diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 07771da85de..a790fa128a9 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -26,6 +26,7 @@ * Port from Qemu. */ #include <linux/mm.h> +#include <linux/slab.h> #include <linux/bitops.h> #include "irq.h" diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 4b224f90087..1eb7a4ae0c9 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -26,6 +26,7 @@ #include <linux/io.h> #include <linux/module.h> #include <linux/math64.h> +#include <linux/slab.h> #include <asm/processor.h> #include <asm/msr.h> #include <asm/page.h> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 741373e8ca7..19a8906bcaa 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -31,6 +31,7 @@ #include <linux/hugetlb.h> #include <linux/compiler.h> #include <linux/srcu.h> +#include <linux/slab.h> #include <asm/page.h> #include <asm/cmpxchg.h> @@ -1489,8 +1490,8 @@ static int mmu_zap_unsync_children(struct kvm *kvm, for_each_sp(pages, sp, parents, i) { kvm_mmu_zap_page(kvm, sp); mmu_pages_clear_parents(&parents); + zapped++; } - zapped += pages.nr; kvm_mmu_pages_init(parent, &parents, &pages); } @@ -1541,14 +1542,16 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) */ if (used_pages > kvm_nr_mmu_pages) { - while (used_pages > kvm_nr_mmu_pages) { + while (used_pages > kvm_nr_mmu_pages && + !list_empty(&kvm->arch.active_mmu_pages)) { struct kvm_mmu_page *page; page = container_of(kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); - kvm_mmu_zap_page(kvm, page); + used_pages -= kvm_mmu_zap_page(kvm, page); used_pages--; } + kvm_nr_mmu_pages = used_pages; kvm->arch.n_free_mmu_pages = 0; } else @@ -1595,7 +1598,8 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) && !sp->role.invalid) { pgprintk("%s: zap %lx %x\n", __func__, gfn, sp->role.word); - kvm_mmu_zap_page(kvm, sp); + if (kvm_mmu_zap_page(kvm, sp)) + nn = bucket->first; } } } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 52f78dd0301..2ba58206812 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -26,6 +26,7 @@ #include <linux/highmem.h> #include <linux/sched.h> #include <linux/ftrace_event.h> +#include <linux/slab.h> #include <asm/desc.h> @@ -705,29 +706,28 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) if (err) goto free_svm; + err = -ENOMEM; page = alloc_page(GFP_KERNEL); - if (!page) { - err = -ENOMEM; + if (!page) goto uninit; - } - err = -ENOMEM; msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); if (!msrpm_pages) - goto uninit; + goto free_page1; nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); if (!nested_msrpm_pages) - goto uninit; - - svm->msrpm = page_address(msrpm_pages); - svm_vcpu_init_msrpm(svm->msrpm); + goto free_page2; hsave_page = alloc_page(GFP_KERNEL); if (!hsave_page) - goto uninit; + goto free_page3; + svm->nested.hsave = page_address(hsave_page); + svm->msrpm = page_address(msrpm_pages); + svm_vcpu_init_msrpm(svm->msrpm); + svm->nested.msrpm = page_address(nested_msrpm_pages); svm->vmcb = page_address(page); @@ -743,6 +743,12 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) return &svm->vcpu; +free_page3: + __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); +free_page2: + __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER); +free_page1: + __free_page(page); uninit: kvm_vcpu_uninit(&svm->vcpu); free_svm: diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 14873b9f843..bc933cfb4e6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -26,6 +26,7 @@ #include <linux/sched.h> #include <linux/moduleparam.h> #include <linux/ftrace_event.h> +#include <linux/slab.h> #include "kvm_cache_regs.h" #include "x86.h" @@ -76,6 +77,8 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO); #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) +#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) + /* * These 2 parameters are used to config the controls for Pause-Loop Exiting: * ple_gap: upper bound on the amount of time between two successive @@ -130,7 +133,7 @@ struct vcpu_vmx { } host_state; struct { int vm86_active; - u8 save_iopl; + ulong save_rflags; struct kvm_save_segment { u16 selector; unsigned long base; @@ -817,18 +820,23 @@ static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) { - unsigned long rflags; + unsigned long rflags, save_rflags; rflags = vmcs_readl(GUEST_RFLAGS); - if (to_vmx(vcpu)->rmode.vm86_active) - rflags &= ~(unsigned long)(X86_EFLAGS_IOPL | X86_EFLAGS_VM); + if (to_vmx(vcpu)->rmode.vm86_active) { + rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; + save_rflags = to_vmx(vcpu)->rmode.save_rflags; + rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; + } return rflags; } static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { - if (to_vmx(vcpu)->rmode.vm86_active) + if (to_vmx(vcpu)->rmode.vm86_active) { + to_vmx(vcpu)->rmode.save_rflags = rflags; rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; + } vmcs_writel(GUEST_RFLAGS, rflags); } @@ -1482,8 +1490,8 @@ static void enter_pmode(struct kvm_vcpu *vcpu) vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar); flags = vmcs_readl(GUEST_RFLAGS); - flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM); - flags |= (vmx->rmode.save_iopl << IOPL_SHIFT); + flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; + flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; vmcs_writel(GUEST_RFLAGS, flags); vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | @@ -1556,8 +1564,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu) vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); flags = vmcs_readl(GUEST_RFLAGS); - vmx->rmode.save_iopl - = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; + vmx->rmode.save_rflags = flags; flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e46282a5656..3c4ca98ad27 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -39,6 +39,7 @@ #include <linux/cpufreq.h> #include <linux/user-return-notifier.h> #include <linux/srcu.h> +#include <linux/slab.h> #include <trace/events/kvm.h> #undef TRACE_INCLUDE_FILE #define CREATE_TRACE_POINTS @@ -432,8 +433,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) #ifdef CONFIG_X86_64 if (cr0 & 0xffffffff00000000UL) { - printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", - cr0, kvm_read_cr0(vcpu)); kvm_inject_gp(vcpu, 0); return; } @@ -442,14 +441,11 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) cr0 &= ~CR0_RESERVED_BITS; if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { - printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); kvm_inject_gp(vcpu, 0); return; } if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { - printk(KERN_DEBUG "set_cr0: #GP, set PG flag " - "and a clear PE flag\n"); kvm_inject_gp(vcpu, 0); return; } @@ -460,15 +456,11 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) int cs_db, cs_l; if (!is_pae(vcpu)) { - printk(KERN_DEBUG "set_cr0: #GP, start paging " - "in long mode while PAE is disabled\n"); kvm_inject_gp(vcpu, 0); return; } kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); if (cs_l) { - printk(KERN_DEBUG "set_cr0: #GP, start paging " - "in long mode while CS.L == 1\n"); kvm_inject_gp(vcpu, 0); return; @@ -476,8 +468,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } else #endif if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { - printk(KERN_DEBUG "set_cr0: #GP, pdptrs " - "reserved bits\n"); kvm_inject_gp(vcpu, 0); return; } @@ -504,28 +494,23 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; if (cr4 & CR4_RESERVED_BITS) { - printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); kvm_inject_gp(vcpu, 0); return; } if (is_long_mode(vcpu)) { if (!(cr4 & X86_CR4_PAE)) { - printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while " - "in long mode\n"); kvm_inject_gp(vcpu, 0); return; } } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) && ((cr4 ^ old_cr4) & pdptr_bits) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { - printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); kvm_inject_gp(vcpu, 0); return; } if (cr4 & X86_CR4_VMXE) { - printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); kvm_inject_gp(vcpu, 0); return; } @@ -546,21 +531,16 @@ void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (is_long_mode(vcpu)) { if (cr3 & CR3_L_MODE_RESERVED_BITS) { - printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); kvm_inject_gp(vcpu, 0); return; } } else { if (is_pae(vcpu)) { if (cr3 & CR3_PAE_RESERVED_BITS) { - printk(KERN_DEBUG - "set_cr3: #GP, reserved bits\n"); kvm_inject_gp(vcpu, 0); return; } if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { - printk(KERN_DEBUG "set_cr3: #GP, pdptrs " - "reserved bits\n"); kvm_inject_gp(vcpu, 0); return; } @@ -592,7 +572,6 @@ EXPORT_SYMBOL_GPL(kvm_set_cr3); void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) { if (cr8 & CR8_RESERVED_BITS) { - printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); kvm_inject_gp(vcpu, 0); return; } @@ -648,15 +627,12 @@ static u32 emulated_msrs[] = { static void set_efer(struct kvm_vcpu *vcpu, u64 efer) { if (efer & efer_reserved_bits) { - printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n", - efer); kvm_inject_gp(vcpu, 0); return; } if (is_paging(vcpu) && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) { - printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); kvm_inject_gp(vcpu, 0); return; } @@ -666,7 +642,6 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer) feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) { - printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n"); kvm_inject_gp(vcpu, 0); return; } @@ -677,7 +652,6 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer) feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) { - printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n"); kvm_inject_gp(vcpu, 0); return; } @@ -966,9 +940,13 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data) if (msr >= MSR_IA32_MC0_CTL && msr < MSR_IA32_MC0_CTL + 4 * bank_num) { u32 offset = msr - MSR_IA32_MC0_CTL; - /* only 0 or all 1s can be written to IA32_MCi_CTL */ + /* only 0 or all 1s can be written to IA32_MCi_CTL + * some Linux kernels though clear bit 10 in bank 4 to + * workaround a BIOS/GART TBL issue on AMD K8s, ignore + * this to avoid an uncatched #GP in the guest + */ if ((offset & 0x3) == 0 && - data != 0 && data != ~(u64)0) + data != 0 && (data | (1 << 10)) != ~(u64)0) return -1; vcpu->arch.mce_banks[offset] = data; break; @@ -2634,8 +2612,9 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { - int r, n, i; + int r, i; struct kvm_memory_slot *memslot; + unsigned long n; unsigned long is_dirty = 0; unsigned long *dirty_bitmap = NULL; @@ -2650,7 +2629,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, if (!memslot->dirty_bitmap) goto out; - n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; + n = kvm_dirty_bitmap_bytes(memslot); r = -ENOMEM; dirty_bitmap = vmalloc(n); @@ -4482,7 +4461,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_set_cr8(vcpu, kvm_run->cr8); if (vcpu->arch.pio.cur_count) { + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); r = complete_pio(vcpu); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); if (r) goto out; } @@ -5145,6 +5126,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) int ret = 0; u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR); u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR); + u32 desc_limit; old_tss_base = kvm_mmu_gva_to_gpa_write(vcpu, old_tss_base, NULL); @@ -5167,7 +5149,10 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) } } - if (!nseg_desc.p || get_desc_limit(&nseg_desc) < 0x67) { + desc_limit = get_desc_limit(&nseg_desc); + if (!nseg_desc.p || + ((desc_limit < 0x67 && (nseg_desc.type & 8)) || + desc_limit < 0x2b)) { kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc); return 1; } diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 7e59dc1d3fc..2bdf628066b 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -115,7 +115,7 @@ static void async_hcall(unsigned long call, unsigned long arg1, local_irq_save(flags); if (lguest_data.hcall_status[next_call] != 0xFF) { /* Table full, so do normal hcall which will flush table. */ - kvm_hypercall4(call, arg1, arg2, arg3, arg4); + hcall(call, arg1, arg2, arg3, arg4); } else { lguest_data.hcalls[next_call].arg0 = call; lguest_data.hcalls[next_call].arg1 = arg1; @@ -145,46 +145,45 @@ static void async_hcall(unsigned long call, unsigned long arg1, * So, when we're in lazy mode, we call async_hcall() to store the call for * future processing: */ -static void lazy_hcall1(unsigned long call, - unsigned long arg1) +static void lazy_hcall1(unsigned long call, unsigned long arg1) { if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) - kvm_hypercall1(call, arg1); + hcall(call, arg1, 0, 0, 0); else async_hcall(call, arg1, 0, 0, 0); } /* You can imagine what lazy_hcall2, 3 and 4 look like. :*/ static void lazy_hcall2(unsigned long call, - unsigned long arg1, - unsigned long arg2) + unsigned long arg1, + unsigned long arg2) { if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) - kvm_hypercall2(call, arg1, arg2); + hcall(call, arg1, arg2, 0, 0); else async_hcall(call, arg1, arg2, 0, 0); } static void lazy_hcall3(unsigned long call, - unsigned long arg1, - unsigned long arg2, - unsigned long arg3) + unsigned long arg1, + unsigned long arg2, + unsigned long arg3) { if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) - kvm_hypercall3(call, arg1, arg2, arg3); + hcall(call, arg1, arg2, arg3, 0); else async_hcall(call, arg1, arg2, arg3, 0); } #ifdef CONFIG_X86_PAE static void lazy_hcall4(unsigned long call, - unsigned long arg1, - unsigned long arg2, - unsigned long arg3, - unsigned long arg4) + unsigned long arg1, + unsigned long arg2, + unsigned long arg3, + unsigned long arg4) { if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) - kvm_hypercall4(call, arg1, arg2, arg3, arg4); + hcall(call, arg1, arg2, arg3, arg4); else async_hcall(call, arg1, arg2, arg3, arg4); } @@ -196,13 +195,13 @@ static void lazy_hcall4(unsigned long call, :*/ static void lguest_leave_lazy_mmu_mode(void) { - kvm_hypercall0(LHCALL_FLUSH_ASYNC); + hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0); paravirt_leave_lazy_mmu(); } static void lguest_end_context_switch(struct task_struct *next) { - kvm_hypercall0(LHCALL_FLUSH_ASYNC); + hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0); paravirt_end_context_switch(next); } @@ -286,7 +285,7 @@ static void lguest_write_idt_entry(gate_desc *dt, /* Keep the local copy up to date. */ native_write_idt_entry(dt, entrynum, g); /* Tell Host about this new entry. */ - kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]); + hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1], 0); } /* @@ -300,7 +299,7 @@ static void lguest_load_idt(const struct desc_ptr *desc) struct desc_struct *idt = (void *)desc->address; for (i = 0; i < (desc->size+1)/8; i++) - kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b); + hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b, 0); } /* @@ -321,7 +320,7 @@ static void lguest_load_gdt(const struct desc_ptr *desc) struct desc_struct *gdt = (void *)desc->address; for (i = 0; i < (desc->size+1)/8; i++) - kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b); + hcall(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b, 0); } /* @@ -334,8 +333,8 @@ static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, { native_write_gdt_entry(dt, entrynum, desc, type); /* Tell Host about this new entry. */ - kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, entrynum, - dt[entrynum].a, dt[entrynum].b); + hcall(LHCALL_LOAD_GDT_ENTRY, entrynum, + dt[entrynum].a, dt[entrynum].b, 0); } /* @@ -931,7 +930,7 @@ static int lguest_clockevent_set_next_event(unsigned long delta, } /* Please wake us this far in the future. */ - kvm_hypercall1(LHCALL_SET_CLOCKEVENT, delta); + hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0, 0); return 0; } @@ -942,7 +941,7 @@ static void lguest_clockevent_set_mode(enum clock_event_mode mode, case CLOCK_EVT_MODE_UNUSED: case CLOCK_EVT_MODE_SHUTDOWN: /* A 0 argument shuts the clock down. */ - kvm_hypercall0(LHCALL_SET_CLOCKEVENT); + hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0, 0); break; case CLOCK_EVT_MODE_ONESHOT: /* This is what we expect. */ @@ -1100,7 +1099,7 @@ static void set_lguest_basic_apic_ops(void) /* STOP! Until an interrupt comes in. */ static void lguest_safe_halt(void) { - kvm_hypercall0(LHCALL_HALT); + hcall(LHCALL_HALT, 0, 0, 0, 0); } /* @@ -1112,8 +1111,8 @@ static void lguest_safe_halt(void) */ static void lguest_power_off(void) { - kvm_hypercall2(LHCALL_SHUTDOWN, __pa("Power down"), - LGUEST_SHUTDOWN_POWEROFF); + hcall(LHCALL_SHUTDOWN, __pa("Power down"), + LGUEST_SHUTDOWN_POWEROFF, 0, 0); } /* @@ -1123,7 +1122,7 @@ static void lguest_power_off(void) */ static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p) { - kvm_hypercall2(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF); + hcall(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF, 0, 0); /* The hcall won't return, but to keep gcc happy, we're "done". */ return NOTIFY_DONE; } @@ -1162,7 +1161,7 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count) len = sizeof(scratch) - 1; scratch[len] = '\0'; memcpy(scratch, buf, len); - kvm_hypercall1(LHCALL_NOTIFY, __pa(scratch)); + hcall(LHCALL_NOTIFY, __pa(scratch), 0, 0, 0); /* This routine returns the number of bytes actually written. */ return len; @@ -1174,7 +1173,7 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count) */ static void lguest_restart(char *reason) { - kvm_hypercall2(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART); + hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0, 0); } /*G:050 diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index 27eac0faee4..4f420c2f2d5 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S @@ -32,7 +32,7 @@ ENTRY(lguest_entry) */ movl $LHCALL_LGUEST_INIT, %eax movl $lguest_data - __PAGE_OFFSET, %ebx - .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ + int $LGUEST_TRAP_ENTRY /* Set up the initial stack so we can run C code. */ movl $(init_thread_union+THREAD_SIZE),%esp diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index f46c340727b..069ce7c37c0 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -9,7 +9,6 @@ #include <linux/mm.h> #include <linux/hugetlb.h> #include <linux/pagemap.h> -#include <linux/slab.h> #include <linux/err.h> #include <linux/sysctl.h> #include <asm/mman.h> diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index e71c5cbc8f3..b278535b14a 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -1,3 +1,4 @@ +#include <linux/gfp.h> #include <linux/initrd.h> #include <linux/ioport.h> #include <linux/swap.h> @@ -331,11 +332,23 @@ int devmem_is_allowed(unsigned long pagenr) void free_init_pages(char *what, unsigned long begin, unsigned long end) { - unsigned long addr = begin; + unsigned long addr; + unsigned long begin_aligned, end_aligned; - if (addr >= end) + /* Make sure boundaries are page aligned */ + begin_aligned = PAGE_ALIGN(begin); + end_aligned = end & PAGE_MASK; + + if (WARN_ON(begin_aligned != begin || end_aligned != end)) { + begin = begin_aligned; + end = end_aligned; + } + + if (begin >= end) return; + addr = begin; + /* * If debugging page accesses then do not free this memory but * mark them not present - any buggy init-section access will @@ -343,7 +356,7 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) */ #ifdef CONFIG_DEBUG_PAGEALLOC printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", - begin, PAGE_ALIGN(end)); + begin, end); set_memory_np(begin, (end - begin) >> PAGE_SHIFT); #else /* @@ -358,8 +371,7 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) for (; addr < end; addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); init_page_count(virt_to_page(addr)); - memset((void *)(addr & ~(PAGE_SIZE-1)), - POISON_FREE_INITMEM, PAGE_SIZE); + memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); free_page(addr); totalram_pages++; } @@ -376,6 +388,15 @@ void free_initmem(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_init_pages("initrd memory", start, end); + /* + * end could be not aligned, and We can not align that, + * decompresser could be confused by aligned initrd_end + * We already reserve the end partial page before in + * - i386_start_kernel() + * - x86_64_start_kernel() + * - relocate_initrd() + * So here We can do PAGE_ALIGN() safely to get partial page to be freed + */ + free_init_pages("initrd memory", start, PAGE_ALIGN(end)); } #endif diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 5cb3f0f54f4..bca79091b9d 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -25,11 +25,11 @@ #include <linux/pfn.h> #include <linux/poison.h> #include <linux/bootmem.h> -#include <linux/slab.h> #include <linux/proc_fs.h> #include <linux/memory_hotplug.h> #include <linux/initrd.h> #include <linux/cpumask.h> +#include <linux/gfp.h> #include <asm/asm.h> #include <asm/bios_ebda.h> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index e9b040e1cde..ee41bba315d 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -29,6 +29,7 @@ #include <linux/module.h> #include <linux/memory_hotplug.h> #include <linux/nmi.h> +#include <linux/gfp.h> #include <asm/processor.h> #include <asm/bios_ebda.h> diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 536fb682336..5d0e67fff1a 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -21,6 +21,7 @@ #include <linux/kdebug.h> #include <linux/mutex.h> #include <linux/io.h> +#include <linux/slab.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> #include <linux/errno.h> diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index 34a3291ca10..3adff7dcc14 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -26,6 +26,7 @@ #include <linux/module.h> #include <linux/debugfs.h> +#include <linux/slab.h> #include <linux/uaccess.h> #include <linux/io.h> #include <linux/version.h> diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 1d4eb93d333..28195c350b9 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -6,13 +6,13 @@ #include <linux/bootmem.h> #include <linux/module.h> #include <linux/sched.h> -#include <linux/slab.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/seq_file.h> #include <linux/debugfs.h> #include <linux/pfn.h> #include <linux/percpu.h> +#include <linux/gfp.h> #include <asm/e820.h> #include <asm/processor.h> @@ -291,8 +291,29 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, */ if (kernel_set_to_readonly && within(address, (unsigned long)_text, - (unsigned long)__end_rodata_hpage_align)) - pgprot_val(forbidden) |= _PAGE_RW; + (unsigned long)__end_rodata_hpage_align)) { + unsigned int level; + + /* + * Don't enforce the !RW mapping for the kernel text mapping, + * if the current mapping is already using small page mapping. + * No need to work hard to preserve large page mappings in this + * case. + * + * This also fixes the Linux Xen paravirt guest boot failure + * (because of unexpected read-only mappings for kernel identity + * mappings). In this paravirt guest case, the kernel text + * mapping and the kernel identity mapping share the same + * page-table pages. Thus we can't really use different + * protections for the kernel text and identity mappings. Also, + * these shared mappings are made of small page mappings. + * Thus this don't enforce !RW mapping for small page kernel + * text mapping logic will help Linux Xen parvirt guest boot + * aswell. + */ + if (lookup_address(address, &level) && (level != PG_LEVEL_4K)) + pgprot_val(forbidden) |= _PAGE_RW; + } #endif prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index ae9648eb1c7..edc8b95afc1 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -12,7 +12,7 @@ #include <linux/debugfs.h> #include <linux/kernel.h> #include <linux/module.h> -#include <linux/gfp.h> +#include <linux/slab.h> #include <linux/mm.h> #include <linux/fs.h> #include <linux/rbtree.h> diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index c9ba9deafe8..5c4ee422590 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -1,4 +1,5 @@ #include <linux/mm.h> +#include <linux/gfp.h> #include <asm/pgalloc.h> #include <asm/pgtable.h> #include <asm/tlb.h> diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 46c8834aedc..1a8faf09afe 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -6,7 +6,6 @@ #include <linux/swap.h> #include <linux/smp.h> #include <linux/highmem.h> -#include <linux/slab.h> #include <linux/pagemap.h> #include <linux/spinlock.h> #include <linux/module.h> diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 6a58256dce9..090cbbec7db 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -46,17 +46,6 @@ static unsigned long reset_value[NUM_VIRT_COUNTERS]; -/* IbsFetchCtl bits/masks */ -#define IBS_FETCH_RAND_EN (1ULL<<57) -#define IBS_FETCH_VAL (1ULL<<49) -#define IBS_FETCH_ENABLE (1ULL<<48) -#define IBS_FETCH_CNT_MASK 0xFFFF0000ULL - -/* IbsOpCtl bits */ -#define IBS_OP_CNT_CTL (1ULL<<19) -#define IBS_OP_VAL (1ULL<<18) -#define IBS_OP_ENABLE (1ULL<<17) - #define IBS_FETCH_SIZE 6 #define IBS_OP_SIZE 12 @@ -182,7 +171,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, continue; } rdmsrl(msrs->controls[i].addr, val); - if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) + if (val & ARCH_PERFMON_EVENTSEL_ENABLE) op_x86_warn_in_use(i); val &= model->reserved; wrmsrl(msrs->controls[i].addr, val); @@ -290,7 +279,7 @@ op_amd_handle_ibs(struct pt_regs * const regs, oprofile_write_commit(&entry); /* reenable the IRQ */ - ctl &= ~(IBS_FETCH_VAL | IBS_FETCH_CNT_MASK); + ctl &= ~(IBS_FETCH_VAL | IBS_FETCH_CNT); ctl |= IBS_FETCH_ENABLE; wrmsrl(MSR_AMD64_IBSFETCHCTL, ctl); } @@ -330,7 +319,7 @@ static inline void op_amd_start_ibs(void) return; if (ibs_config.fetch_enabled) { - val = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; + val = (ibs_config.max_cnt_fetch >> 4) & IBS_FETCH_MAX_CNT; val |= ibs_config.rand_en ? IBS_FETCH_RAND_EN : 0; val |= IBS_FETCH_ENABLE; wrmsrl(MSR_AMD64_IBSFETCHCTL, val); @@ -352,7 +341,7 @@ static inline void op_amd_start_ibs(void) * avoid underflows. */ ibs_op_ctl = min(ibs_op_ctl + IBS_RANDOM_MAXCNT_OFFSET, - 0xFFFFULL); + IBS_OP_MAX_CNT); } if (ibs_caps & IBS_CAPS_OPCNT && ibs_config.dispatched_ops) ibs_op_ctl |= IBS_OP_CNT_CTL; @@ -409,7 +398,7 @@ static void op_amd_start(struct op_msrs const * const msrs) if (!reset_value[op_x86_phys_to_virt(i)]) continue; rdmsrl(msrs->controls[i].addr, val); - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + val |= ARCH_PERFMON_EVENTSEL_ENABLE; wrmsrl(msrs->controls[i].addr, val); } @@ -429,7 +418,7 @@ static void op_amd_stop(struct op_msrs const * const msrs) if (!reset_value[op_x86_phys_to_virt(i)]) continue; rdmsrl(msrs->controls[i].addr, val); - val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; wrmsrl(msrs->controls[i].addr, val); } diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 5d1727ba409..2bf90fafa7b 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -88,7 +88,7 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model, continue; } rdmsrl(msrs->controls[i].addr, val); - if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) + if (val & ARCH_PERFMON_EVENTSEL_ENABLE) op_x86_warn_in_use(i); val &= model->reserved; wrmsrl(msrs->controls[i].addr, val); @@ -166,7 +166,7 @@ static void ppro_start(struct op_msrs const * const msrs) for (i = 0; i < num_counters; ++i) { if (reset_value[i]) { rdmsrl(msrs->controls[i].addr, val); - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + val |= ARCH_PERFMON_EVENTSEL_ENABLE; wrmsrl(msrs->controls[i].addr, val); } } @@ -184,7 +184,7 @@ static void ppro_stop(struct op_msrs const * const msrs) if (!reset_value[i]) continue; rdmsrl(msrs->controls[i].addr, val); - val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; wrmsrl(msrs->controls[i].addr, val); } } diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 6e22454bfaa..c7b1ebfb7da 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -3,6 +3,7 @@ #include <linux/init.h> #include <linux/irq.h> #include <linux/dmi.h> +#include <linux/slab.h> #include <asm/numa.h> #include <asm/pci_x86.h> @@ -122,8 +123,8 @@ setup_resource(struct acpi_resource *acpi_res, void *data) struct acpi_resource_address64 addr; acpi_status status; unsigned long flags; - struct resource *root; - u64 start, end; + struct resource *root, *conflict; + u64 start, end, max_len; status = resource_to_addr(acpi_res, &addr); if (!ACPI_SUCCESS(status)) @@ -140,6 +141,17 @@ setup_resource(struct acpi_resource *acpi_res, void *data) } else return AE_OK; + max_len = addr.maximum - addr.minimum + 1; + if (addr.address_length > max_len) { + dev_printk(KERN_DEBUG, &info->bridge->dev, + "host bridge window length %#llx doesn't fit in " + "%#llx-%#llx, trimming\n", + (unsigned long long) addr.address_length, + (unsigned long long) addr.minimum, + (unsigned long long) addr.maximum); + addr.address_length = max_len; + } + start = addr.minimum + addr.translation_offset; end = start + addr.address_length - 1; @@ -157,9 +169,12 @@ setup_resource(struct acpi_resource *acpi_res, void *data) return AE_OK; } - if (insert_resource(root, res)) { + conflict = insert_resource_conflict(root, res); + if (conflict) { dev_err(&info->bridge->dev, - "can't allocate host bridge window %pR\n", res); + "address space collision: host bridge window %pR " + "conflicts with %s %pR\n", + res, conflict->name, conflict); } else { pci_bus_add_resource(info->bus, res, 0); info->res_num++; diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 294e10cb11e..cf2e93869c4 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -9,6 +9,7 @@ #include <linux/ioport.h> #include <linux/init.h> #include <linux/dmi.h> +#include <linux/slab.h> #include <asm/acpi.h> #include <asm/segment.h> diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index dece3eb9c90..46fd43f7910 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -127,9 +127,6 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list) continue; if (!r->start || pci_claim_resource(dev, idx) < 0) { - dev_info(&dev->dev, - "can't reserve window %pR\n", - r); /* * Something is wrong with the region. * Invalidate the resource to prevent @@ -181,8 +178,6 @@ static void __init pcibios_allocate_resources(int pass) "BAR %d: reserving %pr (d=%d, p=%d)\n", idx, r, disabled, pass); if (pci_claim_resource(dev, idx) < 0) { - dev_info(&dev->dev, - "can't reserve %pR\n", r); /* We'll assign a new address later */ r->end -= r->start; r->start = 0; diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 8b107521d24..5d362b5ba06 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -8,7 +8,6 @@ #include <linux/kernel.h> #include <linux/pci.h> #include <linux/init.h> -#include <linux/slab.h> #include <linux/interrupt.h> #include <linux/dmi.h> #include <linux/io.h> diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 8f3f9a50b1e..39b9ebe8f88 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -16,6 +16,7 @@ #include <linux/sfi_acpi.h> #include <linux/bitmap.h> #include <linux/dmi.h> +#include <linux/slab.h> #include <asm/e820.h> #include <asm/pci_x86.h> #include <asm/acpi.h> diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c index 1c975cc9839..59a225c17b8 100644 --- a/arch/x86/pci/pcbios.c +++ b/arch/x86/pci/pcbios.c @@ -4,6 +4,7 @@ #include <linux/pci.h> #include <linux/init.h> +#include <linux/slab.h> #include <linux/module.h> #include <linux/uaccess.h> #include <asm/pci_x86.h> diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c index 81197c62d5b..3769079874d 100644 --- a/arch/x86/power/hibernate_32.c +++ b/arch/x86/power/hibernate_32.c @@ -6,6 +6,7 @@ * Copyright (c) 2006 Rafael J. Wysocki <rjw@sisk.pl> */ +#include <linux/gfp.h> #include <linux/suspend.h> #include <linux/bootmem.h> diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index 65fdc86e923..d24f983ba1e 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -8,6 +8,7 @@ * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org> */ +#include <linux/gfp.h> #include <linux/smp.h> #include <linux/suspend.h> #include <asm/proto.h> diff --git a/arch/x86/power/hibernate_asm_32.S b/arch/x86/power/hibernate_asm_32.S index b641388d828..ad47daeafa4 100644 --- a/arch/x86/power/hibernate_asm_32.S +++ b/arch/x86/power/hibernate_asm_32.S @@ -27,10 +27,17 @@ ENTRY(swsusp_arch_suspend) ret ENTRY(restore_image) + movl mmu_cr4_features, %ecx movl resume_pg_dir, %eax subl $__PAGE_OFFSET, %eax movl %eax, %cr3 + jecxz 1f # cr4 Pentium and higher, skip if zero + andl $~(X86_CR4_PGE), %ecx + movl %ecx, %cr4; # turn off PGE + movl %cr3, %eax; # flush TLB + movl %eax, %cr3 +1: movl restore_pblist, %edx .p2align 4,,7 @@ -54,16 +61,8 @@ done: movl $swapper_pg_dir, %eax subl $__PAGE_OFFSET, %eax movl %eax, %cr3 - /* Flush TLB, including "global" things (vmalloc) */ movl mmu_cr4_features, %ecx jecxz 1f # cr4 Pentium and higher, skip if zero - movl %ecx, %edx - andl $~(X86_CR4_PGE), %edx - movl %edx, %cr4; # turn off PGE -1: - movl %cr3, %eax; # flush TLB - movl %eax, %cr3 - jecxz 1f # cr4 Pentium and higher, skip if zero movl %ecx, %cr4; # turn PGE back on 1: diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 21e1aeb9f3e..ac74869b814 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -6,6 +6,7 @@ #include <linux/mm.h> #include <linux/err.h> #include <linux/sched.h> +#include <linux/slab.h> #include <linux/init.h> #include <linux/random.h> #include <linux/elf.h> diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c index e133ce25e29..1304bcec8ee 100644 --- a/arch/x86/xen/debugfs.c +++ b/arch/x86/xen/debugfs.c @@ -1,5 +1,6 @@ #include <linux/init.h> #include <linux/debugfs.h> +#include <linux/slab.h> #include <linux/module.h> #include "debugfs.h" diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index b607239c1ba..65d8d79b46a 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -28,6 +28,7 @@ #include <linux/highmem.h> #include <linux/console.h> #include <linux/pci.h> +#include <linux/gfp.h> #include <xen/xen.h> #include <xen/interface/xen.h> diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index f9eb7de74f4..914f04695ce 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -43,6 +43,7 @@ #include <linux/debugfs.h> #include <linux/bug.h> #include <linux/module.h> +#include <linux/gfp.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index deafb65ef44..a29693fd313 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -14,6 +14,7 @@ */ #include <linux/sched.h> #include <linux/err.h> +#include <linux/slab.h> #include <linux/smp.h> #include <asm/paravirt.h> diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 24ded31b5ae..e0500646585 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -6,6 +6,7 @@ #include <linux/spinlock.h> #include <linux/debugfs.h> #include <linux/log2.h> +#include <linux/gfp.h> #include <asm/paravirt.h> diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 0d3f07cd1b5..32764b8880b 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -13,6 +13,7 @@ #include <linux/clockchips.h> #include <linux/kernel_stat.h> #include <linux/math64.h> +#include <linux/gfp.h> #include <asm/pvclock.h> #include <asm/xen/hypervisor.h> |