aboutsummaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig33
-rw-r--r--arch/x86/Kconfig.debug20
-rw-r--r--arch/x86/boot/a20.c3
-rw-r--r--arch/x86/boot/printf.c2
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/acpi/boot.c16
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.S38
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.h5
-rw-r--r--arch/x86/kernel/acpi/sleep.c16
-rw-r--r--arch/x86/kernel/apic_64.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longrun.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c15
-rw-r--r--arch/x86/kernel/efi_32.c8
-rw-r--r--arch/x86/kernel/entry_32.S1
-rw-r--r--arch/x86/kernel/geode_32.c5
-rw-r--r--arch/x86/kernel/head_32.S2
-rw-r--r--arch/x86/kernel/head_64.S2
-rw-r--r--arch/x86/kernel/i387.c48
-rw-r--r--arch/x86/kernel/io_apic_32.c12
-rw-r--r--arch/x86/kernel/kvmclock.c93
-rw-r--r--arch/x86/kernel/mfgpt_32.c2
-rw-r--r--arch/x86/kernel/nmi_32.c9
-rw-r--r--arch/x86/kernel/pci-dma.c14
-rw-r--r--arch/x86/kernel/pci-gart_64.c31
-rw-r--r--arch/x86/kernel/process_32.c6
-rw-r--r--arch/x86/kernel/process_64.c6
-rw-r--r--arch/x86/kernel/pvclock.c141
-rw-r--r--arch/x86/kernel/rtc.c34
-rw-r--r--arch/x86/kernel/setup_32.c10
-rw-r--r--arch/x86/kernel/smpboot.c6
-rw-r--r--arch/x86/kernel/traps_32.c1
-rw-r--r--arch/x86/kernel/tsc_32.c23
-rw-r--r--arch/x86/kernel/tsc_64.c5
-rw-r--r--arch/x86/kvm/i8254.c23
-rw-r--r--arch/x86/kvm/irq.c6
-rw-r--r--arch/x86/kvm/irq.h2
-rw-r--r--arch/x86/kvm/lapic.c1
-rw-r--r--arch/x86/kvm/mmu.c24
-rw-r--r--arch/x86/kvm/paging_tmpl.h2
-rw-r--r--arch/x86/kvm/svm.c2
-rw-r--r--arch/x86/kvm/vmx.c22
-rw-r--r--arch/x86/kvm/x86.c93
-rw-r--r--arch/x86/kvm/x86_emulate.c3
-rw-r--r--arch/x86/lguest/boot.c5
-rw-r--r--arch/x86/lib/copy_user_64.S25
-rw-r--r--arch/x86/lib/copy_user_nocache_64.S25
-rw-r--r--arch/x86/lib/delay_32.c31
-rw-r--r--arch/x86/lib/delay_64.c30
-rw-r--r--arch/x86/math-emu/fpu_entry.c13
-rw-r--r--arch/x86/mm/fault.c5
-rw-r--r--arch/x86/mm/init_64.c13
-rw-r--r--arch/x86/mm/ioremap.c5
-rw-r--r--arch/x86/mm/pat.c51
-rw-r--r--arch/x86/mm/srat_64.c27
-rw-r--r--arch/x86/pci/common.c8
-rw-r--r--arch/x86/pci/init.c3
-rw-r--r--arch/x86/pci/irq.c7
-rw-r--r--arch/x86/pci/olpc.c5
-rw-r--r--arch/x86/pci/pci.h2
-rw-r--r--arch/x86/vdso/vclock_gettime.c6
-rw-r--r--arch/x86/xen/Kconfig3
-rw-r--r--arch/x86/xen/enlighten.c56
-rw-r--r--arch/x86/xen/mmu.c77
-rw-r--r--arch/x86/xen/mmu.h24
-rw-r--r--arch/x86/xen/time.c145
-rw-r--r--arch/x86/xen/xen-head.S6
66 files changed, 802 insertions, 560 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fe361ae7ef2..bf07b6f50fa 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -26,17 +26,10 @@ config X86
select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
select HAVE_ARCH_KGDB if !X86_VOYAGER
-config DEFCONFIG_LIST
+config ARCH_DEFCONFIG
string
- depends on X86_32
- option defconfig_list
- default "arch/x86/configs/i386_defconfig"
-
-config DEFCONFIG_LIST
- string
- depends on X86_64
- option defconfig_list
- default "arch/x86/configs/x86_64_defconfig"
+ default "arch/x86/configs/i386_defconfig" if X86_32
+ default "arch/x86/configs/x86_64_defconfig" if X86_64
config GENERIC_LOCKBREAK
@@ -390,6 +383,7 @@ config VMI
config KVM_CLOCK
bool "KVM paravirtualized clock"
select PARAVIRT
+ select PARAVIRT_CLOCK
depends on !(X86_VISWS || X86_VOYAGER)
help
Turning on this option will allow you to run a paravirtualized clock
@@ -417,6 +411,10 @@ config PARAVIRT
over full virtualization. However, when run without a hypervisor
the kernel is theoretically slower and slightly larger.
+config PARAVIRT_CLOCK
+ bool
+ default n
+
endif
config MEMTEST_BOOTPARAM
@@ -968,8 +966,8 @@ config NUMA_EMU
number of nodes. This is only useful for debugging.
config NODES_SHIFT
- int "Max num nodes shift(1-15)"
- range 1 15 if X86_64
+ int "Max num nodes shift(1-9)"
+ range 1 9 if X86_64
default "6" if X86_64
default "4" if X86_NUMAQ
default "3"
@@ -1515,13 +1513,13 @@ config PCI_GOMMCONFIG
config PCI_GODIRECT
bool "Direct"
-config PCI_GOANY
- bool "Any"
-
config PCI_GOOLPC
bool "OLPC"
depends on OLPC
+config PCI_GOANY
+ bool "Any"
+
endchoice
config PCI_BIOS
@@ -1538,9 +1536,8 @@ config PCI_MMCONFIG
depends on X86_32 && PCI && ACPI && (PCI_GOMMCONFIG || PCI_GOANY)
config PCI_OLPC
- bool
- depends on PCI && PCI_GOOLPC
- default y
+ def_bool y
+ depends on PCI && OLPC && (PCI_GOOLPC || PCI_GOANY)
config PCI_DOMAINS
def_bool y
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index ac1e31ba479..18363374d51 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -6,15 +6,19 @@ config TRACE_IRQFLAGS_SUPPORT
source "lib/Kconfig.debug"
config NONPROMISC_DEVMEM
- bool "Disable promiscuous /dev/mem"
+ bool "Filter access to /dev/mem"
help
- The /dev/mem file by default only allows userspace access to PCI
- space and the BIOS code and data regions. This is sufficient for
- dosemu and X and all common users of /dev/mem. With this config
- option, you allow userspace access to all of memory, including
- kernel and userspace memory. Accidental access to this is
- obviously disasterous, but specific access can be used by people
- debugging the kernel.
+ If this option is left off, you allow userspace access to all
+ of memory, including kernel and userspace memory. Accidental
+ access to this is obviously disastrous, but specific access can
+ be used by people debugging the kernel.
+
+ If this option is switched on, the /dev/mem file only allows
+ userspace access to PCI space and the BIOS code and data regions.
+ This is sufficient for dosemu and X and all common users of
+ /dev/mem.
+
+ If in doubt, say Y.
config EARLY_PRINTK
bool "Early printk" if EMBEDDED
diff --git a/arch/x86/boot/a20.c b/arch/x86/boot/a20.c
index 90943f83e84..e01aafd03bd 100644
--- a/arch/x86/boot/a20.c
+++ b/arch/x86/boot/a20.c
@@ -115,8 +115,6 @@ static void enable_a20_fast(void)
int enable_a20(void)
{
- int loops = A20_ENABLE_LOOPS;
-
#if defined(CONFIG_X86_ELAN)
/* Elan croaks if we try to touch the KBC */
enable_a20_fast();
@@ -128,6 +126,7 @@ int enable_a20(void)
enable_a20_kbc();
return 0;
#else
+ int loops = A20_ENABLE_LOOPS;
while (loops--) {
/* First, check to see if A20 is already enabled
(legacy free, etc.) */
diff --git a/arch/x86/boot/printf.c b/arch/x86/boot/printf.c
index c1d00c0274c..50e47cdbddd 100644
--- a/arch/x86/boot/printf.c
+++ b/arch/x86/boot/printf.c
@@ -56,7 +56,7 @@ static char *number(char *str, long num, int base, int size, int precision,
if (type & LEFT)
type &= ~ZEROPAD;
if (base < 2 || base > 36)
- return 0;
+ return NULL;
c = (type & ZEROPAD) ? '0' : ' ';
sign = 0;
if (type & SIGN) {
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5e618c3b472..77807d4769c 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -82,6 +82,7 @@ obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
obj-$(CONFIG_KVM_GUEST) += kvm.o
obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
+obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index c49ebcc6c41..33c5216fd3e 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -242,12 +242,19 @@ static int __init acpi_parse_madt(struct acpi_table_header *table)
static void __cpuinit acpi_register_lapic(int id, u8 enabled)
{
+ unsigned int ver = 0;
+
if (!enabled) {
++disabled_cpus;
return;
}
- generic_processor_info(id, 0);
+#ifdef CONFIG_X86_32
+ if (boot_cpu_physical_apicid != -1U)
+ ver = apic_version[boot_cpu_physical_apicid];
+#endif
+
+ generic_processor_info(id, ver);
}
static int __init
@@ -767,8 +774,13 @@ static void __init acpi_register_lapic_address(unsigned long address)
mp_lapic_addr = address;
set_fixmap_nocache(FIX_APIC_BASE, address);
- if (boot_cpu_physical_apicid == -1U)
+ if (boot_cpu_physical_apicid == -1U) {
boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id());
+#ifdef CONFIG_X86_32
+ apic_version[boot_cpu_physical_apicid] =
+ GET_APIC_VERSION(apic_read(APIC_LVR));
+#endif
+ }
}
static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S
index f9b77fb37e5..3355973b12a 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.S
@@ -5,6 +5,7 @@
#include <asm/msr-index.h>
#include <asm/page.h>
#include <asm/pgtable.h>
+#include <asm/processor-flags.h>
.code16
.section ".header", "a"
@@ -24,6 +25,11 @@ pmode_gdt: .quad 0
realmode_flags: .long 0
real_magic: .long 0
trampoline_segment: .word 0
+_pad1: .byte 0
+wakeup_jmp: .byte 0xea /* ljmpw */
+wakeup_jmp_off: .word 3f
+wakeup_jmp_seg: .word 0
+wakeup_gdt: .quad 0, 0, 0
signature: .long 0x51ee1111
.text
@@ -34,11 +40,34 @@ _start:
cli
cld
+ /* Apparently some dimwit BIOS programmers don't know how to
+ program a PM to RM transition, and we might end up here with
+ junk in the data segment descriptor registers. The only way
+ to repair that is to go into PM and fix it ourselves... */
+ movw $16, %cx
+ lgdtl %cs:wakeup_gdt
+ movl %cr0, %eax
+ orb $X86_CR0_PE, %al
+ movl %eax, %cr0
+ jmp 1f
+1: ljmpw $8, $2f
+2:
+ movw %cx, %ds
+ movw %cx, %es
+ movw %cx, %ss
+ movw %cx, %fs
+ movw %cx, %gs
+
+ andb $~X86_CR0_PE, %al
+ movl %eax, %cr0
+ jmp wakeup_jmp
+3:
/* Set up segments */
movw %cs, %ax
movw %ax, %ds
movw %ax, %es
movw %ax, %ss
+ lidtl wakeup_idt
movl $wakeup_stack_end, %esp
@@ -98,7 +127,14 @@ bogus_real_magic:
jmp 1b
.data
- .balign 4
+ .balign 8
+
+ /* This is the standard real-mode IDT */
+wakeup_idt:
+ .word 0xffff /* limit */
+ .long 0 /* address */
+ .word 0
+
.globl HEAP, heap_end
HEAP:
.long wakeup_heap
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.h b/arch/x86/kernel/acpi/realmode/wakeup.h
index ef8166fe802..69d38d0b2b6 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.h
+++ b/arch/x86/kernel/acpi/realmode/wakeup.h
@@ -24,6 +24,11 @@ struct wakeup_header {
u32 realmode_flags;
u32 real_magic;
u16 trampoline_segment; /* segment with trampoline code, 64-bit only */
+ u8 _pad1;
+ u8 wakeup_jmp;
+ u16 wakeup_jmp_off;
+ u16 wakeup_jmp_seg;
+ u64 wakeup_gdt[3];
u32 signature; /* To check we have correct structure */
} __attribute__((__packed__));
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index afc25ee9964..36af01f029e 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -50,6 +50,20 @@ int acpi_save_state_mem(void)
header->video_mode = saved_video_mode;
+ header->wakeup_jmp_seg = acpi_wakeup_address >> 4;
+ /* GDT[0]: GDT self-pointer */
+ header->wakeup_gdt[0] =
+ (u64)(sizeof(header->wakeup_gdt) - 1) +
+ ((u64)(acpi_wakeup_address +
+ ((char *)&header->wakeup_gdt - (char *)acpi_realmode))
+ << 16);
+ /* GDT[1]: real-mode-like code segment */
+ header->wakeup_gdt[1] = (0x009bULL << 40) +
+ ((u64)acpi_wakeup_address << 16) + 0xffff;
+ /* GDT[2]: real-mode-like data segment */
+ header->wakeup_gdt[2] = (0x0093ULL << 40) +
+ ((u64)acpi_wakeup_address << 16) + 0xffff;
+
#ifndef CONFIG_64BIT
store_gdt((struct desc_ptr *)&header->pmode_gdt);
@@ -111,7 +125,7 @@ void __init acpi_reserve_bootmem(void)
return;
}
- acpi_wakeup_address = acpi_realmode;
+ acpi_wakeup_address = virt_to_phys((void *)acpi_realmode);
}
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index 5910020c3f2..0633cfd0dc2 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -534,7 +534,7 @@ int setup_profiling_timer(unsigned int multiplier)
*/
void clear_local_APIC(void)
{
- int maxlvt = lapic_get_maxlvt();
+ int maxlvt;
u32 v;
/* APIC hasn't been mapped yet */
diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c
index af4a867a097..777a7ff075d 100644
--- a/arch/x86/kernel/cpu/cpufreq/longrun.c
+++ b/arch/x86/kernel/cpu/cpufreq/longrun.c
@@ -245,7 +245,7 @@ static unsigned int __init longrun_determine_freqs(unsigned int *low_freq,
if ((ecx > 95) || (ecx == 0) || (eax < ebx))
return -EIO;
- edx = (eax - ebx) / (100 - ecx);
+ edx = ((eax - ebx) * 100) / (100 - ecx);
*low_freq = edx * 1000; /* back to kHz */
dprintk("low frequency is %u kHz\n", *low_freq);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 06d6eea5e07..c45ca6d4dce 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -1127,12 +1127,23 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
* an UP version, and is deprecated by AMD.
*/
if (num_online_cpus() != 1) {
- printk(KERN_ERR PFX "MP systems not supported by PSB BIOS structure\n");
+#ifndef CONFIG_ACPI_PROCESSOR
+ printk(KERN_ERR PFX "ACPI Processor support is required "
+ "for SMP systems but is absent. Please load the "
+ "ACPI Processor module before starting this "
+ "driver.\n");
+#else
+ printk(KERN_ERR PFX "Your BIOS does not provide ACPI "
+ "_PSS objects in a way that Linux understands. "
+ "Please report this to the Linux ACPI maintainers"
+ " and complain to your BIOS vendor.\n");
+#endif
kfree(data);
return -ENODEV;
}
if (pol->cpu != 0) {
- printk(KERN_ERR PFX "No _PSS objects for CPU other than CPU0\n");
+ printk(KERN_ERR PFX "No ACPI _PSS objects for CPU other than "
+ "CPU0. Complain to your BIOS vendor.\n");
kfree(data);
return -ENODEV;
}
diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c
index 5d23d85624d..4b63c8e1f13 100644
--- a/arch/x86/kernel/efi_32.c
+++ b/arch/x86/kernel/efi_32.c
@@ -49,13 +49,13 @@ void efi_call_phys_prelog(void)
local_irq_save(efi_rt_eflags);
/*
- * If I don't have PSE, I should just duplicate two entries in page
- * directory. If I have PSE, I just need to duplicate one entry in
+ * If I don't have PAE, I should just duplicate two entries in page
+ * directory. If I have PAE, I just need to duplicate one entry in
* page directory.
*/
cr4 = read_cr4();
- if (cr4 & X86_CR4_PSE) {
+ if (cr4 & X86_CR4_PAE) {
efi_bak_pg_dir_pointer[0].pgd =
swapper_pg_dir[pgd_index(0)].pgd;
swapper_pg_dir[0].pgd =
@@ -93,7 +93,7 @@ void efi_call_phys_epilog(void)
cr4 = read_cr4();
- if (cr4 & X86_CR4_PSE) {
+ if (cr4 & X86_CR4_PAE) {
swapper_pg_dir[pgd_index(0)].pgd =
efi_bak_pg_dir_pointer[0].pgd;
} else {
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 2a609dc3271..c778e4fa55a 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -248,6 +248,7 @@ ENTRY(resume_userspace)
DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
# setting need_resched or sigpending
# between sampling and the iret
+ TRACE_IRQS_OFF
movl TI_flags(%ebp), %ecx
andl $_TIF_WORK_MASK, %ecx # is there any work to be done on
# int/exception return?
diff --git a/arch/x86/kernel/geode_32.c b/arch/x86/kernel/geode_32.c
index e8edd63ab00..9b08e852fd1 100644
--- a/arch/x86/kernel/geode_32.c
+++ b/arch/x86/kernel/geode_32.c
@@ -166,6 +166,8 @@ int geode_has_vsa2(void)
static int has_vsa2 = -1;
if (has_vsa2 == -1) {
+ u16 val;
+
/*
* The VSA has virtual registers that we can query for a
* signature.
@@ -173,7 +175,8 @@ int geode_has_vsa2(void)
outw(VSA_VR_UNLOCK, VSA_VRC_INDEX);
outw(VSA_VR_SIGNATURE, VSA_VRC_INDEX);
- has_vsa2 = (inw(VSA_VRC_DATA) == VSA_SIG);
+ val = inw(VSA_VRC_DATA);
+ has_vsa2 = (val == AMD_VSA_SIG || val == GSW_VSA_SIG);
}
return has_vsa2;
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index b2cc73768a9..f7357cc0162 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -189,7 +189,7 @@ default_entry:
* this stage.
*/
-#define KPMDS ((0x100000000-__PAGE_OFFSET) >> 30) /* Number of kernel PMDs */
+#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */
xorl %ebx,%ebx /* %ebx is kept at zero */
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 10a1955bb1d..b817974ef94 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -128,7 +128,7 @@ ident_complete:
/* Fixup phys_base */
addq %rbp, phys_base(%rip)
-#ifdef CONFIG_SMP
+#ifdef CONFIG_X86_TRAMPOLINE
addq %rbp, trampoline_level4_pgt + 0(%rip)
addq %rbp, trampoline_level4_pgt + (511*8)(%rip)
#endif
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index e03cc952f23..95e80e5033c 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -56,6 +56,11 @@ void __cpuinit mxcsr_feature_mask_init(void)
void __init init_thread_xstate(void)
{
+ if (!HAVE_HWFP) {
+ xstate_size = sizeof(struct i387_soft_struct);
+ return;
+ }
+
if (cpu_has_fxsr)
xstate_size = sizeof(struct i387_fxsave_struct);
#ifdef CONFIG_X86_32
@@ -94,7 +99,7 @@ void __cpuinit fpu_init(void)
int init_fpu(struct task_struct *tsk)
{
if (tsk_used_math(tsk)) {
- if (tsk == current)
+ if (HAVE_HWFP && tsk == current)
unlazy_fpu(tsk);
return 0;
}
@@ -109,6 +114,15 @@ int init_fpu(struct task_struct *tsk)
return -ENOMEM;
}
+#ifdef CONFIG_X86_32
+ if (!HAVE_HWFP) {
+ memset(tsk->thread.xstate, 0, xstate_size);
+ finit();
+ set_stopped_child_used_math(tsk);
+ return 0;
+ }
+#endif
+
if (cpu_has_fxsr) {
struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
@@ -148,7 +162,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
int ret;
if (!cpu_has_fxsr)
- return -ENODEV;
+ return -EIO;
ret = init_fpu(target);
if (ret)
@@ -165,7 +179,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
int ret;
if (!cpu_has_fxsr)
- return -ENODEV;
+ return -EIO;
ret = init_fpu(target);
if (ret)
@@ -330,13 +344,13 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
struct user_i387_ia32_struct env;
int ret;
- if (!HAVE_HWFP)
- return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf);
-
ret = init_fpu(target);
if (ret)
return ret;
+ if (!HAVE_HWFP)
+ return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf);
+
if (!cpu_has_fxsr) {
return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
&target->thread.xstate->fsave, 0,
@@ -360,15 +374,15 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
struct user_i387_ia32_struct env;
int ret;
- if (!HAVE_HWFP)
- return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
-
ret = init_fpu(target);
if (ret)
return ret;
set_stopped_child_used_math(target);
+ if (!HAVE_HWFP)
+ return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
+
if (!cpu_has_fxsr) {
return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
&target->thread.xstate->fsave, 0, -1);
@@ -474,18 +488,18 @@ static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf)
int restore_i387_ia32(struct _fpstate_ia32 __user *buf)
{
int err;
+ struct task_struct *tsk = current;
- if (HAVE_HWFP) {
- struct task_struct *tsk = current;
-
+ if (HAVE_HWFP)
clear_fpu(tsk);
- if (!used_math()) {
- err = init_fpu(tsk);
- if (err)
- return err;
- }
+ if (!used_math()) {
+ err = init_fpu(tsk);
+ if (err)
+ return err;
+ }
+ if (HAVE_HWFP) {
if (cpu_has_fxsr)
err = restore_i387_fxsave(buf);
else
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index a40d54fc1fd..4dc8600d9d2 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -2130,14 +2130,10 @@ static inline void __init check_timer(void)
{
int apic1, pin1, apic2, pin2;
int vector;
- unsigned int ver;
unsigned long flags;
local_irq_save(flags);
- ver = apic_read(APIC_LVR);
- ver = GET_APIC_VERSION(ver);
-
/*
* get/set the timer IRQ vector:
*/
@@ -2150,15 +2146,11 @@ static inline void __init check_timer(void)
* mode for the 8259A whenever interrupts are routed
* through I/O APICs. Also IRQ0 has to be enabled in
* the 8259A which implies the virtual wire has to be
- * disabled in the local APIC. Finally timer interrupts
- * need to be acknowledged manually in the 8259A for
- * timer_interrupt() and for the i82489DX when using
- * the NMI watchdog.
+ * disabled in the local APIC.
*/
apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
init_8259A(1);
- timer_ack = !cpu_has_tsc;
- timer_ack |= (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
+ timer_ack = 1;
if (timer_over_8254 > 0)
enable_8259A_irq(0);
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 4bc1be5d547..87edf1ceb1d 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -18,6 +18,7 @@
#include <linux/clocksource.h>
#include <linux/kvm_para.h>
+#include <asm/pvclock.h>
#include <asm/arch_hooks.h>
#include <asm/msr.h>
#include <asm/apic.h>
@@ -36,83 +37,47 @@ static int parse_no_kvmclock(char *arg)
early_param("no-kvmclock", parse_no_kvmclock);
/* The hypervisor will put information about time periodically here */
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct kvm_vcpu_time_info, hv_clock);
-#define get_clock(cpu, field) per_cpu(hv_clock, cpu).field
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct pvclock_vcpu_time_info, hv_clock);
+static struct pvclock_wall_clock wall_clock;
-static inline u64 kvm_get_delta(u64 last_tsc)
-{
- int cpu = smp_processor_id();
- u64 delta = native_read_tsc() - last_tsc;
- return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE;
-}
-
-static struct kvm_wall_clock wall_clock;
-static cycle_t kvm_clock_read(void);
/*
* The wallclock is the time of day when we booted. Since then, some time may
* have elapsed since the hypervisor wrote the data. So we try to account for
* that with system time
*/
-unsigned long kvm_get_wallclock(void)
+static unsigned long kvm_get_wallclock(void)
{
- u32 wc_sec, wc_nsec;
- u64 delta;
+ struct pvclock_vcpu_time_info *vcpu_time;
struct timespec ts;
- int version, nsec;
int low, high;
low = (int)__pa(&wall_clock);
high = ((u64)__pa(&wall_clock) >> 32);
+ native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
- delta = kvm_clock_read();
+ vcpu_time = &get_cpu_var(hv_clock);
+ pvclock_read_wallclock(&wall_clock, vcpu_time, &ts);
+ put_cpu_var(hv_clock);
- native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
- do {
- version = wall_clock.wc_version;
- rmb();
- wc_sec = wall_clock.wc_sec;
- wc_nsec = wall_clock.wc_nsec;
- rmb();
- } while ((wall_clock.wc_version != version) || (version & 1));
-
- delta = kvm_clock_read() - delta;
- delta += wc_nsec;
- nsec = do_div(delta, NSEC_PER_SEC);
- set_normalized_timespec(&ts, wc_sec + delta, nsec);
- /*
- * Of all mechanisms of time adjustment I've tested, this one
- * was the champion!
- */
- return ts.tv_sec + 1;
+ return ts.tv_sec;
}
-int kvm_set_wallclock(unsigned long now)
+static int kvm_set_wallclock(unsigned long now)
{
- return 0;
+ return -1;
}
-/*
- * This is our read_clock function. The host puts an tsc timestamp each time
- * it updates a new time. Without the tsc adjustment, we can have a situation
- * in which a vcpu starts to run earlier (smaller system_time), but probes
- * time later (compared to another vcpu), leading to backwards time
- */
static cycle_t kvm_clock_read(void)
{
- u64 last_tsc, now;
- int cpu;
+ struct pvclock_vcpu_time_info *src;
+ cycle_t ret;
- preempt_disable();
- cpu = smp_processor_id();
-
- last_tsc = get_clock(cpu, tsc_timestamp);
- now = get_clock(cpu, system_time);
-
- now += kvm_get_delta(last_tsc);
- preempt_enable();
-
- return now;
+ src = &get_cpu_var(hv_clock);
+ ret = pvclock_clocksource_read(src);
+ put_cpu_var(hv_clock);
+ return ret;
}
+
static struct clocksource kvm_clock = {
.name = "kvm-clock",
.read = kvm_clock_read,
@@ -123,13 +88,14 @@ static struct clocksource kvm_clock = {
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
-static int kvm_register_clock(void)
+static int kvm_register_clock(char *txt)
{
int cpu = smp_processor_id();
int low, high;
low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1;
high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
-
+ printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
+ cpu, high, low, txt);
return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high);
}
@@ -140,12 +106,20 @@ static void kvm_setup_secondary_clock(void)
* Now that the first cpu already had this clocksource initialized,
* we shouldn't fail.
*/
- WARN_ON(kvm_register_clock());
+ WARN_ON(kvm_register_clock("secondary cpu clock"));
/* ok, done with our trickery, call native */
setup_secondary_APIC_clock();
}
#endif
+#ifdef CONFIG_SMP
+void __init kvm_smp_prepare_boot_cpu(void)
+{
+ WARN_ON(kvm_register_clock("primary cpu clock"));
+ native_smp_prepare_boot_cpu();
+}
+#endif
+
/*
* After the clock is registered, the host will keep writing to the
* registered memory location. If the guest happens to shutdown, this memory
@@ -174,7 +148,7 @@ void __init kvmclock_init(void)
return;
if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
- if (kvm_register_clock())
+ if (kvm_register_clock("boot clock"))
return;
pv_time_ops.get_wallclock = kvm_get_wallclock;
pv_time_ops.set_wallclock = kvm_set_wallclock;
@@ -182,6 +156,9 @@ void __init kvmclock_init(void)
#ifdef CONFIG_X86_LOCAL_APIC
pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
#endif
+#ifdef CONFIG_SMP
+ smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
+#endif
machine_ops.shutdown = kvm_shutdown;
#ifdef CONFIG_KEXEC
machine_ops.crash_shutdown = kvm_crash_shutdown;
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
index 3cad17fe026..07c0f828f48 100644
--- a/arch/x86/kernel/mfgpt_32.c
+++ b/arch/x86/kernel/mfgpt_32.c
@@ -155,6 +155,7 @@ int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable)
wrmsr(msr, value, dummy);
return 0;
}
+EXPORT_SYMBOL_GPL(geode_mfgpt_toggle_event);
int geode_mfgpt_set_irq(int timer, int cmp, int irq, int enable)
{
@@ -222,6 +223,7 @@ int geode_mfgpt_alloc_timer(int timer, int domain)
/* No timers available - too bad */
return -1;
}
+EXPORT_SYMBOL_GPL(geode_mfgpt_alloc_timer);
#ifdef CONFIG_GEODE_MFGPT_TIMER
diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c
index 11b14bbaa61..84160f74eeb 100644
--- a/arch/x86/kernel/nmi_32.c
+++ b/arch/x86/kernel/nmi_32.c
@@ -26,7 +26,6 @@
#include <asm/smp.h>
#include <asm/nmi.h>
-#include <asm/timer.h>
#include "mach_traps.h"
@@ -82,7 +81,7 @@ int __init check_nmi_watchdog(void)
prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
if (!prev_nmi_count)
- goto error;
+ return -1;
printk(KERN_INFO "Testing NMI watchdog ... ");
@@ -119,7 +118,7 @@ int __init check_nmi_watchdog(void)
if (!atomic_read(&nmi_active)) {
kfree(prev_nmi_count);
atomic_set(&nmi_active, -1);
- goto error;
+ return -1;
}
printk("OK.\n");
@@ -130,10 +129,6 @@ int __init check_nmi_watchdog(void)
kfree(prev_nmi_count);
return 0;
-error:
- timer_ack = !cpu_has_tsc;
-
- return -1;
}
static int __init setup_nmi_watchdog(char *str)
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index c5ef1af8e79..dc00a1331ac 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -378,6 +378,7 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
struct page *page;
unsigned long dma_mask = 0;
dma_addr_t bus;
+ int noretry = 0;
/* ignore region specifiers */
gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
@@ -397,20 +398,25 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
if (dev->dma_mask == NULL)
return NULL;
- /* Don't invoke OOM killer */
- gfp |= __GFP_NORETRY;
+ /* Don't invoke OOM killer or retry in lower 16MB DMA zone */
+ if (gfp & __GFP_DMA)
+ noretry = 1;
#ifdef CONFIG_X86_64
/* Why <=? Even when the mask is smaller than 4GB it is often
larger than 16MB and in this case we have a chance of
finding fitting memory in the next higher zone first. If
not retry with true GFP_DMA. -AK */
- if (dma_mask <= DMA_32BIT_MASK && !(gfp & GFP_DMA))
+ if (dma_mask <= DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
gfp |= GFP_DMA32;
+ if (dma_mask < DMA_32BIT_MASK)
+ noretry = 1;
+ }
#endif
again:
- page = dma_alloc_pages(dev, gfp, get_order(size));
+ page = dma_alloc_pages(dev,
+ noretry ? gfp | __GFP_NORETRY : gfp, get_order(size));
if (page == NULL)
return NULL;
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index c07455d1695..aa8ec928caa 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -26,6 +26,7 @@
#include <linux/kdebug.h>
#include <linux/scatterlist.h>
#include <linux/iommu-helper.h>
+#include <linux/sysdev.h>
#include <asm/atomic.h>
#include <asm/io.h>
#include <asm/mtrr.h>
@@ -548,6 +549,28 @@ static __init unsigned read_aperture(struct pci_dev *dev, u32 *size)
return aper_base;
}
+static int gart_resume(struct sys_device *dev)
+{
+ return 0;
+}
+
+static int gart_suspend(struct sys_device *dev, pm_message_t state)
+{
+ return -EINVAL;
+}
+
+static struct sysdev_class gart_sysdev_class = {
+ .name = "gart",
+ .suspend = gart_suspend,
+ .resume = gart_resume,
+
+};
+
+static struct sys_device device_gart = {
+ .id = 0,
+ .cls = &gart_sysdev_class,
+};
+
/*
* Private Northbridge GATT initialization in case we cannot use the
* AGP driver for some reason.
@@ -558,7 +581,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
unsigned aper_base, new_aper_base;
struct pci_dev *dev;
void *gatt;
- int i;
+ int i, error;
printk(KERN_INFO "PCI-DMA: Disabling AGP.\n");
aper_size = aper_base = info->aper_size = 0;
@@ -606,6 +629,12 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
pci_write_config_dword(dev, 0x90, ctl);
}
+
+ error = sysdev_class_register(&gart_sysdev_class);
+ if (!error)
+ error = sysdev_register(&device_gart);
+ if (error)
+ panic("Could not register gart_sysdev -- would corrupt data on next suspend");
flush_gart();
printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n",
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index f8476dfbb60..e2db9ac5c61 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -333,6 +333,7 @@ void flush_thread(void)
/*
* Forget coprocessor state..
*/
+ tsk->fpu_counter = 0;
clear_fpu(tsk);
clear_used_math();
}
@@ -649,8 +650,11 @@ struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct
/* If the task has used fpu the last 5 timeslices, just do a full
* restore of the math state immediately to avoid the trap; the
* chances of needing FPU soon are obviously high now
+ *
+ * tsk_used_math() checks prevent calling math_state_restore(),
+ * which can sleep in the case of !tsk_used_math()
*/
- if (next_p->fpu_counter > 5)
+ if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
math_state_restore();
/*
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index e2319f39988..c6eb5c91e5f 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -294,6 +294,7 @@ void flush_thread(void)
/*
* Forget coprocessor state..
*/
+ tsk->fpu_counter = 0;
clear_fpu(tsk);
clear_used_math();
}
@@ -658,8 +659,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/* If the task has used fpu the last 5 timeslices, just do a full
* restore of the math state immediately to avoid the trap; the
* chances of needing FPU soon are obviously high now
+ *
+ * tsk_used_math() checks prevent calling math_state_restore(),
+ * which can sleep in the case of !tsk_used_math()
*/
- if (next_p->fpu_counter>5)
+ if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
math_state_restore();
return prev_p;
}
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
new file mode 100644
index 00000000000..05fbe9a0325
--- /dev/null
+++ b/arch/x86/kernel/pvclock.c
@@ -0,0 +1,141 @@
+/* paravirtual clock -- common code used by kvm/xen
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <asm/pvclock.h>
+
+/*
+ * These are perodically updated
+ * xen: magic shared_info page
+ * kvm: gpa registered via msr
+ * and then copied here.
+ */
+struct pvclock_shadow_time {
+ u64 tsc_timestamp; /* TSC at last update of time vals. */
+ u64 system_timestamp; /* Time, in nanosecs, since boot. */
+ u32 tsc_to_nsec_mul;
+ int tsc_shift;
+ u32 version;
+};
+
+/*
+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
+ * yielding a 64-bit result.
+ */
+static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
+{
+ u64 product;
+#ifdef __i386__
+ u32 tmp1, tmp2;
+#endif
+
+ if (shift < 0)
+ delta >>= -shift;
+ else
+ delta <<= shift;
+
+#ifdef __i386__
+ __asm__ (
+ "mul %5 ; "
+ "mov %4,%%eax ; "
+ "mov %%edx,%4 ; "
+ "mul %5 ; "
+ "xor %5,%5 ; "
+ "add %4,%%eax ; "
+ "adc %5,%%edx ; "
+ : "=A" (product), "=r" (tmp1), "=r" (tmp2)
+ : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
+#elif __x86_64__
+ __asm__ (
+ "mul %%rdx ; shrd $32,%%rdx,%%rax"
+ : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
+#else
+#error implement me!
+#endif
+
+ return product;
+}
+
+static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
+{
+ u64 delta = native_read_tsc() - shadow->tsc_timestamp;
+ return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
+}
+
+/*
+ * Reads a consistent set of time-base values from hypervisor,
+ * into a shadow data area.
+ */
+static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
+ struct pvclock_vcpu_time_info *src)
+{
+ do {
+ dst->version = src->version;
+ rmb(); /* fetch version before data */
+ dst->tsc_timestamp = src->tsc_timestamp;
+ dst->system_timestamp = src->system_time;
+ dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
+ dst->tsc_shift = src->tsc_shift;
+ rmb(); /* test version after fetching data */
+ } while ((src->version & 1) || (dst->version != src->version));
+
+ return dst->version;
+}
+
+cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
+{
+ struct pvclock_shadow_time shadow;
+ unsigned version;
+ cycle_t ret, offset;
+
+ do {
+ version = pvclock_get_time_values(&shadow, src);
+ barrier();
+ offset = pvclock_get_nsec_offset(&shadow);
+ ret = shadow.system_timestamp + offset;
+ barrier();
+ } while (version != src->version);
+
+ return ret;
+}
+
+void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
+ struct pvclock_vcpu_time_info *vcpu_time,
+ struct timespec *ts)
+{
+ u32 version;
+ u64 delta;
+ struct timespec now;
+
+ /* get wallclock at system boot */
+ do {
+ version = wall_clock->version;
+ rmb(); /* fetch version before time */
+ now.tv_sec = wall_clock->sec;
+ now.tv_nsec = wall_clock->nsec;
+ rmb(); /* fetch time before checking version */
+ } while ((wall_clock->version & 1) || (version != wall_clock->version));
+
+ delta = pvclock_clocksource_read(vcpu_time); /* time since system boot */
+ delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
+
+ now.tv_nsec = do_div(delta, NSEC_PER_SEC);
+ now.tv_sec = delta;
+
+ set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
+}
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 9615eee9b77..05191bbc68b 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -4,6 +4,8 @@
#include <linux/acpi.h>
#include <linux/bcd.h>
#include <linux/mc146818rtc.h>
+#include <linux/platform_device.h>
+#include <linux/pnp.h>
#include <asm/time.h>
#include <asm/vsyscall.h>
@@ -197,3 +199,35 @@ unsigned long long native_read_tsc(void)
}
EXPORT_SYMBOL(native_read_tsc);
+
+static struct resource rtc_resources[] = {
+ [0] = {
+ .start = RTC_PORT(0),
+ .end = RTC_PORT(1),
+ .flags = IORESOURCE_IO,
+ },
+ [1] = {
+ .start = RTC_IRQ,
+ .end = RTC_IRQ,
+ .flags = IORESOURCE_IRQ,
+ }
+};
+
+static struct platform_device rtc_device = {
+ .name = "rtc_cmos",
+ .id = -1,
+ .resource = rtc_resources,
+ .num_resources = ARRAY_SIZE(rtc_resources),
+};
+
+static __init int add_rtc_cmos(void)
+{
+#ifdef CONFIG_PNP
+ if (!pnp_platform_devices)
+ platform_device_register(&rtc_device);
+#else
+ platform_device_register(&rtc_device);
+#endif /* CONFIG_PNP */
+ return 0;
+}
+device_initcall(add_rtc_cmos);
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 2c5f8b213e8..5a2f8e06388 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -532,10 +532,16 @@ static void __init reserve_crashkernel(void)
(unsigned long)(crash_size >> 20),
(unsigned long)(crash_base >> 20),
(unsigned long)(total_mem >> 20));
+
+ if (reserve_bootmem(crash_base, crash_size,
+ BOOTMEM_EXCLUSIVE) < 0) {
+ printk(KERN_INFO "crashkernel reservation "
+ "failed - memory is in use\n");
+ return;
+ }
+
crashk_res.start = crash_base;
crashk_res.end = crash_base + crash_size - 1;
- reserve_bootmem(crash_base, crash_size,
- BOOTMEM_DEFAULT);
} else
printk(KERN_INFO "crashkernel reservation failed - "
"you have to specify a base address\n");
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index fff8ebaa554..197300bfebd 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -996,7 +996,6 @@ do_rest:
#endif
cpu_clear(cpu, cpu_callout_map); /* was set by do_boot_cpu() */
cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */
- cpu_clear(cpu, cpu_possible_map);
cpu_clear(cpu, cpu_present_map);
per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID;
}
@@ -1190,6 +1189,7 @@ static void __init smp_cpu_index_default(void)
*/
void __init native_smp_prepare_cpus(unsigned int max_cpus)
{
+ preempt_disable();
nmi_watchdog_default();
smp_cpu_index_default();
current_cpu_data = boot_cpu_data;
@@ -1206,7 +1206,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
if (smp_sanity_check(max_cpus) < 0) {
printk(KERN_INFO "SMP disabled\n");
disable_smp();
- return;
+ goto out;
}
preempt_disable();
@@ -1246,6 +1246,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
printk(KERN_INFO "CPU%d: ", 0);
print_cpu_info(&cpu_data(0));
setup_boot_clock();
+out:
+ preempt_enable();
}
/*
* Early setup to make printk work.
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index bde6f63e15d..08d752de4ee 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -544,6 +544,7 @@ vm86_trap:
#define DO_ERROR(trapnr, signr, str, name) \
void do_##name(struct pt_regs *regs, long error_code) \
{ \
+ trace_hardirqs_fixup(); \
if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
== NOTIFY_STOP) \
return; \
diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c
index e4790728b22..65b70637ad9 100644
--- a/arch/x86/kernel/tsc_32.c
+++ b/arch/x86/kernel/tsc_32.c
@@ -14,7 +14,10 @@
#include "mach_timer.h"
-static int tsc_enabled;
+/* native_sched_clock() is called before tsc_init(), so
+ we must start with the TSC soft disabled to prevent
+ erroneous rdtsc usage on !cpu_has_tsc processors */
+static int tsc_disabled = -1;
/*
* On some systems the TSC frequency does not
@@ -28,8 +31,8 @@ EXPORT_SYMBOL_GPL(tsc_khz);
static int __init tsc_setup(char *str)
{
printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
- "cannot disable TSC completely.\n");
- mark_tsc_unstable("user disabled TSC");
+ "cannot disable TSC completely.\n");
+ tsc_disabled = 1;
return 1;
}
#else
@@ -120,7 +123,7 @@ unsigned long long native_sched_clock(void)
* very important for it to be as fast as the platform
* can achive it. )
*/
- if (unlikely(!tsc_enabled && !tsc_unstable))
+ if (unlikely(tsc_disabled))
/* No locking but a rare wrong value is not a big deal: */
return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
@@ -322,7 +325,6 @@ void mark_tsc_unstable(char *reason)
{
if (!tsc_unstable) {
tsc_unstable = 1;
- tsc_enabled = 0;
printk("Marking TSC unstable due to: %s.\n", reason);
/* Can be called before registration */
if (clocksource_tsc.mult)
@@ -336,7 +338,7 @@ EXPORT_SYMBOL_GPL(mark_tsc_unstable);
static int __init dmi_mark_tsc_unstable(const struct dmi_system_id *d)
{
printk(KERN_NOTICE "%s detected: marking TSC unstable.\n",
- d->ident);
+ d->ident);
tsc_unstable = 1;
return 0;
}
@@ -403,7 +405,7 @@ void __init tsc_init(void)
{
int cpu;
- if (!cpu_has_tsc)
+ if (!cpu_has_tsc || tsc_disabled > 0)
return;
cpu_khz = calculate_cpu_khz();
@@ -414,6 +416,9 @@ void __init tsc_init(void)
return;
}
+ /* now allow native_sched_clock() to use rdtsc */
+ tsc_disabled = 0;
+
printk("Detected %lu.%03lu MHz processor.\n",
(unsigned long)cpu_khz / 1000,
(unsigned long)cpu_khz % 1000);
@@ -441,8 +446,6 @@ void __init tsc_init(void)
if (check_tsc_unstable()) {
clocksource_tsc.rating = 0;
clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
- } else
- tsc_enabled = 1;
-
+ }
clocksource_register(&clocksource_tsc);
}
diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c
index fcc16e58609..1784b8077a1 100644
--- a/arch/x86/kernel/tsc_64.c
+++ b/arch/x86/kernel/tsc_64.c
@@ -227,14 +227,14 @@ void __init tsc_calibrate(void)
/* hpet or pmtimer available ? */
if (!hpet && !pm1 && !pm2) {
printk(KERN_INFO "TSC calibrated against PIT\n");
- return;
+ goto out;
}
/* Check, whether the sampling was disturbed by an SMI */
if (tsc1 == ULONG_MAX || tsc2 == ULONG_MAX) {
printk(KERN_WARNING "TSC calibration disturbed by SMI, "
"using PIT calibration result\n");
- return;
+ goto out;
}
tsc2 = (tsc2 - tsc1) * 1000000L;
@@ -255,6 +255,7 @@ void __init tsc_calibrate(void)
tsc_khz = tsc2 / tsc1;
+out:
for_each_possible_cpu(cpu)
set_cyc2ns_scale(tsc_khz, cpu);
}
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 7c077a9d977..3829aa7b663 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -200,10 +200,12 @@ int __pit_timer_fn(struct kvm_kpit_state *ps)
atomic_inc(&pt->pending);
smp_mb__after_atomic_inc();
- /* FIXME: handle case where the guest is in guest mode */
- if (vcpu0 && waitqueue_active(&vcpu0->wq)) {
- vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE;
- wake_up_interruptible(&vcpu0->wq);
+ if (vcpu0) {
+ set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
+ if (waitqueue_active(&vcpu0->wq)) {
+ vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ wake_up_interruptible(&vcpu0->wq);
+ }
}
pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period);
@@ -237,6 +239,19 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
return HRTIMER_NORESTART;
}
+void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pit *pit = vcpu->kvm->arch.vpit;
+ struct hrtimer *timer;
+
+ if (vcpu->vcpu_id != 0 || !pit)
+ return;
+
+ timer = &pit->pit_state.pit_timer.timer;
+ if (hrtimer_cancel(timer))
+ hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS);
+}
+
static void destroy_pit_timer(struct kvm_kpit_timer *pt)
{
pr_debug("pit: execute del timer!\n");
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index ce1f583459b..76d736b5f66 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -94,3 +94,9 @@ void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
/* TODO: PIT, RTC etc. */
}
EXPORT_SYMBOL_GPL(kvm_timer_intr_post);
+
+void __kvm_migrate_timers(struct kvm_vcpu *vcpu)
+{
+ __kvm_migrate_apic_timer(vcpu);
+ __kvm_migrate_pit_timer(vcpu);
+}
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 1802134b836..2a15be2275c 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -84,6 +84,8 @@ void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
+void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu);
+void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
int pit_has_pending_timer(struct kvm_vcpu *vcpu);
int apic_has_pending_timer(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index c297c50eba6..ebc03f5ae16 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -940,6 +940,7 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
wait_queue_head_t *q = &apic->vcpu->wq;
atomic_inc(&apic->timer.pending);
+ set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests);
if (waitqueue_active(q)) {
apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
wake_up_interruptible(q);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 36c5406b181..7e7c3969f7a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -640,6 +640,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
rmap_remove(kvm, spte);
--kvm->stat.lpages;
set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+ spte = NULL;
write_protected = 1;
}
spte = rmap_next(kvm, rmapp, spte);
@@ -658,7 +659,7 @@ static int is_empty_shadow_page(u64 *spt)
u64 *end;
for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
- if (*pos != shadow_trap_nonpresent_pte) {
+ if (is_shadow_present_pte(*pos)) {
printk(KERN_ERR "%s: %p %llx\n", __func__,
pos, *pos);
return 0;
@@ -1082,10 +1083,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
struct kvm_mmu_page *shadow;
spte |= PT_WRITABLE_MASK;
- if (user_fault) {
- mmu_unshadow(vcpu->kvm, gfn);
- goto unshadowed;
- }
shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
if (shadow ||
@@ -1102,8 +1099,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
}
}
-unshadowed:
-
if (pte_access & ACC_WRITE_MASK)
mark_page_dirty(vcpu->kvm, gfn);
@@ -1580,11 +1575,13 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
u64 *spte,
const void *new)
{
- if ((sp->role.level != PT_PAGE_TABLE_LEVEL)
- && !vcpu->arch.update_pte.largepage) {
- ++vcpu->kvm->stat.mmu_pde_zapped;
- return;
- }
+ if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
+ if (!vcpu->arch.update_pte.largepage ||
+ sp->role.glevels == PT32_ROOT_LEVEL) {
+ ++vcpu->kvm->stat.mmu_pde_zapped;
+ return;
+ }
+ }
++vcpu->kvm->stat.mmu_pte_updated;
if (sp->role.glevels == PT32_ROOT_LEVEL)
@@ -1858,6 +1855,7 @@ static void free_mmu_pages(struct kvm_vcpu *vcpu)
sp = container_of(vcpu->kvm->arch.active_mmu_pages.next,
struct kvm_mmu_page, link);
kvm_mmu_zap_page(vcpu->kvm, sp);
+ cond_resched();
}
free_page((unsigned long)vcpu->arch.mmu.pae_root);
}
@@ -1996,7 +1994,7 @@ static struct shrinker mmu_shrinker = {
.seeks = DEFAULT_SEEKS * 10,
};
-void mmu_destroy_caches(void)
+static void mmu_destroy_caches(void)
{
if (pte_chain_cache)
kmem_cache_destroy(pte_chain_cache);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 156fe10288a..934c7b61939 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -418,7 +418,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
/* mmio */
if (is_error_pfn(pfn)) {
- pgprintk("gfn %x is mmio\n", walker.gfn);
+ pgprintk("gfn %lx is mmio\n", walker.gfn);
kvm_release_pfn_clean(pfn);
return 1;
}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ab22615eee8..6b0d5fa5bab 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -688,7 +688,7 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
delta = vcpu->arch.host_tsc - tsc_this;
svm->vmcb->control.tsc_offset += delta;
vcpu->cpu = cpu;
- kvm_migrate_apic_timer(vcpu);
+ kvm_migrate_timers(vcpu);
}
for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bfe4db11989..540e9517907 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -566,7 +566,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
load_transition_efer(vmx);
}
-static void vmx_load_host_state(struct vcpu_vmx *vmx)
+static void __vmx_load_host_state(struct vcpu_vmx *vmx)
{
unsigned long flags;
@@ -596,6 +596,13 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
reload_host_efer(vmx);
}
+static void vmx_load_host_state(struct vcpu_vmx *vmx)
+{
+ preempt_disable();
+ __vmx_load_host_state(vmx);
+ preempt_enable();
+}
+
/*
* Switches to specified vcpu, until a matching vcpu_put(), but assumes
* vcpu mutex is already taken.
@@ -608,7 +615,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (vcpu->cpu != cpu) {
vcpu_clear(vmx);
- kvm_migrate_apic_timer(vcpu);
+ kvm_migrate_timers(vcpu);
vpid_sync_vcpu_all(vmx);
}
@@ -654,7 +661,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
{
- vmx_load_host_state(to_vmx(vcpu));
+ __vmx_load_host_state(to_vmx(vcpu));
}
static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
@@ -884,11 +891,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
switch (msr_index) {
#ifdef CONFIG_X86_64
case MSR_EFER:
+ vmx_load_host_state(vmx);
ret = kvm_set_msr_common(vcpu, msr_index, data);
- if (vmx->host_state.loaded) {
- reload_host_efer(vmx);
- load_transition_efer(vmx);
- }
break;
case MSR_FS_BASE:
vmcs_writel(GUEST_FS_BASE, data);
@@ -910,11 +914,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
guest_write_tsc(data);
break;
default:
+ vmx_load_host_state(vmx);
msr = find_msr_entry(vmx, msr_index);
if (msr) {
msr->data = data;
- if (vmx->host_state.loaded)
- load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
break;
}
ret = kvm_set_msr_common(vcpu, msr_index, data);
@@ -1036,6 +1039,7 @@ static void hardware_enable(void *garbage)
static void hardware_disable(void *garbage)
{
asm volatile (ASM_VMX_VMXOFF : : : "cc");
+ write_cr4(read_cr4() & ~X86_CR4_VMXE);
}
static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 21338bdb28f..63a77caa59f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -492,8 +492,8 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
{
static int version;
- struct kvm_wall_clock wc;
- struct timespec wc_ts;
+ struct pvclock_wall_clock wc;
+ struct timespec now, sys, boot;
if (!wall_clock)
return;
@@ -502,10 +502,19 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
- wc_ts = current_kernel_time();
- wc.wc_sec = wc_ts.tv_sec;
- wc.wc_nsec = wc_ts.tv_nsec;
- wc.wc_version = version;
+ /*
+ * The guest calculates current wall clock time by adding
+ * system time (updated by kvm_write_guest_time below) to the
+ * wall clock specified here. guest system time equals host
+ * system time for us, thus we must fill in host boot time here.
+ */
+ now = current_kernel_time();
+ ktime_get_ts(&sys);
+ boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys));
+
+ wc.sec = boot.tv_sec;
+ wc.nsec = boot.tv_nsec;
+ wc.version = version;
kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
@@ -513,6 +522,45 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
}
+static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
+{
+ uint32_t quotient, remainder;
+
+ /* Don't try to replace with do_div(), this one calculates
+ * "(dividend << 32) / divisor" */
+ __asm__ ( "divl %4"
+ : "=a" (quotient), "=d" (remainder)
+ : "0" (0), "1" (dividend), "r" (divisor) );
+ return quotient;
+}
+
+static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
+{
+ uint64_t nsecs = 1000000000LL;
+ int32_t shift = 0;
+ uint64_t tps64;
+ uint32_t tps32;
+
+ tps64 = tsc_khz * 1000LL;
+ while (tps64 > nsecs*2) {
+ tps64 >>= 1;
+ shift--;
+ }
+
+ tps32 = (uint32_t)tps64;
+ while (tps32 <= (uint32_t)nsecs) {
+ tps32 <<= 1;
+ shift++;
+ }
+
+ hv_clock->tsc_shift = shift;
+ hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
+
+ pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
+ __FUNCTION__, tsc_khz, hv_clock->tsc_shift,
+ hv_clock->tsc_to_system_mul);
+}
+
static void kvm_write_guest_time(struct kvm_vcpu *v)
{
struct timespec ts;
@@ -523,6 +571,11 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
if ((!vcpu->time_page))
return;
+ if (unlikely(vcpu->hv_clock_tsc_khz != tsc_khz)) {
+ kvm_set_time_scale(tsc_khz, &vcpu->hv_clock);
+ vcpu->hv_clock_tsc_khz = tsc_khz;
+ }
+
/* Keep irq disabled to prevent changes to the clock */
local_irq_save(flags);
kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
@@ -537,14 +590,14 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
/*
* The interface expects us to write an even number signaling that the
* update is finished. Since the guest won't see the intermediate
- * state, we just write "2" at the end
+ * state, we just increase by 2 at the end.
*/
- vcpu->hv_clock.version = 2;
+ vcpu->hv_clock.version += 2;
shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
- sizeof(vcpu->hv_clock));
+ sizeof(vcpu->hv_clock));
kunmap_atomic(shared_kaddr, KM_USER0);
@@ -599,10 +652,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
/* ...but clean it before doing the actual write */
vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
- vcpu->arch.hv_clock.tsc_to_system_mul =
- clocksource_khz2mult(tsc_khz, 22);
- vcpu->arch.hv_clock.tsc_shift = 22;
-
down_read(&current->mm->mmap_sem);
vcpu->arch.time_page =
gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
@@ -2758,7 +2807,9 @@ again:
if (vcpu->requests) {
if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
- __kvm_migrate_apic_timer(vcpu);
+ __kvm_migrate_timers(vcpu);
+ if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
+ kvm_x86_ops->tlb_flush(vcpu);
if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
&vcpu->requests)) {
kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS;
@@ -2772,6 +2823,7 @@ again:
}
}
+ clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
kvm_inject_pending_timer_irqs(vcpu);
preempt_disable();
@@ -2781,21 +2833,13 @@ again:
local_irq_disable();
- if (need_resched()) {
+ if (vcpu->requests || need_resched()) {
local_irq_enable();
preempt_enable();
r = 1;
goto out;
}
- if (vcpu->requests)
- if (test_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) {
- local_irq_enable();
- preempt_enable();
- r = 1;
- goto out;
- }
-
if (signal_pending(current)) {
local_irq_enable();
preempt_enable();
@@ -2825,9 +2869,6 @@ again:
kvm_guest_enter();
- if (vcpu->requests)
- if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
- kvm_x86_ops->tlb_flush(vcpu);
KVMTRACE_0D(VMENTRY, vcpu, entryexit);
kvm_x86_ops->run(vcpu, kvm_run);
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 8a96320ab07..932f216d890 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -1727,7 +1727,8 @@ twobyte_insn:
if (rc)
goto done;
- kvm_emulate_hypercall(ctxt->vcpu);
+ /* Let the processor re-execute the fixed hypercall */
+ c->eip = ctxt->vcpu->arch.rip;
/* Disable writeback. */
c->dst.type = OP_NONE;
break;
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index af65b2da3ba..5c7e2fd5207 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -582,8 +582,9 @@ static void __init lguest_init_IRQ(void)
int vector = FIRST_EXTERNAL_VECTOR + i;
if (vector != SYSCALL_VECTOR) {
set_intr_gate(vector, interrupt[i]);
- set_irq_chip_and_handler(i, &lguest_irq_controller,
- handle_level_irq);
+ set_irq_chip_and_handler_name(i, &lguest_irq_controller,
+ handle_level_irq,
+ "level");
}
}
/* This call is required to set up for 4k stacks, where we have
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 70bebd31040..ee1c3f63515 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -217,19 +217,19 @@ ENTRY(copy_user_generic_unrolled)
/* table sorted by exception address */
.section __ex_table,"a"
.align 8
- .quad .Ls1,.Ls1e
- .quad .Ls2,.Ls2e
- .quad .Ls3,.Ls3e
- .quad .Ls4,.Ls4e
- .quad .Ld1,.Ls1e
+ .quad .Ls1,.Ls1e /* Ls1-Ls4 have copied zero bytes */
+ .quad .Ls2,.Ls1e
+ .quad .Ls3,.Ls1e
+ .quad .Ls4,.Ls1e
+ .quad .Ld1,.Ls1e /* Ld1-Ld4 have copied 0-24 bytes */
.quad .Ld2,.Ls2e
.quad .Ld3,.Ls3e
.quad .Ld4,.Ls4e
- .quad .Ls5,.Ls5e
- .quad .Ls6,.Ls6e
- .quad .Ls7,.Ls7e
- .quad .Ls8,.Ls8e
- .quad .Ld5,.Ls5e
+ .quad .Ls5,.Ls5e /* Ls5-Ls8 have copied 32 bytes */
+ .quad .Ls6,.Ls5e
+ .quad .Ls7,.Ls5e
+ .quad .Ls8,.Ls5e
+ .quad .Ld5,.Ls5e /* Ld5-Ld8 have copied 32-56 bytes */
.quad .Ld6,.Ls6e
.quad .Ld7,.Ls7e
.quad .Ld8,.Ls8e
@@ -244,11 +244,8 @@ ENTRY(copy_user_generic_unrolled)
.quad .Le5,.Le_zero
.previous
- /* compute 64-offset for main loop. 8 bytes accuracy with error on the
- pessimistic side. this is gross. it would be better to fix the
- interface. */
/* eax: zero, ebx: 64 */
-.Ls1e: addl $8,%eax
+.Ls1e: addl $8,%eax /* eax is bytes left uncopied within the loop (Ls1e: 64 .. Ls8e: 8) */
.Ls2e: addl $8,%eax
.Ls3e: addl $8,%eax
.Ls4e: addl $8,%eax
diff --git a/arch/x86/lib/copy_user_nocache_64.S b/arch/x86/lib/copy_user_nocache_64.S
index 5196762b3b0..9d3d1ab8376 100644
--- a/arch/x86/lib/copy_user_nocache_64.S
+++ b/arch/x86/lib/copy_user_nocache_64.S
@@ -145,19 +145,19 @@ ENTRY(__copy_user_nocache)
/* table sorted by exception address */
.section __ex_table,"a"
.align 8
- .quad .Ls1,.Ls1e
- .quad .Ls2,.Ls2e
- .quad .Ls3,.Ls3e
- .quad .Ls4,.Ls4e
- .quad .Ld1,.Ls1e
+ .quad .Ls1,.Ls1e /* .Ls[1-4] - 0 bytes copied */
+ .quad .Ls2,.Ls1e
+ .quad .Ls3,.Ls1e
+ .quad .Ls4,.Ls1e
+ .quad .Ld1,.Ls1e /* .Ld[1-4] - 0..24 bytes coped */
.quad .Ld2,.Ls2e
.quad .Ld3,.Ls3e
.quad .Ld4,.Ls4e
- .quad .Ls5,.Ls5e
- .quad .Ls6,.Ls6e
- .quad .Ls7,.Ls7e
- .quad .Ls8,.Ls8e
- .quad .Ld5,.Ls5e
+ .quad .Ls5,.Ls5e /* .Ls[5-8] - 32 bytes copied */
+ .quad .Ls6,.Ls5e
+ .quad .Ls7,.Ls5e
+ .quad .Ls8,.Ls5e
+ .quad .Ld5,.Ls5e /* .Ld[5-8] - 32..56 bytes copied */
.quad .Ld6,.Ls6e
.quad .Ld7,.Ls7e
.quad .Ld8,.Ls8e
@@ -172,11 +172,8 @@ ENTRY(__copy_user_nocache)
.quad .Le5,.Le_zero
.previous
- /* compute 64-offset for main loop. 8 bytes accuracy with error on the
- pessimistic side. this is gross. it would be better to fix the
- interface. */
/* eax: zero, ebx: 64 */
-.Ls1e: addl $8,%eax
+.Ls1e: addl $8,%eax /* eax: bytes left uncopied: Ls1e: 64 .. Ls8e: 8 */
.Ls2e: addl $8,%eax
.Ls3e: addl $8,%eax
.Ls4e: addl $8,%eax
diff --git a/arch/x86/lib/delay_32.c b/arch/x86/lib/delay_32.c
index 4535e6d147a..d710f2d167b 100644
--- a/arch/x86/lib/delay_32.c
+++ b/arch/x86/lib/delay_32.c
@@ -44,13 +44,36 @@ static void delay_loop(unsigned long loops)
static void delay_tsc(unsigned long loops)
{
unsigned long bclock, now;
+ int cpu;
- preempt_disable(); /* TSC's are per-cpu */
+ preempt_disable();
+ cpu = smp_processor_id();
rdtscl(bclock);
- do {
- rep_nop();
+ for (;;) {
rdtscl(now);
- } while ((now-bclock) < loops);
+ if ((now - bclock) >= loops)
+ break;
+
+ /* Allow RT tasks to run */
+ preempt_enable();
+ rep_nop();
+ preempt_disable();
+
+ /*
+ * It is possible that we moved to another CPU, and
+ * since TSC's are per-cpu we need to calculate
+ * that. The delay must guarantee that we wait "at
+ * least" the amount of time. Being moved to another
+ * CPU could make the wait longer but we just need to
+ * make sure we waited long enough. Rebalance the
+ * counter for this CPU.
+ */
+ if (unlikely(cpu != smp_processor_id())) {
+ loops -= (now - bclock);
+ cpu = smp_processor_id();
+ rdtscl(bclock);
+ }
+ }
preempt_enable();
}
diff --git a/arch/x86/lib/delay_64.c b/arch/x86/lib/delay_64.c
index bbc61051851..4c441be9264 100644
--- a/arch/x86/lib/delay_64.c
+++ b/arch/x86/lib/delay_64.c
@@ -31,14 +31,36 @@ int __devinit read_current_timer(unsigned long *timer_value)
void __delay(unsigned long loops)
{
unsigned bclock, now;
+ int cpu;
- preempt_disable(); /* TSC's are pre-cpu */
+ preempt_disable();
+ cpu = smp_processor_id();
rdtscl(bclock);
- do {
- rep_nop();
+ for (;;) {
rdtscl(now);
+ if ((now - bclock) >= loops)
+ break;
+
+ /* Allow RT tasks to run */
+ preempt_enable();
+ rep_nop();
+ preempt_disable();
+
+ /*
+ * It is possible that we moved to another CPU, and
+ * since TSC's are per-cpu we need to calculate
+ * that. The delay must guarantee that we wait "at
+ * least" the amount of time. Being moved to another
+ * CPU could make the wait longer but we just need to
+ * make sure we waited long enough. Rebalance the
+ * counter for this CPU.
+ */
+ if (unlikely(cpu != smp_processor_id())) {
+ loops -= (now - bclock);
+ cpu = smp_processor_id();
+ rdtscl(bclock);
+ }
}
- while ((now-bclock) < loops);
preempt_enable();
}
EXPORT_SYMBOL(__delay);
diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c
index 6e38d877ea7..c7b06feb139 100644
--- a/arch/x86/math-emu/fpu_entry.c
+++ b/arch/x86/math-emu/fpu_entry.c
@@ -30,6 +30,7 @@
#include <asm/uaccess.h>
#include <asm/desc.h>
#include <asm/user.h>
+#include <asm/i387.h>
#include "fpu_system.h"
#include "fpu_emu.h"
@@ -146,6 +147,13 @@ asmlinkage void math_emulate(long arg)
unsigned long code_limit = 0; /* Initialized to stop compiler warnings */
struct desc_struct code_descriptor;
+ if (!used_math()) {
+ if (init_fpu(current)) {
+ do_group_exit(SIGKILL);
+ return;
+ }
+ }
+
#ifdef RE_ENTRANT_CHECKING
if (emulating) {
printk("ERROR: wm-FPU-emu is not RE-ENTRANT!\n");
@@ -153,11 +161,6 @@ asmlinkage void math_emulate(long arg)
RE_ENTRANT_CHECK_ON;
#endif /* RE_ENTRANT_CHECKING */
- if (!used_math()) {
- finit();
- set_used_math();
- }
-
SETUP_DATA_AREA(arg);
FPU_ORIG_EIP = FPU_EIP;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index fd7e1798c75..8bcb6f40ccb 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -497,6 +497,11 @@ static int vmalloc_fault(unsigned long address)
unsigned long pgd_paddr;
pmd_t *pmd_k;
pte_t *pte_k;
+
+ /* Make sure we are in vmalloc area */
+ if (!(address >= VMALLOC_START && address < VMALLOC_END))
+ return -1;
+
/*
* Synchronize this task's top level page-table
* with the 'reference' page table.
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 32ba13b0f81..819dad973b1 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -135,7 +135,7 @@ static __init void *spp_getpage(void)
return ptr;
}
-static void
+static __init void
set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
{
pgd_t *pgd;
@@ -206,7 +206,7 @@ void __init cleanup_highmap(void)
pmd_t *last_pmd = pmd + PTRS_PER_PMD;
for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
- if (!pmd_present(*pmd))
+ if (pmd_none(*pmd))
continue;
if (vaddr < (unsigned long) _text || vaddr > end)
set_pmd(pmd, __pmd(0));
@@ -214,7 +214,7 @@ void __init cleanup_highmap(void)
}
/* NOTE: this is meant to be run only at boot */
-void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
+void __init __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
{
unsigned long address = __fix_to_virt(idx);
@@ -506,7 +506,7 @@ early_param("memtest", parse_memtest);
static void __init early_memtest(unsigned long start, unsigned long end)
{
- unsigned long t_start, t_size;
+ u64 t_start, t_size;
unsigned pattern;
if (!memtest_pattern)
@@ -525,8 +525,9 @@ static void __init early_memtest(unsigned long start, unsigned long end)
if (t_start + t_size > end)
t_size = end - t_start;
- printk(KERN_CONT "\n %016lx - %016lx pattern %d",
- t_start, t_start + t_size, pattern);
+ printk(KERN_CONT "\n %016llx - %016llx pattern %d",
+ (unsigned long long)t_start,
+ (unsigned long long)t_start + t_size, pattern);
memtest(t_start, t_size, pattern);
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 71bb3159031..2b2bb3f9b68 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -593,10 +593,11 @@ void __init early_iounmap(void *addr, unsigned long size)
unsigned long offset;
unsigned int nrpages;
enum fixed_addresses idx;
- unsigned int nesting;
+ int nesting;
nesting = --early_ioremap_nested;
- WARN_ON(nesting < 0);
+ if (WARN_ON(nesting < 0))
+ return;
if (early_ioremap_debug) {
printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr,
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index de3a9981245..06b7a1c90fb 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -34,7 +34,7 @@ void __cpuinit pat_disable(char *reason)
printk(KERN_INFO "%s\n", reason);
}
-static int nopat(char *str)
+static int __init nopat(char *str)
{
pat_disable("PAT support disabled.");
return 0;
@@ -151,32 +151,33 @@ static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot,
unsigned long pat_type;
u8 mtrr_type;
- mtrr_type = mtrr_type_lookup(start, end);
- if (mtrr_type == 0xFF) { /* MTRR not enabled */
- *ret_prot = prot;
- return 0;
- }
- if (mtrr_type == 0xFE) { /* MTRR match error */
- *ret_prot = _PAGE_CACHE_UC;
- return -1;
- }
- if (mtrr_type != MTRR_TYPE_UNCACHABLE &&
- mtrr_type != MTRR_TYPE_WRBACK &&
- mtrr_type != MTRR_TYPE_WRCOMB) { /* MTRR type unhandled */
- *ret_prot = _PAGE_CACHE_UC;
- return -1;
- }
-
pat_type = prot & _PAGE_CACHE_MASK;
prot &= (~_PAGE_CACHE_MASK);
- /* Currently doing intersection by hand. Optimize it later. */
+ /*
+ * We return the PAT request directly for types where PAT takes
+ * precedence with respect to MTRR and for UC_MINUS.
+ * Consistency checks with other PAT requests is done later
+ * while going through memtype list.
+ */
if (pat_type == _PAGE_CACHE_WC) {
*ret_prot = prot | _PAGE_CACHE_WC;
+ return 0;
} else if (pat_type == _PAGE_CACHE_UC_MINUS) {
*ret_prot = prot | _PAGE_CACHE_UC_MINUS;
- } else if (pat_type == _PAGE_CACHE_UC ||
- mtrr_type == MTRR_TYPE_UNCACHABLE) {
+ return 0;
+ } else if (pat_type == _PAGE_CACHE_UC) {
+ *ret_prot = prot | _PAGE_CACHE_UC;
+ return 0;
+ }
+
+ /*
+ * Look for MTRR hint to get the effective type in case where PAT
+ * request is for WB.
+ */
+ mtrr_type = mtrr_type_lookup(start, end);
+
+ if (mtrr_type == MTRR_TYPE_UNCACHABLE) {
*ret_prot = prot | _PAGE_CACHE_UC;
} else if (mtrr_type == MTRR_TYPE_WRCOMB) {
*ret_prot = prot | _PAGE_CACHE_WC;
@@ -233,14 +234,12 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
if (req_type == -1) {
/*
- * Special case where caller wants to inherit from mtrr or
- * existing pat mapping, defaulting to UC_MINUS in case of
- * no match.
+ * Call mtrr_lookup to get the type hint. This is an
+ * optimization for /dev/mem mmap'ers into WB memory (BIOS
+ * tools and ACPI tools). Use WB request for WB memory and use
+ * UC_MINUS otherwise.
*/
u8 mtrr_type = mtrr_type_lookup(start, end);
- if (mtrr_type == 0xFE) { /* MTRR match error */
- err = -1;
- }
if (mtrr_type == MTRR_TYPE_WRBACK) {
req_type = _PAGE_CACHE_WB;
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 3890234e5b2..99649dccad2 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -97,36 +97,9 @@ static __init inline int srat_disabled(void)
return numa_off || acpi_numa < 0;
}
-/*
- * A lot of BIOS fill in 10 (= no distance) everywhere. This messes
- * up the NUMA heuristics which wants the local node to have a smaller
- * distance than the others.
- * Do some quick checks here and only use the SLIT if it passes.
- */
-static __init int slit_valid(struct acpi_table_slit *slit)
-{
- int i, j;
- int d = slit->locality_count;
- for (i = 0; i < d; i++) {
- for (j = 0; j < d; j++) {
- u8 val = slit->entry[d*i + j];
- if (i == j) {
- if (val != LOCAL_DISTANCE)
- return 0;
- } else if (val <= LOCAL_DISTANCE)
- return 0;
- }
- }
- return 1;
-}
-
/* Callback for SLIT parsing */
void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
{
- if (!slit_valid(slit)) {
- printk(KERN_INFO "ACPI: SLIT table looks invalid. Not used.\n");
- return;
- }
acpi_slit = slit;
}
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 6e64aaf00d1..940185ecaed 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -328,18 +328,18 @@ static struct dmi_system_id __devinitdata pciprobe_dmi_table[] = {
#endif
{
.callback = set_bf_sort,
- .ident = "HP ProLiant DL385 G2",
+ .ident = "HP ProLiant DL360",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "HP"),
- DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL385 G2"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL360"),
},
},
{
.callback = set_bf_sort,
- .ident = "HP ProLiant DL585 G2",
+ .ident = "HP ProLiant DL380",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "HP"),
- DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL585 G2"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "ProLiant DL380"),
},
},
{}
diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c
index e70b9c57b88..b821f4462d9 100644
--- a/arch/x86/pci/init.c
+++ b/arch/x86/pci/init.c
@@ -15,7 +15,8 @@ static __init int pci_access_init(void)
pci_mmcfg_early_init();
#ifdef CONFIG_PCI_OLPC
- pci_olpc_init();
+ if (!pci_olpc_init())
+ return 0; /* skip additional checks if it's an XO */
#endif
#ifdef CONFIG_PCI_BIOS
pci_pcbios_init();
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 0908fca901b..ca8df9c260b 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -621,6 +621,13 @@ static __init int via_router_probe(struct irq_router *r,
*/
device = PCI_DEVICE_ID_VIA_8235;
break;
+ case PCI_DEVICE_ID_VIA_8237:
+ /**
+ * Asus a7v600 bios wrongly reports 8237
+ * as 586-compatible
+ */
+ device = PCI_DEVICE_ID_VIA_8237;
+ break;
}
}
diff --git a/arch/x86/pci/olpc.c b/arch/x86/pci/olpc.c
index 5e7636558c0..e11e9e803d5 100644
--- a/arch/x86/pci/olpc.c
+++ b/arch/x86/pci/olpc.c
@@ -302,12 +302,13 @@ static struct pci_raw_ops pci_olpc_conf = {
.write = pci_olpc_write,
};
-void __init pci_olpc_init(void)
+int __init pci_olpc_init(void)
{
if (!machine_is_olpc() || olpc_has_vsa())
- return;
+ return -ENODEV;
printk(KERN_INFO "PCI: Using configuration type OLPC\n");
raw_pci_ops = &pci_olpc_conf;
is_lx = is_geode_lx();
+ return 0;
}
diff --git a/arch/x86/pci/pci.h b/arch/x86/pci/pci.h
index f3972b12c60..720c4c55453 100644
--- a/arch/x86/pci/pci.h
+++ b/arch/x86/pci/pci.h
@@ -101,7 +101,7 @@ extern struct pci_raw_ops pci_direct_conf1;
extern int pci_direct_probe(void);
extern void pci_direct_init(int type);
extern void pci_pcbios_init(void);
-extern void pci_olpc_init(void);
+extern int pci_olpc_init(void);
/* pci-mmconfig.c */
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 23476c2ebfc..efa2ba7c600 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -106,9 +106,9 @@ int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
do_realtime((struct timespec *)tv);
tv->tv_usec /= 1000;
if (unlikely(tz != NULL)) {
- /* This relies on gcc inlining the memcpy. We'll notice
- if it ever fails to do so. */
- memcpy(tz, &gtod->sys_tz, sizeof(struct timezone));
+ /* Avoid memcpy. Some old compilers fail to inline it */
+ tz->tz_minuteswest = gtod->sys_tz.tz_minuteswest;
+ tz->tz_dsttime = gtod->sys_tz.tz_dsttime;
}
return 0;
}
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 2e641be2737..6c388e593bc 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -5,8 +5,9 @@
config XEN
bool "Xen guest support"
select PARAVIRT
+ select PARAVIRT_CLOCK
depends on X86_32
- depends on X86_CMPXCHG && X86_TSC && !(X86_VISWS || X86_VOYAGER)
+ depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER)
help
This is the Linux Xen port. Enabling this will allow the
kernel to boot in a paravirtualized environment under the
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index c8a56e457d6..f09c1c69c37 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -785,38 +785,35 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
static __init void xen_pagetable_setup_start(pgd_t *base)
{
pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
+ int i;
/* special set_pte for pagetable initialization */
pv_mmu_ops.set_pte = xen_set_pte_init;
init_mm.pgd = base;
/*
- * copy top-level of Xen-supplied pagetable into place. For
- * !PAE we can use this as-is, but for PAE it is a stand-in
- * while we copy the pmd pages.
+ * copy top-level of Xen-supplied pagetable into place. This
+ * is a stand-in while we copy the pmd pages.
*/
memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
- if (PTRS_PER_PMD > 1) {
- int i;
- /*
- * For PAE, need to allocate new pmds, rather than
- * share Xen's, since Xen doesn't like pmd's being
- * shared between address spaces.
- */
- for (i = 0; i < PTRS_PER_PGD; i++) {
- if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
- pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
+ /*
+ * For PAE, need to allocate new pmds, rather than
+ * share Xen's, since Xen doesn't like pmd's being
+ * shared between address spaces.
+ */
+ for (i = 0; i < PTRS_PER_PGD; i++) {
+ if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
+ pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
- memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
- PAGE_SIZE);
+ memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
+ PAGE_SIZE);
- make_lowmem_page_readonly(pmd);
+ make_lowmem_page_readonly(pmd);
- set_pgd(&base[i], __pgd(1 + __pa(pmd)));
- } else
- pgd_clear(&base[i]);
- }
+ set_pgd(&base[i], __pgd(1 + __pa(pmd)));
+ } else
+ pgd_clear(&base[i]);
}
/* make sure zero_page is mapped RO so we can use it in pagetables */
@@ -873,17 +870,7 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
/* Actually pin the pagetable down, but we can't set PG_pinned
yet because the page structures don't exist yet. */
- {
- unsigned level;
-
-#ifdef CONFIG_X86_PAE
- level = MMUEXT_PIN_L3_TABLE;
-#else
- level = MMUEXT_PIN_L2_TABLE;
-#endif
-
- pin_pagetable_pfn(level, PFN_DOWN(__pa(base)));
- }
+ pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base)));
}
/* This is called once we have the cpu_possible_map */
@@ -1093,7 +1080,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
.make_pte = xen_make_pte,
.make_pgd = xen_make_pgd,
-#ifdef CONFIG_X86_PAE
.set_pte_atomic = xen_set_pte_atomic,
.set_pte_present = xen_set_pte_at,
.set_pud = xen_set_pud,
@@ -1102,7 +1088,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
.make_pmd = xen_make_pmd,
.pmd_val = xen_pmd_val,
-#endif /* PAE */
.activate_mm = xen_activate_mm,
.dup_mmap = xen_dup_mmap,
@@ -1228,6 +1213,11 @@ asmlinkage void __init xen_start_kernel(void)
if (xen_feature(XENFEAT_supervisor_mode_kernel))
pv_info.kernel_rpl = 0;
+ /* Prevent unwanted bits from being set in PTEs. */
+ __supported_pte_mask &= ~_PAGE_GLOBAL;
+ if (!is_initial_xendomain())
+ __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
+
/* set the limit of our address space */
xen_reserve_top();
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 126766d43ae..4e527e7893a 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -60,7 +60,7 @@ xmaddr_t arbitrary_virt_to_machine(unsigned long address)
{
unsigned int level;
pte_t *pte = lookup_address(address, &level);
- unsigned offset = address & PAGE_MASK;
+ unsigned offset = address & ~PAGE_MASK;
BUG_ON(pte == NULL);
@@ -179,50 +179,56 @@ out:
preempt_enable();
}
-pteval_t xen_pte_val(pte_t pte)
+/* Assume pteval_t is equivalent to all the other *val_t types. */
+static pteval_t pte_mfn_to_pfn(pteval_t val)
+{
+ if (val & _PAGE_PRESENT) {
+ unsigned long mfn = (val & PTE_MASK) >> PAGE_SHIFT;
+ pteval_t flags = val & ~PTE_MASK;
+ val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
+ }
+
+ return val;
+}
+
+static pteval_t pte_pfn_to_mfn(pteval_t val)
{
- pteval_t ret = pte.pte;
+ if (val & _PAGE_PRESENT) {
+ unsigned long pfn = (val & PTE_MASK) >> PAGE_SHIFT;
+ pteval_t flags = val & ~PTE_MASK;
+ val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
+ }
- if (ret & _PAGE_PRESENT)
- ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
+ return val;
+}
- return ret;
+pteval_t xen_pte_val(pte_t pte)
+{
+ return pte_mfn_to_pfn(pte.pte);
}
pgdval_t xen_pgd_val(pgd_t pgd)
{
- pgdval_t ret = pgd.pgd;
- if (ret & _PAGE_PRESENT)
- ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
- return ret;
+ return pte_mfn_to_pfn(pgd.pgd);
}
pte_t xen_make_pte(pteval_t pte)
{
- if (pte & _PAGE_PRESENT) {
- pte = phys_to_machine(XPADDR(pte)).maddr;
- pte &= ~(_PAGE_PCD | _PAGE_PWT);
- }
-
- return (pte_t){ .pte = pte };
+ pte = pte_pfn_to_mfn(pte);
+ return native_make_pte(pte);
}
pgd_t xen_make_pgd(pgdval_t pgd)
{
- if (pgd & _PAGE_PRESENT)
- pgd = phys_to_machine(XPADDR(pgd)).maddr;
-
- return (pgd_t){ pgd };
+ pgd = pte_pfn_to_mfn(pgd);
+ return native_make_pgd(pgd);
}
pmdval_t xen_pmd_val(pmd_t pmd)
{
- pmdval_t ret = native_pmd_val(pmd);
- if (ret & _PAGE_PRESENT)
- ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
- return ret;
+ return pte_mfn_to_pfn(pmd.pmd);
}
-#ifdef CONFIG_X86_PAE
+
void xen_set_pud(pud_t *ptr, pud_t val)
{
struct multicall_space mcs;
@@ -267,17 +273,9 @@ void xen_pmd_clear(pmd_t *pmdp)
pmd_t xen_make_pmd(pmdval_t pmd)
{
- if (pmd & _PAGE_PRESENT)
- pmd = phys_to_machine(XPADDR(pmd)).maddr;
-
+ pmd = pte_pfn_to_mfn(pmd);
return native_make_pmd(pmd);
}
-#else /* !PAE */
-void xen_set_pte(pte_t *ptep, pte_t pte)
-{
- *ptep = pte;
-}
-#endif /* CONFIG_X86_PAE */
/*
(Yet another) pagetable walker. This one is intended for pinning a
@@ -430,8 +428,6 @@ static int pin_page(struct page *page, enum pt_level level)
read-only, and can be pinned. */
void xen_pgd_pin(pgd_t *pgd)
{
- unsigned level;
-
xen_mc_batch();
if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
@@ -441,14 +437,7 @@ void xen_pgd_pin(pgd_t *pgd)
xen_mc_batch();
}
-#ifdef CONFIG_X86_PAE
- level = MMUEXT_PIN_L3_TABLE;
-#else
- level = MMUEXT_PIN_L2_TABLE;
-#endif
-
- xen_do_pin(level, PFN_DOWN(__pa(pgd)));
-
+ xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
xen_mc_issue(0);
}
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index b5e189b1519..5fe961caffd 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -37,14 +37,13 @@ void xen_exit_mmap(struct mm_struct *mm);
void xen_pgd_pin(pgd_t *pgd);
//void xen_pgd_unpin(pgd_t *pgd);
-#ifdef CONFIG_X86_PAE
-unsigned long long xen_pte_val(pte_t);
-unsigned long long xen_pmd_val(pmd_t);
-unsigned long long xen_pgd_val(pgd_t);
+pteval_t xen_pte_val(pte_t);
+pmdval_t xen_pmd_val(pmd_t);
+pgdval_t xen_pgd_val(pgd_t);
-pte_t xen_make_pte(unsigned long long);
-pmd_t xen_make_pmd(unsigned long long);
-pgd_t xen_make_pgd(unsigned long long);
+pte_t xen_make_pte(pteval_t);
+pmd_t xen_make_pmd(pmdval_t);
+pgd_t xen_make_pgd(pgdval_t);
void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pteval);
@@ -53,15 +52,4 @@ void xen_set_pud(pud_t *ptr, pud_t val);
void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
void xen_pmd_clear(pmd_t *pmdp);
-
-#else
-unsigned long xen_pte_val(pte_t);
-unsigned long xen_pmd_val(pmd_t);
-unsigned long xen_pgd_val(pgd_t);
-
-pte_t xen_make_pte(unsigned long);
-pmd_t xen_make_pmd(unsigned long);
-pgd_t xen_make_pgd(unsigned long);
-#endif
-
#endif /* _XEN_MMU_H */
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index c39e1a5aa24..41e217503c9 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -12,7 +12,9 @@
#include <linux/clocksource.h>
#include <linux/clockchips.h>
#include <linux/kernel_stat.h>
+#include <linux/math64.h>
+#include <asm/pvclock.h>
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
@@ -30,17 +32,6 @@
static cycle_t xen_clocksource_read(void);
-/* These are perodically updated in shared_info, and then copied here. */
-struct shadow_time_info {
- u64 tsc_timestamp; /* TSC at last update of time vals. */
- u64 system_timestamp; /* Time, in nanosecs, since boot. */
- u32 tsc_to_nsec_mul;
- int tsc_shift;
- u32 version;
-};
-
-static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
-
/* runstate info updated by Xen */
static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
@@ -150,11 +141,7 @@ static void do_stolen_accounting(void)
if (stolen < 0)
stolen = 0;
- ticks = 0;
- while (stolen >= NS_PER_TICK) {
- ticks++;
- stolen -= NS_PER_TICK;
- }
+ ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen);
__get_cpu_var(residual_stolen) = stolen;
account_steal_time(NULL, ticks);
@@ -166,11 +153,7 @@ static void do_stolen_accounting(void)
if (blocked < 0)
blocked = 0;
- ticks = 0;
- while (blocked >= NS_PER_TICK) {
- ticks++;
- blocked -= NS_PER_TICK;
- }
+ ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked);
__get_cpu_var(residual_blocked) = blocked;
account_steal_time(idle_task(smp_processor_id()), ticks);
}
@@ -218,7 +201,7 @@ unsigned long long xen_sched_clock(void)
unsigned long xen_cpu_khz(void)
{
u64 xen_khz = 1000000ULL << 32;
- const struct vcpu_time_info *info =
+ const struct pvclock_vcpu_time_info *info =
&HYPERVISOR_shared_info->vcpu_info[0].time;
do_div(xen_khz, info->tsc_to_system_mul);
@@ -230,121 +213,26 @@ unsigned long xen_cpu_khz(void)
return xen_khz;
}
-/*
- * Reads a consistent set of time-base values from Xen, into a shadow data
- * area.
- */
-static unsigned get_time_values_from_xen(void)
-{
- struct vcpu_time_info *src;
- struct shadow_time_info *dst;
-
- /* src is shared memory with the hypervisor, so we need to
- make sure we get a consistent snapshot, even in the face of
- being preempted. */
- src = &__get_cpu_var(xen_vcpu)->time;
- dst = &__get_cpu_var(shadow_time);
-
- do {
- dst->version = src->version;
- rmb(); /* fetch version before data */
- dst->tsc_timestamp = src->tsc_timestamp;
- dst->system_timestamp = src->system_time;
- dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
- dst->tsc_shift = src->tsc_shift;
- rmb(); /* test version after fetching data */
- } while ((src->version & 1) | (dst->version ^ src->version));
-
- return dst->version;
-}
-
-/*
- * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
- * yielding a 64-bit result.
- */
-static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
-{
- u64 product;
-#ifdef __i386__
- u32 tmp1, tmp2;
-#endif
-
- if (shift < 0)
- delta >>= -shift;
- else
- delta <<= shift;
-
-#ifdef __i386__
- __asm__ (
- "mul %5 ; "
- "mov %4,%%eax ; "
- "mov %%edx,%4 ; "
- "mul %5 ; "
- "xor %5,%5 ; "
- "add %4,%%eax ; "
- "adc %5,%%edx ; "
- : "=A" (product), "=r" (tmp1), "=r" (tmp2)
- : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
-#elif __x86_64__
- __asm__ (
- "mul %%rdx ; shrd $32,%%rdx,%%rax"
- : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
-#else
-#error implement me!
-#endif
-
- return product;
-}
-
-static u64 get_nsec_offset(struct shadow_time_info *shadow)
-{
- u64 now, delta;
- now = native_read_tsc();
- delta = now - shadow->tsc_timestamp;
- return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
-}
-
static cycle_t xen_clocksource_read(void)
{
- struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
+ struct pvclock_vcpu_time_info *src;
cycle_t ret;
- unsigned version;
-
- do {
- version = get_time_values_from_xen();
- barrier();
- ret = shadow->system_timestamp + get_nsec_offset(shadow);
- barrier();
- } while (version != __get_cpu_var(xen_vcpu)->time.version);
-
- put_cpu_var(shadow_time);
+ src = &get_cpu_var(xen_vcpu)->time;
+ ret = pvclock_clocksource_read(src);
+ put_cpu_var(xen_vcpu);
return ret;
}
static void xen_read_wallclock(struct timespec *ts)
{
- const struct shared_info *s = HYPERVISOR_shared_info;
- u32 version;
- u64 delta;
- struct timespec now;
-
- /* get wallclock at system boot */
- do {
- version = s->wc_version;
- rmb(); /* fetch version before time */
- now.tv_sec = s->wc_sec;
- now.tv_nsec = s->wc_nsec;
- rmb(); /* fetch time before checking version */
- } while ((s->wc_version & 1) | (version ^ s->wc_version));
+ struct shared_info *s = HYPERVISOR_shared_info;
+ struct pvclock_wall_clock *wall_clock = &(s->wc);
+ struct pvclock_vcpu_time_info *vcpu_time;
- delta = xen_clocksource_read(); /* time since system boot */
- delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
-
- now.tv_nsec = do_div(delta, NSEC_PER_SEC);
- now.tv_sec = delta;
-
- set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
+ vcpu_time = &get_cpu_var(xen_vcpu)->time;
+ pvclock_read_wallclock(wall_clock, vcpu_time, ts);
+ put_cpu_var(xen_vcpu);
}
unsigned long xen_get_wallclock(void)
@@ -352,7 +240,6 @@ unsigned long xen_get_wallclock(void)
struct timespec ts;
xen_read_wallclock(&ts);
-
return ts.tv_sec;
}
@@ -576,8 +463,6 @@ __init void xen_time_init(void)
{
int cpu = smp_processor_id();
- get_time_values_from_xen();
-
clocksource_register(&xen_clocksource);
if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 288d587ce73..6ec3b4f7719 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -17,7 +17,7 @@ ENTRY(startup_xen)
__FINIT
-.pushsection .bss.page_aligned
+.pushsection .text
.align PAGE_SIZE_asm
ENTRY(hypercall_page)
.skip 0x1000
@@ -30,11 +30,7 @@ ENTRY(hypercall_page)
ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen)
ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page)
ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb")
-#ifdef CONFIG_X86_PAE
ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
-#else
- ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "no")
-#endif
ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
#endif /*CONFIG_XEN */